Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TIKA-4294 -- improve serialization of ParseContext #1886

Merged
merged 1 commit into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.serialization;

import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT;

import java.io.IOException;
import java.util.Iterator;
import java.util.Map;

import com.fasterxml.jackson.core.JacksonException;
import com.fasterxml.jackson.core.JsonParser;
import com.fasterxml.jackson.databind.DeserializationContext;
import com.fasterxml.jackson.databind.JsonDeserializer;
import com.fasterxml.jackson.databind.JsonNode;
import org.apache.commons.io.IOExceptionWithCause;

import org.apache.tika.parser.ParseContext;

public class ParseContextDeserializer extends JsonDeserializer<ParseContext> {

@Override
public ParseContext deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JacksonException {
JsonNode root = jsonParser.getCodec().readTree(jsonParser);
return readParseContext(root);
}

public static ParseContext readParseContext(JsonNode jsonNode) throws IOException {
//some use cases include the wrapper node, e.g. { "parseContext": {}}
//some include the contents only.
//Try to find "parseContext" to start. If that doesn't exist, assume the jsonNode is the contents.
JsonNode contextNode = jsonNode.get(PARSE_CONTEXT);

if (contextNode == null) {
contextNode = jsonNode;
}
ParseContext parseContext = new ParseContext();
Iterator<Map.Entry<String, JsonNode>> it = contextNode.fields();
while (it.hasNext()) {
Map.Entry<String, JsonNode> e = it.next();
String superClassName = e.getKey();
JsonNode obj = e.getValue();
String className = readVal(TikaJsonSerializer.INSTANTIATED_CLASS_KEY, obj, null, true);
try {
Class clazz = Class.forName(className);
Class superClazz = className.equals(superClassName) ? clazz : Class.forName(superClassName);
parseContext.set(clazz, TikaJsonDeserializer.deserialize(clazz, superClazz, obj));
} catch (ReflectiveOperationException ex) {
throw new IOExceptionWithCause(ex);
}
}
return parseContext;
}

private static String readVal(String key, JsonNode jsonObj, String defaultRet, boolean isRequired) throws IOException {
JsonNode valNode = jsonObj.get(key);
if (valNode == null) {
if (isRequired) {
throw new IOException("required value string, but see: " + key);
}
return defaultRet;
}
return valNode.asText();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,17 @@
import org.apache.tika.parser.ParseContext;

public class ParseContextSerializer extends JsonSerializer<ParseContext> {
public static final String PARSE_CONTEXT = "parseContext";


@Override
public void serialize(ParseContext parseContext, JsonGenerator jsonGenerator, SerializerProvider serializerProvider) throws IOException {
jsonGenerator.writeFieldName("parseContext");
jsonGenerator.writeFieldName(PARSE_CONTEXT);
jsonGenerator.writeStartObject();
for (String className : parseContext.keySet()) {
try {
Class clazz = Class.forName(className);
TikaJsonSerializer.serialize(className, parseContext.get(clazz), clazz, jsonGenerator);
TikaJsonSerializer.serialize(className, parseContext.get(clazz), null, jsonGenerator);
} catch (TikaSerializationException e) {
throw new IOException(e);
} catch (ClassNotFoundException e) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ public static void serializeObject(String fieldName, Object obj, Class superClas
jsonGenerator.writeStringField(INSTANTIATED_CLASS_KEY, obj
.getClass()
.getName());
if (!obj
if (superClass != null && !obj
.getClass()
.equals(superClass)) {
jsonGenerator.writeStringField(SUPER_CLASS_KEY, superClass.getName());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
*/
package org.apache.tika.serialization.pipes;

import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT;

import java.io.IOException;
import java.io.Reader;
import java.io.StringWriter;
Expand All @@ -28,17 +30,15 @@
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.apache.commons.io.IOExceptionWithCause;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
import org.apache.tika.pipes.emitter.EmitKey;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.serialization.JsonMetadata;
import org.apache.tika.serialization.ParseContextDeserializer;
import org.apache.tika.serialization.ParseContextSerializer;
import org.apache.tika.serialization.TikaJsonDeserializer;
import org.apache.tika.serialization.TikaJsonSerializer;
import org.apache.tika.utils.StringUtils;

public class JsonFetchEmitTuple {
Expand All @@ -52,7 +52,6 @@ public class JsonFetchEmitTuple {
public static final String EMITKEY = "emitKey";
public static final String METADATAKEY = "metadata";
public static final String ON_PARSE_EXCEPTION = "onParseException";
public static final String PARSE_CONTEXT = "parseContext";

public static FetchEmitTuple fromJson(Reader reader) throws IOException {
//try (JsonParser jParser = new JsonFactory().setStreamReadConstraints(StreamReadConstraints.builder()
Expand All @@ -72,7 +71,8 @@ static FetchEmitTuple parseFetchEmitTuple(JsonNode root) throws IOException {
long fetchRangeStart = readLong(FETCH_RANGE_START, root, -1l, false);
long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false);
Metadata metadata = readMetadata(root);
ParseContext parseContext = readParseContext(root);
JsonNode parseContextNode = root.get(PARSE_CONTEXT);
ParseContext parseContext = parseContextNode == null ? new ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode);
FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = readOnParseException(root);

return new FetchEmitTuple(id, new FetchKey(fetcherName, fetchKey, fetchRangeStart, fetchRangeEnd), new EmitKey(emitterName, emitKey), metadata, parseContext,
Expand All @@ -94,30 +94,6 @@ private static FetchEmitTuple.ON_PARSE_EXCEPTION readOnParseException(JsonNode r
}
}

private static ParseContext readParseContext(JsonNode root) throws IOException {
JsonNode contextNode = root.get(PARSE_CONTEXT);
if (contextNode == null) {
return new ParseContext();
}
ParseContext parseContext = new ParseContext();
Iterator<Map.Entry<String, JsonNode>> it = contextNode.fields();
while (it.hasNext()) {
Map.Entry<String, JsonNode> e = it.next();
String clazzName = e.getKey();
JsonNode obj = e.getValue();
String className = readVal(TikaJsonSerializer.INSTANTIATED_CLASS_KEY, obj, null, true);
String superClassName = readVal(TikaJsonSerializer.SUPER_CLASS_KEY, obj, className, false);
try {
Class clazz = Class.forName(className);
Class superClazz = clazz.equals(superClassName) ? clazz : Class.forName(superClassName);
parseContext.set(clazz, TikaJsonDeserializer.deserialize(clazz, superClazz, obj));
} catch (ReflectiveOperationException ex) {
throw new IOExceptionWithCause(ex);
}
}
return parseContext;
}

private static Metadata readMetadata(JsonNode root) {
JsonNode metadataNode = root.get(METADATAKEY);
if (metadataNode == null) {
Expand Down
Loading