Skip to content

Commit

Permalink
TIKA-4287 -- enable serialization by wrapping the PDFBox ImageType ob…
Browse files Browse the repository at this point in the history
…ject (#1866)
  • Loading branch information
tballison authored Jul 30, 2024
1 parent 4b3f0a4 commit a53af59
Show file tree
Hide file tree
Showing 5 changed files with 23 additions and 65 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -673,7 +673,7 @@ private RenderResult noContextRenderCurrentPage(Metadata pageMetadata,

try {
BufferedImage image =
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType());
renderer.renderImageWithDPI(pageIndex, dpi, config.getOcrImageType().getImageType());

//TODO -- get suffix based on OcrImageType
tmpFile = tmpResources.createTempFile();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1099,7 +1099,7 @@ private void initRenderer(PDFParserConfig config, ParseContext context) {
//set a default renderer if nothing was defined
PDFBoxRenderer pdfBoxRenderer = new PDFBoxRenderer();
pdfBoxRenderer.setDPI(config.getOcrDPI());
pdfBoxRenderer.setImageType(config.getOcrImageType());
pdfBoxRenderer.setImageType(config.getOcrImageType().getImageType());
pdfBoxRenderer.setImageFormatName(config.getOcrImageFormatName());
config.setRenderer(pdfBoxRenderer);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
import java.lang.reflect.Modifier;
import java.util.HashSet;
import java.util.Locale;
import java.util.Objects;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
Expand All @@ -45,6 +44,18 @@
*/
public class PDFParserConfig implements Serializable {

public enum TikaImageType {
RGB(ImageType.RGB),
GRAY(ImageType.GRAY);

private ImageType imageType;
TikaImageType(ImageType imageType) {
this.imageType = imageType;
}
public ImageType getImageType() {
return imageType;
}
}

private static final long serialVersionUID = 6492570218190936986L;
private final Set<String> userConfigured = new HashSet<>();
Expand Down Expand Up @@ -114,7 +125,7 @@ public class PDFParserConfig implements Serializable {
private OCR_RENDERING_STRATEGY ocrRenderingStrategy = OCR_RENDERING_STRATEGY.ALL;

private int ocrDPI = 300;
private ImageType ocrImageType = ImageType.GRAY;
private TikaImageType ocrImageType = TikaImageType.GRAY;
private String ocrImageFormatName = "png";
private float ocrImageQuality = 1.0f;

Expand Down Expand Up @@ -623,9 +634,9 @@ public void setOcrImageFormatName(String ocrImageFormatName) {
* Image type used to render the page image for OCR.
*
* @return image type
* @see #setOcrImageType(ImageType)
* @see #setOcrImageType(TikaImageType)
*/
public ImageType getOcrImageType() {
public TikaImageType getOcrImageType() {
return ocrImageType;
}

Expand All @@ -634,15 +645,15 @@ public ImageType getOcrImageType() {
*
* @param ocrImageType
*/
public void setOcrImageType(ImageType ocrImageType) {
public void setOcrImageType(TikaImageType ocrImageType) {
this.ocrImageType = ocrImageType;
userConfigured.add("ocrImageType");
}

/**
* Image type used to render the page image for OCR.
*
* @see #setOcrImageType(ImageType)
* @see #setOcrImageType(TikaImageType)
*/
public void setOcrImageType(String ocrImageTypeString) {
setOcrImageType(parseImageType(ocrImageTypeString));
Expand Down Expand Up @@ -749,8 +760,8 @@ public void setSetKCMS(boolean setKCMS) {
userConfigured.add("setKCMS");
}

private ImageType parseImageType(String ocrImageType) {
for (ImageType t : ImageType.values()) {
private TikaImageType parseImageType(String ocrImageType) {
for (TikaImageType t : TikaImageType.values()) {
if (ocrImageType.equalsIgnoreCase(t.toString())) {
return t;
}
Expand Down Expand Up @@ -806,58 +817,6 @@ public PDFParserConfig cloneAndUpdate(PDFParserConfig updates) throws TikaExcept
return updated;
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
PDFParserConfig config = (PDFParserConfig) o;
return enableAutoSpace == config.enableAutoSpace &&
suppressDuplicateOverlappingText == config.suppressDuplicateOverlappingText &&
extractAnnotationText == config.extractAnnotationText &&
sortByPosition == config.sortByPosition &&
extractAcroFormContent == config.extractAcroFormContent &&
extractBookmarksText == config.extractBookmarksText &&
extractInlineImages == config.extractInlineImages &&
extractInlineImageMetadataOnly == config.extractInlineImageMetadataOnly &&
extractUniqueInlineImagesOnly == config.extractUniqueInlineImagesOnly &&
extractMarkedContent == config.extractMarkedContent &&
Float.compare(config.dropThreshold, dropThreshold) == 0 &&
ifXFAExtractOnlyXFA == config.ifXFAExtractOnlyXFA && ocrDPI == config.ocrDPI &&
Float.compare(config.ocrImageQuality, ocrImageQuality) == 0 &&
catchIntermediateIOExceptions == config.catchIntermediateIOExceptions &&
extractActions == config.extractActions &&
extractFontNames == config.extractFontNames &&
maxMainMemoryBytes == config.maxMainMemoryBytes && setKCMS == config.setKCMS &&
detectAngles == config.detectAngles &&
Objects.equals(userConfigured, config.userConfigured) &&
Objects.equals(averageCharTolerance, config.averageCharTolerance) &&
Objects.equals(spacingTolerance, config.spacingTolerance) &&
ocrStrategy == config.ocrStrategy &&
Objects.equals(ocrStrategyAuto, config.ocrStrategyAuto) &&
ocrRenderingStrategy == config.ocrRenderingStrategy &&
ocrImageType == config.ocrImageType &&
Objects.equals(ocrImageFormatName, config.ocrImageFormatName) &&
imageStrategy == config.imageStrategy &&
Objects.equals(accessChecker, config.accessChecker) &&
Objects.equals(renderer, config.renderer);
}

@Override
public int hashCode() {
return Objects.hash(userConfigured, enableAutoSpace, suppressDuplicateOverlappingText,
extractAnnotationText, sortByPosition, extractAcroFormContent, extractBookmarksText,
extractInlineImages, extractInlineImageMetadataOnly, extractUniqueInlineImagesOnly,
extractMarkedContent, averageCharTolerance, spacingTolerance, dropThreshold,
ifXFAExtractOnlyXFA, ocrStrategy, ocrStrategyAuto, ocrRenderingStrategy, ocrDPI,
ocrImageType, ocrImageFormatName, ocrImageQuality, imageStrategy, accessChecker,
catchIntermediateIOExceptions, extractActions, extractFontNames, maxMainMemoryBytes,
setKCMS, detectAngles, renderer);
}

public void setRenderer(Renderer renderer) {
this.renderer = renderer;
userConfigured.add("renderer");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ protected ImageType getImageType(ParseContext parseContext) {
if (pdfParserConfig == null) {
return defaultImageType;
}
return pdfParserConfig.getOcrImageType();
return pdfParserConfig.getOcrImageType().getImageType();
}

protected String getImageFormatName(ParseContext parseContext) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
import java.util.regex.Pattern;

import org.apache.commons.io.IOUtils;
import org.apache.pdfbox.rendering.ImageType;
import org.junit.jupiter.api.AfterAll;
import org.junit.jupiter.api.BeforeAll;
import org.junit.jupiter.api.Disabled;
Expand Down Expand Up @@ -1129,7 +1128,7 @@ public void testInitializationOfNonPrimitivesViaConfig() throws Exception {
pdfParser.getClass().getName());
assertEquals(PDFParserConfig.OCR_STRATEGY.OCR_ONLY,
((PDFParser) pdfParser).getPDFParserConfig().getOcrStrategy());
assertEquals(ImageType.RGB,
assertEquals(PDFParserConfig.TikaImageType.GRAY.RGB,
((PDFParser) pdfParser).getPDFParserConfig().getOcrImageType());
}
}
Expand Down

0 comments on commit a53af59

Please sign in to comment.