diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index cae86054cd8..a0095048c97 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -70,18 +70,6 @@ jobs: with: maven-version: 3.9.9 - - name: Set up cache date - run: echo "CACHE_DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV - - - name: Cache Maven repository - id: maven-cache - uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 - with: - path: ~/.m2/repository - key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ env.CACHE_DATE }} - restore-keys: | - ${{ runner.os }}-maven- - - name: Cache Docker layers uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 with: diff --git a/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java b/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java index 5d0bec1d0e2..52dc4e01584 100644 --- a/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java +++ b/inception/inception-io-xml/src/main/java/de/tudarmstadt/ukp/inception/io/xml/dkprocore/CasXmlHandler.java @@ -57,6 +57,7 @@ public class CasXmlHandler private XmlDocument docNode; private boolean captureText = true; private boolean splitSentencesInBlockElements = true; + private boolean commitText = true; private final Set listeners = new LinkedHashSet<>(); @@ -67,6 +68,11 @@ public CasXmlHandler(JCas aJCas) stack = new ArrayDeque<>(); } + public void setCommitText(boolean aCommitText) + { + commitText = aCommitText; + } + public void addListener(ElementListener aListener) { listeners.add(aListener); @@ -125,7 +131,9 @@ public void endDocument() throws SAXException l.endDocument(docNode); } - jcas.setDocumentText(text.toString()); + if (commitText) { + jcas.setDocumentText(text.toString()); + } if (!blockElements.isEmpty()) { if (splitSentencesInBlockElements) { @@ -213,7 +221,7 @@ public void endElement(String aUri, String aLocalName, String aQName) throws SAX } @Override - public void characters(char[] aCh, int aStart, int aLength) throws SAXException + public void characters(char[] aCh, int aStart, int aLength) { if (stack.isEmpty()) { // We ignore any characters outside the root elements. These could include e.g. @@ -241,7 +249,7 @@ public void characters(char[] aCh, int aStart, int aLength) throws SAXException } @Override - public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException + public void ignorableWhitespace(char[] aCh, int aStart, int aLength) { characters(aCh, aStart, aLength); } diff --git a/inception/inception-pdf-editor2/pom.xml b/inception/inception-pdf-editor2/pom.xml index dc79b304a75..16b3c55425c 100644 --- a/inception/inception-pdf-editor2/pom.xml +++ b/inception/inception-pdf-editor2/pom.xml @@ -73,6 +73,10 @@ de.tudarmstadt.ukp.inception.app inception-external-editor + + de.tudarmstadt.ukp.inception.app + inception-io-xml + org.apache.commons diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/LegacyPDFAndPDFGraphicsStreamEngine.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/LegacyPDFAndPDFGraphicsStreamEngine.java deleted file mode 100644 index f9d904cf709..00000000000 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/LegacyPDFAndPDFGraphicsStreamEngine.java +++ /dev/null @@ -1,327 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.inception.pdfeditor2.deprecated; - -import java.awt.geom.Point2D; -import java.io.IOException; -import java.io.InputStream; -import java.util.Map; -import java.util.WeakHashMap; - -import org.apache.fontbox.ttf.TrueTypeFont; -import org.apache.fontbox.util.BoundingBox; -import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; -import org.apache.pdfbox.cos.COSDictionary; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.common.PDRectangle; -import org.apache.pdfbox.pdmodel.font.PDCIDFont; -import org.apache.pdfbox.pdmodel.font.PDCIDFontType2; -import org.apache.pdfbox.pdmodel.font.PDFont; -import org.apache.pdfbox.pdmodel.font.PDFontDescriptor; -import org.apache.pdfbox.pdmodel.font.PDSimpleFont; -import org.apache.pdfbox.pdmodel.font.PDTrueTypeFont; -import org.apache.pdfbox.pdmodel.font.PDType0Font; -import org.apache.pdfbox.pdmodel.font.PDType3Font; -import org.apache.pdfbox.pdmodel.font.encoding.GlyphList; -import org.apache.pdfbox.pdmodel.graphics.state.PDGraphicsState; -import org.apache.pdfbox.text.TextPosition; -import org.apache.pdfbox.util.Matrix; -import org.apache.pdfbox.util.Vector; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * Adapts code from the PDFBox {@code LegacyPDFStreamEngine} but uses it in the context of a - * {@link PDFGraphicsStreamEngine}. - */ -@Deprecated -public abstract class LegacyPDFAndPDFGraphicsStreamEngine - extends PDFGraphicsStreamEngineAdapter -{ - private static final Logger LOG = LoggerFactory - .getLogger(LegacyPDFAndPDFGraphicsStreamEngine.class); - - private static final GlyphList GLYPHLIST; - - private final Map fontHeightMap = new WeakHashMap(); - - private int pageRotation; - private PDRectangle pageSize; - private Matrix translateMatrix; - - // Source: - // https://github.com/apache/pdfbox/blob/10d1e91af4eb9a06af7e95460533bf3ebc1b1280/pdfbox/src/main/java/org/apache/pdfbox/text/LegacyPDFStreamEngine.java#L89 - static { - // load additional glyph list for Unicode mapping - String path = "/org/apache/pdfbox/resources/glyphlist/additional.txt"; - InputStream input = GlyphList.class.getResourceAsStream(path); - try { - GLYPHLIST = new GlyphList(GlyphList.getAdobeGlyphList(), input); - input.close(); - } - catch (IOException ex) { - throw new RuntimeException(ex); - } - } - - protected LegacyPDFAndPDFGraphicsStreamEngine() - { - super(null); - } - - // source: - // https://github.com/apache/pdfbox/blob/10d1e91af4eb9a06af7e95460533bf3ebc1b1280/pdfbox/src/main/java/org/apache/pdfbox/text/LegacyPDFStreamEngine.java#L141 - /** - * This will initialize and process the contents of the stream. - * - * @param page - * the page to process - * @throws java.io.IOException - * if there is an error accessing the stream. - */ - @Override - public void processPage(PDPage page) throws IOException - { - this.pageRotation = page.getRotation(); - this.pageSize = page.getCropBox(); - - if (pageSize.getLowerLeftX() == 0 && pageSize.getLowerLeftY() == 0) { - translateMatrix = null; - } - else { - // translation matrix for cropbox - translateMatrix = Matrix.getTranslateInstance(-pageSize.getLowerLeftX(), - -pageSize.getLowerLeftY()); - } - super.processPage(page); - } - - // source: - // https://github.com/apache/pdfbox/blob/10d1e91af4eb9a06af7e95460533bf3ebc1b1280/pdfbox/src/main/java/org/apache/pdfbox/text/LegacyPDFStreamEngine.java#L158 - /** - * Called when a glyph is to be processed. The heuristic calculations here were originally - * written by Ben Litchfield for PDFStreamEngine. - */ - @Override - protected void showGlyph(Matrix textRenderingMatrix, PDFont font, int code, Vector displacement) - throws IOException - { - // - // legacy calculations which were previously in PDFStreamEngine - // - // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. - // THIS CODE IS DELIBERATELY INCORRECT - // - - PDGraphicsState state = getGraphicsState(); - Matrix ctm = state.getCurrentTransformationMatrix(); - float fontSize = state.getTextState().getFontSize(); - float horizontalScaling = state.getTextState().getHorizontalScaling() / 100f; - Matrix textMatrix = getTextMatrix(); - - float displacementX = displacement.getX(); - // the sorting algorithm is based on the width of the character. As the displacement - // for vertical characters doesn't provide any suitable value for it, we have to - // calculate our own - if (font.isVertical()) { - displacementX = font.getWidth(code) / 1000; - // there may be an additional scaling factor for true type fonts - TrueTypeFont ttf = null; - if (font instanceof PDTrueTypeFont) { - ttf = ((PDTrueTypeFont) font).getTrueTypeFont(); - } - else if (font instanceof PDType0Font) { - PDCIDFont cidFont = ((PDType0Font) font).getDescendantFont(); - if (cidFont instanceof PDCIDFontType2) { - ttf = ((PDCIDFontType2) cidFont).getTrueTypeFont(); - } - } - if (ttf != null && ttf.getUnitsPerEm() != 1000) { - displacementX *= 1000f / ttf.getUnitsPerEm(); - } - } - - // - // legacy calculations which were previously in PDFStreamEngine - // - // DO NOT USE THIS CODE UNLESS YOU ARE WORKING WITH PDFTextStripper. - // THIS CODE IS DELIBERATELY INCORRECT - // - - // (modified) combined displacement, this is calculated *without* taking the character - // spacing and word spacing into account, due to legacy code in TextStripper - float tx = displacementX * fontSize * horizontalScaling; - float ty = displacement.getY() * fontSize; - - // (modified) combined displacement matrix - Matrix td = Matrix.getTranslateInstance(tx, ty); - - // (modified) text rendering matrix - Matrix nextTextRenderingMatrix = td.multiply(textMatrix).multiply(ctm); // text space -> - // device space - float nextX = nextTextRenderingMatrix.getTranslateX(); - float nextY = nextTextRenderingMatrix.getTranslateY(); - - // (modified) width and height calculations - float dxDisplay = nextX - textRenderingMatrix.getTranslateX(); - Float fontHeight = fontHeightMap.get(font.getCOSObject()); - if (fontHeight == null) { - fontHeight = computeFontHeight(font); - fontHeightMap.put(font.getCOSObject(), fontHeight); - } - float dyDisplay = fontHeight * textRenderingMatrix.getScalingFactorY(); - - // - // start of the original method - // - - // Note on variable names. There are three different units being used in this code. - // Character sizes are given in glyph units, text locations are initially given in text - // units, and we want to save the data in display units. The variable names should end with - // Text or Disp to represent if the values are in text or disp units (no glyph units are - // saved). - - float glyphSpaceToTextSpaceFactor = 1 / 1000f; - if (font instanceof PDType3Font) { - glyphSpaceToTextSpaceFactor = font.getFontMatrix().getScaleX(); - } - - float spaceWidthText = 0; - try { - // to avoid crash as described in PDFBOX-614, see what the space displacement should be - spaceWidthText = font.getSpaceWidth() * glyphSpaceToTextSpaceFactor; - } - catch (Throwable exception) { - LOG.warn("Unable to calculate spaceWidthText", exception); - } - - if (spaceWidthText == 0) { - spaceWidthText = font.getAverageFontWidth() * glyphSpaceToTextSpaceFactor; - // the average space width appears to be higher than necessary so make it smaller - spaceWidthText *= .80f; - } - if (spaceWidthText == 0) { - spaceWidthText = 1.0f; // if could not find font, use a generic value - } - - // the space width has to be transformed into display units - float spaceWidthDisplay = spaceWidthText * textRenderingMatrix.getScalingFactorX(); - - // use our additional glyph list for Unicode mapping - String unicodeMapping = font.toUnicode(code, GLYPHLIST); - - // when there is no Unicode mapping available, Acrobat simply coerces the character code - // into Unicode, so we do the same. Subclasses of PDFStreamEngine don't necessarily want - // this, which is why we leave it until this point in PDFTextStreamEngine. - if (unicodeMapping == null) { - if (font instanceof PDSimpleFont) { - char c = (char) code; - unicodeMapping = new String(new char[] { c }); - } - else { - // Acrobat doesn't seem to coerce composite font's character codes, instead it - // skips them. See the "allah2.pdf" TestTextStripper file. - return; - } - } - - // adjust for cropbox if needed - Matrix translatedTextRenderingMatrix; - if (translateMatrix == null) { - translatedTextRenderingMatrix = textRenderingMatrix; - } - else { - translatedTextRenderingMatrix = Matrix.concatenate(translateMatrix, - textRenderingMatrix); - nextX -= pageSize.getLowerLeftX(); - nextY -= pageSize.getLowerLeftY(); - } - - processTextPosition( - new TextPosition(pageRotation, pageSize.getWidth(), pageSize.getHeight(), - translatedTextRenderingMatrix, nextX, nextY, Math.abs(dyDisplay), dxDisplay, - Math.abs(spaceWidthDisplay), unicodeMapping, new int[] { code }, font, - fontSize, (int) (fontSize * textMatrix.getScalingFactorX())) - // , textRenderingMatrix - ); - } - - // Source: - // https://github.com/apache/pdfbox/blob/10d1e91af4eb9a06af7e95460533bf3ebc1b1280/pdfbox/src/main/java/org/apache/pdfbox/text/LegacyPDFStreamEngine.java#L329 - /** - * Compute the font height. Override this if you want to use own calculations. - * - * @param font - * the font. - * @return the font height. - * - * @throws IOException - * if there is an error while getting the font bounding box. - */ - protected float computeFontHeight(PDFont font) throws IOException - { - BoundingBox bbox = font.getBoundingBox(); - if (bbox.getLowerLeftY() < Short.MIN_VALUE) { - // PDFBOX-2158 and PDFBOX-3130 - // files by Salmat eSolutions / ClibPDF Library - bbox.setLowerLeftY(-(bbox.getLowerLeftY() + 65536)); - } - // 1/2 the bbox is used as the height todo: why? - float glyphHeight = bbox.getHeight() / 2; - - // sometimes the bbox has very high values, but CapHeight is OK - PDFontDescriptor fontDescriptor = font.getFontDescriptor(); - if (fontDescriptor != null) { - float capHeight = fontDescriptor.getCapHeight(); - if (Float.compare(capHeight, 0) != 0 - && (capHeight < glyphHeight || Float.compare(glyphHeight, 0) == 0)) { - glyphHeight = capHeight; - } - // PDFBOX-3464, PDFBOX-4480, PDFBOX-4553: - // sometimes even CapHeight has very high value, but Ascent and Descent are ok - float ascent = fontDescriptor.getAscent(); - float descent = fontDescriptor.getDescent(); - if (capHeight > ascent && ascent > 0 && descent < 0 - && ((ascent - descent) / 2 < glyphHeight - || Float.compare(glyphHeight, 0) == 0)) { - glyphHeight = (ascent - descent) / 2; - } - } - - // transformPoint from glyph space -> text space - float height; - if (font instanceof PDType3Font) { - height = font.getFontMatrix().transformPoint(0, glyphHeight).y; - } - else { - height = glyphHeight / 1000; - } - - return height; - } - - @Override - public Point2D getCurrentPoint() throws IOException - { - return new Point2D.Float(0.0f, 0.0f); - } - - protected abstract void processTextPosition( - TextPosition aTextPosition /* , Matrix aTextRenderingMatrix */) - throws IOException; - -} diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/PDFGraphicsStreamEngineAdapter.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/PDFGraphicsStreamEngineAdapter.java deleted file mode 100644 index 54f931ecad3..00000000000 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/deprecated/PDFGraphicsStreamEngineAdapter.java +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package de.tudarmstadt.ukp.inception.pdfeditor2.deprecated; - -import java.awt.geom.Point2D; -import java.io.IOException; - -import org.apache.pdfbox.contentstream.PDFGraphicsStreamEngine; -import org.apache.pdfbox.cos.COSName; -import org.apache.pdfbox.pdmodel.PDPage; -import org.apache.pdfbox.pdmodel.graphics.image.PDImage; - -/** - * Base class that removes the need for subclasses to implement all the abstract methods of - * {@link PDFGraphicsStreamEngine} that they might not even care about. - */ -@Deprecated -public abstract class PDFGraphicsStreamEngineAdapter - extends PDFGraphicsStreamEngine -{ - protected PDFGraphicsStreamEngineAdapter(PDPage aPage) - { - super(aPage); - } - - @Override - public void appendRectangle(Point2D aP0, Point2D aP1, Point2D aP2, Point2D aP3) - throws IOException - { - // No action - } - - @Override - public void drawImage(PDImage aPdImage) throws IOException - { - // No action - } - - @Override - public void clip(int aWindingRule) throws IOException - { - // No action - } - - @Override - public void moveTo(float aX, float aY) throws IOException - { - // No action - } - - @Override - public void lineTo(float aX, float aY) throws IOException - { - // No action - } - - @Override - public void curveTo(float aX1, float aY1, float aX2, float aY2, float aX3, float aY3) - throws IOException - { - // No action - } - - @Override - public void closePath() throws IOException - { - // No action - } - - @Override - public void endPath() throws IOException - { - // No action - } - - @Override - public void strokePath() throws IOException - { - // No action - } - - @Override - public void fillPath(int aWindingRule) throws IOException - { - // No action - } - - @Override - public void fillAndStrokePath(int aWindingRule) throws IOException - { - // No action - } - - @Override - public void shadingFill(COSName aShadingName) throws IOException - { - // No action - } -} diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/PdfFormatSupport.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/PdfFormatSupport.java index b590d681ea7..00eaa6fd581 100644 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/PdfFormatSupport.java +++ b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/PdfFormatSupport.java @@ -63,6 +63,7 @@ public CollectionReaderDescription getReaderDescription(Project aProject, TypeSystemDescription aTSD) throws ResourceInitializationException { - return createReaderDescription(VisualPdfReader.class, aTSD); + return createReaderDescription(VisualPdfReader.class, aTSD, // + VisualPdfReader.PARAM_GENERATE_HTML_STRUCTURE, true); } } diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReader.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReader.java index b5a28fd012d..63fe8f8ccf8 100644 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReader.java +++ b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReader.java @@ -35,7 +35,11 @@ import org.dkpro.core.api.io.JCasResourceCollectionReader_ImplBase; import org.dkpro.core.api.pdf.type.PdfChunk; import org.dkpro.core.api.pdf.type.PdfPage; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; +import de.tudarmstadt.ukp.inception.io.xml.dkprocore.CasXmlHandler; +import de.tudarmstadt.ukp.inception.pdfeditor2.visual.PdfEventHandler; import de.tudarmstadt.ukp.inception.pdfeditor2.visual.VisualPDFTextStripper; import de.tudarmstadt.ukp.inception.pdfeditor2.visual.model.VChunk; import de.tudarmstadt.ukp.inception.pdfeditor2.visual.model.VGlyph; @@ -130,6 +134,10 @@ public class VisualPdfReader @ConfigurationParameter(name = PARAM_SPACING_TOLERANCE, defaultValue = "0.5") private float spacingTolerance; + public static final String PARAM_GENERATE_HTML_STRUCTURE = "generateHtmlStructure"; + @ConfigurationParameter(name = PARAM_GENERATE_HTML_STRUCTURE, defaultValue = "false") + private boolean generateHtmlStructure; + @Override public void getNext(JCas aJCas) throws IOException, CollectionException { @@ -155,6 +163,12 @@ public void getNext(JCas aJCas) throws IOException, CollectionException stripper.setAverageCharTolerance(averageCharTolerance); stripper.setSpacingTolerance(spacingTolerance); + if (generateHtmlStructure) { + var capture = new PdfStructureCapturer(aJCas); + textBuffer = capture; + stripper.setEventHandler(capture); + } + stripper.writeText(doc, textBuffer); vModel = stripper.getVisualModel(); @@ -295,4 +309,81 @@ public static VModel visualModelFromCas(CAS cas, List pdfPages) return vModel; } + private static class PdfStructureCapturer + extends StringWriter + implements PdfEventHandler + { + private final JCas jCas; + private CasXmlHandler xmlCas; + + private PdfStructureCapturer(JCas aJCas) + { + jCas = aJCas; + xmlCas = new CasXmlHandler(jCas); + xmlCas.setCommitText(false); + } + + @Override + public void documentStart() throws SAXException + { + xmlCas.startDocument(); + xmlCas.startElement("", "", "html", new AttributesImpl()); + xmlCas.startElement("", "", "body", new AttributesImpl()); + } + + @Override + public void afterStartParagraph() throws Exception + { + xmlCas.startElement("", "", "p", new AttributesImpl()); + } + + @Override + public void beforeEndParagraph() throws Exception + { + xmlCas.endElement("", "", "p"); + } + + @Override + public void documentEnd() throws SAXException + { + xmlCas.endElement("", "", "body"); + xmlCas.endElement("", "", "html"); + xmlCas.endDocument(); + } + + @Override + public void write(String aStr, int aOff, int aLen) + { + super.write(aStr, aOff, aLen); + xmlCas.characters(aStr.toCharArray(), aOff, aLen); + } + + @Override + public void write(char[] aCbuf) throws IOException + { + super.write(aCbuf); + xmlCas.characters(aCbuf, 0, aCbuf.length); + } + + @Override + public void write(char[] aCbuf, int aOff, int aLen) + { + super.write(aCbuf, aOff, aLen); + xmlCas.characters(aCbuf, aOff, aLen); + } + + @Override + public void write(int aC) + { + super.write(aC); + xmlCas.characters(new char[] { (char) aC }, 0, 1); + } + + @Override + public void write(String aStr) + { + super.write(aStr); + xmlCas.characters(aStr.toCharArray(), 0, aStr.length()); + } + } } diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/PdfEventHandler.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/PdfEventHandler.java index eade59cbf31..8a2c8532cc0 100644 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/PdfEventHandler.java +++ b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/PdfEventHandler.java @@ -19,23 +19,53 @@ public interface PdfEventHandler { - void documentStart(); + default void documentStart() throws Exception + { + // Nothing by default + } - void documentEnd(); + default void documentEnd() throws Exception + { + // Nothing by default + } - void beforeStartParagraph(CharSequence aCharSequence); + default void beforeStartParagraph() throws Exception + { + // Nothing by default + } - void afterEndParagraph(CharSequence aCharSequence); + default void afterEndParagraph() throws Exception + { + // Nothing by default + } - void beforeStartPage(CharSequence aCharSequence); + default void beforeStartPage() throws Exception + { + // Nothing by default + } - void afterEndPage(CharSequence aCharSequence); + default void afterEndPage() throws Exception + { + // Nothing by default + } - void afterStartParagraph(CharSequence aCharSequence); + default void afterStartParagraph() throws Exception + { + // Nothing by default + } - void beforeEndParagraph(CharSequence aCharSequence); + default void beforeEndParagraph() throws Exception + { + // Nothing by default + } - void afterStartPage(CharSequence aCharSequence); + default void afterStartPage() throws Exception + { + // Nothing by default + } - void beforeEndPage(CharSequence aCharSequence); + default void beforeEndPage() throws Exception + { + // Nothing by default + } } diff --git a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/VisualPDFTextStripper.java b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/VisualPDFTextStripper.java index 95ad9c5eecd..22e13b78e58 100644 --- a/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/VisualPDFTextStripper.java +++ b/inception/inception-pdf-editor2/src/main/java/de/tudarmstadt/ukp/inception/pdfeditor2/visual/VisualPDFTextStripper.java @@ -24,7 +24,6 @@ import static java.util.Comparator.comparing; import static java.util.stream.Collectors.joining; -import java.awt.Shape; import java.awt.geom.AffineTransform; import java.awt.geom.Rectangle2D; import java.io.IOException; @@ -76,6 +75,11 @@ public VisualPDFTextStripper() throws IOException setShouldSeparateByBeads(true); } + public void setEventHandler(PdfEventHandler aEventHandler) + { + eventHandler = aEventHandler; + } + private CharSequence getBuffer() { return ((StringWriter) output).getBuffer(); @@ -127,8 +131,8 @@ protected void writePage() throws IOException private void calculateCharacterPositions() throws IOException { for (List article : charactersByArticle) { - for (TextPosition tp : article) { - Shape fontShape = calculateFontBounds(tp, flipAT, rotateAT); + for (var tp : article) { + var fontShape = calculateFontBounds(tp, flipAT, rotateAT); Rectangle2D.Double f = (Rectangle2D.Double) fontShape.getBounds2D(); fontPositionCache.put(tp, f); } @@ -280,7 +284,12 @@ protected void writeString(String aText, List aTextPositions) thro protected void startDocument(PDDocument aDocument) throws IOException { if (eventHandler != null) { - eventHandler.documentStart(); + try { + eventHandler.documentStart(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } super.startDocument(aDocument); @@ -292,7 +301,12 @@ protected void endDocument(PDDocument aDocument) throws IOException super.endDocument(aDocument); if (eventHandler != null) { - eventHandler.documentEnd(); + try { + eventHandler.documentEnd(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } } @@ -300,13 +314,24 @@ protected void endDocument(PDDocument aDocument) throws IOException protected void writeParagraphStart() throws IOException { if (eventHandler != null) { - eventHandler.beforeStartParagraph(getBuffer()); + try { + eventHandler.beforeStartParagraph(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } + } super.writeParagraphStart(); if (eventHandler != null) { - eventHandler.afterStartParagraph(getBuffer()); + try { + eventHandler.afterStartParagraph(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } } @@ -314,13 +339,23 @@ protected void writeParagraphStart() throws IOException protected void writeParagraphEnd() throws IOException { if (eventHandler != null) { - eventHandler.beforeEndParagraph(getBuffer()); + try { + eventHandler.beforeEndParagraph(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } super.writeParagraphEnd(); if (eventHandler != null) { - eventHandler.afterEndParagraph(getBuffer()); + try { + eventHandler.afterEndParagraph(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } } @@ -328,13 +363,23 @@ protected void writeParagraphEnd() throws IOException protected void writePageStart() throws IOException { if (eventHandler != null) { - eventHandler.beforeStartPage(getBuffer()); + try { + eventHandler.beforeStartPage(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } super.writePageStart(); if (eventHandler != null) { - eventHandler.afterStartPage(getBuffer()); + try { + eventHandler.afterStartPage(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } } @@ -342,16 +387,35 @@ protected void writePageStart() throws IOException protected void writePageEnd() throws IOException { if (eventHandler != null) { - eventHandler.beforeEndPage(getBuffer()); + try { + eventHandler.beforeEndPage(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } super.writePageEnd(); if (eventHandler != null) { - eventHandler.afterEndPage(getBuffer()); + try { + eventHandler.afterEndPage(); + } + catch (Exception e) { + throw handleEventHandlerException(e); + } } } + private IOException handleEventHandlerException(Exception aException) + { + if (aException instanceof IOException ioException) { + return ioException; + } + + return new IOException(aException); + } + private String reconcileGlyphWithText(String aText, boolean rtl, String normalizedUnicode, int begin) { diff --git a/inception/inception-pdf-editor2/src/test/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReaderTest.java b/inception/inception-pdf-editor2/src/test/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReaderTest.java index 39d40ad55f5..a933cc9e759 100644 --- a/inception/inception-pdf-editor2/src/test/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReaderTest.java +++ b/inception/inception-pdf-editor2/src/test/java/de/tudarmstadt/ukp/inception/pdfeditor2/format/VisualPdfReaderTest.java @@ -26,13 +26,13 @@ import java.io.StringWriter; import org.apache.pdfbox.Loader; -import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.uima.cas.CAS; import org.apache.uima.fit.factory.CasFactory; import org.apache.uima.fit.factory.JCasFactory; -import org.apache.uima.jcas.JCas; import org.dkpro.core.api.pdf.type.PdfChunk; import org.dkpro.core.api.pdf.type.PdfPage; +import org.dkpro.core.api.xml.type.XmlDocument; +import org.dkpro.core.api.xml.type.XmlElement; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.condition.OS; @@ -74,6 +74,20 @@ void thatCoordinatesAreStoredInCas() throws Exception } } + @Test + void thatHtmlStructureIsGenerated() throws Exception + { + var reader = createReader( // + VisualPdfReader.class, // + VisualPdfReader.PARAM_SORT_BY_POSITION, true, // + VisualPdfReader.PARAM_GENERATE_HTML_STRUCTURE, true, // + VisualPdfReader.PARAM_SOURCE_LOCATION, testFilesBase + "eu-001.pdf"); + reader.getNext(cas); + + assertThat(cas.select(XmlDocument.class).asList()).hasSize(1); + assertThat(cas.select(XmlElement.class).asList()).hasSize(44); + } + @Test void thatRtlCoordinatesMakeSenseSorting1() throws Exception { @@ -348,7 +362,7 @@ void thatRtlCoordinatesMakeSenseNoSorting2() throws Exception { VModel expected; var textBuffer = new StringWriter(); - try (PDDocument doc = Loader.loadPDF(new File(testFilesBase + "FC60_Times.pdf"))) { + try (var doc = Loader.loadPDF(new File(testFilesBase + "FC60_Times.pdf"))) { var extractor = new VisualPDFTextStripper(); extractor.setSortByPosition(false); extractor.writeText(doc, textBuffer); @@ -365,7 +379,7 @@ void thatRtlCoordinatesMakeSenseNoSorting2() throws Exception // assertThat(textBuffer.toString()) // // .isEqualTo(expectedText); - JCas jCas = JCasFactory.createJCas(); + var jCas = JCasFactory.createJCas(); jCas.setDocumentText(textBuffer.toString()); VisualPdfReader.visualModelToCas(expected, jCas); @@ -401,7 +415,7 @@ void thatRtlCoordinatesMakeSenseNoSorting2() throws Exception 90.0f, // new float[] { 90.0f })); - VModel actual = VisualPdfReader.visualModelFromCas(jCas.getCas(), + var actual = VisualPdfReader.visualModelFromCas(jCas.getCas(), jCas.select(PdfPage.class).asList()); assertThat(actual.getPages().get(0).getChunks()) // diff --git a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/TextSanitizingContentHandler.java b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/TextSanitizingContentHandler.java index 75bdc403afa..a360b627bae 100644 --- a/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/TextSanitizingContentHandler.java +++ b/inception/inception-support/src/main/java/de/tudarmstadt/ukp/inception/support/xml/TextSanitizingContentHandler.java @@ -44,20 +44,9 @@ public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXE private String sanitizeVisibleText(String aText) { - char[] chars = aText.toCharArray(); - for (int i = 0; i < chars.length; i++) { - switch (chars[i]) { - // Replace newline characters before sending to the browser to avoid the character - // offsets in the browser to get out-of-sync with the server-side offsets. E.g. some - // browsers tend to completely discard the `\r`. - case '\r': - chars[i] = ' '; - break; - default: - // Nothing to do - } - } - - return new String(chars); + // Replace newline characters before sending to the browser to avoid the character + // offsets in the browser to get out-of-sync with the server-side offsets. E.g. some + // browsers tend to completely discard the `\r`. + return aText.replace('\r', ' '); } }