Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

#5265 - Extract paragraph structure from PDF files #5266

Merged
merged 3 commits into from
Jan 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 0 additions & 12 deletions .github/workflows/maven.yml
Original file line number Diff line number Diff line change
Expand Up @@ -70,18 +70,6 @@ jobs:
with:
maven-version: 3.9.9

- name: Set up cache date
run: echo "CACHE_DATE=$(date +'%Y-%m-%d')" >> $GITHUB_ENV

- name: Cache Maven repository
id: maven-cache
uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
path: ~/.m2/repository
key: ${{ runner.os }}-maven-${{ hashFiles('**/pom.xml') }}-${{ env.CACHE_DATE }}
restore-keys: |
${{ runner.os }}-maven-

- name: Cache Docker layers
uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0
with:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ public class CasXmlHandler
private XmlDocument docNode;
private boolean captureText = true;
private boolean splitSentencesInBlockElements = true;
private boolean commitText = true;

private final Set<ElementListener> listeners = new LinkedHashSet<>();

Expand All @@ -67,6 +68,11 @@ public CasXmlHandler(JCas aJCas)
stack = new ArrayDeque<>();
}

public void setCommitText(boolean aCommitText)
{
commitText = aCommitText;
}

public void addListener(ElementListener aListener)
{
listeners.add(aListener);
Expand Down Expand Up @@ -125,7 +131,9 @@ public void endDocument() throws SAXException
l.endDocument(docNode);
}

jcas.setDocumentText(text.toString());
if (commitText) {
jcas.setDocumentText(text.toString());
}

if (!blockElements.isEmpty()) {
if (splitSentencesInBlockElements) {
Expand Down Expand Up @@ -213,7 +221,7 @@ public void endElement(String aUri, String aLocalName, String aQName) throws SAX
}

@Override
public void characters(char[] aCh, int aStart, int aLength) throws SAXException
public void characters(char[] aCh, int aStart, int aLength)
{
if (stack.isEmpty()) {
// We ignore any characters outside the root elements. These could include e.g.
Expand Down Expand Up @@ -241,7 +249,7 @@ public void characters(char[] aCh, int aStart, int aLength) throws SAXException
}

@Override
public void ignorableWhitespace(char[] aCh, int aStart, int aLength) throws SAXException
public void ignorableWhitespace(char[] aCh, int aStart, int aLength)
{
characters(aCh, aStart, aLength);
}
Expand Down
4 changes: 4 additions & 0 deletions inception/inception-pdf-editor2/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-external-editor</artifactId>
</dependency>
<dependency>
<groupId>de.tudarmstadt.ukp.inception.app</groupId>
<artifactId>inception-io-xml</artifactId>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
Expand Down
Loading
Loading