From 41308c4a9600e3d2ddb14b684ce1dca2e8115b8f Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 27 Jan 2025 20:29:52 +0100 Subject: [PATCH] #5263 - Assistant index does not update correctly when documents are added or removed - Add document IDs as searchable fields so that updater can check if a document is already present or not - Instead of doing nothing when no document remains in the project, make sure that any documents still in the index get properly removed --- .../documents/UpdateDocumentIndexTask.java | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java index 511513c181..1bd03ef08e 100644 --- a/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java +++ b/inception/inception-assistant/src/main/java/de/tudarmstadt/ukp/inception/assistant/documents/UpdateDocumentIndexTask.java @@ -50,7 +50,6 @@ import org.apache.lucene.document.LongPoint; import org.apache.lucene.document.StoredField; import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.search.FieldExistsQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.uima.cas.CAS; import org.slf4j.Logger; @@ -100,6 +99,10 @@ public void execute() throws Exception { var documents = documentService.listSourceDocuments(getProject()); if (documents.isEmpty()) { + try (var index = documentQueryService.borrowIndex(getProject())) { + index.getIndexWriter().deleteAll(); + index.getIndexWriter().commit(); + } return; } @@ -114,15 +117,16 @@ public void execute() throws Exception try (var index = documentQueryService.borrowIndex(getProject())) { try (var reader = DirectoryReader.open(index.getIndexWriter())) { var searcher = new IndexSearcher(reader); - var query = new FieldExistsQuery(FIELD_SOURCE_DOC_COMPLETE); + var query = LongPoint.newRangeQuery(FIELD_SOURCE_DOC_COMPLETE, Long.MIN_VALUE, + Long.MAX_VALUE); var result = searcher.search(query, Integer.MAX_VALUE); var documentsToIndex = new HashMap(); for (var doc : documents) { documentsToIndex.put(doc.getId(), doc); } - var documentsToUnindex = new ArrayList(); + var documentsToUnindex = new ArrayList(); for (var doc : result.scoreDocs) { var fields = searcher.getIndexReader().storedFields().document(doc.doc); var sourceDocId = fields.getField(FIELD_SOURCE_DOC_COMPLETE).numericValue() @@ -181,6 +185,8 @@ private void unindexDocument(LuceneIndexPool.PooledIndex aIndex, long aSourceDoc try { aIndex.getIndexWriter().deleteDocuments( LongPoint.newExactQuery(FIELD_SOURCE_DOC_ID, aSourceDocumentId)); + aIndex.getIndexWriter().deleteDocuments( + LongPoint.newExactQuery(FIELD_SOURCE_DOC_COMPLETE, aSourceDocumentId)); } catch (IOException e) { LOG.error("Error unindexing document [{}]", aSourceDocumentId, e); @@ -232,6 +238,7 @@ private void markDocumentAsIndexed(LuceneIndexPool.PooledIndex aIndex, throws IOException { var doc = new Document(); + doc.add(new LongPoint(FIELD_SOURCE_DOC_COMPLETE, aSourceDocument.getId())); doc.add(new StoredField(FIELD_SOURCE_DOC_COMPLETE, aSourceDocument.getId())); aIndex.getIndexWriter().addDocument(doc); } @@ -251,6 +258,7 @@ private void indexChunks(LuceneIndexPool.PooledIndex aIndex, SourceDocument aSou var normalizedEmbedding = l2normalize(aEmbeddedChunks.getValue(), false); doc.add(new KnnFloatVectorField(FIELD_EMBEDDING, normalizedEmbedding, DOT_PRODUCT)); doc.add(new IntRange(FIELD_RANGE, new int[] { chunk.begin() }, new int[] { chunk.end() })); + doc.add(new LongPoint(FIELD_SOURCE_DOC_ID, aSourceDocument.getId())); doc.add(new StoredField(FIELD_SOURCE_DOC_ID, aSourceDocument.getId())); doc.add(new StoredField(FIELD_SECTION, "")); doc.add(new StoredField(FIELD_TEXT, chunk.text()));