Added extracting search snippets from Fulltext Search for UI, fixed S…

…RL search.
texttechnologylab · Oct 23, 2024 · 7ad39a8 · 7ad39a8
1 parent aba15df
commit 7ad39a8
Show file tree

Hide file tree

Showing 12 changed files with 122 additions and 64 deletions.
diff --git a/database/createSearchLayerFulltextProcedure.sql b/database/createSearchLayerFulltextProcedure.sql
@@ -15,36 +15,45 @@ CREATE OR REPLACE FUNCTION uce_search_layer_fulltext(
     OUT document_ids integer[],
     OUT named_entities_found text[][],
     OUT time_found text[][],
-    OUT taxons_found text[][]
+    OUT taxons_found text[][],
+    OUT snippets_found text[] -- Add an output for snippets
 )
 RETURNS record AS $$
 DECLARE
-    -- Declare variables to hold total count, document IDs, named entities, time, and taxons
+    -- Declare variables to hold total count, document IDs, named entities, time, taxons, and snippets
     total_count_temp integer;
     document_ids_temp integer[];
     named_entities_temp text[][];
     time_temp text[][];
     taxons_temp text[][];
+    snippets_temp text[];
 BEGIN
-    -- Common table expression to define the set of documents
-    WITH documents_query AS (
-        SELECT DISTINCT d.id
-        FROM document d
-        WHERE d.corpusid = corpus_id and (d.documenttitle = ANY(input1) OR d.language = ANY(input1) OR d.fulltext ~* input2)
-
-        UNION ALL
-
-        SELECT DISTINCT d.id
-        FROM document d
-        JOIN metadatatitleinfo me ON d.id = me.id
-        WHERE d.corpusid = corpus_id and (me.title ~* input2 OR me.published ~* input2)
-    ),
+    -- Common table expression to define the set of documents and matching snippets
+	WITH documents_query AS (
+		SELECT DISTINCT d.id,
+						unnest(regexp_matches(d.fulltext, input2, 'gi')) AS match,
+						substring(d.fulltext from position(unnest(regexp_matches(d.fulltext, input2, 'gi')) in d.fulltext) - 250 for 500) AS snippet
+		FROM document d
+		WHERE d.corpusid = corpus_id 
+		  AND (d.documenttitle = ANY(input1) OR d.language = ANY(input1) OR d.fulltext ~* input2)
+
+		UNION ALL
+
+		SELECT DISTINCT d.id,
+						unnest(regexp_matches(me.title || ' ' || coalesce(me.published, ''), input2, 'gi')) AS match,
+						substring(me.title || ' ' || coalesce(me.published, '') from position(unnest(regexp_matches(me.title || ' ' || coalesce(me.published, ''), input2, 'gi')) in me.title || ' ' || coalesce(me.published, '')) - 50 for 100) AS snippet
+		FROM document d
+		JOIN metadatatitleinfo me ON d.id = me.id
+		WHERE d.corpusid = corpus_id 
+		  AND (me.title ~* input2 OR me.published ~* input2)
+	),
+
     -- Count all found documents
     counted_documents AS (
         SELECT COUNT(*) AS total_count FROM documents_query
     )
 
-    -- Retrieve total count, document IDs, named entities, time, and taxons (conditionally)
+    -- Retrieve total count, document IDs, named entities, time, taxons, and snippets (conditionally)
     SELECT 
       CASE WHEN count_all THEN (SELECT total_count FROM counted_documents) ELSE NULL END AS total_count,
       ARRAY(
@@ -54,7 +63,7 @@ BEGIN
             FROM documents_query dq
             JOIN metadatatitleinfo me ON dq.id = me.id
 
-			-- This ordering is a bit scuffed, but it finally works. A lot of copy pasting when adding new cases, but that should happend often. --
+			-- This ordering is a bit scuffed, but it finally works. A lot of copy pasting when adding new cases, but that shouldn't happen often. --
 			ORDER BY 
 			  CASE 
 				WHEN order_by_column = 'title' THEN 
@@ -83,7 +92,7 @@ BEGIN
         ) AS dq
       ) AS document_ids_temp,
 
-	  -- Count the occurences of all the found entities, taxons etc. --
+	  -- Count the occurrences of all the found entities, taxons etc. --
 
 	  CASE WHEN count_all THEN
 		  ARRAY(
@@ -92,7 +101,7 @@ BEGIN
 			JOIN namedentity ne ON dq.id = ne.document_id
 			GROUP BY ne.coveredtext, ne.typee, ne.document_id
 		  )
-		  ELSE ARRAY[]::text[][][][]
+		  ELSE ARRAY[]::text[][]
 	  END AS named_entities_temp,
 
 	  CASE WHEN count_all THEN
@@ -102,7 +111,7 @@ BEGIN
 			JOIN time t ON dq.id = t.document_id
 			GROUP BY t.coveredtext, t.valuee, t.document_id
 		  ) 
-		  ELSE ARRAY[]::text[][][][]
+		  ELSE ARRAY[]::text[][]
 	  END AS time_temp,
 
 	  CASE WHEN count_all THEN
@@ -112,10 +121,16 @@ BEGIN
         JOIN taxon ta ON dq.id = ta.document_id
         GROUP BY ta.coveredtext, ta.valuee, ta.document_id
       )
-	  ELSE ARRAY[]::text[][][][]
-	  END AS taxons_temp
+	  ELSE ARRAY[]::text[][]
+	  END AS taxons_temp,
 
-    INTO total_count_temp, document_ids_temp, named_entities_temp, time_temp, taxons_temp
+	  -- Fetch the snippets for the matched input2
+	  ARRAY(
+        SELECT dq.snippet
+        FROM documents_query dq
+      ) AS snippets_temp -- New snippet logic
+
+    INTO total_count_temp, document_ids_temp, named_entities_temp, time_temp, taxons_temp, snippets_temp
     FROM (SELECT 1) AS dummy;
 
     -- Set out parameters
@@ -124,5 +139,6 @@ BEGIN
     named_entities_found := named_entities_temp;
     time_found := time_temp;
     taxons_found := taxons_temp;
+    snippets_found := snippets_temp; -- Set the snippets output
 END;
 $$ LANGUAGE plpgsql;
diff --git a/database/createSemanticRoleSearchProcedure.sql b/database/createSemanticRoleSearchProcedure.sql
@@ -40,19 +40,19 @@ BEGIN
 			AND (LOWER(sr.figurecoveredtext) = verb OR verb = '')
 			-- Pre-filter based on ARG0, ARG1, ARG2, or ARGM conditions in WHERE clause
 			AND (
-				((sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) OR
-				((sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) OR
-				((sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) OR
-				((sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm))
+				((sr.relationtype = 'ARG0' OR sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) OR
+				((sr.relationtype = 'ARG1' OR sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) OR
+				((sr.relationtype = 'ARG2' OR sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) OR
+				((sr.relationtype = 'ARGM' OR sr.relationtype = 'ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm))
 			)
 		GROUP BY d.id, sr.figurebegin
 		HAVING 
 			COUNT(DISTINCT 
 				CASE 
-					WHEN ((sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) THEN sr.relationtype
-					WHEN ((sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) THEN sr.relationtype
-					WHEN ((sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) THEN sr.relationtype
-					WHEN ((sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm)) THEN sr.relationtype
+					WHEN ((sr.relationtype = 'ARG0' OR sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) THEN sr.relationtype
+					WHEN ((sr.relationtype = 'ARG1' OR sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) THEN sr.relationtype
+					WHEN ((sr.relationtype = 'ARG2' OR sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) THEN sr.relationtype
+					WHEN ((sr.relationtype = 'ARGM' OR sr.relationtype = 'ARGM-LOC' OR sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm)) THEN sr.relationtype
 					ELSE NULL
 				END
 			) = 

diff --git a/docs/index.md b/docs/index.md
@@ -1,17 +1,10 @@
-# Welcome to MkDocs
+# Unified Corpus Explorer (UCE)
 
-For full documentation visit [mkdocs.org](https://www.mkdocs.org).
 
-## Commands
+UCE is a dynamic Natural Language Processing (NLP) application for exploring UIMA-annotated large text corpora. It imports the corpus, adapts to the specifics and builds its UI around it.  
 
-* `mkdocs new [dir-name]` - Create a new project.
-* `mkdocs serve` - Start the live-reloading docs server.
-* `mkdocs build` - Build the documentation site.
-* `mkdocs -h` - Print help message and exit.
+-----------------
 
-## Project layout
+## Quick Start
 
-    mkdocs.yml    # The configuration file.
-    docs/
-        index.md  # The documentation homepage.
-        ...       # Other markdown pages, images and other files.
+To ads
diff --git a/docs/publications.md b/docs/publications.md
@@ -0,0 +1,3 @@
+# Publications
+
+To be added.
diff --git a/uce.portal/resources/templates/css/search-redesign.css b/uce.portal/resources/templates/css/search-redesign.css
@@ -338,6 +338,13 @@
     transition: 0.15s;
 }
 
+.document-card .highlighted-token{
+    color:black;
+    text-decoration: underline;
+    text-decoration-color: var(--secondary);
+    text-decoration-thickness: 3px;
+}
+
 .document-card .embedding-text {
     font-size: small;
     font-style: italic;

diff --git a/uce.portal/resources/templates/search/components/documentCardContent.ftl b/uce.portal/resources/templates/search/components/documentCardContent.ftl
@@ -42,6 +42,23 @@
 
 <div class="snippet-content flexed align-items-center justify-content-between h-100">
     <p class="mb-0 small-font text font-italic mr-2">
-        "${document.getFullTextSnippet(85)}..."
+        <#assign snippet = searchState.getPossibleSnippetOfDocumentIdx(documentIdx)!>
+        <#if !snippet?has_content>
+            <#assign snippet = document.getFullTextSnippet(85)!>
+        </#if>
+
+        <!-- Get the list of search tokens -->
+        <#assign searchTokens = searchState.getSearchTokens()!>
+
+        <!-- Initialize the highlighted snippet -->
+        <#assign highlightedSnippet = snippet>
+
+        <!-- Loop through each search token and highlight it -->
+        <#list searchTokens as searchToken>
+            <#assign highlightedSnippet = highlightedSnippet?replace(searchToken, "<span class='highlighted-token'>${searchToken}</span>", "i")>
+        </#list>
+
+        <!-- Render the highlighted snippet -->
+        ${highlightedSnippet}...
     </p>
 </div>
diff --git a/uce.portal/resources/templates/search/components/documentList.ftl b/uce.portal/resources/templates/search/components/documentList.ftl
@@ -1,5 +1,6 @@
 <div>
     <#list searchState.getCurrentDocuments() as document>
+        <#assign documentIdx = document?index>
         <div class="flexed justify-content-center">
             <div class="document-card" data-id="${document.getId()?string?replace('.', '')?replace(',', '')}">
                 <div>

diff --git a/uce.portal/uce.common/src/main/java/org/texttechnologylab/models/corpus/Page.java b/uce.portal/uce.common/src/main/java/org/texttechnologylab/models/corpus/Page.java
@@ -72,6 +72,7 @@ public List<Paragraph> getParagraphs() {
             // I have no idea why, but sometimes, the lazy load of empty paragraphs throws an error.
             // There is nothing wrong with the document or page - it just throws an error here.
             // It's not a problem if the paragraphs are empty! So we just catch the error and return empty...
+            // It's also not worth logging to the DB or file logger.
             System.err.println("Opened a document with unloadable lazy paragraphs.");
             return new ArrayList<>();
         }

diff --git a/...al/uce.common/src/main/java/org/texttechnologylab/models/search/DocumentSearchResult.java b/...al/uce.common/src/main/java/org/texttechnologylab/models/search/DocumentSearchResult.java
@@ -1,7 +1,8 @@
 package org.texttechnologylab.models.search;
 
 import java.util.ArrayList;
-import java.util.List;
+import java.util.HashMap;
+import java.util.Map;
 
 public class DocumentSearchResult {
 
@@ -11,13 +12,22 @@ public class DocumentSearchResult {
     private ArrayList<AnnotationSearchResult> foundNamedEntities;
     private ArrayList<AnnotationSearchResult> foundTimes;
     private ArrayList<AnnotationSearchResult> foundTaxons;
+    private HashMap<Integer, String> searchSnippets;
 
     public DocumentSearchResult(int documentCount,
                                 ArrayList<Integer> documentIds) {
         this.documentCount = documentCount;
         this.documentIds = documentIds;
     }
 
+    public HashMap<Integer, String> getSearchSnippets() {
+        return searchSnippets;
+    }
+
+    public void setSearchSnippets(HashMap<Integer, String> searchSnippets) {
+        this.searchSnippets = searchSnippets;
+    }
+
     public void setDocumentCount(int documentCount) {
         this.documentCount = documentCount;
     }

diff --git a/...uce.common/src/main/java/org/texttechnologylab/services/PostgresqlDataInterface_Impl.java b/...uce.common/src/main/java/org/texttechnologylab/services/PostgresqlDataInterface_Impl.java
@@ -327,17 +327,14 @@ public DocumentSearchResult defaultSearchForDocuments(int skip,
                     search.setFoundTaxons(parseAnnotationOccurrences(result.getArray("taxons_found").getResultSet()));
                     search.setFoundTimes(parseAnnotationOccurrences(result.getArray("time_found").getResultSet()));
 
-                    /* This was an attempt to count the hits of each document, but it took waaay too long.
-                    For now, this is not used!
-                    // Finally, parse how many matches/hits we have per document
-                    var matchCountsResults = result.getArray("match_counts").getResultSet();
-                    var documentHits = new ArrayList<Integer>();
-                    while(matchCountsResults.next()){
-                        var idx = matchCountsResults.getInt(1);
-                        var hits = matchCountsResults.getInt(2);
-                        documentHits.add(hits);
+                    // Finally, parse the found snippets of the search
+                    // This is only done for the fulltext search
+                    if(layer == SearchLayer.FULLTEXT){
+                        var resultSet = result.getArray("snippets_found").getResultSet();
+                        var foundSnippets = new HashMap<Integer, String>();
+                        while (resultSet.next()) foundSnippets.put(resultSet.getInt(1) - 1, resultSet.getString(2));
+                        search.setSearchSnippets(foundSnippets);
                     }
-                    search.setDocumentHits(documentHits);*/
                 }
                 return search;
             }

diff --git a/uce.portal/uce.search/src/main/java/org/texttechnologylab/SearchState.java b/uce.portal/uce.search/src/main/java/org/texttechnologylab/SearchState.java
@@ -6,10 +6,7 @@
 import org.texttechnologylab.models.corpus.Document;
 import org.texttechnologylab.models.search.*;
 
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.List;
-import java.util.UUID;
+import java.util.*;
 
 /**
  * A class that holds all states of a biofid search. We can use this class to serialize the search. It shouldn't hold any services.
@@ -53,12 +50,22 @@ public class SearchState {
      */
     @Obsolete
     private List<Integer> currentDocumentHits;
+    private HashMap<Integer, String> documentIdxToSnippet;
 
     public SearchState(SearchType searchType) {
         this.searchType = searchType;
         this.searchId = UUID.randomUUID();
     }
 
+    public String getPossibleSnippetOfDocumentIdx(Integer idx){
+        if(this.documentIdxToSnippet != null && this.documentIdxToSnippet.containsKey(idx)) return this.documentIdxToSnippet.get(idx);
+        return null;
+    }
+
+    public void setDocumentIdxToSnippet(HashMap<Integer, String> map) {
+        this.documentIdxToSnippet = map;
+    }
+
     public List<Integer> getCurrentDocumentHits() {
         return currentDocumentHits;
     }
@@ -67,9 +74,13 @@ public void setCurrentDocumentHits(List<Integer> currentDocumentHits) {
         this.currentDocumentHits = currentDocumentHits;
     }
 
-    public CorpusConfig getCorpusConfig() { return corpusConfig;}
+    public CorpusConfig getCorpusConfig() {
+        return corpusConfig;
+    }
 
-    public void setCorpusConfig(CorpusConfig corpusConfig) { this.corpusConfig = corpusConfig; }
+    public void setCorpusConfig(CorpusConfig corpusConfig) {
+        this.corpusConfig = corpusConfig;
+    }
 
     public KeywordInContextState getKeywordInContextState() {
         return keywordInContextState;
@@ -144,11 +155,11 @@ public void setTotalHits(Integer totalHits) {
         this.totalHits = totalHits;
     }
 
-    public int getSearchHitsOfDocument(int documentId){
-        try{
+    public int getSearchHitsOfDocument(int documentId) {
+        try {
             var documentIdx = currentDocuments.indexOf(currentDocuments.stream().filter(d -> d.getId() == documentId).findFirst().get());
             return currentDocumentHits.get(documentIdx);
-        } catch (Exception ex){
+        } catch (Exception ex) {
             // This exception should never happen!
             return -1;
         }

diff --git a/uce.portal/uce.search/src/main/java/org/texttechnologylab/Search_DefaultImpl.java b/uce.portal/uce.search/src/main/java/org/texttechnologylab/Search_DefaultImpl.java
@@ -90,6 +90,7 @@ public SearchState initSearch() {
         if (documents == null) return null;
         searchState.setCurrentDocuments(documents);
         searchState.setCurrentDocumentHits(documentSearchResult.getDocumentHits());
+        searchState.setDocumentIdxToSnippet(documentSearchResult.getSearchSnippets());
         searchState.setTotalHits(documentSearchResult.getDocumentCount());
         searchState.setFoundNamedEntities(documentSearchResult.getFoundNamedEntities());
         searchState.setFoundTaxons(documentSearchResult.getFoundTaxons());
@@ -139,6 +140,7 @@ public SearchState getSearchHitsForPage(int page) {
         if (documents == null) return searchState;
         searchState.setCurrentDocuments(documents);
         searchState.setCurrentDocumentHits(documentSearchResult.getDocumentHits());
+        searchState.setDocumentIdxToSnippet(documentSearchResult.getSearchSnippets());
         return searchState;
     }