Skip to content

Commit

Permalink
Added extracting search snippets from Fulltext Search for UI, fixed S…
Browse files Browse the repository at this point in the history
…RL search.
  • Loading branch information
TheItCrOw committed Oct 23, 2024
1 parent aba15df commit 7ad39a8
Show file tree
Hide file tree
Showing 12 changed files with 122 additions and 64 deletions.
62 changes: 39 additions & 23 deletions database/createSearchLayerFulltextProcedure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -15,36 +15,45 @@ CREATE OR REPLACE FUNCTION uce_search_layer_fulltext(
OUT document_ids integer[],
OUT named_entities_found text[][],
OUT time_found text[][],
OUT taxons_found text[][]
OUT taxons_found text[][],
OUT snippets_found text[] -- Add an output for snippets
)
RETURNS record AS $$
DECLARE
-- Declare variables to hold total count, document IDs, named entities, time, and taxons
-- Declare variables to hold total count, document IDs, named entities, time, taxons, and snippets
total_count_temp integer;
document_ids_temp integer[];
named_entities_temp text[][];
time_temp text[][];
taxons_temp text[][];
snippets_temp text[];
BEGIN
-- Common table expression to define the set of documents
WITH documents_query AS (
SELECT DISTINCT d.id
FROM document d
WHERE d.corpusid = corpus_id and (d.documenttitle = ANY(input1) OR d.language = ANY(input1) OR d.fulltext ~* input2)

UNION ALL

SELECT DISTINCT d.id
FROM document d
JOIN metadatatitleinfo me ON d.id = me.id
WHERE d.corpusid = corpus_id and (me.title ~* input2 OR me.published ~* input2)
),
-- Common table expression to define the set of documents and matching snippets
WITH documents_query AS (
SELECT DISTINCT d.id,
unnest(regexp_matches(d.fulltext, input2, 'gi')) AS match,
substring(d.fulltext from position(unnest(regexp_matches(d.fulltext, input2, 'gi')) in d.fulltext) - 250 for 500) AS snippet
FROM document d
WHERE d.corpusid = corpus_id
AND (d.documenttitle = ANY(input1) OR d.language = ANY(input1) OR d.fulltext ~* input2)

UNION ALL

SELECT DISTINCT d.id,
unnest(regexp_matches(me.title || ' ' || coalesce(me.published, ''), input2, 'gi')) AS match,
substring(me.title || ' ' || coalesce(me.published, '') from position(unnest(regexp_matches(me.title || ' ' || coalesce(me.published, ''), input2, 'gi')) in me.title || ' ' || coalesce(me.published, '')) - 50 for 100) AS snippet
FROM document d
JOIN metadatatitleinfo me ON d.id = me.id
WHERE d.corpusid = corpus_id
AND (me.title ~* input2 OR me.published ~* input2)
),

-- Count all found documents
counted_documents AS (
SELECT COUNT(*) AS total_count FROM documents_query
)

-- Retrieve total count, document IDs, named entities, time, and taxons (conditionally)
-- Retrieve total count, document IDs, named entities, time, taxons, and snippets (conditionally)
SELECT
CASE WHEN count_all THEN (SELECT total_count FROM counted_documents) ELSE NULL END AS total_count,
ARRAY(
Expand All @@ -54,7 +63,7 @@ BEGIN
FROM documents_query dq
JOIN metadatatitleinfo me ON dq.id = me.id

-- This ordering is a bit scuffed, but it finally works. A lot of copy pasting when adding new cases, but that should happend often. --
-- This ordering is a bit scuffed, but it finally works. A lot of copy pasting when adding new cases, but that shouldn't happen often. --
ORDER BY
CASE
WHEN order_by_column = 'title' THEN
Expand Down Expand Up @@ -83,7 +92,7 @@ BEGIN
) AS dq
) AS document_ids_temp,

-- Count the occurences of all the found entities, taxons etc. --
-- Count the occurrences of all the found entities, taxons etc. --

CASE WHEN count_all THEN
ARRAY(
Expand All @@ -92,7 +101,7 @@ BEGIN
JOIN namedentity ne ON dq.id = ne.document_id
GROUP BY ne.coveredtext, ne.typee, ne.document_id
)
ELSE ARRAY[]::text[][][][]
ELSE ARRAY[]::text[][]
END AS named_entities_temp,

CASE WHEN count_all THEN
Expand All @@ -102,7 +111,7 @@ BEGIN
JOIN time t ON dq.id = t.document_id
GROUP BY t.coveredtext, t.valuee, t.document_id
)
ELSE ARRAY[]::text[][][][]
ELSE ARRAY[]::text[][]
END AS time_temp,

CASE WHEN count_all THEN
Expand All @@ -112,10 +121,16 @@ BEGIN
JOIN taxon ta ON dq.id = ta.document_id
GROUP BY ta.coveredtext, ta.valuee, ta.document_id
)
ELSE ARRAY[]::text[][][][]
END AS taxons_temp
ELSE ARRAY[]::text[][]
END AS taxons_temp,

INTO total_count_temp, document_ids_temp, named_entities_temp, time_temp, taxons_temp
-- Fetch the snippets for the matched input2
ARRAY(
SELECT dq.snippet
FROM documents_query dq
) AS snippets_temp -- New snippet logic

INTO total_count_temp, document_ids_temp, named_entities_temp, time_temp, taxons_temp, snippets_temp
FROM (SELECT 1) AS dummy;

-- Set out parameters
Expand All @@ -124,5 +139,6 @@ BEGIN
named_entities_found := named_entities_temp;
time_found := time_temp;
taxons_found := taxons_temp;
snippets_found := snippets_temp; -- Set the snippets output
END;
$$ LANGUAGE plpgsql;
16 changes: 8 additions & 8 deletions database/createSemanticRoleSearchProcedure.sql
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,19 @@ BEGIN
AND (LOWER(sr.figurecoveredtext) = verb OR verb = '')
-- Pre-filter based on ARG0, ARG1, ARG2, or ARGM conditions in WHERE clause
AND (
((sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) OR
((sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) OR
((sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) OR
((sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm))
((sr.relationtype = 'ARG0' OR sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) OR
((sr.relationtype = 'ARG1' OR sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) OR
((sr.relationtype = 'ARG2' OR sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) OR
((sr.relationtype = 'ARGM' OR sr.relationtype = 'ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm))
)
GROUP BY d.id, sr.figurebegin
HAVING
COUNT(DISTINCT
CASE
WHEN ((sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) THEN sr.relationtype
WHEN ((sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) THEN sr.relationtype
WHEN ((sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) THEN sr.relationtype
WHEN ((sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm)) THEN sr.relationtype
WHEN ((sr.relationtype = 'ARG0' OR sr.relationtype = 'I-ARG0' OR sr.relationtype = 'B-C-ARG0') AND LOWER(sr.groundcoveredtext) = ANY(arg0)) THEN sr.relationtype
WHEN ((sr.relationtype = 'ARG1' OR sr.relationtype = 'I-ARG1' OR sr.relationtype = 'B-C-ARG1') AND LOWER(sr.groundcoveredtext) = ANY(arg1)) THEN sr.relationtype
WHEN ((sr.relationtype = 'ARG2' OR sr.relationtype = 'I-ARG2' OR sr.relationtype = 'B-C-ARG2') AND LOWER(sr.groundcoveredtext) = ANY(arg2)) THEN sr.relationtype
WHEN ((sr.relationtype = 'ARGM' OR sr.relationtype = 'ARGM-LOC' OR sr.relationtype = 'I-ARGM-LOC' OR sr.relationtype = 'B-C-ARGM-LOC') AND LOWER(sr.groundcoveredtext) = ANY(argm)) THEN sr.relationtype
ELSE NULL
END
) =
Expand Down
17 changes: 5 additions & 12 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -1,17 +1,10 @@
# Welcome to MkDocs
# Unified Corpus Explorer (UCE)

For full documentation visit [mkdocs.org](https://www.mkdocs.org).

## Commands
UCE is a dynamic Natural Language Processing (NLP) application for exploring UIMA-annotated large text corpora. It imports the corpus, adapts to the specifics and builds its UI around it.

* `mkdocs new [dir-name]` - Create a new project.
* `mkdocs serve` - Start the live-reloading docs server.
* `mkdocs build` - Build the documentation site.
* `mkdocs -h` - Print help message and exit.
-----------------

## Project layout
## Quick Start

mkdocs.yml # The configuration file.
docs/
index.md # The documentation homepage.
... # Other markdown pages, images and other files.
To ads
3 changes: 3 additions & 0 deletions docs/publications.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Publications

To be added.
7 changes: 7 additions & 0 deletions uce.portal/resources/templates/css/search-redesign.css
Original file line number Diff line number Diff line change
Expand Up @@ -338,6 +338,13 @@
transition: 0.15s;
}

.document-card .highlighted-token{
color:black;
text-decoration: underline;
text-decoration-color: var(--secondary);
text-decoration-thickness: 3px;
}

.document-card .embedding-text {
font-size: small;
font-style: italic;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,23 @@

<div class="snippet-content flexed align-items-center justify-content-between h-100">
<p class="mb-0 small-font text font-italic mr-2">
"${document.getFullTextSnippet(85)}..."
<#assign snippet = searchState.getPossibleSnippetOfDocumentIdx(documentIdx)!>
<#if !snippet?has_content>
<#assign snippet = document.getFullTextSnippet(85)!>
</#if>

<!-- Get the list of search tokens -->
<#assign searchTokens = searchState.getSearchTokens()!>

<!-- Initialize the highlighted snippet -->
<#assign highlightedSnippet = snippet>

<!-- Loop through each search token and highlight it -->
<#list searchTokens as searchToken>
<#assign highlightedSnippet = highlightedSnippet?replace(searchToken, "<span class='highlighted-token'>${searchToken}</span>", "i")>
</#list>

<!-- Render the highlighted snippet -->
${highlightedSnippet}...
</p>
</div>
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
<div>
<#list searchState.getCurrentDocuments() as document>
<#assign documentIdx = document?index>
<div class="flexed justify-content-center">
<div class="document-card" data-id="${document.getId()?string?replace('.', '')?replace(',', '')}">
<div>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ public List<Paragraph> getParagraphs() {
// I have no idea why, but sometimes, the lazy load of empty paragraphs throws an error.
// There is nothing wrong with the document or page - it just throws an error here.
// It's not a problem if the paragraphs are empty! So we just catch the error and return empty...
// It's also not worth logging to the DB or file logger.
System.err.println("Opened a document with unloadable lazy paragraphs.");
return new ArrayList<>();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
package org.texttechnologylab.models.search;

import java.util.ArrayList;
import java.util.List;
import java.util.HashMap;
import java.util.Map;

public class DocumentSearchResult {

Expand All @@ -11,13 +12,22 @@ public class DocumentSearchResult {
private ArrayList<AnnotationSearchResult> foundNamedEntities;
private ArrayList<AnnotationSearchResult> foundTimes;
private ArrayList<AnnotationSearchResult> foundTaxons;
private HashMap<Integer, String> searchSnippets;

public DocumentSearchResult(int documentCount,
ArrayList<Integer> documentIds) {
this.documentCount = documentCount;
this.documentIds = documentIds;
}

public HashMap<Integer, String> getSearchSnippets() {
return searchSnippets;
}

public void setSearchSnippets(HashMap<Integer, String> searchSnippets) {
this.searchSnippets = searchSnippets;
}

public void setDocumentCount(int documentCount) {
this.documentCount = documentCount;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -327,17 +327,14 @@ public DocumentSearchResult defaultSearchForDocuments(int skip,
search.setFoundTaxons(parseAnnotationOccurrences(result.getArray("taxons_found").getResultSet()));
search.setFoundTimes(parseAnnotationOccurrences(result.getArray("time_found").getResultSet()));

/* This was an attempt to count the hits of each document, but it took waaay too long.
For now, this is not used!
// Finally, parse how many matches/hits we have per document
var matchCountsResults = result.getArray("match_counts").getResultSet();
var documentHits = new ArrayList<Integer>();
while(matchCountsResults.next()){
var idx = matchCountsResults.getInt(1);
var hits = matchCountsResults.getInt(2);
documentHits.add(hits);
// Finally, parse the found snippets of the search
// This is only done for the fulltext search
if(layer == SearchLayer.FULLTEXT){
var resultSet = result.getArray("snippets_found").getResultSet();
var foundSnippets = new HashMap<Integer, String>();
while (resultSet.next()) foundSnippets.put(resultSet.getInt(1) - 1, resultSet.getString(2));
search.setSearchSnippets(foundSnippets);
}
search.setDocumentHits(documentHits);*/
}
return search;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import org.texttechnologylab.models.corpus.Document;
import org.texttechnologylab.models.search.*;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.UUID;
import java.util.*;

/**
* A class that holds all states of a biofid search. We can use this class to serialize the search. It shouldn't hold any services.
Expand Down Expand Up @@ -53,12 +50,22 @@ public class SearchState {
*/
@Obsolete
private List<Integer> currentDocumentHits;
private HashMap<Integer, String> documentIdxToSnippet;

public SearchState(SearchType searchType) {
this.searchType = searchType;
this.searchId = UUID.randomUUID();
}

public String getPossibleSnippetOfDocumentIdx(Integer idx){
if(this.documentIdxToSnippet != null && this.documentIdxToSnippet.containsKey(idx)) return this.documentIdxToSnippet.get(idx);
return null;
}

public void setDocumentIdxToSnippet(HashMap<Integer, String> map) {
this.documentIdxToSnippet = map;
}

public List<Integer> getCurrentDocumentHits() {
return currentDocumentHits;
}
Expand All @@ -67,9 +74,13 @@ public void setCurrentDocumentHits(List<Integer> currentDocumentHits) {
this.currentDocumentHits = currentDocumentHits;
}

public CorpusConfig getCorpusConfig() { return corpusConfig;}
public CorpusConfig getCorpusConfig() {
return corpusConfig;
}

public void setCorpusConfig(CorpusConfig corpusConfig) { this.corpusConfig = corpusConfig; }
public void setCorpusConfig(CorpusConfig corpusConfig) {
this.corpusConfig = corpusConfig;
}

public KeywordInContextState getKeywordInContextState() {
return keywordInContextState;
Expand Down Expand Up @@ -144,11 +155,11 @@ public void setTotalHits(Integer totalHits) {
this.totalHits = totalHits;
}

public int getSearchHitsOfDocument(int documentId){
try{
public int getSearchHitsOfDocument(int documentId) {
try {
var documentIdx = currentDocuments.indexOf(currentDocuments.stream().filter(d -> d.getId() == documentId).findFirst().get());
return currentDocumentHits.get(documentIdx);
} catch (Exception ex){
} catch (Exception ex) {
// This exception should never happen!
return -1;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ public SearchState initSearch() {
if (documents == null) return null;
searchState.setCurrentDocuments(documents);
searchState.setCurrentDocumentHits(documentSearchResult.getDocumentHits());
searchState.setDocumentIdxToSnippet(documentSearchResult.getSearchSnippets());
searchState.setTotalHits(documentSearchResult.getDocumentCount());
searchState.setFoundNamedEntities(documentSearchResult.getFoundNamedEntities());
searchState.setFoundTaxons(documentSearchResult.getFoundTaxons());
Expand Down Expand Up @@ -139,6 +140,7 @@ public SearchState getSearchHitsForPage(int page) {
if (documents == null) return searchState;
searchState.setCurrentDocuments(documents);
searchState.setCurrentDocumentHits(documentSearchResult.getDocumentHits());
searchState.setDocumentIdxToSnippet(documentSearchResult.getSearchSnippets());
return searchState;
}

Expand Down

0 comments on commit 7ad39a8

Please sign in to comment.