Improved search queries

AppliedEnergistics · Feb 9, 2025 · 0e68bae · 0e68bae
1 parent 30d2862
commit 0e68bae
Show file tree

Hide file tree

Showing 6 changed files with 261 additions and 47 deletions.
diff --git a/docs/docs/02-changelog.md b/docs/docs/02-changelog.md
@@ -3,7 +3,14 @@ import Video from '@site/src/components/Video';
 
 # Changelog
 
-## 2.6.0
+## 21.1.0 (Minecraft 1.21.1)
+
+- Switching to the NeoForge versioning scheme, this version is equivalent to version 2.6.0, except for the following changes.
+- Improved query parsing for full-text search. Search will now always apply "incremental" search for the last entered word,
+  assuming the user might not have entered it fully yet. This means searching for "io po" will search for both "io po"
+  and "io po*", although it will score an exact hit for "po" higher than a hit for "port" (for example).
+
+## 2.6.0 (Minecraft 1.21.1)
 
 - Change the default layout of guides to be a centered column, and add a toolbar button to toggle between 
   full-width and centered-column layout.
@@ -15,16 +22,16 @@ import Video from '@site/src/components/Video';
 - Added support for blast furnace recipes
 - Do not show a navigation bar for guides that do not have any navigation items
 
-## 2.5.1
+## 2.5.1 (Minecraft 1.21.1)
 
 - Fix shared recipe types not being collected correctly from the service loader
 
-## 2.5.0
+## 2.5.0 (Minecraft 1.21.1)
 
 - Added an extension point for mods to add support for [custom recipe types](20-integration/recipe-types.md) to all guides
 - Fixed an issue with navigating to the search screen
 
-## 2.4.0
+## 2.4.0 (Minecraft 1.21.1)
 
 - Add missing Markdown node classes to API jar
 - Add structure editing commands that only work in singleplayer:
@@ -36,17 +43,17 @@ import Video from '@site/src/components/Video';
 - Added op command `/guideme give <target> <guide>` to quickly give a guide item to an entity target (i.e. `@s`)
 - Fix guidebook navbar closing when clicking links
 
-## 2.3.1
+## 2.3.1 (Minecraft 1.21.1)
 
 - Fixes a crash with the generic guide item if it has no guide id attached
 
-## 2.3.0
+## 2.3.0 (Minecraft 1.21.1)
 
 - GuideME is now published on Maven Central instead of Modmaven
 - The group id of the Maven artifact has changed from `appeng` to `org.appliedenergistics` 
   to enable publishing on Maven Central
 
-## 2.2.0
+## 2.2.0 (Minecraft 1.21.1)
 
 - Added full-text search based on Apache Lucene, which is enabled for all guides:
   <Video src="guide-search.mp4" />
@@ -65,15 +72,16 @@ import Video from '@site/src/components/Video';
     By default, all custom tags simply add their children to the indexer
   - Added the ability to set borders for `LytBox`
   - Generalized `GuideUiHost` into `DocumentUiHost`
-## 2.1.2
+
+## 2.1.2 (Minecraft 1.21.1)
 
 - Skip fully invisible blocks (without block entities) when calculating the bounding box of a game scene. Fixes inexplicably larger bounds when blocks like `minecraft:light` where included in the exported structure.
 
-## 2.1.1
+## 2.1.1 (Minecraft 1.21.1)
 
 - Fix race-condition crash when local file-system changes were processed before the resource reload was finished.
 
-## 2.1.0
+## 2.1.0 (Minecraft 1.21.1)
 
 - Adds API to open guides for players from both server- and client-side
   - `GuidesCommon.openGuide(Player player, ResourceLocation guideId)` to open the last opened (or start-page if none) page of a guide for the given player.
@@ -86,6 +94,6 @@ import Video from '@site/src/components/Video';
   or `/guideme open @s testmod:guide page.md#anchor` to open a specific page at an anchor.
 - Fix mod version being shown as 0.0.0
 
-## 2.0.1
+## 2.0.1 (Minecraft 1.21.1)
 
 - Removes superfluous log spam when opening the creative menu.
diff --git a/src/main/java/guideme/internal/search/GuideQueryParser.java b/src/main/java/guideme/internal/search/GuideQueryParser.java
@@ -0,0 +1,83 @@
+package guideme.internal.search;
+
+import java.util.List;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.BoostQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.WildcardQuery;
+
+public class GuideQueryParser {
+    private GuideQueryParser() {
+    }
+
+    /**
+     * This method will create a query of the following form:
+     * <ul>
+     * <li>A query that matches every document where a field contains all terms.</li>
+     * <li>OR A query that matches every document where a field contains any of the terms, but boosted to 0.1.</li>
+     * </ul>
+     */
+    public static Query parse(String queryString) {
+        var tokens = QueryStringSplitter.split(queryString);
+
+        var textField = IndexSchema.getTextField("en");
+        var titleField = IndexSchema.getTitleField("en");
+
+        var builder = new BooleanQuery.Builder();
+
+        // Exact occurrences in the title are scored with 20% boost
+        builder.add(new BoostQuery(buildFieldQuery(titleField, tokens, false, BooleanClause.Occur.SHOULD), 1.2f),
+                BooleanClause.Occur.SHOULD);
+        // Exact occurrences in the body are scored normally
+        builder.add(buildFieldQuery(textField, tokens, false, BooleanClause.Occur.SHOULD), BooleanClause.Occur.SHOULD);
+        // Occurrences in the title, where the last token is expanded to a wildcard are scored at 40%
+        builder.add(new BoostQuery(buildFieldQuery(titleField, tokens, true, BooleanClause.Occur.SHOULD), 0.4f),
+                BooleanClause.Occur.SHOULD);
+        // Occurrences in the body, where the last token is expanded to a wildcard are scored at 20%
+        builder.add(new BoostQuery(buildFieldQuery(textField, tokens, true, BooleanClause.Occur.SHOULD), 0.2f),
+                BooleanClause.Occur.SHOULD);
+
+        return builder.build();
+    }
+
+    private static BooleanQuery buildFieldQuery(String fieldName, List<String> tokens, boolean makeLastTokenWildcard,
+            BooleanClause.Occur clause) {
+        // Prepare a BooleanQuery to combine terms with OR
+        var booleanQueryBuilder = new BooleanQuery.Builder();
+
+        for (int i = 0; i < tokens.size(); i++) {
+            String token = tokens.get(i);
+
+            if (token.contains(" ")) {
+                // Phrase query
+                var splitToken = QueryStringSplitter.split(token);
+                booleanQueryBuilder.add(new PhraseQuery(fieldName, splitToken.toArray(String[]::new)), clause);
+                continue;
+            }
+
+            // Make the last token a wildcard
+            if (makeLastTokenWildcard && i == tokens.size() - 1 && !token.endsWith("*")) {
+                token += "*";
+            }
+
+            Term term = new Term(fieldName, token);
+
+            Query q;
+            if (token.contains("*")) {
+                q = new WildcardQuery(term);
+            } else {
+                q = new TermQuery(term);
+            }
+
+            booleanQueryBuilder.add(q, clause);
+        }
+
+        // Return the constructed BooleanQuery
+        return booleanQueryBuilder.build();
+    }
+
+}
diff --git a/src/main/java/guideme/internal/search/GuideSearch.java b/src/main/java/guideme/internal/search/GuideSearch.java
@@ -15,7 +15,6 @@
 import java.time.Instant;
 import java.util.ArrayList;
 import java.util.List;
-import java.util.Map;
 import java.util.Objects;
 import java.util.concurrent.TimeUnit;
 import net.minecraft.resources.ResourceLocation;
@@ -31,8 +30,6 @@
 import org.apache.lucene.index.IndexWriterConfig;
 import org.apache.lucene.index.StoredFields;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.queryparser.flexible.core.QueryNodeException;
-import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
@@ -58,16 +55,6 @@
 public class GuideSearch implements AutoCloseable {
     private static final Logger LOG = LoggerFactory.getLogger(GuideSearch.class);
 
-    private static final String FIELD_GUIDE_ID = "guide_id";
-    private static final String FIELD_PAGE_ID = "page_id";
-
-    private static final String FIELD_TEXT = "page_content";
-    private static final String FIELD_TITLE = "page_title";
-
-    // Fields for analyzed text
-    private static final String FIELD_TITLE_EN = "page_title_en";
-    private static final String FIELD_TEXT_EN = "page_content_en";
-
     /**
      * Maximum time spent indexing per tick.
      */
@@ -99,7 +86,7 @@ public GuideSearch() {
 
     public void index(Guide guide) {
         try {
-            indexWriter.deleteDocuments(new PhraseQuery(FIELD_GUIDE_ID, guide.getId().toString()));
+            indexWriter.deleteDocuments(new PhraseQuery(IndexSchema.FIELD_GUIDE_ID, guide.getId().toString()));
         } catch (IOException e) {
             LOG.error("Failed to delete all documents before re-indexing.", e);
         }
@@ -182,27 +169,20 @@ public List<SearchResult> searchGuide(String queryText, @Nullable Guide onlyFrom
 
         var indexSearcher = new IndexSearcher(indexReader);
 
-        var parser = new StandardQueryParser(analyzer);
-        parser.setMultiFields(new String[] {
-                FIELD_TITLE_EN,
-                FIELD_TEXT_EN
-        });
-        parser.setFieldsBoost(Map.of(FIELD_TITLE_EN, 1.2f));
-
         Query query;
         try {
-            query = parser.parse(queryText, null);
-        } catch (QueryNodeException e) {
-            LOG.debug("Failed to parse Lucene query: '{}'", queryText, e);
+            query = GuideQueryParser.parse(queryText);
+        } catch (Exception e) {
+            LOG.debug("Failed to parse search query: '{}'", queryText, e);
             return List.of();
         }
 
         // Filter by guide if given one
         if (onlyFromGuide != null) {
             query = new BooleanQuery.Builder()
                     .add(query, BooleanClause.Occur.MUST)
-                    .add(new TermQuery(new Term(FIELD_GUIDE_ID, onlyFromGuide.getId().toString())),
-                            BooleanClause.Occur.MUST)
+                    .add(new TermQuery(new Term(IndexSchema.FIELD_GUIDE_ID, onlyFromGuide.getId().toString())),
+                            BooleanClause.Occur.FILTER)
                     .build();
         }
 
@@ -224,8 +204,8 @@ public List<SearchResult> searchGuide(String queryText, @Nullable Guide onlyFrom
 
             for (ScoreDoc scoreDoc : topDocs.scoreDocs) {
                 var document = storedFields.document(scoreDoc.doc);
-                var guideId = ResourceLocation.parse(document.get(FIELD_GUIDE_ID));
-                var pageId = ResourceLocation.parse(document.get(FIELD_PAGE_ID));
+                var guideId = ResourceLocation.parse(document.get(IndexSchema.FIELD_GUIDE_ID));
+                var pageId = ResourceLocation.parse(document.get(IndexSchema.FIELD_PAGE_ID));
 
                 var guide = Guides.getById(guideId);
                 if (guide == null) {
@@ -241,14 +221,15 @@ public List<SearchResult> searchGuide(String queryText, @Nullable Guide onlyFrom
 
                 String bestFragment = "";
                 try {
-                    bestFragment = highlighter.getBestFragment(analyzer, FIELD_TEXT_EN, document.get(FIELD_TEXT));
+                    bestFragment = highlighter.getBestFragment(analyzer, IndexSchema.FIELD_TEXT_EN,
+                            document.get(IndexSchema.FIELD_TEXT));
                 } catch (InvalidTokenOffsetsException e) {
                     LOG.error("Cannot determine text to highlight for result", e);
                 }
 
                 // This is kinda shit, but the Lucene highlighter isn't exactly flexible with its return type
                 // it only supports strings.
-                var pageTitle = document.get(FIELD_TITLE);
+                var pageTitle = document.get(IndexSchema.FIELD_TITLE);
 
                 var startOfSegment = 0;
                 LytFlowSpan currentSpan = new LytFlowSpan();
@@ -294,15 +275,15 @@ private Document createPageDocument(Guide guide, ParsedGuidePage page) {
         var pageTitle = getPageTitle(guide, page);
 
         var doc = new Document();
-        doc.add(new StringField(FIELD_GUIDE_ID, guide.getId().toString(), Field.Store.YES));
-        doc.add(new StoredField(FIELD_PAGE_ID, page.getId().toString()));
+        doc.add(new StringField(IndexSchema.FIELD_GUIDE_ID, guide.getId().toString(), Field.Store.YES));
+        doc.add(new StoredField(IndexSchema.FIELD_PAGE_ID, page.getId().toString()));
 
         // Store original text for highlighting and display purposes
-        doc.add(new StoredField(FIELD_TITLE, pageTitle));
-        doc.add(new StoredField(FIELD_TEXT, pageText));
+        doc.add(new StoredField(IndexSchema.FIELD_TITLE, pageTitle));
+        doc.add(new StoredField(IndexSchema.FIELD_TEXT, pageText));
 
-        doc.add(new TextField(FIELD_TITLE_EN, pageTitle, Field.Store.NO));
-        doc.add(new TextField(FIELD_TEXT_EN, pageText, Field.Store.NO));
+        doc.add(new TextField(IndexSchema.FIELD_TITLE_EN, pageTitle, Field.Store.NO));
+        doc.add(new TextField(IndexSchema.FIELD_TEXT_EN, pageText, Field.Store.NO));
         return doc;
     }
 

diff --git a/src/main/java/guideme/internal/search/IndexSchema.java b/src/main/java/guideme/internal/search/IndexSchema.java
@@ -0,0 +1,29 @@
+package guideme.internal.search;
+
+import java.util.Map;
+
+final class IndexSchema {
+    static final String FIELD_GUIDE_ID = "guide_id";
+    static final String FIELD_PAGE_ID = "page_id";
+    static final String FIELD_TEXT = "page_content";
+    static final String FIELD_TITLE = "page_title";
+    // Fields for analyzed text
+    static final String FIELD_TITLE_EN = "page_title_en";
+    static final String FIELD_TEXT_EN = "page_content_en";
+
+    private static final Map<String, String> titleFields = Map.of(
+            "en", FIELD_TITLE_EN);
+    private static final Map<String, String> textFields = Map.of(
+            "en", FIELD_TEXT_EN);
+
+    private IndexSchema() {
+    }
+
+    public static String getTitleField(String language) {
+        return titleFields.getOrDefault(language, FIELD_TITLE_EN);
+    }
+
+    public static String getTextField(String language) {
+        return textFields.getOrDefault(language, FIELD_TEXT_EN);
+    }
+}
diff --git a/src/main/java/guideme/internal/search/QueryStringSplitter.java b/src/main/java/guideme/internal/search/QueryStringSplitter.java
@@ -0,0 +1,56 @@
+package guideme.internal.search;
+
+import java.util.ArrayList;
+import java.util.List;
+
+public final class QueryStringSplitter {
+    private QueryStringSplitter() {
+    }
+
+    public static List<String> split(String query) {
+        List<String> terms = new ArrayList<>();
+        StringBuilder currentTerm = new StringBuilder();
+        boolean insideQuotes = false; // Flag to track if we're inside a quoted phrase
+
+        int i = 0;
+        while (i < query.length()) {
+            char ch = query.charAt(i);
+
+            if (ch == '"' || ch == '\'') {
+                if (insideQuotes) {
+                    // We found a closing quote, add the term and reset
+                    terms.add(currentTerm.toString());
+                    currentTerm.setLength(0); // Reset the current term
+                    insideQuotes = false;
+                } else {
+                    // Opening quote, start a quoted phrase
+                    insideQuotes = true;
+                }
+                i++; // Move past the quote
+            } else if (Character.isWhitespace(ch)) {
+                if (insideQuotes) {
+                    // Inside quotes, we don't split on spaces
+                    currentTerm.append(ch); // Keep spaces inside quotes
+                } else {
+                    // We found a space outside quotes
+                    if (!currentTerm.isEmpty()) {
+                        terms.add(currentTerm.toString());
+                        currentTerm.setLength(0); // Reset the current term
+                    }
+                }
+                i++; // Skip the whitespace
+            } else {
+                // Accumulate characters for the current term
+                currentTerm.append(ch);
+                i++;
+            }
+        }
+
+        // Add the last term if there's any remaining text
+        if (!currentTerm.isEmpty()) {
+            terms.add(currentTerm.toString());
+        }
+
+        return terms;
+    }
+}