-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
261 additions
and
47 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
83 changes: 83 additions & 0 deletions
83
src/main/java/guideme/internal/search/GuideQueryParser.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
package guideme.internal.search; | ||
|
||
import java.util.List; | ||
import org.apache.lucene.index.Term; | ||
import org.apache.lucene.search.BooleanClause; | ||
import org.apache.lucene.search.BooleanQuery; | ||
import org.apache.lucene.search.BoostQuery; | ||
import org.apache.lucene.search.PhraseQuery; | ||
import org.apache.lucene.search.Query; | ||
import org.apache.lucene.search.TermQuery; | ||
import org.apache.lucene.search.WildcardQuery; | ||
|
||
public class GuideQueryParser { | ||
private GuideQueryParser() { | ||
} | ||
|
||
/** | ||
* This method will create a query of the following form: | ||
* <ul> | ||
* <li>A query that matches every document where a field contains all terms.</li> | ||
* <li>OR A query that matches every document where a field contains any of the terms, but boosted to 0.1.</li> | ||
* </ul> | ||
*/ | ||
public static Query parse(String queryString) { | ||
var tokens = QueryStringSplitter.split(queryString); | ||
|
||
var textField = IndexSchema.getTextField("en"); | ||
var titleField = IndexSchema.getTitleField("en"); | ||
|
||
var builder = new BooleanQuery.Builder(); | ||
|
||
// Exact occurrences in the title are scored with 20% boost | ||
builder.add(new BoostQuery(buildFieldQuery(titleField, tokens, false, BooleanClause.Occur.SHOULD), 1.2f), | ||
BooleanClause.Occur.SHOULD); | ||
// Exact occurrences in the body are scored normally | ||
builder.add(buildFieldQuery(textField, tokens, false, BooleanClause.Occur.SHOULD), BooleanClause.Occur.SHOULD); | ||
// Occurrences in the title, where the last token is expanded to a wildcard are scored at 40% | ||
builder.add(new BoostQuery(buildFieldQuery(titleField, tokens, true, BooleanClause.Occur.SHOULD), 0.4f), | ||
BooleanClause.Occur.SHOULD); | ||
// Occurrences in the body, where the last token is expanded to a wildcard are scored at 20% | ||
builder.add(new BoostQuery(buildFieldQuery(textField, tokens, true, BooleanClause.Occur.SHOULD), 0.2f), | ||
BooleanClause.Occur.SHOULD); | ||
|
||
return builder.build(); | ||
} | ||
|
||
private static BooleanQuery buildFieldQuery(String fieldName, List<String> tokens, boolean makeLastTokenWildcard, | ||
BooleanClause.Occur clause) { | ||
// Prepare a BooleanQuery to combine terms with OR | ||
var booleanQueryBuilder = new BooleanQuery.Builder(); | ||
|
||
for (int i = 0; i < tokens.size(); i++) { | ||
String token = tokens.get(i); | ||
|
||
if (token.contains(" ")) { | ||
// Phrase query | ||
var splitToken = QueryStringSplitter.split(token); | ||
booleanQueryBuilder.add(new PhraseQuery(fieldName, splitToken.toArray(String[]::new)), clause); | ||
continue; | ||
} | ||
|
||
// Make the last token a wildcard | ||
if (makeLastTokenWildcard && i == tokens.size() - 1 && !token.endsWith("*")) { | ||
token += "*"; | ||
} | ||
|
||
Term term = new Term(fieldName, token); | ||
|
||
Query q; | ||
if (token.contains("*")) { | ||
q = new WildcardQuery(term); | ||
} else { | ||
q = new TermQuery(term); | ||
} | ||
|
||
booleanQueryBuilder.add(q, clause); | ||
} | ||
|
||
// Return the constructed BooleanQuery | ||
return booleanQueryBuilder.build(); | ||
} | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package guideme.internal.search; | ||
|
||
import java.util.Map; | ||
|
||
final class IndexSchema { | ||
static final String FIELD_GUIDE_ID = "guide_id"; | ||
static final String FIELD_PAGE_ID = "page_id"; | ||
static final String FIELD_TEXT = "page_content"; | ||
static final String FIELD_TITLE = "page_title"; | ||
// Fields for analyzed text | ||
static final String FIELD_TITLE_EN = "page_title_en"; | ||
static final String FIELD_TEXT_EN = "page_content_en"; | ||
|
||
private static final Map<String, String> titleFields = Map.of( | ||
"en", FIELD_TITLE_EN); | ||
private static final Map<String, String> textFields = Map.of( | ||
"en", FIELD_TEXT_EN); | ||
|
||
private IndexSchema() { | ||
} | ||
|
||
public static String getTitleField(String language) { | ||
return titleFields.getOrDefault(language, FIELD_TITLE_EN); | ||
} | ||
|
||
public static String getTextField(String language) { | ||
return textFields.getOrDefault(language, FIELD_TEXT_EN); | ||
} | ||
} |
56 changes: 56 additions & 0 deletions
56
src/main/java/guideme/internal/search/QueryStringSplitter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
package guideme.internal.search; | ||
|
||
import java.util.ArrayList; | ||
import java.util.List; | ||
|
||
public final class QueryStringSplitter { | ||
private QueryStringSplitter() { | ||
} | ||
|
||
public static List<String> split(String query) { | ||
List<String> terms = new ArrayList<>(); | ||
StringBuilder currentTerm = new StringBuilder(); | ||
boolean insideQuotes = false; // Flag to track if we're inside a quoted phrase | ||
|
||
int i = 0; | ||
while (i < query.length()) { | ||
char ch = query.charAt(i); | ||
|
||
if (ch == '"' || ch == '\'') { | ||
if (insideQuotes) { | ||
// We found a closing quote, add the term and reset | ||
terms.add(currentTerm.toString()); | ||
currentTerm.setLength(0); // Reset the current term | ||
insideQuotes = false; | ||
} else { | ||
// Opening quote, start a quoted phrase | ||
insideQuotes = true; | ||
} | ||
i++; // Move past the quote | ||
} else if (Character.isWhitespace(ch)) { | ||
if (insideQuotes) { | ||
// Inside quotes, we don't split on spaces | ||
currentTerm.append(ch); // Keep spaces inside quotes | ||
} else { | ||
// We found a space outside quotes | ||
if (!currentTerm.isEmpty()) { | ||
terms.add(currentTerm.toString()); | ||
currentTerm.setLength(0); // Reset the current term | ||
} | ||
} | ||
i++; // Skip the whitespace | ||
} else { | ||
// Accumulate characters for the current term | ||
currentTerm.append(ch); | ||
i++; | ||
} | ||
} | ||
|
||
// Add the last term if there's any remaining text | ||
if (!currentTerm.isEmpty()) { | ||
terms.add(currentTerm.toString()); | ||
} | ||
|
||
return terms; | ||
} | ||
} |
Oops, something went wrong.