Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for index_prefix #816

Merged
merged 7 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions clientlib/src/main/proto/yelp/nrtsearch/luceneserver.proto
Original file line number Diff line number Diff line change
Expand Up @@ -672,6 +672,16 @@ message Field {
// than ignore_above will not be indexed or stored. This option is useful for protecting against Lucene’s
// term byte-length limit of 32766
optional int32 ignoreAbove = 36;
// Parameter enables the indexing of term prefixes to speed up prefix searches
IndexPrefixes indexPrefixes = 37;
}

// Options for including IndexPrefixes for field
message IndexPrefixes{
// The minimum prefix length to index. Must be greater than 0, and defaults to 2.
optional int32 min_chars = 1;
// The maximum prefix length to index. Must be less than 20, and defaults to 5.
optional int32 max_chars = 2;
}

// Vector field element type
Expand Down
3,413 changes: 1,753 additions & 1,660 deletions grpc-gateway/luceneserver.pb.go

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions grpc-gateway/luceneserver.swagger.json
Original file line number Diff line number Diff line change
Expand Up @@ -3338,6 +3338,10 @@
"type": "integer",
"format": "int32",
"title": "For arrays of strings, ignoreAbove will be applied for each array element separately and string elements longer\nthan ignore_above will not be indexed or stored. This option is useful for protecting against Lucene’s\nterm byte-length limit of 32766"
},
"indexPrefixes": {
"$ref": "#/definitions/luceneserverIndexPrefixes",
"title": "IndexPrefixes when set for TEXT field"
}
},
"title": "Definition of a field in an index"
Expand Down Expand Up @@ -3961,6 +3965,20 @@
"description": "- DEFAULT: Use field default index options: ATOM=DOCS, TEXT=DOCS_FREQS_POSITIONS\n - DOCS: Index only doc ids\n - DOCS_FREQS: Index doc ids and term frequencies\n - DOCS_FREQS_POSITIONS: Index doc ids, term frequencies and positions\n - DOCS_FREQS_POSITIONS_OFFSETS: Index doc ids, term frequencies, positions and offsets",
"title": "How text tokens should be indexed"
},
"luceneserverIndexPrefixes": {
"type": "object",
"properties": {
"minChars": {
"type": "integer",
"format": "int32"
},
"maxChars": {
"type": "integer",
"format": "int32"
}
},
"title": "Options for including IndexPrefixes for field"
},
"luceneserverIndexSettings": {
"type": "object",
"properties": {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.analysis;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.AnalyzerWrapper;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;

/**
* An {@link AnalyzerWrapper} that wraps another analyzer and applies an Edge N-Gram token filter to
* the token stream.
*/
public class PrefixWrappedAnalyzer extends AnalyzerWrapper {
manav113 marked this conversation as resolved.
Show resolved Hide resolved
private final int minChars;
private final int maxChars;
private final Analyzer delegate;

/**
* Create a new {@link PrefixWrappedAnalyzer} that wraps the given {@link Analyzer} and sets
* applies an Edge N-Gram token filter to the token stream.
*
* @param delegate the analyzer to wrap
* @param minChars the minimum number of characters for the edge n-grams
* @param maxChars the maximum number of characters for the edge n-grams
*/
public PrefixWrappedAnalyzer(Analyzer delegate, int minChars, int maxChars) {
super(delegate.getReuseStrategy());
this.delegate = delegate;
this.minChars = minChars;
this.maxChars = maxChars;
}

@Override
protected Analyzer getWrappedAnalyzer(String fieldName) {
return delegate;
}

@Override
protected TokenStreamComponents wrapComponents(
String fieldName, TokenStreamComponents components) {
TokenFilter filter =
new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
return new TokenStreamComponents(components.getSource(), filter);
}

@Override
public String toString() {
return "PrefixWrappedAnalyzer(" + delegate.toString() + ")";
}
}
17 changes: 12 additions & 5 deletions src/main/java/com/yelp/nrtsearch/server/field/AtomFieldDef.java
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@

import static com.yelp.nrtsearch.server.analysis.AnalyzerCreator.hasAnalyzer;

import com.yelp.nrtsearch.server.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.field.properties.RangeQueryable;
import com.yelp.nrtsearch.server.field.properties.Sortable;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import com.yelp.nrtsearch.server.grpc.RangeQuery;
import com.yelp.nrtsearch.server.grpc.SortType;
import java.util.List;
Expand All @@ -31,14 +33,12 @@
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.SortField;
import org.apache.lucene.search.SortedSetSortField;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.*;
import org.apache.lucene.util.BytesRef;

/** Field class for 'ATOM' field type. Uses {@link KeywordAnalyzer} for text analysis. */
public class AtomFieldDef extends TextBaseFieldDef implements Sortable, RangeQueryable {
public class AtomFieldDef extends TextBaseFieldDef
implements Sortable, RangeQueryable, PrefixQueryable {
private static final Analyzer keywordAnalyzer = new KeywordAnalyzer();

public AtomFieldDef(
Expand Down Expand Up @@ -150,4 +150,11 @@ public Query getRangeQuery(RangeQuery rangeQuery) {
"Only SORTED or SORTED_SET doc values are supported for range queries: " + getName());
}
}

@Override
public Query getPrefixQuery(PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod) {
verifySearchable("Prefix query");
return new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()), rewriteMethod);
}
}
108 changes: 108 additions & 0 deletions src/main/java/com/yelp/nrtsearch/server/field/PrefixFieldDef.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright 2025 Yelp Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package com.yelp.nrtsearch.server.field;

import com.yelp.nrtsearch.server.analysis.PrefixWrappedAnalyzer;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import java.util.ArrayList;
import java.util.List;
import java.util.Objects;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.*;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.Operations;

public class PrefixFieldDef extends TextBaseFieldDef {
private final int minChars;
private final int maxChars;
private final String parentField;
private static final String INDEX_PREFIX = "._index_prefix";

public PrefixFieldDef(
String parentName, Field requestField, FieldDefCreator.FieldDefCreatorContext context) {
super(parentName + INDEX_PREFIX, requestField, context);
this.minChars = requestField.getIndexPrefixes().getMinChars();
this.maxChars = requestField.getIndexPrefixes().getMaxChars();
this.parentField = parentName;
}

@Override
protected void setSearchProperties(FieldType fieldType, Field requestField) {
fieldType.setOmitNorms(true);
fieldType.setTokenized(true);
if (requestField.getSearch()) {
setIndexOptions(requestField.getIndexOptions(), fieldType, IndexOptions.DOCS);
}
manav113 marked this conversation as resolved.
Show resolved Hide resolved
}

@Override
protected Analyzer parseIndexAnalyzer(Field requestField) {
manav113 marked this conversation as resolved.
Show resolved Hide resolved
Analyzer baseAnalyzer = super.parseIndexAnalyzer(requestField);
if (baseAnalyzer == null) {
throw new IllegalArgumentException("Could not determine analyzer");
}
return new PrefixWrappedAnalyzer(
baseAnalyzer,
requestField.getIndexPrefixes().getMinChars(),
requestField.getIndexPrefixes().getMaxChars());
}

boolean accept(int length) {
return length >= minChars - 1 && length <= maxChars;
}

public Query getPrefixQuery(PrefixQuery prefixQuery) {
String textValue = prefixQuery.getPrefix();
if (textValue.length() >= minChars) {
return super.getTermQueryFromTextValue(textValue);
}
List<Automaton> automata = new ArrayList<>();
automata.add(Automata.makeString(textValue));
for (int i = textValue.length(); i < minChars; i++) {
automata.add(Automata.makeAnyChar());
}
Automaton automaton = Operations.concatenate(automata);
AutomatonQuery query = new AutomatonQuery(new Term(getName(), textValue + "*"), automaton);

return new BooleanQuery.Builder()
.add(query, BooleanClause.Occur.SHOULD)
.add(new TermQuery(new Term(parentField, textValue)), BooleanClause.Occur.SHOULD)
.build();
}

@Override
public String getType() {
return "PREFIX";
}

public int getMinChars() {
return minChars;
}

public int getMaxChars() {
return maxChars;
}

@Override
public int hashCode() {
return Objects.hash(super.hashCode(), minChars, maxChars, parentField);
}
}
105 changes: 104 additions & 1 deletion src/main/java/com/yelp/nrtsearch/server/field/TextFieldDef.java
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,71 @@
*/
package com.yelp.nrtsearch.server.field;

import com.yelp.nrtsearch.server.field.properties.PrefixQueryable;
import com.yelp.nrtsearch.server.grpc.Field;
import com.yelp.nrtsearch.server.grpc.IndexPrefixes;
import com.yelp.nrtsearch.server.grpc.PrefixQuery;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.ConstantScoreQuery;
import org.apache.lucene.search.MultiTermQuery;
import org.apache.lucene.search.Query;

/** Field class for 'TEXT' field type. */
public class TextFieldDef extends TextBaseFieldDef {
public class TextFieldDef extends TextBaseFieldDef implements PrefixQueryable {
protected PrefixFieldDef prefixFieldDef;
private final Map<String, IndexableFieldDef<?>> childFieldsWithPrefix;
private static final int DEFAULT_MIN_CHARS = 2;
private static final int DEFAULT_MAX_CHARS = 5;

public TextFieldDef(
String name, Field requestField, FieldDefCreator.FieldDefCreatorContext context) {
super(name, requestField, context);
if (requestField.hasIndexPrefixes()) {
verifySearchable("Prefix query");
int minChars =
requestField.getIndexPrefixes().hasMinChars()
? requestField.getIndexPrefixes().getMinChars()
: DEFAULT_MIN_CHARS;
int maxChars =
requestField.getIndexPrefixes().hasMaxChars()
? requestField.getIndexPrefixes().getMaxChars()
: DEFAULT_MAX_CHARS;
validatePrefix(minChars, maxChars);

this.prefixFieldDef =
new PrefixFieldDef(
getName(),
Field.newBuilder()
.setSearch(true)
.setAnalyzer(requestField.getAnalyzer())
.setIndexAnalyzer(requestField.getIndexAnalyzer())
manav113 marked this conversation as resolved.
Show resolved Hide resolved
.setIndexPrefixes(
IndexPrefixes.newBuilder()
.setMinChars(minChars)
.setMaxChars(maxChars)
.build())
.build(),
context);

Map<String, IndexableFieldDef<?>> childFieldsMap = new HashMap<>(super.getChildFields());
childFieldsMap.put(prefixFieldDef.getName(), prefixFieldDef);
childFieldsWithPrefix = Collections.unmodifiableMap(childFieldsMap);
} else {
this.prefixFieldDef = null;
childFieldsWithPrefix = super.getChildFields();
}
}

@Override
public Map<String, IndexableFieldDef<?>> getChildFields() {
return childFieldsWithPrefix;
}

@Override
Expand Down Expand Up @@ -51,4 +107,51 @@ protected void setSearchProperties(FieldType fieldType, Field requestField) {
fieldType.setTokenized(true);
fieldType.setOmitNorms(requestField.getOmitNorms());
}

public PrefixFieldDef getPrefixFieldDef() {
return prefixFieldDef;
}

public boolean hasPrefix() {
return prefixFieldDef != null;
}

@Override
public void parseDocumentField(
Document document, List<String> fieldValues, List<List<String>> facetHierarchyPaths) {
super.parseDocumentField(document, fieldValues, facetHierarchyPaths);

if (hasPrefix() && !fieldValues.isEmpty()) {
prefixFieldDef.parseDocumentField(document, fieldValues, facetHierarchyPaths);
}
}

@Override
public Query getPrefixQuery(PrefixQuery prefixQuery, MultiTermQuery.RewriteMethod rewriteMethod) {
verifySearchable("Prefix query");
if (hasPrefix() && prefixFieldDef.accept(prefixQuery.getPrefix().length())) {
Query query = prefixFieldDef.getPrefixQuery(prefixQuery);
if (rewriteMethod == null
|| rewriteMethod == MultiTermQuery.CONSTANT_SCORE_REWRITE
|| rewriteMethod == MultiTermQuery.CONSTANT_SCORE_BOOLEAN_REWRITE) {
return new ConstantScoreQuery(query);
}
return query;
}
return new org.apache.lucene.search.PrefixQuery(
new Term(prefixQuery.getField(), prefixQuery.getPrefix()), rewriteMethod);
}

public void validatePrefix(int minChars, int maxChars) {
if (minChars > maxChars) {
throw new IllegalArgumentException(
"min_chars [" + minChars + "] must be less than max_chars [" + maxChars + "]");
}
if (minChars < 1) {
throw new IllegalArgumentException("min_chars [" + minChars + "] must be greater than zero");
}
if (maxChars >= 20) {
throw new IllegalArgumentException("max_chars [" + maxChars + "] must be less than 20");
}
}
}
Loading
Loading