From cbe34bd5fc33ab58c2dd2123a0ebf2b90e302bfd Mon Sep 17 00:00:00 2001 From: John Mazanec Date: Wed, 26 Feb 2025 07:03:47 -0800 Subject: [PATCH] Refactor derived source nested injector Refactors derived source nested injector to be more readable and reliable. Moves low-level lucene calls into a helper class. Simplifies the iterator and the injector. Along with refactoring, it blocks enabling this feature for fields with more than 2 levels of nesting. This includes object fields as well. The focus will be getting this to work in a very stable fashion before moving on to generalized support. Additionally, it adds a new integ test. Still need to add uTs but the refactor overall for nested should make it a lot easier. Signed-off-by: John Mazanec --- ...AbstractPerFieldDerivedVectorInjector.java | 15 +- .../DerivedSourceLuceneHelper.java | 239 ++++++++++++ .../NestedPerFieldDerivedVectorInjector.java | 349 +++++++++--------- ...tedPerFieldParentToChildDocIdIterator.java | 96 +++++ .../NestedPerFieldParentToDocIdIterator.java | 172 --------- .../RootPerFieldDerivedVectorInjector.java | 2 +- .../index/mapper/FlatVectorFieldMapper.java | 4 +- .../index/mapper/KNNVectorFieldMapper.java | 4 +- .../mapper/KNNVectorFieldMapperUtil.java | 13 + .../knn/index/mapper/LuceneFieldMapper.java | 4 +- .../knn/index/mapper/MethodFieldMapper.java | 4 +- .../knn/index/mapper/ModelFieldMapper.java | 4 +- .../opensearch/knn/integ/DerivedSourceIT.java | 208 ++++++++++- .../org/opensearch/knn/KNNRestTestCase.java | 29 +- 14 files changed, 768 insertions(+), 375 deletions(-) create mode 100644 src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java create mode 100644 src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToChildDocIdIterator.java delete mode 100644 src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/AbstractPerFieldDerivedVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/AbstractPerFieldDerivedVectorInjector.java index bba5e14c19..f0908f75a0 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/AbstractPerFieldDerivedVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/AbstractPerFieldDerivedVectorInjector.java @@ -8,10 +8,10 @@ import lombok.extern.log4j.Log4j2; import org.apache.lucene.index.FieldInfo; import org.apache.lucene.util.BytesRef; +import org.opensearch.common.CheckedSupplier; import org.opensearch.knn.common.FieldInfoExtractor; import org.opensearch.knn.index.VectorDataType; import org.opensearch.knn.index.mapper.KNNVectorFieldMapperUtil; -import org.opensearch.knn.index.vectorvalues.KNNVectorValues; import java.io.IOException; @@ -22,18 +22,23 @@ abstract class AbstractPerFieldDerivedVectorInjector implements PerFieldDerivedV * to the correct position. * * @param fieldInfo fieldinfo for the vector field - * @param vectorValues vector values of the field. getVector or getConditionalVector should return expected vector. + * @param vectorSupplier supplies vector (without clone) + * @param vectorCloneSupplier supplies clone of vector. * @return vector formatted based on the vector data type * @throws IOException if unable to deserialize stored vector */ - protected Object formatVector(FieldInfo fieldInfo, KNNVectorValues vectorValues) throws IOException { - Object vectorValue = vectorValues.getVector(); + protected Object formatVector( + FieldInfo fieldInfo, + CheckedSupplier vectorSupplier, + CheckedSupplier vectorCloneSupplier + ) throws IOException { + Object vectorValue = vectorSupplier.get(); // If the vector value is a byte[], we must deserialize if (vectorValue instanceof byte[]) { BytesRef vectorBytesRef = new BytesRef((byte[]) vectorValue); VectorDataType vectorDataType = FieldInfoExtractor.extractVectorDataType(fieldInfo); return KNNVectorFieldMapperUtil.deserializeStoredVector(vectorBytesRef, vectorDataType); } - return vectorValues.conditionalCloneVector(); + return vectorCloneSupplier.get(); } } diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java new file mode 100644 index 0000000000..0b42adeabb --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/DerivedSourceLuceneHelper.java @@ -0,0 +1,239 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.derivedsource; + +import lombok.RequiredArgsConstructor; +import org.apache.lucene.index.DocValuesType; +import org.apache.lucene.index.FieldInfo; +import org.apache.lucene.index.NumericDocValues; +import org.apache.lucene.index.PostingsEnum; +import org.apache.lucene.index.SegmentReadState; +import org.apache.lucene.index.Terms; +import org.apache.lucene.index.TermsEnum; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.util.BytesRef; +import org.opensearch.index.mapper.FieldNamesFieldMapper; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Utility class used to implement Lucene functionality that can be used to interact with Lucene + */ +@RequiredArgsConstructor +public class DerivedSourceLuceneHelper { + + private final DerivedSourceReaders derivedSourceReaders; + private final SegmentReadState segmentReadState; + + /** + * Return list of documents matching term in range + * + * @param startDocId first eligible document (inclusive) + * @param endDocId last eligible document (inclusive) + * @param termField field to check for term + * @param term term to match + * @return list of docIds that match the term in the given range + * @throws IOException if there is an issue reading + */ + public List termMatchesInRange(int startDocId, int endDocId, String termField, String term) throws IOException { + if (endDocId - startDocId < 0) { + return Collections.emptyList(); + } + + // First, we need to get the current PostingsEnum for the key as term field and term + PostingsEnum postingsEnum = getPostingsEnum(termField, term); + + // Next, get all the docs that match this parent path. If none were found, return an empty list + if (postingsEnum == null) { + return Collections.emptyList(); + } + List matchingDocs = new ArrayList<>(); + postingsEnum.advance(startDocId); + while (postingsEnum.docID() != NO_MORE_DOCS && postingsEnum.docID() <= endDocId) { + if (postingsEnum.freq() > 0) { + matchingDocs.add(postingsEnum.docID()); + } + postingsEnum.nextDoc(); + } + + return matchingDocs; + } + + /** + * Check if the docId is a parent for the given field. To do this, it checks if any of the documents in the range + * contain the parent field in the _nested_path + * + * @param offset First doc to check (inclusive) + * @param parentDocId Parent document to be checked if its a parent + * @param parentFieldName path to parent field + * @return true if the docId is a parent, false otherwise + */ + public boolean isNestedParent(int offset, int parentDocId, String parentFieldName) throws IOException { + // Check if the document before the parent has the + if (parentFieldName == null) { + return false; + } + + if (parentDocId <= 0) { + return false; + } + + return termMatchesInRange(offset, parentDocId - 1, "_nested_path", parentFieldName).isEmpty() == false; + } + + /** + * Check if the field exists for the given document. + * + * @param fieldToMatch field to check + * @param docId document to check + * @return true if the field exists for the given document, false otherwise + * @throws IOException if there is an issue reading from the segment + */ + public boolean fieldExists(String fieldToMatch, int docId) throws IOException { + int firstDocId = getFirstDocWhereFieldExists(fieldToMatch, docId); + return firstDocId == docId; + } + + /** + * Get the lowest docId for a field that is greater than (or equal to) the offset. This method is implemented in a + * very similar way as checking if a field exists. + * + * @param fieldToMatch field to find the lowest docId for + * @param offset offset to start searching from (inclusive) + * @return lowest docId for the field that is greater than the offset. Returns {@link DocIdSetIterator#NO_MORE_DOCS} if doc cannot be found + * @throws IOException if there is an issue reading from the formats + */ + public int getFirstDocWhereFieldExists(String fieldToMatch, int offset) throws IOException { + // This method implementation is inspired by the FieldExistsQuery in Lucene and the FieldNamesMapper in + // Opensearch. We first mimic the logic in the FieldExistsQuery in order to identify the docId of the nested + // doc. If that fails, we rely on + // References: + // 1. https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java#L170-L218. + // 2. + // https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/index/mapper/FieldMapper.java#L316-L324 + FieldInfo fieldInfo = segmentReadState.fieldInfos.fieldInfo(fieldToMatch); + + if (fieldInfo == null) { + return NO_MORE_DOCS; + } + + DocIdSetIterator iterator = null; + if (fieldInfo.hasNorms() && derivedSourceReaders.getNormsProducer() != null) { // the field indexes norms + iterator = derivedSourceReaders.getNormsProducer().getNorms(fieldInfo); + } else if (fieldInfo.getVectorDimension() != 0 && derivedSourceReaders.getKnnVectorsReader() != null) { // the field indexes vectors + switch (fieldInfo.getVectorEncoding()) { + case FLOAT32: + iterator = derivedSourceReaders.getKnnVectorsReader().getFloatVectorValues(fieldInfo.name).iterator(); + break; + case BYTE: + iterator = derivedSourceReaders.getKnnVectorsReader().getByteVectorValues(fieldInfo.name).iterator(); + break; + } + } else if (fieldInfo.getDocValuesType() != DocValuesType.NONE && derivedSourceReaders.getDocValuesProducer() != null) { // the field + // indexes + // doc + // values + switch (fieldInfo.getDocValuesType()) { + case NUMERIC: + iterator = derivedSourceReaders.getDocValuesProducer().getNumeric(fieldInfo); + break; + case BINARY: + iterator = derivedSourceReaders.getDocValuesProducer().getBinary(fieldInfo); + break; + case SORTED: + iterator = derivedSourceReaders.getDocValuesProducer().getSorted(fieldInfo); + break; + case SORTED_NUMERIC: + iterator = derivedSourceReaders.getDocValuesProducer().getSortedNumeric(fieldInfo); + break; + case SORTED_SET: + iterator = derivedSourceReaders.getDocValuesProducer().getSortedSet(fieldInfo); + break; + case NONE: + default: + throw new AssertionError(); + } + } + if (iterator != null) { + return iterator.advance(offset); + } + + // Check the field names field type for matches + PostingsEnum postingsEnum = getPostingsEnum(FieldNamesFieldMapper.NAME, fieldInfo.name); + if (postingsEnum == null) { + return NO_MORE_DOCS; + } + return postingsEnum.advance(offset); + } + + /** + * Get the first child of the given parentDoc. This can be used to determine if the document contains any nested + * fields. + * + * @return doc id of last matching doc. {@link DocIdSetIterator#NO_MORE_DOCS} if no children exist. + * @throws IOException + */ + public int getFirstChild(int parentDocId) throws IOException { + // If its the first document id, then there is no change there are parents + if (parentDocId == 0) { + return NO_MORE_DOCS; + } + + // Only root level documents have the "_primary_term" field. So, we iterate through all of the documents in + // order to find out if any have this term. + // TODO: This is expensive and should be optimized. We should start at doc parentDocId - 10000 and work back + // (can we fetch the setting? Maybe) + FieldInfo fieldInfo = segmentReadState.fieldInfos.fieldInfo("_primary_term"); + assert derivedSourceReaders.getDocValuesProducer() != null; + NumericDocValues numericDocValues = derivedSourceReaders.getDocValuesProducer().getNumeric(fieldInfo); + int previousParentDocId = NO_MORE_DOCS; + numericDocValues.advance(0); + while (numericDocValues.docID() != NO_MORE_DOCS) { + if (numericDocValues.docID() >= parentDocId) { + break; + } + previousParentDocId = numericDocValues.docID(); + numericDocValues.nextDoc(); + } + + // If there are no numeric docvalues before the current parent doc, then the parent doc is the first parent. So + // its first child must be 0 + if (previousParentDocId == NO_MORE_DOCS) { + return 0; + } + // If the document right before is the previous parent, then there are no children. + if (parentDocId - previousParentDocId <= 1) { + return NO_MORE_DOCS; + } + return previousParentDocId + 1; + } + + private PostingsEnum getPostingsEnum(String termField, String term) throws IOException { + if (derivedSourceReaders.getFieldsProducer() == null) { + return null; + } + Terms terms = derivedSourceReaders.getFieldsProducer().terms(termField); + if (terms == null) { + return null; + } + TermsEnum nestedFieldsTerms = terms.iterator(); + BytesRef childPathRef = new BytesRef(term); + PostingsEnum postingsEnum = null; + while (nestedFieldsTerms.next() != null) { + BytesRef currentTerm = nestedFieldsTerms.term(); + if (currentTerm.bytesEquals(childPathRef)) { + postingsEnum = nestedFieldsTerms.postings(null); + break; + } + } + return postingsEnum; + } +} diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java index dc378ec378..bfec33ad74 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldDerivedVectorInjector.java @@ -5,17 +5,10 @@ package org.opensearch.knn.index.codec.derivedsource; -import lombok.AllArgsConstructor; import lombok.extern.log4j.Log4j2; -import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.PostingsEnum; import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.util.BytesRef; -import org.opensearch.index.mapper.FieldNamesFieldMapper; +import org.opensearch.common.xcontent.support.XContentMapValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValues; import org.opensearch.knn.index.vectorvalues.KNNVectorValuesFactory; @@ -24,40 +17,92 @@ import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.Objects; import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; +/** + * Injector class for nested fields and object fields. The class assumes that there will only be one level of nesting + * and that any vector path will have only one parent (i.e. parent.vector is supported but grandparent.parent.vector is + * not) + */ @Log4j2 -@AllArgsConstructor public class NestedPerFieldDerivedVectorInjector extends AbstractPerFieldDerivedVectorInjector { private final FieldInfo childFieldInfo; private final DerivedSourceReaders derivedSourceReaders; + private final DerivedSourceLuceneHelper derivedSourceLuceneHelper; private final SegmentReadState segmentReadState; + /** + * + * @param childFieldInfo FieldInfo of the child field + * @param derivedSourceReaders Readers for access segment info + * @param segmentReadState Segment read stats + */ + public NestedPerFieldDerivedVectorInjector( + FieldInfo childFieldInfo, + DerivedSourceReaders derivedSourceReaders, + SegmentReadState segmentReadState + ) { + this.childFieldInfo = childFieldInfo; + this.derivedSourceReaders = derivedSourceReaders; + this.segmentReadState = segmentReadState; + this.derivedSourceLuceneHelper = new DerivedSourceLuceneHelper(derivedSourceReaders, segmentReadState); + } + @Override - public void inject(int parentDocId, Map sourceAsMap) throws IOException { - // If the parent has the field, then it is just an object field. - int lowestDocIdForFieldWithParentAsOffset = getLowestDocIdForField(childFieldInfo.name, parentDocId); - if (lowestDocIdForFieldWithParentAsOffset == parentDocId) { - injectObject(parentDocId, sourceAsMap); + public void inject(int docId, Map sourceAsMap) throws IOException { + KNNVectorValues vectorValues = KNNVectorValuesFactory.getVectorValues( + childFieldInfo, + derivedSourceReaders.getDocValuesProducer(), + derivedSourceReaders.getKnnVectorsReader() + ); + + // If the doc has the field, then it is just an object field. + if (derivedSourceLuceneHelper.fieldExists(childFieldInfo.name, docId)) { + injectObject(docId, sourceAsMap, vectorValues); return; } - // Setup the iterator. Return if no parent - String childFieldName = ParentChildHelper.getChildField(childFieldInfo.name); - String parentFieldName = ParentChildHelper.getParentField(childFieldInfo.name); - if (parentFieldName == null) { + // If the doc doesnt have the field, either the field is a nested field or it is an object field that is + // just not present for the doc. Regardless, we will treat as nested and do nothing if it is just missing the + // field. + injectNested(docId, sourceAsMap, vectorValues); + } + + private void injectObject(int docId, Map sourceAsMap, KNNVectorValues vectorValues) throws IOException { + // Check if a vector is actually present for this value + if (vectorValues.docId() != docId && vectorValues.advance(docId) != docId) { return; } - NestedPerFieldParentToDocIdIterator nestedPerFieldParentToDocIdIterator = new NestedPerFieldParentToDocIdIterator( - childFieldInfo, - segmentReadState, - derivedSourceReaders, - parentDocId + + // Translate the flat path to a nested map of maps + String[] fields = ParentChildHelper.splitPath(childFieldInfo.name); + Map currentMap = sourceAsMap; + for (int i = 0; i < fields.length - 1; i++) { + String field = fields[i]; + currentMap = (Map) currentMap.computeIfAbsent(field, k -> new HashMap<>()); + } + currentMap.put( + fields[fields.length - 1], + formatVector(childFieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector) ); + } - if (nestedPerFieldParentToDocIdIterator.numChildren() == 0) { + private void injectNested(int parentDocId, Map sourceAsMap, KNNVectorValues vectorValues) throws IOException { + // The first child represents the first child document of the parent. This does not mean that this child is + // a document belonging to the current field that is being injected. Instead, it just means that the + // parent doc has nested fields + int firstChild = derivedSourceLuceneHelper.getFirstChild(parentDocId); + if (firstChild == NO_MORE_DOCS) { + return; + } + + // We need to check if the parent field is a nested field. + String childFieldName = ParentChildHelper.getChildField(childFieldInfo.name); + String parentFieldName = ParentChildHelper.getParentField(childFieldInfo.name); + if (derivedSourceLuceneHelper.isNestedParent(firstChild, parentDocId, parentFieldName) == false) { return; } @@ -70,207 +115,167 @@ public void inject(int parentDocId, Map sourceAsMap) throws IOEx reconstructedSource = (List>) originalParentValue; } - // Contains the docIds of existing objects in the map in order. This is used to help figure out the best play - // to put back the vectors - List positions = mapObjectsToPositionInNestedList( - reconstructedSource, - nestedPerFieldParentToDocIdIterator.firstChild(), - parentDocId - ); + // In order to inject vectors into source for nested documents, we need to be able to map the existing + // maps to document positions. This lets us know what place to put the vector back into. For example: + // Assume we have the following document from the user and we are deriving the value for nested.vector + // { + // "nested": [ + // { + // "text": "text1" + // }, + // { + // "vector": [vec1] + // }, + // { + // "vector": [vec2], + // "text": "text2" + // } + // ] + // } + // + // This would get filtered and serialized as: + // { + // "nested": [ + // { + // "text": "text1" + // }, + // { + // "text": "text2" + // } + // ] + // } + // + // We need to ensure that when we want to inject vec1 back, we create a new map and put it in between + // the existing fields. + // + // To do this, we need to know what docs the existing 2 maps map to. + List docIdsInNestedList = mapObjectsInNestedListToDocIds(reconstructedSource, firstChild, parentDocId); - // Finally, inject children for the document into the source. This code is non-trivial because filtering out - // the vectors during write could mean that children docs disappear from the source. So, to properly put - // everything back, we need to figure out where the existing fields in the original map to - KNNVectorValues vectorValues = KNNVectorValuesFactory.getVectorValues( - childFieldInfo, - derivedSourceReaders.getDocValuesProducer(), - derivedSourceReaders.getKnnVectorsReader() + // Finally, inject children for the document into the source. + NestedPerFieldParentToChildDocIdIterator nestedPerFieldParentToChildDocIdIterator = new NestedPerFieldParentToChildDocIdIterator( + parentDocId, + firstChild, + vectorValues ); int offsetPositionsIndex = 0; - while (nestedPerFieldParentToDocIdIterator.nextChild() != NO_MORE_DOCS) { - // If the child does not have a vector, vectValues advance will advance past child to the next matching - // docId. So, we need to ensure that doing this does not pass the parent docId. - if (nestedPerFieldParentToDocIdIterator.childId() > vectorValues.docId()) { - vectorValues.advance(nestedPerFieldParentToDocIdIterator.childId()); - } - if (vectorValues.docId() != nestedPerFieldParentToDocIdIterator.childId()) { - continue; - } - - int docId = nestedPerFieldParentToDocIdIterator.childId(); + while (nestedPerFieldParentToChildDocIdIterator.nextDoc() != NO_MORE_DOCS) { + int docId = nestedPerFieldParentToChildDocIdIterator.docId(); boolean isInsert = true; - int position = positions.size(); // by default we insert it at the end - for (int i = offsetPositionsIndex; i < positions.size(); i++) { - if (docId < positions.get(i)) { + + // Find the position in the nested source list of maps to put it back + int position = docIdsInNestedList.size(); + for (int i = offsetPositionsIndex; i < docIdsInNestedList.size(); i++) { + if (docId < docIdsInNestedList.get(i)) { position = i; break; } - if (docId == positions.get(i)) { + if (docId == docIdsInNestedList.get(i)) { isInsert = false; position = i; break; } } + // If we need to insert a new map, we do so here if (isInsert) { reconstructedSource.add(position, new HashMap<>()); - positions.add(position, docId); + docIdsInNestedList.add(position, docId); } - reconstructedSource.get(position).put(childFieldName, formatVector(childFieldInfo, vectorValues)); + reconstructedSource.get(position) + .put( + childFieldName, + formatVector( + childFieldInfo, + nestedPerFieldParentToChildDocIdIterator::getVector, + nestedPerFieldParentToChildDocIdIterator::getVectorClone + ) + ); offsetPositionsIndex = position + 1; } sourceAsMap.put(parentFieldName, reconstructedSource); } - private void injectObject(int docId, Map sourceAsMap) throws IOException { - KNNVectorValues vectorValues = KNNVectorValuesFactory.getVectorValues( - childFieldInfo, - derivedSourceReaders.getDocValuesProducer(), - derivedSourceReaders.getKnnVectorsReader() - ); - if (vectorValues.docId() != docId && vectorValues.advance(docId) != docId) { - return; - } - String[] fields = ParentChildHelper.splitPath(childFieldInfo.name); - Map currentMap = sourceAsMap; - for (int i = 0; i < fields.length - 1; i++) { - String field = fields[i]; - currentMap = (Map) currentMap.computeIfAbsent(field, k -> new HashMap<>()); - } - currentMap.put(fields[fields.length - 1], formatVector(childFieldInfo, vectorValues)); - } - /** - * Given a list of maps, map each map to a position in the nested list. This is used to help figure out where to put - * the vectors back in the source. + * Given a list of maps, map each map to a doc id. This is used to help figure out where to put + * the vectors back in the source. The assumption is that earlier objects in the list will have lower doc ids than + * later objects in the map * * @param originals list of maps - * @param firstChild first child docId + * @param offset Position to move iterators to identify the positions in the map * @param parent parent docId * @return list of positions in the nested list * @throws IOException if there is an issue reading from the formats */ - private List mapObjectsToPositionInNestedList(List> originals, int firstChild, int parent) - throws IOException { + private List mapObjectsInNestedListToDocIds(List> originals, int offset, int parent) throws IOException { + // Starting at the offset, we iterate over all of maps in the list of maps and figure out what doc id they map + // to. List positions = new ArrayList<>(); - int offset = firstChild; + int currentOffset = offset; for (Map docWithFields : originals) { - int fieldMapping = mapToDocId(docWithFields, offset, parent); - assert fieldMapping != -1; + int fieldMapping = mapToDocId(docWithFields, currentOffset, parent); + assert fieldMapping != NO_MORE_DOCS; positions.add(fieldMapping); - offset = fieldMapping + 1; + currentOffset = fieldMapping + 1; } return positions; } /** - * Given a doc as a map and the offset it has to be, find the ordinal of the first field that is greater than the - * offset. + * Given a doc as a map and the offset it has to be after, return the doc id that it must be * - * @param doc doc to find the ordinal for + * @param doc doc to find the docId for * @param offset offset to start searching from - * @return id of the first field that is greater than the offset + * @return doc id the map must map to * @throws IOException if there is an issue reading from the formats */ private int mapToDocId(Map doc, int offset, int parent) throws IOException { - // For all the fields, we look for the first doc that matches any of the fields. - int position = NO_MORE_DOCS; - for (String key : doc.keySet()) { - position = getLowestDocIdForField(ParentChildHelper.constructSiblingField(childFieldInfo.name, key), offset); - if (position < parent) { - break; - } - } + // First, we identify a field that the doc in question must have + FieldInfo fieldInfoOfDoc = getAnyMatchingFieldInfoForDoc(doc); + assert fieldInfoOfDoc != null; + // Get the first document on/after the offset that has this field. + int firstMatchingDocWithField = derivedSourceLuceneHelper.getFirstDocWhereFieldExists(fieldInfoOfDoc.name, offset); // Advancing past the parent means something went wrong + assert firstMatchingDocWithField < parent; + + // The field in question may be a nested field. In this case, we need to find the next parent on the same level + // as the child to figure out where to put back the vector. + List matches = derivedSourceLuceneHelper.termMatchesInRange( + firstMatchingDocWithField, + parent - 1, + "_nested_path", + ParentChildHelper.getParentField(childFieldInfo.name) + ); + assert matches != null; + assert matches.isEmpty() == false; + int position = matches.getFirst(); assert position < parent; return position; } /** - * Get the lowest docId for a field that is greater than the offset. + * For a given map, return a {@link FieldInfo} that the doc must have. * - * @param fieldToMatch field to find the lowest docId for - * @param offset offset to start searching from - * @return lowest docId for the field that is greater than the offset. Returns {@link DocIdSetIterator#NO_MORE_DOCS} if doc cannot be found - * @throws IOException if there is an issue reading from the formats + * @param doc source of the document + * @return {@link FieldInfo} of any field the document must have; null if none are found */ - private int getLowestDocIdForField(String fieldToMatch, int offset) throws IOException { - // This method implementation is inspired by the FieldExistsQuery in Lucene and the FieldNamesMapper in - // Opensearch. We first mimic the logic in the FieldExistsQuery in order to identify the docId of the nested - // doc. If that fails, we rely on - // References: - // 1. https://github.com/apache/lucene/blob/main/lucene/core/src/java/org/apache/lucene/search/FieldExistsQuery.java#L170-L218. - // 2. - // https://github.com/opensearch-project/OpenSearch/blob/main/server/src/main/java/org/opensearch/index/mapper/FieldMapper.java#L316-L324 - FieldInfo fieldInfo = segmentReadState.fieldInfos.fieldInfo(fieldToMatch); - - if (fieldInfo == null) { - return NO_MORE_DOCS; - } - - DocIdSetIterator iterator = null; - if (fieldInfo.hasNorms() && derivedSourceReaders.getNormsProducer() != null) { // the field indexes norms - iterator = derivedSourceReaders.getNormsProducer().getNorms(fieldInfo); - } else if (fieldInfo.getVectorDimension() != 0 && derivedSourceReaders.getKnnVectorsReader() != null) { // the field indexes vectors - switch (fieldInfo.getVectorEncoding()) { - case FLOAT32: - iterator = derivedSourceReaders.getKnnVectorsReader().getFloatVectorValues(fieldInfo.name).iterator(); - break; - case BYTE: - iterator = derivedSourceReaders.getKnnVectorsReader().getByteVectorValues(fieldInfo.name).iterator(); - break; - } - } else if (fieldInfo.getDocValuesType() != DocValuesType.NONE && derivedSourceReaders.getDocValuesProducer() != null) { // the field - // indexes - // doc - // values - switch (fieldInfo.getDocValuesType()) { - case NUMERIC: - iterator = derivedSourceReaders.getDocValuesProducer().getNumeric(fieldInfo); - break; - case BINARY: - iterator = derivedSourceReaders.getDocValuesProducer().getBinary(fieldInfo); - break; - case SORTED: - iterator = derivedSourceReaders.getDocValuesProducer().getSorted(fieldInfo); - break; - case SORTED_NUMERIC: - iterator = derivedSourceReaders.getDocValuesProducer().getSortedNumeric(fieldInfo); - break; - case SORTED_SET: - iterator = derivedSourceReaders.getDocValuesProducer().getSortedSet(fieldInfo); - break; - case NONE: - default: - throw new AssertionError(); + private FieldInfo getAnyMatchingFieldInfoForDoc(Map doc) { + for (FieldInfo fieldInfo : segmentReadState.fieldInfos) { + String extractedFieldName = ParentChildHelper.getChildField(fieldInfo.name); + String parentFieldName = ParentChildHelper.getParentField(fieldInfo.name); + if (extractedFieldName == null || !Objects.equals(parentFieldName, ParentChildHelper.getParentField(childFieldInfo.name))) { + continue; } - } - if (iterator != null) { - return iterator.advance(offset); - } - // Check the field names field type for matches - if (derivedSourceReaders.getFieldsProducer() == null) { - return NO_MORE_DOCS; - } - Terms terms = derivedSourceReaders.getFieldsProducer().terms(FieldNamesFieldMapper.NAME); - if (terms == null) { - return NO_MORE_DOCS; - } - TermsEnum fieldNameFieldsTerms = terms.iterator(); - BytesRef fieldToMatchRef = new BytesRef(fieldInfo.name); - PostingsEnum postingsEnum = null; - while (fieldNameFieldsTerms.next() != null) { - BytesRef currentTerm = fieldNameFieldsTerms.term(); - if (currentTerm.bytesEquals(fieldToMatchRef)) { - postingsEnum = fieldNameFieldsTerms.postings(null); - break; + Object object = XContentMapValues.extractValue(extractedFieldName, doc, NullValue.INSTANCE); + if (object != null) { + return fieldInfo; } } - if (postingsEnum == null) { - return NO_MORE_DOCS; - } - return postingsEnum.advance(offset); + return null; + } + + private static class NullValue { + private static final NullValue INSTANCE = new NullValue(); } } diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToChildDocIdIterator.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToChildDocIdIterator.java new file mode 100644 index 0000000000..5deb1734fb --- /dev/null +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToChildDocIdIterator.java @@ -0,0 +1,96 @@ +/* + * Copyright OpenSearch Contributors + * SPDX-License-Identifier: Apache-2.0 + */ + +package org.opensearch.knn.index.codec.derivedsource; + +import lombok.Getter; +import org.opensearch.knn.index.vectorvalues.KNNVectorValues; + +import java.io.IOException; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +/** + * Iterator over the children documents of a particular parent + */ +public class NestedPerFieldParentToChildDocIdIterator { + + private final int parentDocId; + @Getter + private int firstChild; + private int currentChild; + private final KNNVectorValues vectorValues; + + /** + * Constructor + * + * @param parentDocId the parent docId + * @param firstChild the first child of this parent doc + * @param vectorValues the vector values + */ + public NestedPerFieldParentToChildDocIdIterator(int parentDocId, int firstChild, KNNVectorValues vectorValues) { + this.parentDocId = parentDocId; + this.vectorValues = vectorValues; + this.currentChild = -1; + this.firstChild = firstChild; + } + + /** + * Get the next child for this parent + * + * @return the next child docId. If there are no more children, return + * NO_MORE_DOCS + */ + public int nextDoc() throws IOException { + if (currentChild == NO_MORE_DOCS) { + return NO_MORE_DOCS; + } + + // On the first call, we advance to the first child and, if it has the vector for the field, return it. + if (currentChild == -1) { + currentChild = vectorValues.advance(firstChild); + } else { + currentChild = vectorValues.nextDoc(); + } + + if (currentChild >= parentDocId) { + currentChild = NO_MORE_DOCS; + if (vectorValues.docId() != NO_MORE_DOCS) { + vectorValues.advance(NO_MORE_DOCS); + } + } + + return currentChild; + } + + /** + * Get the current child for this parent + * + * @return the current child docId. If this has not been set, return -1 + */ + public int docId() { + return currentChild; + } + + /** + * Get the vector value for the current child. + * + * @return the vector for the current child + * @throws IOException if there is an error reading the vector values + */ + public Object getVector() throws IOException { + return vectorValues.getVector(); + } + + /** + * Get a clone of the vector for the current child. + * + * @return a clone of the vector for the current child + * @throws IOException if there is an error reading the vector values + */ + public Object getVectorClone() throws IOException { + return vectorValues.conditionalCloneVector(); + } +} diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java deleted file mode 100644 index d2bc1a32fd..0000000000 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/NestedPerFieldParentToDocIdIterator.java +++ /dev/null @@ -1,172 +0,0 @@ -/* - * Copyright OpenSearch Contributors - * SPDX-License-Identifier: Apache-2.0 - */ - -package org.opensearch.knn.index.codec.derivedsource; - -import org.apache.lucene.index.FieldInfo; -import org.apache.lucene.index.NumericDocValues; -import org.apache.lucene.index.PostingsEnum; -import org.apache.lucene.index.SegmentReadState; -import org.apache.lucene.index.Terms; -import org.apache.lucene.index.TermsEnum; -import org.apache.lucene.util.BytesRef; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collections; -import java.util.List; - -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - -/** - * Iterator over the children documents of a particular parent - */ -public class NestedPerFieldParentToDocIdIterator { - - private final FieldInfo childFieldInfo; - private final SegmentReadState segmentReadState; - private final DerivedSourceReaders derivedSourceReaders; - private final int parentDocId; - private final int previousParentDocId; - private final List children; - private int currentChild; - - /** - * - * @param childFieldInfo FieldInfo for the child field - * @param segmentReadState SegmentReadState for the segment - * @param derivedSourceReaders {@link DerivedSourceReaders} instance - * @param parentDocId Parent docId of the parent - * @throws IOException if there is an error reading the parent docId - */ - public NestedPerFieldParentToDocIdIterator( - FieldInfo childFieldInfo, - SegmentReadState segmentReadState, - DerivedSourceReaders derivedSourceReaders, - int parentDocId - ) throws IOException { - this.childFieldInfo = childFieldInfo; - this.segmentReadState = segmentReadState; - this.derivedSourceReaders = derivedSourceReaders; - this.parentDocId = parentDocId; - this.previousParentDocId = previousParent(); - this.children = getChildren(); - this.currentChild = -1; - } - - /** - * For the given parent get its first child offset - * - * @return the first child offset. If there are no children, just return NO_MORE_DOCS - */ - public int firstChild() { - if (parentDocId - previousParentDocId == 1) { - return NO_MORE_DOCS; - } - return previousParentDocId + 1; - } - - /** - * Get the next child for this parent - * - * @return the next child docId. If this has not been set, return -1. If there are no more children, return - * NO_MORE_DOCS - */ - public int nextChild() { - currentChild++; - if (currentChild >= children.size()) { - return NO_MORE_DOCS; - } - return children.get(currentChild); - } - - /** - * Get the current child for this parent - * - * @return the current child docId. If this has not been set, return -1 - */ - public int childId() { - return children.get(currentChild); - } - - /** - * - * @return the number of children for this parent - */ - public int numChildren() { - return children.size(); - } - - /** - * For parentDocId of this class, find the one just before it to be used for matching children. - * - * @return the parent docId just before the parentDocId. -1 if none exist - * @throws IOException if there is an error reading the parent docId - */ - private int previousParent() throws IOException { - // TODO: In the future this needs to be generalized to handle multiple levels of nesting - // For now, for non-nested docs, the primary_term field can be used to identify root level docs. For reference: - // https://github.com/opensearch-project/OpenSearch/blob/2.18.0/server/src/main/java/org/opensearch/search/fetch/subphase/SeqNoPrimaryTermPhase.java#L72 - // https://github.com/opensearch-project/OpenSearch/blob/3032bef54d502836789ea438f464ae0b1ba978b2/server/src/main/java/org/opensearch/index/mapper/SeqNoFieldMapper.java#L206-L230 - // We use it here to identify the previous parent to the current parent to get a range on the children documents - FieldInfo seqTermsFieldInfo = segmentReadState.fieldInfos.fieldInfo("_primary_term"); - NumericDocValues numericDocValues = derivedSourceReaders.getDocValuesProducer().getNumeric(seqTermsFieldInfo); - int previousParentDocId = -1; - while (numericDocValues.nextDoc() != NO_MORE_DOCS) { - if (numericDocValues.docID() >= parentDocId) { - break; - } - previousParentDocId = numericDocValues.docID(); - } - return previousParentDocId; - } - - /** - * Get all the children that match the parent path for the _nested_field - * - * @return list of children that match the parent path - * @throws IOException if there is an error reading the children - */ - private List getChildren() throws IOException { - if (this.parentDocId - this.previousParentDocId <= 1) { - return Collections.emptyList(); - } - - // First, we need to get the currect PostingsEnum for the key as _nested_path and the value the actual parent - // path. - String childField = childFieldInfo.name; - String parentField = ParentChildHelper.getParentField(childField); - - Terms terms = derivedSourceReaders.getFieldsProducer().terms("_nested_path"); - if (terms == null) { - return Collections.emptyList(); - } - TermsEnum nestedFieldsTerms = terms.iterator(); - BytesRef childPathRef = new BytesRef(parentField); - PostingsEnum postingsEnum = null; - while (nestedFieldsTerms.next() != null) { - BytesRef currentTerm = nestedFieldsTerms.term(); - if (currentTerm.bytesEquals(childPathRef)) { - postingsEnum = nestedFieldsTerms.postings(null); - break; - } - } - - // Next, get all the children that match this parent path. If none were found, return an empty list - if (postingsEnum == null) { - return Collections.emptyList(); - } - List children = new ArrayList<>(); - postingsEnum.advance(previousParentDocId + 1); - while (postingsEnum.docID() != NO_MORE_DOCS && postingsEnum.docID() < parentDocId) { - if (postingsEnum.freq() > 0) { - children.add(postingsEnum.docID()); - } - postingsEnum.nextDoc(); - } - - return children; - } -} diff --git a/src/main/java/org/opensearch/knn/index/codec/derivedsource/RootPerFieldDerivedVectorInjector.java b/src/main/java/org/opensearch/knn/index/codec/derivedsource/RootPerFieldDerivedVectorInjector.java index e9c4d21a68..b379e7a02b 100644 --- a/src/main/java/org/opensearch/knn/index/codec/derivedsource/RootPerFieldDerivedVectorInjector.java +++ b/src/main/java/org/opensearch/knn/index/codec/derivedsource/RootPerFieldDerivedVectorInjector.java @@ -40,7 +40,7 @@ public RootPerFieldDerivedVectorInjector(FieldInfo fieldInfo, DerivedSourceReade public void inject(int docId, Map sourceAsMap) throws IOException { KNNVectorValues vectorValues = vectorValuesSupplier.get(); if (vectorValues.docId() == docId || vectorValues.advance(docId) == docId) { - sourceAsMap.put(fieldInfo.name, formatVector(fieldInfo, vectorValues)); + sourceAsMap.put(fieldInfo.name, formatVector(fieldInfo, vectorValues::getVector, vectorValues::conditionalCloneVector)); } } } diff --git a/src/main/java/org/opensearch/knn/index/mapper/FlatVectorFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/FlatVectorFieldMapper.java index 68ea25a1fc..7c6cd7d9a0 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/FlatVectorFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/FlatVectorFieldMapper.java @@ -45,6 +45,7 @@ public static FlatVectorFieldMapper createFieldMapper( knnMethodConfigContext::getDimension ); return new FlatVectorFieldMapper( + fullname, simpleName, mappedFieldType, multiFields, @@ -59,6 +60,7 @@ public static FlatVectorFieldMapper createFieldMapper( } private FlatVectorFieldMapper( + String fullName, String simpleName, KNNVectorFieldType mappedFieldType, MultiFields multiFields, @@ -87,7 +89,7 @@ private FlatVectorFieldMapper( this.perDimensionValidator = selectPerDimensionValidator(vectorDataType); this.fieldType = new FieldType(KNNVectorFieldMapper.Defaults.FIELD_TYPE); this.fieldType.setDocValuesType(DocValuesType.BINARY); - if (isDerivedSourceEnabled) { + if (KNNVectorFieldMapperUtil.isDeriveSourceForFieldEnabled(isDerivedSourceEnabled, fullName)) { this.fieldType.putAttribute(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY, DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE); } this.fieldType.freeze(); diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java index 3fb328fc7e..07e92d6ee5 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java @@ -249,6 +249,8 @@ public KNNVectorFieldMapper build(BuilderContext context) { final Explicit ignoreMalformed = ignoreMalformed(context); final Map metaValue = meta.getValue(); + String fullName = buildFullName(context); + log.info("Full name: {}", fullName); if (modelId.get() != null) { return ModelFieldMapper.createFieldMapper( buildFullName(context), @@ -273,7 +275,7 @@ public KNNVectorFieldMapper build(BuilderContext context) { // MethodFieldMapper to maintain backwards compatibility if (originalParameters.getResolvedKnnMethodContext() == null && indexCreatedVersion.onOrAfter(Version.V_2_17_0)) { return FlatVectorFieldMapper.createFieldMapper( - buildFullName(context), + fullName, name, metaValue, KNNMethodConfigContext.builder() diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperUtil.java b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperUtil.java index 1701154a93..e3fd1f45fe 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperUtil.java +++ b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperUtil.java @@ -24,6 +24,7 @@ import org.opensearch.knn.index.KnnCircuitBreakerException; import org.opensearch.knn.index.SpaceType; import org.opensearch.knn.index.VectorDataType; +import org.opensearch.knn.index.codec.derivedsource.ParentChildHelper; import org.opensearch.knn.index.codec.util.KNNVectorSerializerFactory; import org.opensearch.knn.index.engine.KNNEngine; import org.opensearch.knn.index.engine.KNNMethodContext; @@ -152,6 +153,18 @@ private static int getEfConstruction(Version indexVersion) { return IndexHyperParametersUtil.getHNSWEFConstructionValue(indexVersion); } + /** + * Determine whether the field should use derived source. We only support derived source for a single + * level of nesting/object. + * + * @param enabledForIndex Whether derived source is enabled for index + * @param fieldName field to check + * @return true if it should be enabled; false otherwise. + */ + static boolean isDeriveSourceForFieldEnabled(boolean enabledForIndex, String fieldName) { + return enabledForIndex && ParentChildHelper.splitPath(fieldName).length <= 2; + } + static KNNMethodContext createKNNMethodContextFromLegacy( Settings indexSettings, Version indexCreatedVersion, diff --git a/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java index 49cd02d5b0..030388bfb7 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java @@ -86,6 +86,7 @@ public Version getIndexCreatedVersion() { ); return new LuceneFieldMapper( + fullname, mappedFieldType, createLuceneFieldMapperInput, knnMethodConfigContext, @@ -95,6 +96,7 @@ public Version getIndexCreatedVersion() { } private LuceneFieldMapper( + String fullName, final KNNVectorFieldType mappedFieldType, final CreateLuceneFieldMapperInput input, KNNMethodConfigContext knnMethodConfigContext, @@ -128,7 +130,7 @@ private LuceneFieldMapper( this.vectorFieldType = null; } - if (isDerivedSourceEnabled) { + if (KNNVectorFieldMapperUtil.isDeriveSourceForFieldEnabled(isDerivedSourceEnabled, fullName)) { this.fieldType = new FieldType(this.fieldType); this.fieldType.putAttribute(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY, DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE); this.fieldType.freeze(); diff --git a/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java index a2635b1953..f96c6cb13f 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java @@ -99,6 +99,7 @@ public Version getIndexCreatedVersion() { } ); return new MethodFieldMapper( + fullname, simpleName, mappedFieldType, multiFields, @@ -113,6 +114,7 @@ public Version getIndexCreatedVersion() { } private MethodFieldMapper( + String fullName, String simpleName, KNNVectorFieldType mappedFieldType, MultiFields multiFields, @@ -157,7 +159,7 @@ private MethodFieldMapper( this.fieldType.putAttribute(VECTOR_DATA_TYPE_FIELD, vectorDataType.getValue()); this.fieldType.putAttribute(KNN_ENGINE, knnEngine.getName()); - if (isDerivedSourceEnabled) { + if (KNNVectorFieldMapperUtil.isDeriveSourceForFieldEnabled(isDerivedSourceEnabled, fullName)) { this.fieldType.putAttribute(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY, DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE); } try { diff --git a/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java index ae912aa415..741f4d79e8 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java @@ -128,6 +128,7 @@ private void initFromModelMetadata() { } }); return new ModelFieldMapper( + fullname, simpleName, mappedFieldType, multiFields, @@ -143,6 +144,7 @@ private void initFromModelMetadata() { } private ModelFieldMapper( + String fullName, String simpleName, KNNVectorFieldType mappedFieldType, MultiFields multiFields, @@ -180,7 +182,7 @@ private ModelFieldMapper( this.fieldType = new FieldType(KNNVectorFieldMapper.Defaults.FIELD_TYPE); this.fieldType.putAttribute(MODEL_ID, modelId); - if (isDerivedSourceEnabled) { + if (KNNVectorFieldMapperUtil.isDeriveSourceForFieldEnabled(isDerivedSourceEnabled, fullName)) { this.fieldType.putAttribute(DERIVED_VECTOR_FIELD_ATTRIBUTE_KEY, DERIVED_VECTOR_FIELD_ATTRIBUTE_TRUE_VALUE); } this.useLuceneBasedVectorField = KNNVectorFieldMapperUtil.useLuceneKNNVectorsFormat(this.indexCreatedVersion); diff --git a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java index 9255e9863e..99d9af252f 100644 --- a/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java +++ b/src/test/java/org/opensearch/knn/integ/DerivedSourceIT.java @@ -20,6 +20,7 @@ import org.opensearch.index.query.QueryBuilder; import org.opensearch.knn.KNNRestTestCase; import org.opensearch.knn.index.KNNSettings; +import org.opensearch.knn.index.codec.derivedsource.ParentChildHelper; import java.io.IOException; import java.util.List; @@ -632,7 +633,7 @@ public void testNestedSingleDocBasic() { bulkIngestRandomVectorsWithSkipsAndNested( context.indexName, context.vectorFieldNames.get(0), - NESTED_NAME + "." + "text", + "text", context.docCount, context.dimension, 0.1f @@ -653,7 +654,7 @@ public void testNestedSingleDocBasic() { bulkIngestRandomVectorsWithSkipsAndNested( context.indexName, context.vectorFieldNames.get(0), - NESTED_NAME + "." + "text", + "text", context.docCount, context.dimension, 0.1f @@ -775,8 +776,8 @@ public void testNestedMultiDocBasic() { .indexIngestor(context -> { bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( context.indexName, - context.vectorFieldNames.get(0), - NESTED_NAME + "." + "text", + context.vectorFieldNames, + "text", context.docCount, context.dimension, 0.1f, @@ -797,8 +798,8 @@ public void testNestedMultiDocBasic() { .indexIngestor(context -> { bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( context.indexName, - context.vectorFieldNames.get(0), - NESTED_NAME + "." + "text", + context.vectorFieldNames, + "text", context.docCount, context.dimension, 0.1f, @@ -857,6 +858,182 @@ public void testNestedMultiDocBasic() { testDerivedSourceE2E(indexConfigContexts); } + /** + * Testing single nested doc per parent doc. + * Test mapping: + * { + * "settings": { + * "index.knn" true, + * "index.knn.derived_source.enabled": true + * }, + * "mappings":{ + * "properties": { + * "test_nested_1" : { + * "type": "nested", + * "properties": { + * "test_vector": { + * "type": "knn_vector", + * "dimension": 128 + * }, + * "text": { + * "type": "text", + * }, + * } + * }, + * "test_nested_2" : { + * "type": "nested", + * "properties": { + * "test_vector": { + * "type": "knn_vector", + * "dimension": 128 + * }, + * "text": { + * "type": "text", + * }, + * } + * } + * } + * } + * } + * Baseline mapping: + * { + * "settings": { + * "index.knn" true, + * "index.knn.derived_source.enabled": false + * }, + * "mappings":{ + * "properties": { + * "test_nested_1" : { + * "type": "nested", + * "properties": { + * "test_vector": { + * "type": "knn_vector", + * "dimension": 128 + * }, + * "text": { + * "type": "text", + * }, + * } + * }, + * "test_nested_2" : { + * "type": "nested", + * "properties": { + * "test_vector": { + * "type": "knn_vector", + * "dimension": 128 + * }, + * "text": { + * "type": "text", + * }, + * } + * } + * } + * } + * } + */ + public void testNestedMultiDocMultiField() { + List nestedFields = List.of( + NESTED_NAME + "1" + "." + FIELD_NAME, + NESTED_NAME + "2" + "." + FIELD_NAME, + NESTED_NAME + "3" + "." + FIELD_NAME, + NESTED_NAME + "4" + "." + FIELD_NAME, + NESTED_NAME + "5" + "." + FIELD_NAME + ); + String nestedMapping = createVectorNestedMappings(TEST_DIMENSION, null, nestedFields); + List indexConfigContexts = List.of( + IndexConfigContext.builder() + .indexName(("original-enable-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_ENABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> { + bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( + context.indexName, + context.vectorFieldNames, + "text", + context.docCount, + context.dimension, + 0.1f, + 5 + ); + refreshAllIndices(); + }) + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build(), + IndexConfigContext.builder() + .indexName(("original-disable-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_DISABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> { + bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( + context.indexName, + context.vectorFieldNames, + "text", + context.docCount, + context.dimension, + 0.1f, + 5 + ); + refreshAllIndices(); + }) + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build(), + IndexConfigContext.builder() + .indexName(("e2e-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_ENABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> {}) // noop for reindex + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build(), + IndexConfigContext.builder() + .indexName(("e2d-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_DISABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> {}) // noop for reindex + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build(), + IndexConfigContext.builder() + .indexName(("d2e-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_ENABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> {}) // noop for reindex + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build(), + IndexConfigContext.builder() + .indexName(("d2d-" + getTestName() + randomAlphaOfLength(6)).toLowerCase(Locale.ROOT)) + .vectorFieldNames(nestedFields) + .dimension(TEST_DIMENSION) + .settings(DERIVED_DISABLED_SETTINGS) + .mapping(nestedMapping) + .isNested(true) + .docCount(DOCS) + .indexIngestor(context -> {}) // noop for reindex + .updateVectorSupplier((c) -> randomFloatVector(c.dimension)) + .build() + + ); + testDerivedSourceE2E(indexConfigContexts); + } + /** * Test object (non-nested field) * Test @@ -1496,4 +1673,23 @@ private String createVectorNestedMappings(final int dimension, String dataType) builder.endObject().endObject().endObject().endObject().endObject(); return builder.toString(); } + + @SneakyThrows + private String createVectorNestedMappings(final int dimension, String dataType, List nestedFieldNames) { + XContentBuilder builder = XContentFactory.jsonBuilder().startObject().startObject(PROPERTIES_FIELD); + for (String nestedFieldName : nestedFieldNames) { + builder.startObject(ParentChildHelper.getParentField(nestedFieldName)) + .field(TYPE, "nested") + .startObject(PROPERTIES_FIELD) + .startObject(ParentChildHelper.getChildField(nestedFieldName)) + .field(TYPE, TYPE_KNN_VECTOR) + .field(DIMENSION, dimension); + if (dataType != null) { + builder.field(VECTOR_DATA_TYPE_FIELD, dataType); + } + builder.endObject().endObject().endObject(); + } + builder.endObject().endObject(); + return builder.toString(); + } } diff --git a/src/testFixtures/java/org/opensearch/knn/KNNRestTestCase.java b/src/testFixtures/java/org/opensearch/knn/KNNRestTestCase.java index 6996e49934..11ec780096 100644 --- a/src/testFixtures/java/org/opensearch/knn/KNNRestTestCase.java +++ b/src/testFixtures/java/org/opensearch/knn/KNNRestTestCase.java @@ -1569,7 +1569,7 @@ public void bulkIngestRandomVectorsWithSkipsAndNested( ) throws IOException { bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( indexName, - nestedFieldName, + List.of(nestedFieldName), nestedNumericPath, numVectors, dimension, @@ -1580,7 +1580,7 @@ public void bulkIngestRandomVectorsWithSkipsAndNested( public void bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( String indexName, - String nestedFieldName, + List nestedFieldNames, String nestedNumericPath, int numDocs, int dimension, @@ -1591,20 +1591,21 @@ public void bulkIngestRandomVectorsWithSkipsAndNestedMultiDoc( random.setSeed(2); float[][] vectors = TestUtils.randomlyGenerateStandardVectors(numDocs * maxDoc, dimension, 1); for (int i = 0; i < numDocs; i++) { - int nestedDocs = random.nextInt(maxDoc) + 1; - XContentBuilder builder = XContentFactory.jsonBuilder() - .startObject() - .startArray(ParentChildHelper.getParentField(nestedFieldName)); - for (int j = 0; j < nestedDocs; j++) { - builder.startObject(); - if (random.nextFloat() > skipProb) { - builder.field(ParentChildHelper.getChildField(nestedFieldName), vectors[i + j]); - } else { - builder.field(ParentChildHelper.getChildField(nestedNumericPath), 1); + XContentBuilder builder = XContentFactory.jsonBuilder().startObject(); + for (String nestedFieldName : nestedFieldNames) { + builder.startArray(ParentChildHelper.getParentField(nestedFieldName)); + int nestedDocs = random.nextInt(maxDoc) + 1; + for (int j = 0; j < nestedDocs; j++) { + builder.startObject(); + if (random.nextFloat() > skipProb) { + builder.field(ParentChildHelper.getChildField(nestedFieldName), vectors[i + j]); + } else { + builder.field(nestedNumericPath, 1); + } + builder.endObject(); } - builder.endObject(); + builder.endArray(); } - builder.endArray(); builder.endObject(); addKnnDoc(indexName, String.valueOf(i + 1), builder.toString()); }