Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main' into feature/knn-multi-l…
Browse files Browse the repository at this point in the history
…eaf-coll-ctor
  • Loading branch information
benwtrent committed Sep 30, 2024
2 parents 3ed0a0b + 94d3504 commit cf57ce4
Show file tree
Hide file tree
Showing 112 changed files with 2,240 additions and 2,382 deletions.
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ apply from: file('gradle/globals.gradle')
// Calculate project version:
version = {
// Release manager: update base version here after release:
String baseVersion = '10.0.0'
String baseVersion = '11.0.0'

// On a release explicitly set release version in one go:
// -Dversion.release=x.y.z
Expand Down
9 changes: 8 additions & 1 deletion dev-tools/doap/lucene.rdf
Original file line number Diff line number Diff line change
Expand Up @@ -67,13 +67,20 @@
</maintainer>

<!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
<release>
<Version>
<name>lucene-9.12.0</name>
<created>2024-09-28</created>
<revision>9.12.0</revision>
</Version>
</release>
<release>
<Version>
<name>lucene-9.11.1</name>
<created>2024-06-27</created>
<revision>9.11.1</revision>
</Version>
</release>.
</release>
<release>
<Version>
<name>lucene-9.11.0</name>
Expand Down
64 changes: 64 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,58 @@ Lucene Change Log
For more information on past and future Lucene versions, please see:
http://s.apache.org/luceneversions

======================= Lucene 11.0.0 =======================

API Changes
---------------------
(No changes)

New Features
---------------------
(No changes)

Improvements
---------------------
(No changes)

Optimizations
---------------------
(No changes)

Bug Fixes
---------------------
(No changes)

Other
---------------------
(No changes)

======================= Lucene 10.1.0 =======================

API Changes
---------------------
(No changes)

New Features
---------------------
(No changes)

Improvements
---------------------
(No changes)

Optimizations
---------------------
(No changes)

Bug Fixes
---------------------
(No changes)

Other
---------------------
(No changes)

======================= Lucene 10.0.0 =======================

API Changes
Expand Down Expand Up @@ -123,6 +175,10 @@ API Changes
* GITHUB#13780: Remove `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` in favour of the newly
introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)`

* GITHUB#13779: First-class random access API for KnnVectorValues
unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces
DocIndexIterator for iterative access in place of direct inheritance from DISI.

New Features
---------------------

Expand Down Expand Up @@ -292,6 +348,12 @@ Build

======================== Lucene 9.12.0 =======================

Security Fixes
---------------------

* Deserialization of Untrusted Data vulnerability in Apache Lucene Replicator - CVE-2024-45772
(Summ3r from Vidar-Team, Robert Muir, Paul Irwin)

API Changes
---------------------

Expand Down Expand Up @@ -484,6 +546,8 @@ Other
* GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float
summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita)

* Remove code triggering forbidden-apis regarding Java serialization. (Uwe Schindler, Robert Muir)

======================== Lucene 9.11.1 =======================

Bug Fixes
Expand Down
4 changes: 4 additions & 0 deletions lucene/MIGRATE.md
Original file line number Diff line number Diff line change
Expand Up @@ -888,3 +888,7 @@ additional vectors into the same field with either 4 or 7 bit
quantization (or no quantization), and ensure all older (9.x written)
segments are rewritten either via `IndexWriter.forceMerge` or
`IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.

### Vector values APIs switched to primarily random-access

`{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`.
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,18 @@
package org.apache.lucene.analysis.synonym.word2vec;

import java.io.IOException;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.TermAndVector;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;

/**
* Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each
* word in dictionary
*
* @lucene.experimental
*/
public class Word2VecModel implements RandomAccessVectorValues.Floats {
public class Word2VecModel extends FloatVectorValues {

private final int dictionarySize;
private final int vectorDimension;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@
public class TestCustomAnalyzer extends BaseTokenStreamTestCase {

@SuppressWarnings("deprecation")
private static final Version LUCENE_9_0_0 = Version.LUCENE_9_0_0;
private static final Version LUCENE_10_0_0 = Version.LUCENE_10_0_0;

// Test some examples (TODO: we only check behavior, we may need something like
// TestRandomChains...)
Expand Down Expand Up @@ -111,7 +111,7 @@ public void testWhitespaceWithFolding() throws Exception {
public void testVersionAwareFilter() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.withTokenizer(StandardTokenizerFactory.class)
.addTokenFilter(DummyVersionAwareTokenFilterFactory.class)
.build();
Expand All @@ -128,7 +128,7 @@ public void testVersionAwareFilter() throws Exception {
public void testFactoryHtmlStripClassicFolding() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.addCharFilter(HTMLStripCharFilterFactory.class)
.withTokenizer(ClassicTokenizerFactory.class)
.addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
Expand Down Expand Up @@ -164,7 +164,7 @@ public void testFactoryHtmlStripClassicFolding() throws Exception {
public void testHtmlStripClassicFolding() throws Exception {
CustomAnalyzer a =
CustomAnalyzer.builder()
.withDefaultMatchVersion(LUCENE_9_0_0)
.withDefaultMatchVersion(LUCENE_10_0_0)
.addCharFilter("htmlstrip")
.withTokenizer("classic")
.addTokenFilter("asciifolding", "preserveOriginal", "true")
Expand Down Expand Up @@ -513,7 +513,7 @@ public DummyVersionAwareTokenFilterFactory(Map<String, String> args) {

@Override
public TokenStream create(TokenStream input) {
if (luceneMatchVersion.equals(LUCENE_9_0_0)) {
if (luceneMatchVersion.equals(LUCENE_10_0_0)) {
return input;
}
return new LowerCaseFilter(input);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,10 @@
import java.util.Objects;
import java.util.SplittableRandom;
import java.util.concurrent.TimeUnit;
import org.apache.lucene.index.FloatVectorValues;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.util.InfoStream;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;

/**
* Builder for HNSW graph. See {@link Lucene90OnHeapHnswGraph} for a gloss on the algorithm and the
Expand All @@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder {
private final Lucene90NeighborArray scratch;

private final VectorSimilarityFunction similarityFunction;
private final RandomAccessVectorValues.Floats vectorValues;
private final FloatVectorValues vectorValues;
private final SplittableRandom random;
private final Lucene90BoundsChecker bound;
final Lucene90OnHeapHnswGraph hnsw;
Expand All @@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder {

// we need two sources of vectors in order to perform diversity check comparisons without
// colliding
private final RandomAccessVectorValues.Floats buildVectors;
private final FloatVectorValues buildVectors;

/**
* Reads all the vectors from vector values, builds a graph connecting them by their dense
Expand All @@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder {
* to ensure repeatable construction.
*/
public Lucene90HnswGraphBuilder(
RandomAccessVectorValues.Floats vectors,
FloatVectorValues vectors,
VectorSimilarityFunction similarityFunction,
int maxConn,
int beamWidth,
Expand All @@ -97,14 +97,14 @@ public Lucene90HnswGraphBuilder(
}

/**
* Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two
* copies enables efficient retrieval without extra data copying, while avoiding collision of the
* Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies
* enables efficient retrieval without extra data copying, while avoiding collision of the
* returned values.
*
* @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
* accessor for the vectors
*/
public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException {
if (vectors == vectorValues) {
throw new IllegalArgumentException(
"Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
Expand Down Expand Up @@ -230,7 +230,7 @@ private boolean diversityCheck(
float[] candidate,
float score,
Lucene90NeighborArray neighbors,
RandomAccessVectorValues.Floats vectorValues)
FloatVectorValues vectorValues)
throws IOException {
bound.set(score);
for (int i = 0; i < neighbors.size(); i++) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.SplittableRandom;
Expand All @@ -34,7 +33,6 @@
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.VectorSimilarityFunction;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.KnnCollector;
import org.apache.lucene.search.VectorScorer;
import org.apache.lucene.store.ChecksumIndexInput;
Expand All @@ -44,7 +42,6 @@
import org.apache.lucene.util.IOUtils;
import org.apache.lucene.util.hnsw.HnswGraph;
import org.apache.lucene.util.hnsw.NeighborQueue;
import org.apache.lucene.util.hnsw.RandomAccessVectorValues;

/**
* Reads vectors from the index segments along with index data structures supporting KNN search.
Expand Down Expand Up @@ -355,8 +352,7 @@ int size() {
}

/** Read the vector values from the index input. This supports both iterated and random access. */
static class OffHeapFloatVectorValues extends FloatVectorValues
implements RandomAccessVectorValues.Floats {
static class OffHeapFloatVectorValues extends FloatVectorValues {

final int dimension;
final int[] ordToDoc;
Expand All @@ -367,9 +363,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues
final float[] value;
final VectorSimilarityFunction similarityFunction;

int ord = -1;
int doc = -1;

OffHeapFloatVectorValues(
int dimension,
int[] ordToDoc,
Expand All @@ -394,42 +387,6 @@ public int size() {
return ordToDoc.length;
}

@Override
public float[] vectorValue() throws IOException {
return vectorValue(ord);
}

@Override
public int docID() {
return doc;
}

@Override
public int nextDoc() {
if (++ord >= size()) {
doc = NO_MORE_DOCS;
} else {
doc = ordToDoc[ord];
}
return doc;
}

@Override
public int advance(int target) {
assert docID() < target;
ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
if (ord < 0) {
ord = -(ord + 1);
}
assert ord <= ordToDoc.length;
if (ord == ordToDoc.length) {
doc = NO_MORE_DOCS;
} else {
doc = ordToDoc[ord];
}
return doc;
}

@Override
public OffHeapFloatVectorValues copy() {
return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone());
Expand All @@ -446,21 +403,32 @@ public float[] vectorValue(int targetOrd) throws IOException {
return value;
}

@Override
public int ordToDoc(int ord) {
return ordToDoc[ord];
}

@Override
public DocIndexIterator iterator() {
return createSparseIterator();
}

@Override
public VectorScorer scorer(float[] target) {
if (size() == 0) {
return null;
}
OffHeapFloatVectorValues values = this.copy();
DocIndexIterator iterator = values.iterator();
return new VectorScorer() {
@Override
public float score() throws IOException {
return values.similarityFunction.compare(values.vectorValue(), target);
return values.similarityFunction.compare(values.vectorValue(iterator.index()), target);
}

@Override
public DocIdSetIterator iterator() {
return values;
public DocIndexIterator iterator() {
return iterator;
}
};
}
Expand Down
Loading

0 comments on commit cf57ce4

Please sign in to comment.