Merge remote-tracking branch 'upstream/main' into feature/knn-multi-l…

…eaf-coll-ctor
apache · Sep 30, 2024 · cf57ce4 · cf57ce4
2 parents 3ed0a0b + 94d3504
commit cf57ce4
Show file tree

Hide file tree

Showing 112 changed files with 2,240 additions and 2,382 deletions.
diff --git a/build.gradle b/build.gradle
@@ -41,7 +41,7 @@ apply from: file('gradle/globals.gradle')
 // Calculate project version:
 version = {
   // Release manager: update base version here after release:
-  String baseVersion = '10.0.0'
+  String baseVersion = '11.0.0'
 
   // On a release explicitly set release version in one go:
   //  -Dversion.release=x.y.z

diff --git a/dev-tools/doap/lucene.rdf b/dev-tools/doap/lucene.rdf
@@ -67,13 +67,20 @@
     </maintainer>
 
     <!-- NOTE: please insert releases in numeric order, NOT chronologically. -->
+    <release>
+       <Version>
+         <name>lucene-9.12.0</name>
+         <created>2024-09-28</created>
+         <revision>9.12.0</revision>
+       </Version>
+    </release>
     <release>
        <Version>
          <name>lucene-9.11.1</name>
          <created>2024-06-27</created>
          <revision>9.11.1</revision>
        </Version>
-    </release>. 
+    </release> 
     <release>
        <Version>
          <name>lucene-9.11.0</name>

diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
@@ -3,6 +3,58 @@ Lucene Change Log
 For more information on past and future Lucene versions, please see:
 http://s.apache.org/luceneversions
 
+======================= Lucene 11.0.0 =======================
+
+API Changes
+---------------------
+(No changes)
+
+New Features
+---------------------
+(No changes)
+
+Improvements
+---------------------
+(No changes)
+
+Optimizations
+---------------------
+(No changes)
+
+Bug Fixes
+---------------------
+(No changes)
+
+Other
+---------------------
+(No changes)
+
+======================= Lucene 10.1.0 =======================
+
+API Changes
+---------------------
+(No changes)
+
+New Features
+---------------------
+(No changes)
+
+Improvements
+---------------------
+(No changes)
+
+Optimizations
+---------------------
+(No changes)
+
+Bug Fixes
+---------------------
+(No changes)
+
+Other
+---------------------
+(No changes)
+
 ======================= Lucene 10.0.0 =======================
 
 API Changes
@@ -123,6 +175,10 @@ API Changes
 * GITHUB#13780: Remove `IndexSearcher#search(List<LeafReaderContext>, Weight, Collector)` in favour of the newly
   introduced `IndexSearcher#search(LeafReaderContextPartition[], Weight, Collector)`
 
+* GITHUB#13779:  First-class random access API for KnnVectorValues
+  unifies Byte/FloatVectorValues incorporating RandomAccess* API and introduces
+  DocIndexIterator for iterative access in place of direct inheritance from DISI.
+
 New Features
 ---------------------
 
@@ -292,6 +348,12 @@ Build
 
 ======================== Lucene 9.12.0 =======================
 
+Security Fixes
+---------------------
+
+* Deserialization of Untrusted Data vulnerability in Apache Lucene Replicator - CVE-2024-45772
+ (Summ3r from Vidar-Team, Robert Muir, Paul Irwin)
+
 API Changes
 ---------------------
 
@@ -484,6 +546,8 @@ Other
 * GITHUB#13720: Add float comparison based on unit of least precision and use it to stop test failures caused by float
   summation not being associative in IEEE 754. (Alex Herbert, Stefan Vodita)
 
+* Remove code triggering forbidden-apis regarding Java serialization. (Uwe Schindler, Robert Muir)
+
 ======================== Lucene 9.11.1 =======================
 
 Bug Fixes

diff --git a/lucene/MIGRATE.md b/lucene/MIGRATE.md
@@ -888,3 +888,7 @@ additional vectors into the same field with either 4 or 7 bit
 quantization (or no quantization), and ensure all older (9.x written)
 segments are rewritten either via `IndexWriter.forceMerge` or
 `IndexWriter.addIndexes(CodecReader...)`, or reindexing entirely.
+
+### Vector values APIs switched to primarily random-access
+
+`{Byte/Float}VectorValues` no longer inherit from `DocIdSetIterator`. Rather they extend a common class, `KnnVectorValues`, that provides a random access API (previously provided by `RandomAccessVectorValues`, now removed), and an `iterator()` method for retrieving `DocIndexIterator`: an iterator which is a DISI that also provides an `index()` method. Therefore, any iteration over vector values must now be performed using the values' `iterator()`. Random access works as before, but does not require casting to `RandomAccessVectorValues`.
diff --git a/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java b/...e/analysis/common/src/java/org/apache/lucene/analysis/synonym/word2vec/Word2VecModel.java
@@ -18,18 +18,18 @@
 package org.apache.lucene.analysis.synonym.word2vec;
 
 import java.io.IOException;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.TermAndVector;
-import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
 
 /**
  * Word2VecModel is a class representing the parsed Word2Vec model containing the vectors for each
  * word in dictionary
  *
  * @lucene.experimental
  */
-public class Word2VecModel implements RandomAccessVectorValues.Floats {
+public class Word2VecModel extends FloatVectorValues {
 
   private final int dictionarySize;
   private final int vectorDimension;

diff --git a/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java b/lucene/analysis/common/src/test/org/apache/lucene/analysis/custom/TestCustomAnalyzer.java
@@ -49,7 +49,7 @@
 public class TestCustomAnalyzer extends BaseTokenStreamTestCase {
 
   @SuppressWarnings("deprecation")
-  private static final Version LUCENE_9_0_0 = Version.LUCENE_9_0_0;
+  private static final Version LUCENE_10_0_0 = Version.LUCENE_10_0_0;
 
   // Test some examples (TODO: we only check behavior, we may need something like
   // TestRandomChains...)
@@ -111,7 +111,7 @@ public void testWhitespaceWithFolding() throws Exception {
   public void testVersionAwareFilter() throws Exception {
     CustomAnalyzer a =
         CustomAnalyzer.builder()
-            .withDefaultMatchVersion(LUCENE_9_0_0)
+            .withDefaultMatchVersion(LUCENE_10_0_0)
             .withTokenizer(StandardTokenizerFactory.class)
             .addTokenFilter(DummyVersionAwareTokenFilterFactory.class)
             .build();
@@ -128,7 +128,7 @@ public void testVersionAwareFilter() throws Exception {
   public void testFactoryHtmlStripClassicFolding() throws Exception {
     CustomAnalyzer a =
         CustomAnalyzer.builder()
-            .withDefaultMatchVersion(LUCENE_9_0_0)
+            .withDefaultMatchVersion(LUCENE_10_0_0)
             .addCharFilter(HTMLStripCharFilterFactory.class)
             .withTokenizer(ClassicTokenizerFactory.class)
             .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
@@ -164,7 +164,7 @@ public void testFactoryHtmlStripClassicFolding() throws Exception {
   public void testHtmlStripClassicFolding() throws Exception {
     CustomAnalyzer a =
         CustomAnalyzer.builder()
-            .withDefaultMatchVersion(LUCENE_9_0_0)
+            .withDefaultMatchVersion(LUCENE_10_0_0)
             .addCharFilter("htmlstrip")
             .withTokenizer("classic")
             .addTokenFilter("asciifolding", "preserveOriginal", "true")
@@ -513,7 +513,7 @@ public DummyVersionAwareTokenFilterFactory(Map<String, String> args) {
 
     @Override
     public TokenStream create(TokenStream input) {
-      if (luceneMatchVersion.equals(LUCENE_9_0_0)) {
+      if (luceneMatchVersion.equals(LUCENE_10_0_0)) {
         return input;
       }
       return new LowerCaseFilter(input);

diff --git a/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java b/...-codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswGraphBuilder.java
@@ -22,10 +22,10 @@
 import java.util.Objects;
 import java.util.SplittableRandom;
 import java.util.concurrent.TimeUnit;
+import org.apache.lucene.index.FloatVectorValues;
 import org.apache.lucene.index.VectorSimilarityFunction;
 import org.apache.lucene.util.InfoStream;
 import org.apache.lucene.util.hnsw.NeighborQueue;
-import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
 
 /**
  * Builder for HNSW graph. See {@link Lucene90OnHeapHnswGraph} for a gloss on the algorithm and the
@@ -49,7 +49,7 @@ public final class Lucene90HnswGraphBuilder {
   private final Lucene90NeighborArray scratch;
 
   private final VectorSimilarityFunction similarityFunction;
-  private final RandomAccessVectorValues.Floats vectorValues;
+  private final FloatVectorValues vectorValues;
   private final SplittableRandom random;
   private final Lucene90BoundsChecker bound;
   final Lucene90OnHeapHnswGraph hnsw;
@@ -58,7 +58,7 @@ public final class Lucene90HnswGraphBuilder {
 
   // we need two sources of vectors in order to perform diversity check comparisons without
   // colliding
-  private final RandomAccessVectorValues.Floats buildVectors;
+  private final FloatVectorValues buildVectors;
 
   /**
    * Reads all the vectors from vector values, builds a graph connecting them by their dense
@@ -73,7 +73,7 @@ public final class Lucene90HnswGraphBuilder {
    *     to ensure repeatable construction.
    */
   public Lucene90HnswGraphBuilder(
-      RandomAccessVectorValues.Floats vectors,
+      FloatVectorValues vectors,
       VectorSimilarityFunction similarityFunction,
       int maxConn,
       int beamWidth,
@@ -97,14 +97,14 @@ public Lucene90HnswGraphBuilder(
   }
 
   /**
-   * Reads all the vectors from two copies of a {@link RandomAccessVectorValues}. Providing two
-   * copies enables efficient retrieval without extra data copying, while avoiding collision of the
+   * Reads all the vectors from two copies of a {@link FloatVectorValues}. Providing two copies
+   * enables efficient retrieval without extra data copying, while avoiding collision of the
    * returned values.
    *
    * @param vectors the vectors for which to build a nearest neighbors graph. Must be an independet
    *     accessor for the vectors
    */
-  public Lucene90OnHeapHnswGraph build(RandomAccessVectorValues.Floats vectors) throws IOException {
+  public Lucene90OnHeapHnswGraph build(FloatVectorValues vectors) throws IOException {
     if (vectors == vectorValues) {
       throw new IllegalArgumentException(
           "Vectors to build must be independent of the source of vectors provided to HnswGraphBuilder()");
@@ -230,7 +230,7 @@ private boolean diversityCheck(
       float[] candidate,
       float score,
       Lucene90NeighborArray neighbors,
-      RandomAccessVectorValues.Floats vectorValues)
+      FloatVectorValues vectorValues)
       throws IOException {
     bound.set(score);
     for (int i = 0; i < neighbors.size(); i++) {

diff --git a/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java b/...codecs/src/java/org/apache/lucene/backward_codecs/lucene90/Lucene90HnswVectorsReader.java
@@ -20,7 +20,6 @@
 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;
 
 import java.io.IOException;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.SplittableRandom;
@@ -34,7 +33,6 @@
 import org.apache.lucene.index.IndexFileNames;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.VectorSimilarityFunction;
-import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.KnnCollector;
 import org.apache.lucene.search.VectorScorer;
 import org.apache.lucene.store.ChecksumIndexInput;
@@ -44,7 +42,6 @@
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.hnsw.HnswGraph;
 import org.apache.lucene.util.hnsw.NeighborQueue;
-import org.apache.lucene.util.hnsw.RandomAccessVectorValues;
 
 /**
  * Reads vectors from the index segments along with index data structures supporting KNN search.
@@ -355,8 +352,7 @@ int size() {
   }
 
   /** Read the vector values from the index input. This supports both iterated and random access. */
-  static class OffHeapFloatVectorValues extends FloatVectorValues
-      implements RandomAccessVectorValues.Floats {
+  static class OffHeapFloatVectorValues extends FloatVectorValues {
 
     final int dimension;
     final int[] ordToDoc;
@@ -367,9 +363,6 @@ static class OffHeapFloatVectorValues extends FloatVectorValues
     final float[] value;
     final VectorSimilarityFunction similarityFunction;
 
-    int ord = -1;
-    int doc = -1;
-
     OffHeapFloatVectorValues(
         int dimension,
         int[] ordToDoc,
@@ -394,42 +387,6 @@ public int size() {
       return ordToDoc.length;
     }
 
-    @Override
-    public float[] vectorValue() throws IOException {
-      return vectorValue(ord);
-    }
-
-    @Override
-    public int docID() {
-      return doc;
-    }
-
-    @Override
-    public int nextDoc() {
-      if (++ord >= size()) {
-        doc = NO_MORE_DOCS;
-      } else {
-        doc = ordToDoc[ord];
-      }
-      return doc;
-    }
-
-    @Override
-    public int advance(int target) {
-      assert docID() < target;
-      ord = Arrays.binarySearch(ordToDoc, ord + 1, ordToDoc.length, target);
-      if (ord < 0) {
-        ord = -(ord + 1);
-      }
-      assert ord <= ordToDoc.length;
-      if (ord == ordToDoc.length) {
-        doc = NO_MORE_DOCS;
-      } else {
-        doc = ordToDoc[ord];
-      }
-      return doc;
-    }
-
     @Override
     public OffHeapFloatVectorValues copy() {
       return new OffHeapFloatVectorValues(dimension, ordToDoc, similarityFunction, dataIn.clone());
@@ -446,21 +403,32 @@ public float[] vectorValue(int targetOrd) throws IOException {
       return value;
     }
 
+    @Override
+    public int ordToDoc(int ord) {
+      return ordToDoc[ord];
+    }
+
+    @Override
+    public DocIndexIterator iterator() {
+      return createSparseIterator();
+    }
+
     @Override
     public VectorScorer scorer(float[] target) {
       if (size() == 0) {
         return null;
       }
       OffHeapFloatVectorValues values = this.copy();
+      DocIndexIterator iterator = values.iterator();
       return new VectorScorer() {
         @Override
         public float score() throws IOException {
-          return values.similarityFunction.compare(values.vectorValue(), target);
+          return values.similarityFunction.compare(values.vectorValue(iterator.index()), target);
         }
 
         @Override
-        public DocIdSetIterator iterator() {
-          return values;
+        public DocIndexIterator iterator() {
+          return iterator;
         }
       };
     }