From 943d08bdad7946b22f56d040756669ee444dd681 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Thu, 29 Feb 2024 13:24:50 -0800
Subject: [PATCH 001/116] Prepare for v.1.8.0 release (#3265)

Summary:
Prepare for v1.8.0 release

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3265

Reviewed By: mdouze

Differential Revision: D54232846

Pulled By: junjieqi

fbshipit-source-id: f92efc93e340507262321c5033bab7fede7d7f40
---
 CHANGELOG.md          | 32 ++++++++++++++++++++++++++++++--
 CMakeLists.txt        |  2 +-
 faiss/Index.h         |  4 ++--
 faiss/python/setup.py |  2 +-
 4 files changed, 34 insertions(+), 6 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae418b09b4..e61bd997ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,34 @@ We try to indicate most contributions here with the contributor names who are no
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
-- Support for range search in HNSW and Fast scan IVF.
+
+## [1.8.0] - 2024-02-27
+### Added
+- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
+- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
+- Added a context parameter to InvertedLists and InvertedListsIterator
+- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
+- Introduced Offline IVF framework powered by Faiss big batch search
+- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
+- Generalized ResultHandler and supported range search for HNSW and FastScan
+- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
+- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
+- Supported large two-level clustering
+- Added support for Python 3.11 and 3.12
+- Added support for CUDA 12
+
+### Changed
+- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
+- Splitted off RQ encoding steps to another file
+- Supported better NaN handling
+- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
+
+### Fixed
+- Fixed DeviceVector reallocations in Faiss GPU
+- Used efSearch from params if provided in HNSW search
+- Fixed warp synchronous behavior in Faiss GPU CUDA 12
+
+
 ## [1.7.4] - 2023-04-12
 ### Added
 - Added big batch IVF search for conducting efficient search with big batches of queries
@@ -259,7 +286,8 @@ by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.4...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.8.0...HEAD
+[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
 [1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
 [1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
 [1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 445b39d59e..6cdc37c46f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ rapids_cuda_init_architectures(faiss_c_library)
 endif()
 
 project(faiss
-  VERSION 1.7.4
+  VERSION 1.8.0
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
   LANGUAGES ${FAISS_LANGUAGES})
diff --git a/faiss/Index.h b/faiss/Index.h
index 4b4b302b47..3d1bdb996a 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -17,8 +17,8 @@
 #include <typeinfo>
 
 #define FAISS_VERSION_MAJOR 1
-#define FAISS_VERSION_MINOR 7
-#define FAISS_VERSION_PATCH 4
+#define FAISS_VERSION_MINOR 8
+#define FAISS_VERSION_PATCH 0
 
 /**
  * @namespace faiss
diff --git a/faiss/python/setup.py b/faiss/python/setup.py
index 3b4f2e9c83..939aeeffbe 100644
--- a/faiss/python/setup.py
+++ b/faiss/python/setup.py
@@ -60,7 +60,7 @@
 """
 setup(
     name='faiss',
-    version='1.7.4',
+    version='1.8.0',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',

From 12b92e9fa5d8e8fb3da53c57af9ff007c826b1ee Mon Sep 17 00:00:00 2001
From: John Mazanec <jmazane@amazon.com>
Date: Fri, 1 Mar 2024 04:27:49 -0800
Subject: [PATCH 002/116] Skip HNSWPQ sdc init with new io flag (#3250)

Summary:
## Description

Related issue: https://github.com/facebookresearch/faiss/issues/3246

When reading HNSWPQ from disk, if index ~read only~ new `IO_FLAG_PQ_SKIP_SDC_TABLE` flag is set, skip initializing the sdc_table.

In addition, adds cpp test case verifying functionality and build test util header file to share creation of temporary files amongst tests.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3250

Test Plan: buck test //faiss/tests/:test_disable_pq_sdc_tables

Reviewed By: junjieqi

Differential Revision: D53844075

Pulled By: mdouze

fbshipit-source-id: e9a83c0e5243867edbca8f80e3b1242b38ef6a42
---
 faiss/impl/index_read.cpp            |  2 +-
 faiss/index_io.h                     |  6 +++
 tests/CMakeLists.txt                 |  1 +
 tests/test_disable_pq_sdc_tables.cpp | 61 ++++++++++++++++++++++++++++
 tests/test_merge.cpp                 | 35 +++-------------
 tests/test_util.h                    | 39 ++++++++++++++++++
 6 files changed, 113 insertions(+), 31 deletions(-)
 create mode 100644 tests/test_disable_pq_sdc_tables.cpp
 create mode 100644 tests/test_util.h

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index ac62e0269e..8d80329bf9 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -962,7 +962,7 @@ Index* read_index(IOReader* f, int io_flags) {
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
         idxhnsw->own_fields = true;
-        if (h == fourcc("IHNp")) {
+        if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
         idx = idxhnsw;
diff --git a/faiss/index_io.h b/faiss/index_io.h
index 8d52ee1afd..f73cd073b7 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -52,6 +52,12 @@ const int IO_FLAG_ONDISK_SAME_DIR = 4;
 const int IO_FLAG_SKIP_IVF_DATA = 8;
 // don't initialize precomputed table after loading
 const int IO_FLAG_SKIP_PRECOMPUTE_TABLE = 16;
+// don't compute the sdc table for PQ-based indices
+// this will prevent distances from being computed
+// between elements in the index. For indices like HNSWPQ,
+// this will prevent graph building because sdc
+// computations are required to construct the graph
+const int IO_FLAG_PQ_SKIP_SDC_TABLE = 32;
 // try to memmap data (useful to load an ArrayInvertedLists as an
 // OnDiskInvertedLists)
 const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 10243b9a9c..9017edc586 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -32,6 +32,7 @@ set(FAISS_TEST_SRC
   test_hnsw.cpp
   test_partitioning.cpp
   test_fastscan_perf.cpp
+  test_disable_pq_sdc_tables.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff --git a/tests/test_disable_pq_sdc_tables.cpp b/tests/test_disable_pq_sdc_tables.cpp
new file mode 100644
index 0000000000..b211a5c451
--- /dev/null
+++ b/tests/test_disable_pq_sdc_tables.cpp
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "faiss/Index.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/index_factory.h"
+#include "faiss/index_io.h"
+#include "test_util.h"
+
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+TEST(IO, TestReadHNSWPQ_whenSDCDisabledFlagPassed_thenDisableSDCTable) {
+    Tempfilename index_filename(&temp_file_mutex, "/tmp/faiss_TestReadHNSWPQ");
+    int d = 32, n = 256;
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 100);
+    std::vector<float> vectors(n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        vectors[i] = u(rng);
+    }
+
+    // Build the index and write it to the temp file
+    {
+        std::unique_ptr<faiss::Index> index_writer(
+                faiss::index_factory(d, "HNSW8,PQ4np", faiss::METRIC_L2));
+        index_writer->train(n, vectors.data());
+        index_writer->add(n, vectors.data());
+
+        faiss::write_index(index_writer.get(), index_filename.c_str());
+    }
+
+    // Load index from disk. Confirm that the sdc table is equal to 0 when
+    // disable sdc is set
+    {
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_read_write(
+                dynamic_cast<faiss::IndexHNSWPQ*>(
+                        faiss::read_index(index_filename.c_str())));
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_sdc_disabled(
+                dynamic_cast<faiss::IndexHNSWPQ*>(faiss::read_index(
+                        index_filename.c_str(),
+                        faiss::IO_FLAG_PQ_SKIP_SDC_TABLE)));
+
+        ASSERT_NE(
+                dynamic_cast<faiss::IndexPQ*>(index_reader_read_write->storage)
+                        ->pq.sdc_table.size(),
+                0);
+        ASSERT_EQ(
+                dynamic_cast<faiss::IndexPQ*>(
+                        index_reader_sdc_disabled->storage)
+                        ->pq.sdc_table.size(),
+                0);
+    }
+}
diff --git a/tests/test_merge.cpp b/tests/test_merge.cpp
index 7e23f15f72..5a1d08cfba 100644
--- a/tests/test_merge.cpp
+++ b/tests/test_merge.cpp
@@ -6,47 +6,22 @@
  */
 
 #include <cstdio>
-#include <cstdlib>
 #include <random>
 
-#include <unistd.h>
-
 #include <gtest/gtest.h>
 
 #include <faiss/IVFlib.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/invlists/OnDiskInvertedLists.h>
 
-namespace {
-
-struct Tempfilename {
-    static pthread_mutex_t mutex;
-
-    std::string filename = "/tmp/faiss_tmp_XXXXXX";
-
-    Tempfilename() {
-        pthread_mutex_lock(&mutex);
-        int fd = mkstemp(&filename[0]);
-        close(fd);
-        pthread_mutex_unlock(&mutex);
-    }
-
-    ~Tempfilename() {
-        if (access(filename.c_str(), F_OK)) {
-            unlink(filename.c_str());
-        }
-    }
+#include "test_util.h"
 
-    const char* c_str() {
-        return filename.c_str();
-    }
-};
+namespace {
 
-pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 typedef faiss::idx_t idx_t;
 
@@ -95,7 +70,7 @@ int compare_merged(
     std::vector<float> refD(k * nq);
 
     index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     std::vector<idx_t> newI(k * nq);
     std::vector<float> newD(k * nq);
@@ -212,7 +187,7 @@ TEST(MERGE, merge_flat_vt) {
 TEST(MERGE, merge_flat_ondisk) {
     faiss::IndexShards index_shards(d, false, false);
     index_shards.own_indices = true;
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     for (int i = 0; i < nindex; i++) {
         auto ivf = new faiss::IndexIVFFlat(&cd.quantizer, d, nlist);
diff --git a/tests/test_util.h b/tests/test_util.h
new file mode 100644
index 0000000000..3be0e35cff
--- /dev/null
+++ b/tests/test_util.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_TEST_UTIL_H
+#define FAISS_TEST_UTIL_H
+
+#include <faiss/IndexIVFPQ.h>
+#include <unistd.h>
+#include <cstdlib>
+
+struct Tempfilename {
+    pthread_mutex_t* mutex;
+    std::string filename;
+
+    Tempfilename(pthread_mutex_t* mutex, std::string filename) {
+        this->mutex = mutex;
+        this->filename = filename;
+        pthread_mutex_lock(mutex);
+        int fd = mkstemp(&filename[0]);
+        close(fd);
+        pthread_mutex_unlock(mutex);
+    }
+
+    ~Tempfilename() {
+        if (access(filename.c_str(), F_OK)) {
+            unlink(filename.c_str());
+        }
+    }
+
+    const char* c_str() {
+        return filename.c_str();
+    }
+};
+
+#endif // FAISS_TEST_UTIL_H

From dafdff110489db7587b169a0afee8470f220d295 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Mon, 4 Mar 2024 03:07:49 -0800
Subject: [PATCH 003/116] Change intall.md to reflect faiss 1.8.0

Summary: Same as title

Reviewed By: algoriddle

Differential Revision: D54399993

fbshipit-source-id: a0b05aabc2a0b70ae64843ca2ef2f4faaa123cdd
---
 INSTALL.md | 37 ++++++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 17 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index ee08d8d2cf..45e2c9341b 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -4,44 +4,47 @@ The supported way to install Faiss is through [conda](https://docs.conda.io).
 Stable releases are pushed regularly to the pytorch conda channel, as well as
 pre-release nightly builds.
 
-The CPU-only `faiss-cpu` conda package is currently available on Linux, OSX, and
-Windows. The `faiss-gpu`, containing both CPU and GPU indices, is available on
-Linux systems, for CUDA 11.4. Packages are built for Python versions 3.8-3.10.
+- The CPU-only faiss-cpu conda package is currently available on Linux (x86_64 and arm64), OSX (arm64 only), and Windows (x86_64)
+- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86_64 only) for CUDA 11.4 and 12.1
+- NEW: faiss-gpu-raft containing both CPU and GPU indices provided by NVIDIA RAFT, is available on Linux (x86_64 only) for CUDA 11.8 and 12.1.
 
 To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch faiss-cpu=1.8.0
 
 # GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0
 ```
 
-For faiss-gpu, the nvidia channel is required for cudatoolkit=11.4, which is not
+For faiss-gpu, the nvidia channel is required for CUDA, which is not
 published in the main anaconda channel.
 
-NOTE: due to a bug in the latest 1.7.4 release, Intel MKL 2021 needs to be installed
-separately where applicable. Remove the MKL reference when installing on
-non-Intel platforms.
+For faiss-gpu-raft, the nvidia, rapidsai and conda-forge channels are required.
 
-Nightly pre-release packages can be installed as follows. There is no need to
-install MKL separately, the correct package is automatically installed as a
-dependency where necessary:
+Nightly pre-release packages can be installed as follows:
 
 ``` shell
 # CPU-only version
 $ conda install -c pytorch/label/nightly faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0 pytorch pytorch-cuda numpy
 ```
+In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
 
-A combination of versions that installs GPU Faiss with CUDA 11.4 and Pytorch (as of 2023-06-19):
+A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-03-01):
 ```
-conda create --name faiss_1.7.4 python=3.10
-conda activate faiss_1.7.4
-conda install faiss-gpu=1.7.4 mkl=2021 pytorch pytorch-cuda numpy -c pytorch -c nvidia
+conda create --name faiss_1.8.0
+conda activate faiss_1.8.0
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch pytorch-cuda numpy
 ```
 
 ## Installing from conda-forge

From e99ad124cbc40c03f72a37032ec25fcabf1479cf Mon Sep 17 00:00:00 2001
From: ranjitsastra <ranjit.sastra@gmail.com>
Date: Fri, 15 Mar 2024 05:19:05 -0700
Subject: [PATCH 004/116] AIX compilation fix for io classes (#3275)

Summary:
in AIX OS ,as fileno is defined as C macro,   we get the compilation error during preprocessing step.

In file included from /ranjit/Faiss/faiss/faiss/invlists/InvertedListsIOHook.h:10:
/ranjit/Faiss/faiss/faiss/impl/io.h:35:17: error: expected member name or ';' after declaration specifiers
   35 |     virtual int fileno();
      |     ~~~~~~~~~~~ ^
/usr/include/stdio.h:517:30: note: expanded from macro 'fileno'
  517 | #define fileno(__p)     ((__p)->_file)

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3275

Reviewed By: algoriddle

Differential Revision: D54944388

Pulled By: mdouze

fbshipit-source-id: 40c4314de93547778ac274281245ff59e3a18b6d
---
 faiss/impl/io.cpp | 16 ++++++++++++----
 faiss/impl/io.h   |  8 ++++----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/faiss/impl/io.cpp b/faiss/impl/io.cpp
index 5d24e58591..3d3af95036 100644
--- a/faiss/impl/io.cpp
+++ b/faiss/impl/io.cpp
@@ -20,11 +20,11 @@ namespace faiss {
  * IO functions
  ***********************************************************************/
 
-int IOReader::fileno() {
+int IOReader::filedescriptor() {
     FAISS_THROW_MSG("IOReader does not support memory mapping");
 }
 
-int IOWriter::fileno() {
+int IOWriter::filedescriptor() {
     FAISS_THROW_MSG("IOWriter does not support memory mapping");
 }
 
@@ -85,8 +85,12 @@ size_t FileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
     return fread(ptr, size, nitems, f);
 }
 
-int FileIOReader::fileno() {
+int FileIOReader::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 FileIOWriter::FileIOWriter(FILE* wf) : f(wf) {}
@@ -116,8 +120,12 @@ size_t FileIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
     return fwrite(ptr, size, nitems, f);
 }
 
-int FileIOWriter::fileno() {
+int FileIOWriter::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 /***********************************************************************
diff --git a/faiss/impl/io.h b/faiss/impl/io.h
index 8d0605a5a6..59c2e31539 100644
--- a/faiss/impl/io.h
+++ b/faiss/impl/io.h
@@ -32,7 +32,7 @@ struct IOReader {
     virtual size_t operator()(void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOReader() {}
 };
@@ -45,7 +45,7 @@ struct IOWriter {
     virtual size_t operator()(const void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOWriter() noexcept(false) {}
 };
@@ -73,7 +73,7 @@ struct FileIOReader : IOReader {
 
     size_t operator()(void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 struct FileIOWriter : IOWriter {
@@ -88,7 +88,7 @@ struct FileIOWriter : IOWriter {
 
     size_t operator()(const void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 /*******************************************************

From d5e4c798f3586a7d5e97665698c25f58dc7f0e9d Mon Sep 17 00:00:00 2001
From: Maria <marialomeli@fb.com>
Date: Mon, 18 Mar 2024 11:16:56 -0700
Subject: [PATCH 005/116] Removed index_shard_and_quantize OIVFBBS (#3291)

Summary:
This PR removes the unused method `index_shard_and_quantize` in OIVFBBS.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3291

Reviewed By: algoriddle, junjieqi

Differential Revision: D54901824

Pulled By: mlomeli1

fbshipit-source-id: f723aa386b91417f697b24b620618b864329ef6d
---
 demos/offline_ivf/offline_ivf.py | 58 --------------------------------
 1 file changed, 58 deletions(-)

diff --git a/demos/offline_ivf/offline_ivf.py b/demos/offline_ivf/offline_ivf.py
index 5c316178cb..458c1a991c 100644
--- a/demos/offline_ivf/offline_ivf.py
+++ b/demos/offline_ivf/offline_ivf.py
@@ -227,64 +227,6 @@ def _iterate_transformed(self, ds, start, batch_size, dt):
             for buffer in ds.iterate(start, batch_size, dt):
                 yield buffer
 
-    def index_shard_and_quantize(self):
-        assert os.path.exists(self.index_template_file)
-        index = faiss.read_index(self.index_template_file)
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        assert self.nprobe <= index_ivf.quantizer.ntotal, (
-            f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
-            f" to retrieve {self.nprobe} neighbours, check."
-        )
-
-        if is_pretransform_index(index):
-            d = index.chain.at(0).d_out
-        else:
-            d = self.input_d
-        for i in range(0, self.nshards):
-            sfn = f"{self.index_shard_prefix}{i}"
-            cqfn = f"{self.coarse_quantization_prefix}{i}"  # fixme
-            if os.path.exists(sfn) or os.path.exists(cqfn):
-                logging.info(f"skipping shard: {i}")
-                continue
-            try:
-                with open(cqfn, "xb") as cqf:
-                    index.reset()
-                    start = i * self.shard_size
-                    j = 0
-                    quantizer = faiss.index_cpu_to_all_gpus(
-                        index_ivf.quantizer
-                    )
-                    for xb_j in tqdm(
-                        self._iterate_transformed(
-                            self.xb_ds,
-                            start,
-                            EMBEDDINGS_BATCH_SIZE,
-                            np.float32,
-                        ),
-                        file=sys.stdout,
-                    ):
-                        assert xb_j.shape[1] == d
-                        _, I = quantizer.search(xb_j, self.nprobe)
-                        assert np.amin(I) >= 0, f"{I}"
-                        assert np.amax(I) < index_ivf.nlist
-                        cqf.write(I)
-                        self._index_add_core_wrapper(  # fixme
-                            index_ivf,
-                            xb_j,
-                            np.arange(start + j, start + j + xb_j.shape[0]),
-                            I[:, 0],
-                        )
-                        j += xb_j.shape[0]
-                        assert j <= self.shard_size
-                        if j == self.shard_size:
-                            break
-                logging.info(f"writing {sfn}...")
-                faiss.write_index(index, sfn)
-            except FileExistsError:
-                logging.info(f"skipping shard: {i}")
-                continue
-        logging.info("done")
-
     def index_shard(self):
         assert os.path.exists(self.index_template_file)
         index = faiss.read_index(self.index_template_file)

From 7d21c92fc1db52a5ab7a033d756b55198a950f95 Mon Sep 17 00:00:00 2001
From: Maria <marialomeli@fb.com>
Date: Mon, 18 Mar 2024 11:59:21 -0700
Subject: [PATCH 006/116] Dim reduction support in OIVFBBS (#3290)

Summary:
This PR adds support for dimensionality reduction in OIVFBBS. I tested the code with an index `OPQ64_128,IVF4096,PQ64` using the ssnpp embeddings - this index string is added to the config_ssnpp.yaml to showcase this functionality.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3290

Reviewed By: junjieqi

Differential Revision: D54878345

Pulled By: mlomeli1

fbshipit-source-id: 98ecdeb2224ce0325e37720cc113d82f9c6c75d6
---
 demos/offline_ivf/config_ssnpp.yaml |  1 +
 demos/offline_ivf/offline_ivf.py    | 30 +++++++++++++++++++----------
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/demos/offline_ivf/config_ssnpp.yaml b/demos/offline_ivf/config_ssnpp.yaml
index 690f0de156..88e0394155 100644
--- a/demos/offline_ivf/config_ssnpp.yaml
+++ b/demos/offline_ivf/config_ssnpp.yaml
@@ -6,6 +6,7 @@ index:
   non-prod:
   - 'IVF16384,PQ128'
   - 'IVF32768,PQ128'
+  - 'OPQ64_128,IVF4096,PQ64'
 nprobe:
   prod:
     - 512
diff --git a/demos/offline_ivf/offline_ivf.py b/demos/offline_ivf/offline_ivf.py
index 458c1a991c..eccd2b95cb 100644
--- a/demos/offline_ivf/offline_ivf.py
+++ b/demos/offline_ivf/offline_ivf.py
@@ -178,7 +178,7 @@ def dedupe(self):
             idxs.append(np.empty((0,), dtype=np.uint32))
         bs = 1_000_000
         i = 0
-        for buffer in tqdm(self.xb_ds.iterate(0, bs, np.float32)):
+        for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
             for j in range(len(codecs)):
                 codec, codeset, idx = codecs[j], codesets[j], idxs[j]
                 uniq = codeset.insert(codec.sa_encode(buffer))
@@ -267,11 +267,18 @@ def index_shard(self):
                         ),
                         file=sys.stdout,
                     ):
-                        assert xb_j.shape[1] == index.d
-                        index.add_with_ids(
-                            xb_j,
-                            np.arange(start + jj, start + jj + xb_j.shape[0]),
-                        )
+                        if is_pretransform_index(index):
+                            assert xb_j.shape[1] == index.chain.at(0).d_out
+                            index_ivf.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
+                        else:
+                            assert xb_j.shape[1] == index.d
+                            index.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
                         jj += xb_j.shape[0]
                         logging.info(jj)
                         assert (
@@ -670,10 +677,14 @@ def search(self):
                             os.remove(Ifn)
                             os.remove(Dfn)
 
-            try:  # TODO: modify shape for pretransform case
+            try:
+                if is_pretransform_index(index):
+                    d = index.chain.at(0).d_out
+                else:
+                    d = self.input_d
                 with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
                     xq_i = np.empty(
-                        shape=(self.xq_bs, self.input_d), dtype=np.float16
+                        shape=(self.xq_bs, d), dtype=np.float16
                     )
                     q_assign = np.empty(
                         (self.xq_bs, self.nprobe), dtype=np.int32
@@ -835,8 +846,7 @@ def consistency_check(self):
             for j in range(SMALL_DATA_SAMPLE):
                 assert np.where(I[j] == j + r)[0].size > 0, (
                     f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
-                    f" {self.shard_size}"
-                )
+                    f" {self.shard_size}")
 
         logging.info("search results...")
         index_ivf.nprobe = self.nprobe

From f7fe62e801ebcd01e792680ee3ed8328d7e0a786 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Tue, 19 Mar 2024 10:46:30 -0700
Subject: [PATCH 007/116] Remove swig version and always rely on the latest
 version (#3295)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3295

In the past, we had build failure due to the latest swig version in conda-forge so we had to specify the version of swig. In this diff, we are going to change it to be the latest version always because the issue has gone.

Reviewed By: algoriddle

Differential Revision: D54975271

fbshipit-source-id: 7ca59fb58390edb0cc5ed52f6fd416f633dd7938
---
 .circleci/config.yml           | 4 ++--
 conda/faiss-gpu-raft/meta.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 94aad3b11e..a5429bc1e7 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -224,7 +224,7 @@ jobs:
             - run:
                 name: Install env using main channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
+                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64 -c conda-forge
       - when:
           condition:
             equal: [ "ON", << parameters.raft >> ]
@@ -232,7 +232,7 @@ jobs:
             - run:
                 name: Install env using conda-forge channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
       - when:
           condition:
             and:
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index b365571777..ab605f8dde 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -84,7 +84,7 @@ outputs:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0.2
+        - swig
         - cmake >=3.23.1
         - make  # [not win]
       host:

From cf364ec606322318ff9409ccb0f904890e38e023 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Tue, 19 Mar 2024 11:32:29 -0700
Subject: [PATCH 008/116] Remove unused fallthrough (#3296)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3296

same as title

Reviewed By: algoriddle

Differential Revision: D54973709

fbshipit-source-id: 545118e30773c6a4ea3f544a3a20c5ba8c394f69
---
 faiss/utils/hamming_distance/generic-inl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h
index e0907a1586..3565a97c6b 100644
--- a/faiss/utils/hamming_distance/generic-inl.h
+++ b/faiss/utils/hamming_distance/generic-inl.h
@@ -275,7 +275,6 @@ struct HammingComputerDefault {
                     len -= 8;
                     accu += popcount64(a64[i] ^ b64[i]);
                     i++;
-                    [[fallthrough]];
                     case 7:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
@@ -309,7 +308,6 @@ struct HammingComputerDefault {
             const uint8_t* a = a8 + 8 * quotient8;
             const uint8_t* b = b8 + 8 * quotient8;
             switch (remainder8) {
-                [[fallthrough]];
                 case 7:
                     accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
                     [[fallthrough]];

From 0e06a28094b160aee238bddb93c1967517da3af7 Mon Sep 17 00:00:00 2001
From: Jason Sylka <jsylka@fb.com>
Date: Tue, 19 Mar 2024 13:21:00 -0700
Subject: [PATCH 009/116] Revert D54973709: Remove unused fallthrough

Differential Revision:
D54973709

Original commit changeset: 545118e30773

Original Phabricator Diff: D54973709

fbshipit-source-id: d975b59d071deda5d8eaa2583a8f7c6f1562b9ba
---
 faiss/utils/hamming_distance/generic-inl.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h
index 3565a97c6b..e0907a1586 100644
--- a/faiss/utils/hamming_distance/generic-inl.h
+++ b/faiss/utils/hamming_distance/generic-inl.h
@@ -275,6 +275,7 @@ struct HammingComputerDefault {
                     len -= 8;
                     accu += popcount64(a64[i] ^ b64[i]);
                     i++;
+                    [[fallthrough]];
                     case 7:
                         accu += popcount64(a64[i] ^ b64[i]);
                         i++;
@@ -308,6 +309,7 @@ struct HammingComputerDefault {
             const uint8_t* a = a8 + 8 * quotient8;
             const uint8_t* b = b8 + 8 * quotient8;
             switch (remainder8) {
+                [[fallthrough]];
                 case 7:
                     accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
                     [[fallthrough]];

From 6f3843e14f5bde264aa3aecc71a79480d8ccdb7e Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Tue, 19 Mar 2024 16:31:48 -0700
Subject: [PATCH 010/116] Back out "Remove swig version and always rely on the
 latest version" (#3297)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3297

Original commit changeset: 7ca59fb58390

Original Phabricator Diff: D54975271

Differential Revision: D55102226

fbshipit-source-id: 2a2828b4e74b16ee25b090ae4b844dab4f1d72a6
---
 .circleci/config.yml           | 4 ++--
 conda/faiss-gpu-raft/meta.yaml | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index a5429bc1e7..94aad3b11e 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -224,7 +224,7 @@ jobs:
             - run:
                 name: Install env using main channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64 -c conda-forge
+                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
       - when:
           condition:
             equal: [ "ON", << parameters.raft >> ]
@@ -232,7 +232,7 @@ jobs:
             - run:
                 name: Install env using conda-forge channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
       - when:
           condition:
             and:
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index ab605f8dde..b365571777 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -84,7 +84,7 @@ outputs:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
-        - swig
+        - swig =4.0.2
         - cmake >=3.23.1
         - make  # [not win]
       host:

From 5483f210d2c7aa858c473e601e2e387ea275f91b Mon Sep 17 00:00:00 2001
From: Yuri Victorovich <yuri@FreeBSD.org>
Date: Wed, 20 Mar 2024 09:52:12 -0700
Subject: [PATCH 011/116] Use cmake's find_package to link to GTest (#3278)

Summary:
Otherwise the gtest transitive dependency isn't linked properly when GoogleTest is built to have shared libraries.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3278

Reviewed By: junjieqi

Differential Revision: D55134304

Pulled By: algoriddle

fbshipit-source-id: 01e7b11f28c27f837afee36350fbf9543e301a31
---
 tests/CMakeLists.txt | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 9017edc586..0cb8219096 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -59,18 +59,12 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
   target_link_libraries(faiss_test PRIVATE faiss_avx512)
 endif()
 
-include(FetchContent)
-FetchContent_Declare(googletest
-  URL "https://github.com/google/googletest/archive/release-1.12.1.tar.gz")
-set(BUILD_GMOCK CACHE BOOL OFF)
-set(INSTALL_GTEST CACHE BOOL OFF)
-FetchContent_MakeAvailable(googletest)
-
 find_package(OpenMP REQUIRED)
+find_package(GTest CONFIG REQUIRED)
 
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
-  gtest_main
+  GTest::gtest_main
   $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>
 )
 

From 9c79e3d5b1e0bd81c37e6a006a5b2340139d41b1 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Wed, 20 Mar 2024 14:28:42 -0700
Subject: [PATCH 012/116] RAFT 24.04 API changes (#3282)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3282

Reviewed By: junjieqi

Differential Revision: D55153617

Pulled By: algoriddle

fbshipit-source-id: 7b1ab24a6b0fbe002a0d8358078d014b1556044a
---
 faiss/gpu/impl/RaftIVFFlat.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 1e310723d0..0906a60f46 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -403,7 +403,8 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     // Update the pointers and the sizes
-    raft_knn_index.value().recompute_internal_state(raft_handle);
+    raft::neighbors::ivf_flat::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);

From 8274c38f2737f83ecc80655afb595b942779e0ea Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 21 Mar 2024 10:30:44 -0700
Subject: [PATCH 013/116] Remove TypedStorage usage when working with
 torch_utils (#3301)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3301

In `torch_utils.py`, changed `storage()' references to `untyped_storage()`.

Reviewed By: junjieqi

Differential Revision: D55167842

fbshipit-source-id: 911eda1c22f10595663fb4416ab992903390d457
---
 contrib/torch_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index 790c295e48..e371932c9f 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -33,7 +33,7 @@ def swig_ptr_from_UInt8Tensor(x):
     assert x.is_contiguous()
     assert x.dtype == torch.uint8
     return faiss.cast_integer_to_uint8_ptr(
-        x.storage().data_ptr() + x.storage_offset())
+        x.untyped_storage().data_ptr() + x.storage_offset())
 
 def swig_ptr_from_HalfTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -41,28 +41,28 @@ def swig_ptr_from_HalfTensor(x):
     assert x.dtype == torch.float16
     # no canonical half type in C/C++
     return faiss.cast_integer_to_void_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 2)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 2)
 
 def swig_ptr_from_FloatTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.float32
     return faiss.cast_integer_to_float_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IntTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_int_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IndicesTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_idx_t_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 8)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 8)
 
 @contextlib.contextmanager
 def using_stream(res, pytorch_stream=None):

From b77061ff5eb2d5dc3b1fc25b240578c2d686a646 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Thu, 21 Mar 2024 11:18:02 -0700
Subject: [PATCH 014/116] move to raft 24.04 (#3302)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3302

Reviewed By: junjieqi

Differential Revision: D55173776

fbshipit-source-id: 5de2225638e2d997fbfa4e28b924d5e4633ee27f
---
 conda/faiss-gpu-raft/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index b365571777..12dfc889b1 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -55,14 +55,14 @@ outputs:
       host:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.02
+        - libraft =24.04
         - cuda-version {{ cuda_constraints }}
       run:
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.02
+        - libraft =24.04
         - cuda-version {{ cuda_constraints }}
     test:
       requires:

From fa1f39ec9fc9a7fd5afa3be79e1e214317cfc21b Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Fri, 22 Mar 2024 12:55:30 -0700
Subject: [PATCH 015/116] Fix HNSW stats (#3309)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3309

Make sure that the HNSW search stats work, remove stats for deprecated functionality.
Remove code of the link and code paper that is not supported anymore.

Reviewed By: kuarora, junjieqi

Differential Revision: D55247802

fbshipit-source-id: 03f176be092bff6b2db359cc956905d8646ea702
---
 benchs/link_and_code/README.md              | 137 +--------
 benchs/link_and_code/bench_link_and_code.py | 300 --------------------
 benchs/link_and_code/datasets.py            | 236 ---------------
 benchs/link_and_code/neighbor_codec.py      | 241 ----------------
 faiss/IndexHNSW.cpp                         |  16 +-
 faiss/gpu/GpuIndex.h                        |  10 +-
 faiss/impl/HNSW.cpp                         |   4 +-
 faiss/impl/HNSW.h                           |  20 +-
 tests/test_graph_based.py                   |  10 +
 9 files changed, 27 insertions(+), 947 deletions(-)
 delete mode 100755 benchs/link_and_code/bench_link_and_code.py
 delete mode 100755 benchs/link_and_code/datasets.py
 delete mode 100755 benchs/link_and_code/neighbor_codec.py

diff --git a/benchs/link_and_code/README.md b/benchs/link_and_code/README.md
index 697c7bdfc6..0c04cadac5 100644
--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
@@ -21,138 +21,5 @@ graph to improve the reconstruction. It is described in
 
 ArXiV [here](https://arxiv.org/abs/1804.09996)
 
-Code structure
---------------
-
-The test runs with 3 files:
-
-- `bench_link_and_code.py`: driver script
-
-- `datasets.py`: code to load the datasets. The example code runs on the
-  deep1b and bigann datasets. See the [toplevel README](../README.md)
-  on how to download them. They should be put in a directory, edit
-  datasets.py to set the path.
-
-- `neighbor_codec.py`: this is where the representation is trained.
-
-The code runs on top of Faiss. The HNSW index can be extended with a
-`ReconstructFromNeighbors` C++ object that refines the distances. The
-training is implemented in Python.
-
-Update: 2023-12-28: the current Faiss dropped support for reconstruction with
-this method.
-
-Reproducing Table 2 in the paper
---------------------------------
-
-The results of table 2 (accuracy on deep100M) in the paper can be
-obtained with:
-
-```bash
-python bench_link_and_code.py \
-   --db deep100M \
-   --M0 6 \
-   --indexkey OPQ36_144,HNSW32_PQ36 \
-   --indexfile $bdir/deep100M_PQ36_L6.index \
-   --beta_nsq 4  \
-   --beta_centroids $bdir/deep100M_PQ36_L6_nsq4.npy \
-   --neigh_recons_codes $bdir/deep100M_PQ36_L6_nsq4_codes.npy \
-   --k_reorder 0,5 --efSearch 1,1024
-```
-
-Set `bdir` to a scratch directory.
-
-Explanation of the flags:
-
-- `--db deep1M`: dataset to process
-
-- `--M0 6`: number of links on the base level (L6)
-
-- `--indexkey OPQ36_144,HNSW32_PQ36`: Faiss index key to construct the
-  HNSW structure. It means that vectors are transformed by OPQ and
-  encoded with PQ 36x8 (with an intermediate size of 144D). The HNSW
-  level>0 nodes have 32 links (theses ones are "cheap" to store
-  because there are fewer nodes in the upper levels.
-
-- `--indexfile $bdir/deep1M_PQ36_M6.index`: name of the index file
-  (without information for the L&C extension)
-
-- `--beta_nsq 4`: number of bytes to allocate for the codes (M in the
-  paper)
-
-- `--beta_centroids $bdir/deep1M_PQ36_M6_nsq4.npy`: filename to store
-  the trained beta centroids
-
-- `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
-  for the encoded weights (beta) of the combination
-
-- `--k_reorder 0,5`: number of results to reorder. 0 = baseline
-  without reordering, 5 = value used throughout the paper
-
-- `--efSearch 1,1024`: number of nodes to visit (T in the paper)
-
-The script will proceed with the following steps:
-
-0. load dataset (and possibly compute the ground-truth if the
-ground-truth file is not provided)
-
-1. train the OPQ encoder
-
-2. build the index and store it
-
-3. compute the residuals and train the beta vocabulary to do the reconstruction
-
-4. encode the vertices
-
-5. search and evaluate the search results.
-
-With option `--exhaustive` the results of the exhaustive column can be
-obtained.
-
-The run above should output:
-```bash
-...
-setting k_reorder=5
-...
-efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520 ndis 40941919 nreorder 50000
-
-```
-which matches the paper's table 2.
-
-Note that in multi-threaded mode, the building of the HNSW structure
-is not deterministic. Therefore, the results across runs may not be exactly the same.
-
-Reproducing Figure 5 in the paper
----------------------------------
-
-Figure 5 just evaluates the combination of HNSW and PQ. For example,
-the operating point L6&OPQ40 can be obtained with
-
-```bash
-python bench_link_and_code.py \
-   --db deep1M \
-   --M0 6 \
-   --indexkey OPQ40_160,HNSW32_PQ40 \
-   --indexfile $bdir/deep1M_PQ40_M6.index \
-   --beta_nsq 1 --beta_k 1  \
-   --beta_centroids $bdir/deep1M_PQ40_M6_nsq0.npy \
-   --neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq0_codes.npy \
-   --k_reorder 0 --efSearch 16,64,256,1024
-```
-
-The arguments are similar to the previous table. Note that nsq = 0 is
-simulated by setting beta_nsq = 1 and beta_k = 1 (ie a code with a single
-reproduction value).
-
-The output should look like:
-
-```bash
-setting k_reorder=0
-efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
-efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
-efSearch=256       0.0344 ms per query,  R@1: 0.5730 R@10: 0.7915 R@100: 0.7951 ndis 11090176 nreorder 0
-efSearch=1024      0.2656 ms per query,  R@1: 0.6212 R@10: 0.8722 R@100: 0.8765 ndis 33501951 nreorder 0
-```
-
-The results with k_reorder=5 are not reported in the paper, they
-represent the performance of a "free coding" version of the algorithm.
+The necessary code for this paper was removed from Faiss in version 1.8.0.
+For a functioning verinsion, use Faiss 1.7.4.
diff --git a/benchs/link_and_code/bench_link_and_code.py b/benchs/link_and_code/bench_link_and_code.py
deleted file mode 100755
index ed8f86d631..0000000000
--- a/benchs/link_and_code/bench_link_and_code.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import os
-import sys
-import time
-import numpy as np
-import faiss
-import argparse
-import datasets
-from datasets import sanitize
-import neighbor_codec
-
-######################################################
-# Command-line parsing
-######################################################
-
-
-parser = argparse.ArgumentParser()
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-group = parser.add_argument_group('dataset options')
-
-aa('--db', default='deep1M', help='dataset')
-aa( '--compute_gt', default=False, action='store_true',
-    help='compute and store the groundtruth')
-
-group = parser.add_argument_group('index consturction')
-
-aa('--indexkey', default='HNSW32', help='index_factory type')
-aa('--efConstruction', default=200, type=int,
-   help='HNSW construction factor')
-aa('--M0', default=-1, type=int, help='size of base level')
-aa('--maxtrain', default=256 * 256, type=int,
-   help='maximum number of training points')
-aa('--indexfile', default='', help='file to read or write index from')
-aa('--add_bs', default=-1, type=int,
-   help='add elements index by batches of this size')
-aa('--link_singletons', default=False, action='store_true',
-   help='do a pass to link in the singletons')
-
-group = parser.add_argument_group(
-    'searching (reconstruct_from_neighbors options)')
-
-aa('--beta_centroids', default='',
-   help='file with codebook')
-aa('--neigh_recons_codes', default='',
-   help='file with codes for reconstruction')
-aa('--beta_ntrain', default=250000, type=int, help='')
-aa('--beta_k', default=256, type=int, help='beta codebook size')
-aa('--beta_nsq', default=1, type=int, help='number of beta sub-vectors')
-aa('--beta_niter', default=10, type=int, help='')
-aa('--k_reorder', default='-1', help='')
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=100, type=int, help='nb of nearest neighbors')
-aa('--exhaustive', default=False, action='store_true',
-    help='report the exhaustive search topline')
-aa('--searchthreads', default=-1, type=int,
-   help='nb of threads to use at search time')
-aa('--efSearch', default='', type=str,
-   help='comma-separated values of efSearch to try')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-
-######################################################
-# Load dataset
-######################################################
-
-xt, xb, xq, gt = datasets.load_data(
-    dataset=args.db, compute_gt=args.compute_gt)
-
-nq, d = xq.shape
-nb, d = xb.shape
-
-
-######################################################
-# Make index
-######################################################
-
-if os.path.exists(args.indexfile):
-
-    print("reading", args.indexfile)
-    index = faiss.read_index(args.indexfile)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw_stats = faiss.cvar.hnsw_stats
-
-else:
-
-    print("build index, key=", args.indexkey)
-
-    index = faiss.index_factory(d, args.indexkey)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw.efConstruction = args.efConstruction
-    hnsw_stats = faiss.cvar.hnsw_stats
-    index.verbose = True
-    index_hnsw.verbose = True
-    index_hnsw.storage.verbose = True
-
-    if args.M0 != -1:
-        print("set level 0 nb of neighbors to", args.M0)
-        hnsw.set_nb_neighbors(0, args.M0)
-
-    xt2 = sanitize(xt[:args.maxtrain])
-    assert np.all(np.isfinite(xt2))
-
-    print("train, size", xt.shape)
-    t0 = time.time()
-    index.train(xt2)
-    print("  train in %.3f s" % (time.time() - t0))
-
-    print("adding")
-    t0 = time.time()
-    if args.add_bs == -1:
-        index.add(sanitize(xb))
-    else:
-        for i0 in range(0, nb, args.add_bs):
-            i1 = min(nb, i0 + args.add_bs)
-            print("  adding %d:%d / %d" % (i0, i1, nb))
-            index.add(sanitize(xb[i0:i1]))
-
-    print("  add in %.3f s" % (time.time() - t0))
-    print("storing", args.indexfile)
-    faiss.write_index(index, args.indexfile)
-
-
-######################################################
-# Train beta centroids and encode dataset
-######################################################
-
-if args.beta_centroids:
-    print("reordering links")
-    index_hnsw.reorder_links()
-
-    if os.path.exists(args.beta_centroids):
-        print("load", args.beta_centroids)
-        beta_centroids = np.load(args.beta_centroids)
-        nsq, k, M1 = beta_centroids.shape
-        assert M1 == hnsw.nb_neighbors(0) + 1
-
-        rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq)
-    else:
-        print("train beta centroids")
-        rfn = faiss.ReconstructFromNeighbors(
-            index_hnsw, args.beta_k, args.beta_nsq)
-
-        xb_full = vec_transform(sanitize(xb[:args.beta_ntrain]))
-
-        beta_centroids = neighbor_codec.train_beta_codebook(
-            rfn, xb_full, niter=args.beta_niter)
-
-        print("  storing", args.beta_centroids)
-        np.save(args.beta_centroids, beta_centroids)
-
-
-    faiss.copy_array_to_vector(beta_centroids.ravel(),
-                               rfn.codebook)
-    index_hnsw.reconstruct_from_neighbors = rfn
-
-    if rfn.k == 1:
-        pass     # no codes to take care of
-    elif os.path.exists(args.neigh_recons_codes):
-        print("loading neigh codes", args.neigh_recons_codes)
-        codes = np.load(args.neigh_recons_codes)
-        assert codes.size == rfn.code_size * index.ntotal
-        faiss.copy_array_to_vector(codes.astype('uint8'),
-                                   rfn.codes)
-        rfn.ntotal = index.ntotal
-    else:
-        print("encoding neigh codes")
-        t0 = time.time()
-
-        bs = 1000000 if args.add_bs == -1 else args.add_bs
-
-        for i0 in range(0, nb, bs):
-            i1 = min(i0 + bs, nb)
-            print("   encode %d:%d / %d [%.3f s]\r" % (
-                i0, i1, nb, time.time() - t0), end=' ')
-            sys.stdout.flush()
-            xbatch = vec_transform(sanitize(xb[i0:i1]))
-            rfn.add_codes(i1 - i0, faiss.swig_ptr(xbatch))
-        print()
-
-        print("storing %s" % args.neigh_recons_codes)
-        codes = faiss.vector_to_array(rfn.codes)
-        np.save(args.neigh_recons_codes, codes)
-
-######################################################
-# Exhaustive evaluation
-######################################################
-
-if args.exhaustive:
-    print("exhaustive evaluation")
-    xq_tr = vec_transform(sanitize(xq))
-    index2 = faiss.IndexFlatL2(index_hnsw.d)
-    accu_recons_error = 0.0
-
-    if faiss.get_num_gpus() > 0:
-        print("do eval on GPU")
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = False
-        index2 = faiss.index_cpu_to_all_gpus(index2, co)
-
-    # process in batches in case the dataset does not fit in RAM
-    rh = datasets.ResultHeap(xq_tr.shape[0], 100)
-    t0 = time.time()
-    bs = 500000
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        print('  handling batch %d:%d' % (i0, i1))
-
-        xb_recons = np.empty(
-            (i1 - i0, index_hnsw.d), dtype='float32')
-        rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons))
-
-        accu_recons_error += (
-            (vec_transform(sanitize(xb[i0:i1])) -
-             xb_recons)**2).sum()
-
-        index2.reset()
-        index2.add(xb_recons)
-        D, I = index2.search(xq_tr, 100)
-        rh.add_batch_result(D, I, i0)
-
-    rh.finalize()
-    del index2
-    t1 = time.time()
-    print("done in %.3f s" % (t1 - t0))
-    print("total reconstruction error: ", accu_recons_error)
-    print("eval retrieval:")
-    datasets.evaluate_DI(rh.D, rh.I, gt)
-
-
-def get_neighbors(hnsw, i, level):
-    " list the neighbors for node i at level "
-    assert i < hnsw.levels.size()
-    assert level < hnsw.levels.at(i)
-    be = np.empty(2, 'uint64')
-    hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
-    return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
-
-
-#############################################################
-# Index is ready
-#############################################################
-
-xq = sanitize(xq)
-
-if args.searchthreads != -1:
-    print("Setting nb of threads to", args.searchthreads)
-    faiss.omp_set_num_threads(args.searchthreads)
-
-
-if gt is None:
-    print("no valid groundtruth -- exit")
-    sys.exit()
-
-
-k_reorders = [int(x) for x in args.k_reorder.split(',')]
-efSearchs = [int(x) for x in args.efSearch.split(',')]
-
-
-for k_reorder in k_reorders:
-
-    if index_hnsw.reconstruct_from_neighbors:
-        print("setting k_reorder=%d" % k_reorder)
-        index_hnsw.reconstruct_from_neighbors.k_reorder = k_reorder
-
-    for efSearch in efSearchs:
-        print("efSearch=%-4d" % efSearch, end=' ')
-        hnsw.efSearch = efSearch
-        hnsw_stats.reset()
-        datasets.evaluate(xq, gt, index, k=args.k, endl=False)
-
-        print("ndis %d nreorder %d" % (hnsw_stats.ndis, hnsw_stats.nreorder))
diff --git a/benchs/link_and_code/datasets.py b/benchs/link_and_code/datasets.py
deleted file mode 100755
index a043eb8883..0000000000
--- a/benchs/link_and_code/datasets.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#! /usr/bin/env python2
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Common functions to load datasets and compute their ground-truth
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-import pdb
-import sys
-
-# set this to the directory that contains the datafiles.
-# deep1b data should be at simdir + 'deep1b'
-# bigann data should be at simdir + 'bigann'
-simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-def bvecs_mmap(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def ivecs_write(fname, m):
-    n, d = m.shape
-    m1 = np.empty((n, d + 1), dtype='int32')
-    m1[:, 0] = d
-    m1[:, 1:] = m
-    m1.tofile(fname)
-
-
-def fvecs_write(fname, m):
-    m = m.astype('float32')
-    ivecs_write(fname, m.view('int32'))
-
-
-#################################################################
-# Dataset
-#################################################################
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-class ResultHeap:
-    """ Combine query results from a sliced dataset """
-
-    def __init__(self, nq, k):
-        " nq: number of query vectors, k: number of results per query "
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        heaps = faiss.float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = faiss.swig_ptr(self.D)
-        heaps.ids = faiss.swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_batch_result(self, D, I, i0):
-        assert D.shape == (self.nq, self.k)
-        assert I.shape == (self.nq, self.k)
-        I += i0
-        self.heaps.addn_with_ids(
-            self.k, faiss.swig_ptr(D),
-            faiss.swig_ptr(I), self.k)
-
-    def finalize(self):
-        self.heaps.reorder()
-
-
-
-def compute_GT_sliced(xb, xq, k):
-    print("compute GT")
-    t0 = time.time()
-    nb, d = xb.shape
-    nq, d = xq.shape
-    rh = ResultHeap(nq, k)
-    bs = 10 ** 5
-
-    xqs = sanitize(xq)
-
-    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-
-    # compute ground-truth by blocks of bs, and add to heaps
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        xsl = sanitize(xb[i0:i1])
-        db_gt.add(xsl)
-        D, I = db_gt.search(xqs, k)
-        rh.add_batch_result(D, I, i0)
-        db_gt.reset()
-        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
-        sys.stdout.flush()
-    print()
-    rh.finalize()
-    gt_I = rh.I
-
-    print("GT time: %.3f s" % (time.time() - t0))
-    return gt_I
-
-
-def do_compute_gt(xb, xq, k):
-    print("computing GT")
-    nb, d = xb.shape
-    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-    if nb < 100 * 1000:
-        print("   add")
-        index.add(np.ascontiguousarray(xb, dtype='float32'))
-        print("   search")
-        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
-    else:
-        I = compute_GT_sliced(xb, xq, k)
-
-    return I.astype('int32')
-
-
-def load_data(dataset='deep1M', compute_gt=False):
-
-    print("load data", dataset)
-
-    if dataset == 'sift1M':
-        basedir = simdir + 'sift1M/'
-
-        xt = fvecs_read(basedir + "sift_learn.fvecs")
-        xb = fvecs_read(basedir + "sift_base.fvecs")
-        xq = fvecs_read(basedir + "sift_query.fvecs")
-        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
-
-    elif dataset.startswith('bigann'):
-        basedir = simdir + 'bigann/'
-
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
-        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
-        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
-        # trim xb to correct size
-        xb = xb[:dbsize * 1000 * 1000]
-        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
-
-    elif dataset.startswith("deep"):
-        basedir = simdir + 'deep1b/'
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-
-        xt = fvecs_mmap(basedir + "learn.fvecs")
-        xb = fvecs_mmap(basedir + "base.fvecs")
-        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
-
-        xb = xb[:dbsize]
-
-        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
-        if compute_gt:
-            gt = do_compute_gt(xb, xq, 100)
-            print("store", gt_fname)
-            ivecs_write(gt_fname, gt)
-
-        gt = ivecs_read(gt_fname)
-
-    else:
-        assert False
-
-    print("dataset %s sizes: B %s Q %s T %s" % (
-        dataset, xb.shape, xq.shape, xt.shape))
-
-    return xt, xb, xq, gt
-
-#################################################################
-# Evaluation
-#################################################################
-
-
-def evaluate_DI(D, I, gt):
-    nq = gt.shape[0]
-    k = I.shape[1]
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-
-
-def evaluate(xq, gt, index, k=100, endl=True):
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-    nq = xq.shape[0]
-    print("\t %8.4f ms per query, " % (
-        (t1 - t0) * 1000.0 / nq), end=' ')
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-    if endl:
-        print()
-    return D, I
diff --git a/benchs/link_and_code/neighbor_codec.py b/benchs/link_and_code/neighbor_codec.py
deleted file mode 100755
index 54cad8168a..0000000000
--- a/benchs/link_and_code/neighbor_codec.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This is the training code for the link and code. Especially the
-neighbors_kmeans function implements the EM-algorithm to find the
-appropriate weightings and cluster them.
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-
-#----------------------------------------------------------
-# Utils
-#----------------------------------------------------------
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
-    "Runs kmeans on one or several GPUs"
-    d = x.shape[1]
-    clus = faiss.Clustering(d, k)
-    clus.verbose = True
-    clus.niter = 20
-    clus.max_points_per_centroid = max_points_per_centroid
-
-    if ngpu == 0:
-        index = faiss.IndexFlatL2(d)
-    else:
-        res = [faiss.StandardGpuResources() for i in range(ngpu)]
-
-        flat_config = []
-        for i in range(ngpu):
-            cfg = faiss.GpuIndexFlatConfig()
-            cfg.useFloat16 = False
-            cfg.device = i
-            flat_config.append(cfg)
-
-        if ngpu == 1:
-            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
-        else:
-            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
-                       for i in range(ngpu)]
-            index = faiss.IndexReplicas()
-            for sub_index in indexes:
-                index.addIndex(sub_index)
-
-    # perform the training
-    clus.train(x, index)
-    centroids = faiss.vector_float_to_array(clus.centroids)
-
-    stats = clus.iteration_stats
-    stats = [stats.at(i) for i in range(stats.size())]
-    obj = np.array([st.obj for st in stats])
-    print("final objective: %.4g" % obj[-1])
-
-    return centroids.reshape(k, d)
-
-
-#----------------------------------------------------------
-# Learning the codebook from neighbors
-#----------------------------------------------------------
-
-
-# works with both a full Inn table and dynamically generated neighbors
-
-def get_Inn_shape(Inn):
-    if type(Inn) != tuple:
-        return Inn.shape
-    return Inn[:2]
-
-def get_neighbor_table(x_coded, Inn, i):
-    if type(Inn) != tuple:
-        return x_coded[Inn[i,:],:]
-    rfn = x_coded
-    M, d = rfn.M, rfn.index.d
-    out = np.zeros((M + 1, d), dtype='float32')
-    int_i = int(i)
-    rfn.get_neighbor_table(int_i, faiss.swig_ptr(out))
-    _, _, sq = Inn
-    return out[:, sq * rfn.dsub : (sq + 1) * rfn.dsub]
-
-
-# Function that produces the best regression values from the vector
-# and its neighbors
-def regress_from_neighbors (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    betas = np.zeros((N,knn))
-    t0 = time.time()
-    for i in range (N):
-        xi = x[i,:]
-        NNi = get_neighbor_table(x_coded, Inn, i)
-        betas[i,:] = np.linalg.lstsq(NNi.transpose(), xi, rcond=0.01)[0]
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return betas
-
-
-
-# find the best beta minimizing ||x-x_coded[Inn,:]*beta||^2
-def regress_opt_beta (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    # construct the linear system to be solved
-    X = np.zeros ((d*N))
-    Y = np.zeros ((d*N, knn))
-    for i in range (N):
-        X[i*d:(i+1)*d] = x[i,:]
-        neighbor_table = get_neighbor_table(x_coded, Inn, i)
-        Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-    beta_opt = np.linalg.lstsq(Y, X, rcond=0.01)[0]
-    return beta_opt
-
-
-# Find the best encoding by minimizing the reconstruction error using
-# a set of pre-computed beta values
-def assign_beta (beta_centroids, x, x_coded, Inn, verbose=True):
-    if type(Inn) == tuple:
-        return assign_beta_2(beta_centroids, x, x_coded, Inn)
-    (N, knn) = Inn.shape
-    x_ibeta = np.zeros ((N), dtype='int32')
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        # Consider all possible betas for the encoding and compute the
-        # encoding error
-        x_reg_all = np.dot (beta_centroids, NNi)
-        err = ((x_reg_all - x[i,:]) ** 2).sum(axis=1)
-        x_ibeta[i] = err.argmin()
-        if verbose:
-            if i % (N / 10) == 0:
-                print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_ibeta
-
-
-# Reconstruct a set of vectors using the beta_centroids, the
-# assignment, the encoded neighbors identified by the list Inn (which
-# includes the vector itself)
-def recons_from_neighbors (beta_centroids, x_ibeta, x_coded, Inn):
-    (N, knn) = Inn.shape
-    x_rec = np.zeros(x_coded.shape)
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        x_rec[i, :] = np.dot (beta_centroids[x_ibeta[i]], NNi)
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_rec
-
-
-# Compute a EM-like algorithm trying at optimizing the beta such as they
-# minimize the reconstruction error from the neighbors
-def neighbors_kmeans (x, x_coded, Inn, K, ngpus=1, niter=5):
-    # First compute centroids using a regular k-means algorithm
-    betas = regress_from_neighbors (x, x_coded, Inn)
-    beta_centroids = train_kmeans(
-        sanitize(betas), K, ngpus, max_points_per_centroid=1000000)
-    _, knn = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    rs = np.random.RandomState()
-    for iter in range(niter):
-        print('iter', iter)
-        idx = assign_beta (beta_centroids, x, x_coded, Inn, verbose=False)
-
-        hist = np.bincount(idx)
-        for cl0 in np.where(hist == 0)[0]:
-            print("  cluster %d empty, split" % cl0, end=' ')
-            cl1 = idx[np.random.randint(idx.size)]
-            pos = np.nonzero (idx == cl1)[0]
-            pos = rs.choice(pos, pos.size / 2)
-            print("   cl %d -> %d + %d" % (cl1, len(pos), hist[cl1] - len(pos)))
-            idx[pos] = cl0
-            hist = np.bincount(idx)
-
-        tot_err = 0
-        for k in range (K):
-            pos = np.nonzero (idx == k)[0]
-            npos = pos.shape[0]
-
-            X = np.zeros (d*npos)
-            Y = np.zeros ((d*npos, knn))
-
-            for i in range(npos):
-                X[i*d:(i+1)*d] = x[pos[i],:]
-                neighbor_table = get_neighbor_table(x_coded, Inn, pos[i])
-                Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-            sol, residuals, _, _ = np.linalg.lstsq(Y, X, rcond=0.01)
-            if residuals.size > 0:
-                tot_err += residuals.sum()
-            beta_centroids[k, :] = sol
-        print('  err=%g' % tot_err)
-    return beta_centroids
-
-
-# assign the betas in C++
-def assign_beta_2(beta_centroids, x, rfn, Inn):
-    _, _, sq = Inn
-    if rfn.k == 1:
-        return np.zeros(x.shape[0], dtype=int)
-    # add dummy dimensions to beta_centroids and x
-    all_beta_centroids = np.zeros(
-        (rfn.nsq, rfn.k, rfn.M + 1), dtype='float32')
-    all_beta_centroids[sq] = beta_centroids
-    all_x = np.zeros((len(x), rfn.d), dtype='float32')
-    all_x[:, sq * rfn.dsub : (sq + 1) * rfn.dsub] = x
-    rfn.codes.clear()
-    rfn.ntotal = 0
-    faiss.copy_array_to_vector(
-        all_beta_centroids.ravel(), rfn.codebook)
-    rfn.add_codes(len(x), faiss.swig_ptr(all_x))
-    codes = faiss.vector_to_array(rfn.codes)
-    codes = codes.reshape(-1, rfn.nsq)
-    return codes[:, sq]
-
-
-#######################################################
-# For usage from bench_storages.py
-
-def train_beta_codebook(rfn, xb_full, niter=10):
-    beta_centroids = []
-    for sq in range(rfn.nsq):
-        d0, d1 = sq * rfn.dsub, (sq + 1) * rfn.dsub
-        print("training subquantizer %d/%d on dimensions %d:%d" % (
-            sq, rfn.nsq, d0, d1))
-        beta_centroids_i = neighbors_kmeans(
-            xb_full[:, d0:d1], rfn, (xb_full.shape[0], rfn.M + 1, sq),
-            rfn.k,
-            ngpus=0, niter=niter)
-        beta_centroids.append(beta_centroids_i)
-        rfn.ntotal = 0
-        rfn.codes.clear()
-        rfn.codebook.clear()
-    return np.stack(beta_centroids)
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 9a67332d67..3325c8c0e1 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -307,7 +307,7 @@ void hnsw_search(
         FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
         efSearch = params->efSearch;
     }
-    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+    size_t n1 = 0, n2 = 0, ndis = 0;
 
     idx_t check_period = InterruptCallback::get_period_hint(
             hnsw.max_level * index->d * efSearch);
@@ -323,7 +323,7 @@ void hnsw_search(
             std::unique_ptr<DistanceComputer> dis(
                     storage_distance_computer(index->storage));
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) schedule(guided)
+#pragma omp for reduction(+ : n1, n2, ndis) schedule(guided)
             for (idx_t i = i0; i < i1; i++) {
                 res.begin(i);
                 dis->set_query(x + i * index->d);
@@ -331,16 +331,14 @@ void hnsw_search(
                 HNSWStats stats = hnsw.search(*dis, res, vt, params);
                 n1 += stats.n1;
                 n2 += stats.n2;
-                n3 += stats.n3;
                 ndis += stats.ndis;
-                nreorder += stats.nreorder;
                 res.end();
             }
         }
         InterruptCallback::check();
     }
 
-    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+    hnsw_stats.combine({n1, n2, ndis});
 }
 
 } // anonymous namespace
@@ -800,7 +798,7 @@ void IndexHNSW2Level::search(
         IndexHNSW::search(n, x, k, distances, labels);
 
     } else { // "mixed" search
-        size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+        size_t n1 = 0, n2 = 0, ndis = 0;
 
         const IndexIVFPQ* index_ivfpq =
                 dynamic_cast<const IndexIVFPQ*>(storage);
@@ -832,7 +830,7 @@ void IndexHNSW2Level::search(
             int candidates_size = hnsw.upper_beam;
             MinimaxHeap candidates(candidates_size);
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+#pragma omp for reduction(+ : n1, n2, ndis)
             for (idx_t i = 0; i < n; i++) {
                 idx_t* idxi = labels + i * k;
                 float* simi = distances + i * k;
@@ -877,9 +875,7 @@ void IndexHNSW2Level::search(
                         k);
                 n1 += search_stats.n1;
                 n2 += search_stats.n2;
-                n3 += search_stats.n3;
                 ndis += search_stats.ndis;
-                nreorder += search_stats.nreorder;
 
                 vt.advance();
                 vt.advance();
@@ -888,7 +884,7 @@ void IndexHNSW2Level::search(
             }
         }
 
-        hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+        hnsw_stats.combine({n1, n2, ndis});
     }
 }
 
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 36de98c098..cc10f21589 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
 
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
-    void assign(
-            idx_t n,
-            const float* x,
-            idx_t* labels,
-            // faiss::Index has idx_t for k
-            idx_t k = 1) const override;
+    void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const override;
 
     /// `x`, `distances` and `labels` can be resident on the CPU or any
     /// GPU; copies are performed as needed
     void search(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
     void search_and_reconstruct(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index a9fb9daf5b..b1324e1211 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -664,7 +664,7 @@ int search_from_candidates(
         if (candidates.size() == 0) {
             stats.n2++;
         }
-        stats.n3 += ndis;
+        stats.ndis += ndis;
     }
 
     return nres;
@@ -793,7 +793,7 @@ std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
     if (candidates.size() == 0) {
         ++stats.n2;
     }
-    stats.n3 += ndis;
+    stats.ndis += ndis;
 
     return top_candidates;
 }
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index cb6b422c3d..8261423cdd 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -230,30 +230,20 @@ struct HNSW {
 };
 
 struct HNSWStats {
-    size_t n1, n2, n3;
-    size_t ndis;
-    size_t nreorder;
-
-    HNSWStats(
-            size_t n1 = 0,
-            size_t n2 = 0,
-            size_t n3 = 0,
-            size_t ndis = 0,
-            size_t nreorder = 0)
-            : n1(n1), n2(n2), n3(n3), ndis(ndis), nreorder(nreorder) {}
+    size_t n1 = 0; /// numbner of vectors searched
+    size_t n2 =
+            0; /// number of queries for which the candidate list is exhasted
+    size_t ndis = 0; /// number of distances computed
 
     void reset() {
-        n1 = n2 = n3 = 0;
+        n1 = n2 = 0;
         ndis = 0;
-        nreorder = 0;
     }
 
     void combine(const HNSWStats& other) {
         n1 += other.n1;
         n2 += other.n2;
-        n3 += other.n3;
         ndis += other.ndis;
-        nreorder += other.nreorder;
     }
 };
 
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index 914fac3ff1..dd4212d717 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -123,6 +123,16 @@ def test_hnsw_IP(self):
         mask = Iref[:, 0] == Ihnsw[:, 0]
         assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
 
+    def test_ndis_stats(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        stats = faiss.cvar.hnsw_stats
+        stats.reset()
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+        self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
+
 
 class TestNSG(unittest.TestCase):
 

From 798427c019d1596ff720fc5785d5fcacf31a9b7c Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 22 Mar 2024 14:58:14 -0700
Subject: [PATCH 016/116] Handling FaissException in few destructors of
 ResultHandler.h (#3311)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3311

**Context**
[Issue 2948](https://github.com/facebookresearch/faiss/issues/2948) highlights potential issue of calling allocation on result handler which may throw exception but it is not handled.

**In this diff**,
I observed two calls where we may potentially call allocation in ResultHandler.h and handled FaissException.
1/ partial result when finalized in ~SingleResultHandler
2/ partial result when merged in ~RangeSearchBlockResultHandler

Reviewed By: junjieqi

Differential Revision: D55258213

fbshipit-source-id: 259be472e73619b2fcb0ea480d6d3486affeafdf
---
 faiss/impl/ResultHandler.h | 23 ++++++++++++++++++++---
 1 file changed, 20 insertions(+), 3 deletions(-)

diff --git a/faiss/impl/ResultHandler.h b/faiss/impl/ResultHandler.h
index 270de8dcd6..713fe8e49f 100644
--- a/faiss/impl/ResultHandler.h
+++ b/faiss/impl/ResultHandler.h
@@ -12,8 +12,10 @@
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/partitioning.h>
+#include <iostream>
 
 namespace faiss {
 
@@ -504,7 +506,15 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C> {
         void end() {}
 
         ~SingleResultHandler() {
-            pres.finalize();
+            try {
+                // finalize the partial result
+                pres.finalize();
+            } catch (const faiss::FaissException& e) {
+                // Do nothing if allocation fails in finalizing partial results.
+#ifndef NDEBUG
+                std::cerr << e.what() << std::endl;
+#endif
+            }
         }
     };
 
@@ -559,8 +569,15 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C> {
     }
 
     ~RangeSearchBlockResultHandler() {
-        if (partial_results.size() > 0) {
-            RangeSearchPartialResult::merge(partial_results);
+        try {
+            if (partial_results.size() > 0) {
+                RangeSearchPartialResult::merge(partial_results);
+            }
+        } catch (const faiss::FaissException& e) {
+            // Do nothing if allocation fails in merge.
+#ifndef NDEBUG
+            std::cerr << e.what() << std::endl;
+#endif
         }
     }
 };

From af5793cf128168520564a21c7b4dac9e655cee36 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 22 Mar 2024 15:04:01 -0700
Subject: [PATCH 017/116] Adding test for IndexBinaryFlat.reconstruct_n()
 (#3310)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3310

**Context**
[Issue 2751](https://github.com/facebookresearch/faiss/issues/2751) is already fixed as class wrappers has replacement definition of reconstruct_n in handle_IndexBinary.

**In this diff**,
Writing test test_reconstruct for binary index to validate fix for above issue.

Reviewed By: junjieqi

Differential Revision: D55168600

fbshipit-source-id: b62dc5fa89d65b843c52faa7456f046142e34421
---
 tests/test_index_binary.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index 312530ad46..b505e0ba1c 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -143,6 +143,15 @@ def test_range_search(self):
         # nb tests is actually low...
         self.assertTrue(nt1 > 19 and nt2 > 19)
 
+    def test_reconstruct(self):
+        index = faiss.IndexBinaryFlat(64)
+        input_vector = np.random.randint(0, 255, size=(10, index.code_size)).astype("uint8")
+        index.add(input_vector)
+
+        reconstructed_vector = index.reconstruct_n(0, 4)
+        assert reconstructed_vector.shape == (4, index.code_size)
+        assert np.all(input_vector[:4] == reconstructed_vector)
+
 
 class TestBinaryIVF(unittest.TestCase):
 

From 0c96b0d7e0e399458e7fb0703015f03dbecb614d Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Mon, 25 Mar 2024 04:02:23 -0700
Subject: [PATCH 018/116] enable rapidsai-nightly channel for libraft (#3317)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3317

libraft packages are first published in rapidsai-nightly and moved to rapidsai after release, at which point they're removed from rapidsai-nightly

In this diff we enable both channels with a preference to rapidsai (since it's before rapidsai-nightly on the command line).

Reviewed By: mlomeli1

Differential Revision: D55310143

fbshipit-source-id: b85e0fda86a442f435d985ace1d7eb37209c74e1
---
 .circleci/config.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 94aad3b11e..289f812526 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -168,7 +168,7 @@ jobs:
                 command: |
                   cd conda
                   conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c conda-forge
+                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
       - when:
           condition:
             and:
@@ -182,7 +182,7 @@ jobs:
                 command: |
                   cd conda
                   conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c conda-forge
+                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
 
   build_cmake:
     parameters:

From 14b8af6e736fdfff636584841e61e0161d8ceadd Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Mon, 25 Mar 2024 11:19:40 -0700
Subject: [PATCH 019/116] Fix IVFPQFastScan decode function (#3312)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3312

as the [#issue3258](https://github.com/facebookresearch/faiss/issues/3258) mentioned, the IVFPQFastScan should have same decoding result as IVFPQ. However, current result is not as expected.

In this PR/Diff, we are going to fix the decoding function

Reviewed By: mdouze

Differential Revision: D55264781

fbshipit-source-id: dfdae9eabceadfc5a3ebb851930d71ce3c1c654d
---
 faiss/IndexIVF.h             |  8 +++++
 faiss/IndexIVFPQFastScan.cpp | 23 +++++++++++--
 tests/test_fast_scan_ivf.py  | 63 +++++++++++++++++++++++++++---------
 3 files changed, 77 insertions(+), 17 deletions(-)

diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index 45c65ef839..185561d086 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -433,6 +433,14 @@ struct IndexIVF : Index, IndexIVFInterface {
 
     /* The standalone codec interface (except sa_decode that is specific) */
     size_t sa_code_size() const override;
+
+    /** encode a set of vectors
+     * sa_encode will call encode_vector with include_listno=true
+     * @param n      nb of vectors to encode
+     * @param x      the vectors to encode
+     * @param bytes  output array for the codes
+     * @return nb of bytes written to codes
+     */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     IndexIVF();
diff --git a/faiss/IndexIVFPQFastScan.cpp b/faiss/IndexIVFPQFastScan.cpp
index d069db1354..2844ae4936 100644
--- a/faiss/IndexIVFPQFastScan.cpp
+++ b/faiss/IndexIVFPQFastScan.cpp
@@ -286,9 +286,28 @@ void IndexIVFPQFastScan::compute_LUT(
     }
 }
 
-void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x)
+void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* codes, float* x)
         const {
-    pq.decode(bytes, x, n);
+    size_t coarse_size = coarse_code_size();
+
+#pragma omp parallel if (n > 1)
+    {
+        std::vector<float> residual(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+            pq.decode(code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct(list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
 }
 
 } // namespace faiss
diff --git a/tests/test_fast_scan_ivf.py b/tests/test_fast_scan_ivf.py
index d6dad8fec3..f48dd2e47a 100644
--- a/tests/test_fast_scan_ivf.py
+++ b/tests/test_fast_scan_ivf.py
@@ -84,9 +84,7 @@ def sp(x):
         b = btab[0]
         dis_new = self.compute_dis_quant(codes, LUTq, biasq, a, b)
 
-        #    print(a, b, dis_ref.sum())
         avg_realtive_error = np.abs(dis_new - dis_ref).sum() / dis_ref.sum()
-        # print('a=', a, 'avg_relative_error=', avg_realtive_error)
         self.assertLess(avg_realtive_error, 0.0005)
 
     def test_no_residual_ip(self):
@@ -228,8 +226,6 @@ def eval_quant_loss(self, by_residual, metric=faiss.METRIC_L2):
 
         m3 = three_metrics(Da, Ia, Db, Ib)
 
-
-        # print(by_residual, metric, recall_at_1, recall_at_10, intersection_at_10)
         ref_results = {
             (True, 1): [0.985, 1.0, 9.872],
             (True, 0): [ 0.987, 1.0, 9.914],
@@ -261,6 +257,7 @@ class TestEquivPQ(unittest.TestCase):
 
     def test_equiv_pq(self):
         ds  = datasets.SyntheticDataset(32, 2000, 200, 4)
+        xq = ds.get_queries()
 
         index = faiss.index_factory(32, "IVF1,PQ16x4np")
         index.by_residual = False
@@ -268,7 +265,7 @@ def test_equiv_pq(self):
         index.quantizer.add(np.zeros((1, 32), dtype='float32'))
         index.train(ds.get_train())
         index.add(ds.get_database())
-        Dref, Iref = index.search(ds.get_queries(), 4)
+        Dref, Iref = index.search(xq, 4)
 
         index_pq = faiss.index_factory(32, "PQ16x4np")
         index_pq.pq = index.pq
@@ -276,21 +273,64 @@ def test_equiv_pq(self):
         index_pq.codes = faiss. downcast_InvertedLists(
             index.invlists).codes.at(0)
         index_pq.ntotal = index.ntotal
-        Dnew, Inew = index_pq.search(ds.get_queries(), 4)
+        Dnew, Inew = index_pq.search(xq, 4)
 
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
         index_pq2 = faiss.IndexPQFastScan(index_pq)
         index_pq2.implem = 12
-        Dref, Iref = index_pq2.search(ds.get_queries(), 4)
+        Dref, Iref = index_pq2.search(xq, 4)
 
         index2 = faiss.IndexIVFPQFastScan(index)
         index2.implem = 12
-        Dnew, Inew = index2.search(ds.get_queries(), 4)
+        Dnew, Inew = index2.search(xq, 4)
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
+        # test encode and decode
+
+        np.testing.assert_array_equal(
+            index_pq.sa_encode(xq),
+            index2.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_pq.sa_decode(index_pq.sa_encode(xq)),
+            index2.sa_decode(index2.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_pq.sa_decode(index_pq.sa_encode(xq)) - xq) ** 2).sum(1),
+            ((index2.sa_decode(index2.sa_encode(xq)) - xq) ** 2).sum(1)
+        )
+
+    def test_equiv_pq_encode_decode(self):
+        ds = datasets.SyntheticDataset(32, 1000, 200, 10)
+        xq = ds.get_queries()
+
+        index_ivfpq = faiss.index_factory(ds.d, "IVF10,PQ8x4np")
+        index_ivfpq.train(ds.get_train())
+
+        index_ivfpqfs = faiss.IndexIVFPQFastScan(index_ivfpq)
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_encode(xq),
+            index_ivfpqfs.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)),
+            index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)) - xq) ** 2)
+            .sum(1),
+            ((index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq)) - xq) ** 2)
+            .sum(1)
+        )
+
 
 class TestIVFImplem12(unittest.TestCase):
 
@@ -463,7 +503,6 @@ def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
         Dnew, Inew = index2.search(ds.get_queries(), 10)
 
         m3 = three_metrics(Dref, Iref, Dnew, Inew)
-        #   print((by_residual, metric, d), ":", m3)
         ref_m3_tab = {
             (True, 1, 32): (0.995, 1.0, 9.91),
             (True, 0, 32): (0.99, 1.0, 9.91),
@@ -554,7 +593,6 @@ def subtest_accuracy(self, aq, st, by_residual, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, metric_type, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.051
 
     def xx_test_accuracy(self):
@@ -599,7 +637,6 @@ def subtest_rescale_accuracy(self, aq, st, by_residual, implem):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_rescale_accuracy(self):
@@ -624,7 +661,6 @@ def subtest_from_ivfaq(self, implem):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.02
 
     def test_from_ivfaq(self):
@@ -763,7 +799,6 @@ def subtest_accuracy(self, paq):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(paq, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def test_accuracy_PLSQ(self):
@@ -847,7 +882,6 @@ def do_test(self, metric=faiss.METRIC_L2):
         # find a reasonable radius
         D, I = index.search(ds.get_queries(), 10)
         radius = np.median(D[:, -1])
-        #   print("radius=", radius)
         lims1, D1, I1 = index.range_search(ds.get_queries(), radius)
 
         index2 = faiss.IndexIVFPQFastScan(index)
@@ -860,7 +894,6 @@ def do_test(self, metric=faiss.METRIC_L2):
         for i in range(ds.nq):
             ref = set(I1[lims1[i]: lims1[i + 1]])
             new = set(I2[lims2[i]: lims2[i + 1]])
-            print(ref, new)
             nmiss += len(ref - new)
             nextra += len(new - ref)
 

From 55dc880c2f813637c5a35cdc7fcae39f30c2c71e Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Wed, 27 Mar 2024 10:53:49 -0700
Subject: [PATCH 020/116] Change cmake to build googletest from source (#3319)

Summary:
In the https://github.com/facebookresearch/faiss/pull/3278, we to find_package to link to GTest. However, it needs to have googletest to build independently. Not everyone builds their googletest locally first. In this diff, we still try to build googletest from source and combine find_package together.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3319

Test Plan:
STEP 1: Install deps
```
conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
```

STEP2: Compile

```
cmake -B build \
      -DBUILD_TESTING=ON \
      -DBUILD_SHARED_LIBS=ON \
      -DFAISS_ENABLE_GPU=OFF \
      -DFAISS_ENABLE_RAFT=OFF \
      -DFAISS_OPT_LEVEL=avx2 \
      -DFAISS_ENABLE_C_API=ON \
      -DPYTHON_EXECUTABLE=$(which python) \
      -DCMAKE_BUILD_TYPE=Release \
      -DBLA_VENDOR=Intel10_64_dyn \
      -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
      .
```

Reviewed By: algoriddle

Differential Revision: D55358059

Pulled By: junjieqi

fbshipit-source-id: 95ad4a745238b88b438728de64173f99d3d50dbe
---
 tests/CMakeLists.txt | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 0cb8219096..66ec9f74a5 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -59,6 +59,34 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
   target_link_libraries(faiss_test PRIVATE faiss_avx512)
 endif()
 
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG 58d77fa8070e8cec2dc1ed015d66b454c8d78850 # release-1.12.1
+  OVERRIDE_FIND_PACKAGE)
+set(BUILD_GMOCK CACHE BOOL OFF)
+set(INSTALL_GTEST CACHE BOOL OFF)
+FetchContent_MakeAvailable(googletest)
+
+if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+   AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
+  file(
+    WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+    [=[
+include(CMakeFindDependencyMacro)
+find_dependency(googletest)
+if(NOT TARGET GTest::GTest)
+  add_library(GTest::GTest INTERFACE IMPORTED)
+  target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
+endif()
+if(NOT TARGET GTest::Main)
+  add_library(GTest::Main INTERFACE IMPORTED)
+  target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
+endif()
+]=])
+endif()
+
 find_package(OpenMP REQUIRED)
 find_package(GTest CONFIG REQUIRED)
 

From 03db694aa799fa5f5fc036743dfcaea5e4cc82bb Mon Sep 17 00:00:00 2001
From: Chip-Kerchner <chip.kerchner@ibm.com>
Date: Thu, 28 Mar 2024 03:38:02 -0700
Subject: [PATCH 021/116] Fix problems when using 64-bit integers. (#3322)

Summary:
Fixes problem when compiling OpenBLAS with INTERFACE64=1 (64-bit integers).

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3322

Reviewed By: algoriddle

Differential Revision: D55469397

Pulled By: mdouze

fbshipit-source-id: 14d916fb074f6ea0f591e0324bb7b8674a624473
---
 faiss/CMakeLists.txt                |  5 ++++-
 faiss/impl/LocalSearchQuantizer.cpp | 16 ++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index a890a46f11..33e1849568 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -290,7 +290,10 @@ if(WIN32)
   target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB)
 endif()
 
-target_compile_definitions(faiss PRIVATE FINTEGER=int)
+string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx)
+if (${finteger_idx} EQUAL -1)
+  target_compile_definitions(faiss PRIVATE FINTEGER=int)
+endif()
 target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
 target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int)
 
diff --git a/faiss/impl/LocalSearchQuantizer.cpp b/faiss/impl/LocalSearchQuantizer.cpp
index 8da989a9a4..943fe32c9d 100644
--- a/faiss/impl/LocalSearchQuantizer.cpp
+++ b/faiss/impl/LocalSearchQuantizer.cpp
@@ -104,10 +104,10 @@ int dgemm_(
 
 namespace {
 
-void fmat_inverse(float* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void fmat_inverse(float* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<float> workspace(lwork);
 
     sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
@@ -123,10 +123,10 @@ void dfvec_add(size_t d, const double* a, const float* b, double* c) {
     }
 }
 
-void dmat_inverse(double* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void dmat_inverse(double* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<double> workspace(lwork);
 
     dgetrf_(&n, &n, a, &n, ipiv.data(), &info);

From d6854136afa3b987defb2c8e8108bad76ebdb949 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Thu, 28 Mar 2024 22:54:51 -0700
Subject: [PATCH 022/116] Fix faiss swig build with version > 4.2.x (#3315)

Summary:
Currently, faiss can't build with swig version > 4.2.x. As the https://github.com/facebookresearch/faiss/issues/3239 mentioned.  Swig removed the support for 32bit https://github.com/swig/swig/commit/9fb3a4939e4ec528f050057d8ccd743a066222ac. So SWIGTYPE_p_unsigned_long_long isn't supported any more. In this diff, we are going to remove the unsupported type from Faiss swig.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3315

Test Plan:
STEP 1: create a new conda env
```
conda create --name faiss_swig
conda activate faiss_swig
```

STEP 2: install dependecies from conda-forge
```
conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 -c conda-forge
```

STEP 3: CMAKE

```
cmake -B build \
      -DBUILD_TESTING=ON \
      -DBUILD_SHARED_LIBS=ON \
      -DFAISS_ENABLE_GPU=OFF \
      -DFAISS_ENABLE_RAFT=OFF \
      -DFAISS_OPT_LEVEL=avx512 \
      -DFAISS_ENABLE_C_API=ON \
      -DPYTHON_EXECUTABLE=$(which python) \
      -DCMAKE_BUILD_TYPE=Release \
      -DBLA_VENDOR=Intel10_64_dyn \
      -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
      .
```

STEP 4: build
```
make -C build -j faiss && make -C build -j swigfaiss
```
<img width="876" alt="Screenshot 2024-03-25 at 12 24 16 AM" src="https://github.com/facebookresearch/faiss/assets/8333898/918f0caf-398a-4361-989f-93ff547cf2b2">

Reviewed By: algoriddle

Differential Revision: D55304004

Pulled By: junjieqi

fbshipit-source-id: e958009dc637aa33b0e1a574a16a846a4abb1525
---
 .circleci/config.yml           | 4 ++--
 conda/faiss-gpu-raft/meta.yaml | 2 +-
 faiss/python/swigfaiss.swig    | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 289f812526..549e4a2793 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -224,7 +224,7 @@ jobs:
             - run:
                 name: Install env using main channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
+                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
       - when:
           condition:
             equal: [ "ON", << parameters.raft >> ]
@@ -232,7 +232,7 @@ jobs:
             - run:
                 name: Install env using conda-forge channel
                 command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
       - when:
           condition:
             and:
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 12dfc889b1..c43e7656c3 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -84,7 +84,7 @@ outputs:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0.2
+        - swig
         - cmake >=3.23.1
         - make  # [not win]
       host:
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index fb7f50dd2e..0ea93609e3 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -1022,14 +1022,14 @@ PyObject *swig_ptr (PyObject *a)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
     }
     if(PyArray_TYPE(ao) == NPY_UINT64) {
-#ifdef SWIGWORDSIZE64
+#if (__SIZEOF_LONG__ == 8)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
 #endif
     }
     if(PyArray_TYPE(ao) == NPY_INT64) {
-#ifdef SWIGWORDSIZE64
+#if  (__SIZEOF_LONG__ == 8)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);

From d99f07e91a19bfffee8c482b117ded6699ca82bc Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Fri, 29 Mar 2024 01:25:24 -0700
Subject: [PATCH 023/116] AVX512 for PQFastScan (#3276)

Summary:
AVX-512 implementation for PQFastScan for QBS.
For local benchmarks on 4th gen Xeon, the QPS is up to 10% higher, mostly for a single query case. But as far as I remember, production cases would show higher performance improvements.

* Baseline `benchs/bench_ivf_fastscan_single_query.py` (sift1M): https://gist.github.com/alexanderguzhva/c9cde2cb5e9c7675f429623e6faa9fbf
* Candidate `benchs/bench_ivf_fastscan_single_query.py` (sift1M): https://gist.github.com/alexanderguzhva/4e8530073a108f73771d38e55bc45b17
* Baseline `benchs/bench_ivf_fastscan.py` (sift1M): https://gist.github.com/alexanderguzhva/9eb03ed60354d7e76cfa25e676f983ac
* Candidate `benchs/bench_ivf_fastscan.py` (sift1M): https://gist.github.com/alexanderguzhva/3cbfeba1364dd445a2bb52455966979e

mdouze should I modify `pq4_fast_scan_search_1.cpp` as well? It is somewhat cumbersome to dig through various possible sub-implementations

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3276

Reviewed By: junjieqi

Differential Revision: D54943632

Pulled By: mdouze

fbshipit-source-id: 3d70066e9779039559b1734c2be99bf439058246
---
 faiss/impl/LookupTableScaler.h          |  34 ++
 faiss/impl/pq4_fast_scan_search_qbs.cpp | 447 ++++++++++++++++++++++++
 faiss/impl/simd_result_handlers.h       |   2 +-
 faiss/utils/simdlib.h                   |   7 +-
 faiss/utils/simdlib_avx512.h            | 296 ++++++++++++++++
 5 files changed, 784 insertions(+), 2 deletions(-)
 create mode 100644 faiss/utils/simdlib_avx512.h

diff --git a/faiss/impl/LookupTableScaler.h b/faiss/impl/LookupTableScaler.h
index c553a0f14d..b6438307fb 100644
--- a/faiss/impl/LookupTableScaler.h
+++ b/faiss/impl/LookupTableScaler.h
@@ -38,6 +38,23 @@ struct DummyScaler {
         return simd16uint16(0);
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8&, const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
+        return simd64uint8(0);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
+        return simd32uint16(0);
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
+        return simd32uint16(0);
+    }
+#endif
+
     template <class dist_t>
     inline dist_t scale_one(const dist_t&) const {
         FAISS_THROW_MSG("DummyScaler::scale_one should not be called.");
@@ -67,6 +84,23 @@ struct NormTableScaler {
         return (simd16uint16(res) >> 8) * scale_simd;
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8& lut, const simd64uint8& c)
+            const {
+        return lut.lookup_4_lanes(c);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return simd32uint16(res) * scale_simd_wide;
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return (simd32uint16(res) >> 8) * scale_simd_wide;
+    }
+#endif
+
     // for non-SIMD implem 2, 3, 4
     template <class dist_t>
     inline dist_t scale_one(const dist_t& x) const {
diff --git a/faiss/impl/pq4_fast_scan_search_qbs.cpp b/faiss/impl/pq4_fast_scan_search_qbs.cpp
index d69542c309..bf2ccd1f76 100644
--- a/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ b/faiss/impl/pq4_fast_scan_search_qbs.cpp
@@ -31,6 +31,8 @@ namespace {
  * writes results in a ResultHandler
  */
 
+#ifndef __AVX512F__
+
 template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
         int nsq,
@@ -111,6 +113,451 @@ void kernel_accumulate_block(
     }
 }
 
+#else
+
+// a special version for NQ=1.
+// Despite the function being large in the text form, it compiles to a very
+//    compact assembler code.
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nq1(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // NQ is kept in order to match the similarity to baseline function
+    constexpr int NQ = 1;
+    // distance accumulators. We can accept more for NQ=1
+    // layout: accu[q][b]: distance accumulator for vectors 32*b..32*b+15
+    simd32uint16 accu[NQ][4];
+    // layout: accu[q][b]: distance accumulator for vectors 32*b+16..32*b+31
+    simd32uint16 accu1[NQ][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+            accu1[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_8 = (nsq_minus_nscale / 8) * 8;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(c1lo);
+                simd64uint8 res1 = lut.lookup_4_lanes(c1hi);
+
+                accu1[q][0] += simd32uint16(res0);
+                accu1[q][1] += simd32uint16(res0) >> 8;
+
+                accu1[q][2] += simd32uint16(res1);
+                accu1[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nsq_minus_nscale_8 != nsq_minus_nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = lut.lookup_4_lanes(clo);
+            simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+            accu[q][0] += simd32uint16(res0);
+            accu[q][1] += simd32uint16(res0) >> 8;
+
+            accu[q][2] += simd32uint16(res1);
+            accu[q][3] += simd32uint16(res1) >> 8;
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_8 = (nscale / 8) * 8;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, c1lo);
+                accu1[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu1[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, c1hi);
+                accu1[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu1[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nscale_8 != nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+            accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+            simd64uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b] += accu1[q][b];
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+// general-purpose case
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nqx(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // dummy alloc to keep the windows compiler happy
+    constexpr int NQA = NQ > 0 ? NQ : 1;
+    // distance accumulators
+    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
+    simd32uint16 accu[NQA][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 4
+    for (int sq = 0; sq < nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    if constexpr (NQ == 1) {
+        kernel_accumulate_block_avx512_nq1<ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    } else {
+        kernel_accumulate_block_avx512_nqx<NQ, ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    }
+}
+
+#endif
+
 // handle at most 4 blocks of queries
 template <int QBS, class ResultHandler, class Scaler>
 void accumulate_q_4step(
diff --git a/faiss/impl/simd_result_handlers.h b/faiss/impl/simd_result_handlers.h
index 2d8e5388d9..633d480990 100644
--- a/faiss/impl/simd_result_handlers.h
+++ b/faiss/impl/simd_result_handlers.h
@@ -505,7 +505,7 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
         n_per_query.resize(nq + 1);
     }
 
-    virtual void begin(const float* norms) {
+    virtual void begin(const float* norms) override {
         normalizers = norms;
         for (int q = 0; q < nq; ++q) {
             thresholds[q] =
diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h
index 27e9cc59f5..beeec2374e 100644
--- a/faiss/utils/simdlib.h
+++ b/faiss/utils/simdlib.h
@@ -14,7 +14,12 @@
  * functions.
  */
 
-#ifdef __AVX2__
+#if defined(__AVX512F__)
+
+#include <faiss/utils/simdlib_avx2.h>
+#include <faiss/utils/simdlib_avx512.h>
+
+#elif defined(__AVX2__)
 
 #include <faiss/utils/simdlib_avx2.h>
 
diff --git a/faiss/utils/simdlib_avx512.h b/faiss/utils/simdlib_avx512.h
new file mode 100644
index 0000000000..9ce0965895
--- /dev/null
+++ b/faiss/utils/simdlib_avx512.h
@@ -0,0 +1,296 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include <immintrin.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/simdlib_avx2.h>
+
+namespace faiss {
+
+/** Simple wrapper around the AVX 512-bit registers
+ *
+ * The objective is to separate the different interpretations of the same
+ * registers (as a vector of uint8, uint16 or uint32), to provide printing
+ * functions, and to give more readable names to the AVX intrinsics. It does not
+ * pretend to be exhausitve, functions are added as needed.
+ */
+
+/// 512-bit representation without interpretation as a vector
+struct simd512bit {
+    union {
+        __m512i i;
+        __m512 f;
+    };
+
+    simd512bit() {}
+
+    explicit simd512bit(__m512i i) : i(i) {}
+
+    explicit simd512bit(__m512 f) : f(f) {}
+
+    explicit simd512bit(const void* x)
+            : i(_mm512_loadu_si512((__m512i const*)x)) {}
+
+    // sets up a lower half of the register while keeping upper one as zero
+    explicit simd512bit(simd256bit lo)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      _mm256_setzero_si256(),
+                      1)) {}
+
+    // constructs from lower and upper halves
+    explicit simd512bit(simd256bit lo, simd256bit hi)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      hi.i,
+                      1)) {}
+
+    void clear() {
+        i = _mm512_setzero_si512();
+    }
+
+    void storeu(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void loadu(const void* ptr) {
+        i = _mm512_loadu_si512((__m512i*)ptr);
+    }
+
+    void store(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void bin(char bits[513]) const {
+        char bytes[64];
+        storeu((void*)bytes);
+        for (int i = 0; i < 512; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[512] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+};
+
+/// vector of 32 elements in uint16
+struct simd32uint16 : simd512bit {
+    simd32uint16() {}
+
+    explicit simd32uint16(__m512i i) : simd512bit(i) {}
+
+    explicit simd32uint16(int x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(uint16_t x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(simd512bit x) : simd512bit(x) {}
+
+    explicit simd32uint16(const uint16_t* x) : simd512bit((const void*)x) {}
+
+    // sets up a lower half of the register
+    explicit simd32uint16(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd32uint16(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint16_t bytes[32];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint16_t x) {
+        i = _mm512_set1_epi16((short)x);
+    }
+
+    simd32uint16 operator*(const simd32uint16& other) const {
+        return simd32uint16(_mm512_mullo_epi16(i, other.i));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator>>(const int shift) const {
+        return simd32uint16(_mm512_srli_epi16(i, shift));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator<<(const int shift) const {
+        return simd32uint16(_mm512_slli_epi16(i, shift));
+    }
+
+    simd32uint16 operator+=(simd32uint16 other) {
+        i = _mm512_add_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator-=(simd32uint16 other) {
+        i = _mm512_sub_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator+(simd32uint16 other) const {
+        return simd32uint16(_mm512_add_epi16(i, other.i));
+    }
+
+    simd32uint16 operator-(simd32uint16 other) const {
+        return simd32uint16(_mm512_sub_epi16(i, other.i));
+    }
+
+    simd32uint16 operator&(simd512bit other) const {
+        return simd32uint16(_mm512_and_si512(i, other.i));
+    }
+
+    simd32uint16 operator|(simd512bit other) const {
+        return simd32uint16(_mm512_or_si512(i, other.i));
+    }
+
+    simd32uint16 operator^(simd512bit other) const {
+        return simd32uint16(_mm512_xor_si512(i, other.i));
+    }
+
+    simd32uint16 operator~() const {
+        return simd32uint16(_mm512_xor_si512(i, _mm512_set1_epi32(-1)));
+    }
+
+    simd16uint16 low() const {
+        return simd16uint16(_mm512_castsi512_si256(i));
+    }
+
+    simd16uint16 high() const {
+        return simd16uint16(_mm512_extracti32x8_epi32(i, 1));
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        ALIGNED(64) uint16_t tab[32];
+        store(tab);
+        return tab[i];
+    }
+
+    void accu_min(simd32uint16 incoming) {
+        i = _mm512_min_epu16(i, incoming.i);
+    }
+
+    void accu_max(simd32uint16 incoming) {
+        i = _mm512_max_epu16(i, incoming.i);
+    }
+};
+
+// decompose in 128-lanes: a = (a0, a1, a2, a3), b = (b0, b1, b2, b3)
+// return (a0 + a1 + a2 + a3, b0 + b1 + b2 + b3)
+inline simd16uint16 combine4x2(simd32uint16 a, simd32uint16 b) {
+    return combine2x2(a.low(), b.low()) + combine2x2(a.high(), b.high());
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd64uint8 : simd512bit {
+    simd64uint8() {}
+
+    explicit simd64uint8(__m512i i) : simd512bit(i) {}
+
+    explicit simd64uint8(int x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    explicit simd64uint8(uint8_t x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    // sets up a lower half of the register
+    explicit simd64uint8(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd64uint8(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    explicit simd64uint8(simd512bit x) : simd512bit(x) {}
+
+    explicit simd64uint8(const uint8_t* x) : simd512bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint8_t bytes[64];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 64; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        i = _mm512_set1_epi8((char)x);
+    }
+
+    simd64uint8 operator&(simd512bit other) const {
+        return simd64uint8(_mm512_and_si512(i, other.i));
+    }
+
+    simd64uint8 operator+(simd64uint8 other) const {
+        return simd64uint8(_mm512_add_epi8(i, other.i));
+    }
+
+    simd64uint8 lookup_4_lanes(simd64uint8 idx) const {
+        return simd64uint8(_mm512_shuffle_epi8(i, idx.i));
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+    simd32uint16 lane0_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 0);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd32uint16 lane1_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 1);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd64uint8 operator+=(simd64uint8 other) {
+        i = _mm512_add_epi8(i, other.i);
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        ALIGNED(64) uint8_t tab[64];
+        store(tab);
+        return tab[i];
+    }
+};
+
+} // namespace faiss

From 4e6b6f8a12c93de33415467d1be649832e05afa6 Mon Sep 17 00:00:00 2001
From: Aalekh Patel <aalekh.gwpeck.7998@icloud.com>
Date: Fri, 29 Mar 2024 02:37:19 -0700
Subject: [PATCH 024/116] Add the ability to clone and read binary indexes to
 the C API. (#3318)

Summary:
I noticed we have a pretty decent C API for binary indexes and please correct me if I'm wrong but we seem to be missing a couple of functions, like the ability to clone and read binary indexes. This PR provides those functions.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3318

Reviewed By: algoriddle

Differential Revision: D55469615

Pulled By: mdouze

fbshipit-source-id: 42e6f827d8b5ad6bc3efe989e47ede3aa06c1810
---
 c_api/clone_index_c.cpp   | 12 ++++++++++++
 c_api/clone_index_c.h     |  4 ++++
 c_api/index_factory_c.cpp | 16 +++++++++++++++-
 c_api/index_factory_c.h   | 11 ++++++++++-
 4 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/c_api/clone_index_c.cpp b/c_api/clone_index_c.cpp
index 8211156aaa..606e5f9b0a 100644
--- a/c_api/clone_index_c.cpp
+++ b/c_api/clone_index_c.cpp
@@ -14,6 +14,7 @@
 #include "macros_impl.h"
 
 using faiss::Index;
+using faiss::IndexBinary;
 
 int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     try {
@@ -22,3 +23,14 @@ int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     }
     CATCH_AND_HANDLE
 }
+
+int faiss_clone_index_binary(
+        const FaissIndexBinary* idx,
+        FaissIndexBinary** p_out) {
+    try {
+        auto out = faiss::clone_binary_index(
+                reinterpret_cast<const IndexBinary*>(idx));
+        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/clone_index_c.h b/c_api/clone_index_c.h
index 3d0bd6745f..d2da35b82f 100644
--- a/c_api/clone_index_c.h
+++ b/c_api/clone_index_c.h
@@ -13,6 +13,7 @@
 #define FAISS_CLONE_INDEX_C_H
 
 #include <stdio.h>
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -25,6 +26,9 @@ extern "C" {
 /** Clone an index. This is equivalent to `faiss::clone_index` */
 int faiss_clone_index(const FaissIndex*, FaissIndex** p_out);
 
+/** Clone a binary index. This is equivalent to `faiss::clone_index_binary` */
+int faiss_clone_index_binary(const FaissIndexBinary*, FaissIndexBinary** p_out);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c_api/index_factory_c.cpp b/c_api/index_factory_c.cpp
index e9abf141f8..3a1ab9bab9 100644
--- a/c_api/index_factory_c.cpp
+++ b/c_api/index_factory_c.cpp
@@ -15,7 +15,7 @@
 
 using faiss::Index;
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -29,3 +29,17 @@ int faiss_index_factory(
     }
     CATCH_AND_HANDLE
 }
+
+/** Build an index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexBinary*>(
+                faiss::index_binary_factory(d, description));
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/index_factory_c.h b/c_api/index_factory_c.h
index 11fb0faa16..ccd58ac778 100644
--- a/c_api/index_factory_c.h
+++ b/c_api/index_factory_c.h
@@ -11,6 +11,7 @@
 #ifndef FAISS_INDEX_FACTORY_C_H
 #define FAISS_INDEX_FACTORY_C_H
 
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -18,7 +19,7 @@
 extern "C" {
 #endif
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -27,6 +28,14 @@ int faiss_index_factory(
         const char* description,
         FaissMetricType metric);
 
+/** Build a binary index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description);
+
 #ifdef __cplusplus
 }
 #endif

From 77e2e79cd0a680adc343b9840dd865da724c579e Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Fri, 29 Mar 2024 11:43:33 -0700
Subject: [PATCH 025/116] Throw when attempting to move IndexPQ to GPU (#3328)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3328

Reviewed By: junjieqi

Differential Revision: D55476917

fbshipit-source-id: e7f64adefa07650fda32ad2300a1b933cedc9c79
---
 faiss/gpu/GpuCloner.cpp                 |  2 ++
 faiss/gpu/test/test_index_cpu_to_gpu.py | 29 +++++++++++++++++++++++++
 faiss/impl/FaissAssert.h                |  7 ++++++
 3 files changed, 38 insertions(+)
 create mode 100644 faiss/gpu/test/test_index_cpu_to_gpu.py

diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 20583720f3..06ad082272 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -224,6 +224,8 @@ faiss::Index* index_cpu_to_gpu(
         int device,
         const faiss::Index* index,
         const GpuClonerOptions* options) {
+    auto index_pq = dynamic_cast<const faiss::IndexPQ*>(index);
+    FAISS_THROW_IF_MSG(index_pq, "This index type is not implemented on GPU.");
     GpuClonerOptions defaults;
     ToGpuCloner cl(provider, device, options ? *options : defaults);
     return cl.clone_Index(index);
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
new file mode 100644
index 0000000000..84c35e2af7
--- /dev/null
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -0,0 +1,29 @@
+import numpy as np
+import unittest
+import faiss
+
+
+class TestMoveToGpu(unittest.TestCase):
+    def test_index_cpu_to_gpu(self):
+        dimension = 128
+        n = 2500
+        db_vectors = np.random.random((n, dimension)).astype('float32')
+        code_size = 16
+        res = faiss.StandardGpuResources()
+        index_pq = faiss.IndexPQ(dimension, code_size, 6)
+        index_pq.train(db_vectors)
+        index_pq.add(db_vectors)
+        self.assertRaisesRegex(Exception, ".*not implemented.*",
+                               faiss.index_cpu_to_gpu, res, 0, index_pq)
+
+    def test_index_cpu_to_gpu_does_not_throw_with_index_flat(self):
+        dimension = 128
+        n = 100
+        db_vectors = np.random.random((n, dimension)).astype('float32')
+        res = faiss.StandardGpuResources()
+        index_flat = faiss.IndexFlatL2(dimension)
+        index_flat.add(db_vectors)
+        try:
+            faiss.index_cpu_to_gpu(res, 0, index_flat)
+        except Exception:
+            self.fail("index_cpu_to_gpu() threw an unexpected exception.")
diff --git a/faiss/impl/FaissAssert.h b/faiss/impl/FaissAssert.h
index 6f666f684c..2aea23e6a8 100644
--- a/faiss/impl/FaissAssert.h
+++ b/faiss/impl/FaissAssert.h
@@ -94,6 +94,13 @@
         }                                              \
     } while (false)
 
+#define FAISS_THROW_IF_MSG(X, MSG)                           \
+    do {                                                     \
+        if (X) {                                             \
+            FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
+        }                                                    \
+    } while (false)
+
 #define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
     do {                                                     \
         if (!(X)) {                                          \

From c9c86f0daafc2d0ccbfa40c2d46779b26102a349 Mon Sep 17 00:00:00 2001
From: Warmchay <1282046785@qq.com>
Date: Tue, 2 Apr 2024 06:11:53 -0700
Subject: [PATCH 026/116] Fix missing overload variable in Rocksdb ivf demo
 (#3326)

Summary:
**Bugs:**
When following rocksdb_ivf demo to build executable file, its output as:
```bash
faiss/demos/rocksdb_ivf/RocksDBInvertedLists.h:52:35: error: 'faiss::InvertedListsIterator* faiss_rocksdb::RocksDBInvertedLists::get_iterator(size_t) const' marked 'override', but does not override
   52 |     faiss::InvertedListsIterator* get_iterator(size_t list_no) const override;
      |                                   ^~~~~~~~~~~~
make[2]: *** [CMakeFiles/demo_rocksdb_ivf.dir/build.make:90: CMakeFiles/demo_rocksdb_ivf.dir/RocksDBInvertedLists.cpp.o] Error 1
```

**Solution:**
Add relevant variable `void* inverted_list_contex` corresponding `get_iterator`'s base virtual function.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3326

Reviewed By: mlomeli1, mdouze

Differential Revision: D55629580

Pulled By: algoriddle

fbshipit-source-id: a12fcacb483e0dd576411ad91a3dd1e0de94abec
---
 demos/rocksdb_ivf/RocksDBInvertedLists.cpp | 3 ++-
 demos/rocksdb_ivf/RocksDBInvertedLists.h   | 4 +++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.cpp b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
index 99c51c1456..8d692f0b54 100644
--- a/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
@@ -101,7 +101,8 @@ void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
 }
 
 InvertedListsIterator* RocksDBInvertedLists::get_iterator(
-        size_t list_no) const {
+        size_t list_no,
+        void* inverted_list_context) const {
     return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
 }
 
diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.h b/demos/rocksdb_ivf/RocksDBInvertedLists.h
index fdc83d1d27..f9d70a4f97 100644
--- a/demos/rocksdb_ivf/RocksDBInvertedLists.h
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.h
@@ -49,7 +49,9 @@ struct RocksDBInvertedLists : faiss::InvertedLists {
 
     void resize(size_t list_no, size_t new_size) override;
 
-    faiss::InvertedListsIterator* get_iterator(size_t list_no) const override;
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context) const override;
 
    private:
     std::unique_ptr<rocksdb::DB> db_;

From da9f292a4b1c3382431e06996732e4ea10081b8a Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Wed, 3 Apr 2024 10:36:56 -0700
Subject: [PATCH 027/116] Support of skip_ids in merge_from_multiple function
 of OnDiskInvertedLists (#3327)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3327

**Context**
1. [Issue 2621](https://github.com/facebookresearch/faiss/issues/2621) discuss inconsistency between OnDiskInvertedList and InvertedList. OnDiskInvertedList is supposed to handle disk based multiple Index Shards. Thus, we should name it differently when merging invls from index shard.
2. [Issue 2876](https://github.com/facebookresearch/faiss/issues/2876) provides usecase of shifting ids when merging invls from different shards.

**In this diff**,
1. To address #1 above, I renamed the merge_from function to merge_from_multiple without touching merge_from base class.
why so? To continue to allow merge invl from one index to ondiskinvl from other index.

2. To address #2 above, I have added support of shift_ids in merge_from_multiple to shift ids from different shards. This can be used when each shard has same set of ids but different data. This is not recommended if id is already unique across shards.

Reviewed By: mdouze

Differential Revision: D55482518

fbshipit-source-id: 95470c7449160488d2b45b024d134cbc037a2083
---
 contrib/ondisk.py                      |  4 +-
 faiss/invlists/OnDiskInvertedLists.cpp | 23 ++++++--
 faiss/invlists/OnDiskInvertedLists.h   |  3 +-
 tests/test_contrib.py                  | 73 +++++++++++++++++++++++---
 tests/test_merge.cpp                   | 28 +++++++++-
 5 files changed, 115 insertions(+), 16 deletions(-)

diff --git a/contrib/ondisk.py b/contrib/ondisk.py
index 26a95f44f5..81ec71941c 100644
--- a/contrib/ondisk.py
+++ b/contrib/ondisk.py
@@ -11,7 +11,7 @@
 
 
 def merge_ondisk(
-    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str
+    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str, shift_ids=False
 ) -> None:
     """Add the contents of the indexes stored in shard_fnames into the index
     trained_index. The on-disk data is stored in ivfdata_fname"""
@@ -51,7 +51,7 @@ def merge_ondisk(
         ivf_vector.push_back(ivf)
 
     LOG.info("merge %d inverted lists " % ivf_vector.size())
-    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
+    ntotal = invlists.merge_from_multiple(ivf_vector.data(), ivf_vector.size(), shift_ids)
 
     # now replace the inverted lists in the output index
     index.ntotal = index_ivf.ntotal = ntotal
diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
index 3017d164c6..dc17fe67f6 100644
--- a/faiss/invlists/OnDiskInvertedLists.cpp
+++ b/faiss/invlists/OnDiskInvertedLists.cpp
@@ -565,15 +565,16 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
 /*****************************************
  * Compact form
  *****************************************/
-
-size_t OnDiskInvertedLists::merge_from(
+size_t OnDiskInvertedLists::merge_from_multiple(
         const InvertedLists** ils,
         int n_il,
+        bool shift_ids,
         bool verbose) {
     FAISS_THROW_IF_NOT_MSG(
             totsize == 0, "works only on an empty InvertedLists");
 
     std::vector<size_t> sizes(nlist);
+    std::vector<size_t> shift_id_offsets(n_il);
     for (int i = 0; i < n_il; i++) {
         const InvertedLists* il = ils[i];
         FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
@@ -581,6 +582,10 @@ size_t OnDiskInvertedLists::merge_from(
         for (size_t j = 0; j < nlist; j++) {
             sizes[j] += il->list_size(j);
         }
+
+        size_t il_totsize = il->compute_ntotal();
+        shift_id_offsets[i] =
+                (shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
     }
 
     size_t cums = 0;
@@ -605,11 +610,21 @@ size_t OnDiskInvertedLists::merge_from(
             const InvertedLists* il = ils[i];
             size_t n_entry = il->list_size(j);
             l.size += n_entry;
+            ScopedIds scope_ids(il, j);
+            const idx_t* scope_ids_data = scope_ids.get();
+            std::vector<idx_t> new_ids;
+            if (shift_ids) {
+                new_ids.resize(n_entry);
+                for (size_t k = 0; k < n_entry; k++) {
+                    new_ids[k] = scope_ids[k] + shift_id_offsets[i];
+                }
+                scope_ids_data = new_ids.data();
+            }
             update_entries(
                     j,
                     l.size - n_entry,
                     n_entry,
-                    ScopedIds(il, j).get(),
+                    scope_ids_data,
                     ScopedCodes(il, j).get());
         }
         assert(l.size == l.capacity);
@@ -638,7 +653,7 @@ size_t OnDiskInvertedLists::merge_from(
 size_t OnDiskInvertedLists::merge_from_1(
         const InvertedLists* ils,
         bool verbose) {
-    return merge_from(&ils, 1, verbose);
+    return merge_from_multiple(&ils, 1, verbose);
 }
 
 void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {
diff --git a/faiss/invlists/OnDiskInvertedLists.h b/faiss/invlists/OnDiskInvertedLists.h
index 98cb653a7a..01c7f3481e 100644
--- a/faiss/invlists/OnDiskInvertedLists.h
+++ b/faiss/invlists/OnDiskInvertedLists.h
@@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists {
 
     // copy all inverted lists into *this, in compact form (without
     // allocating slots)
-    size_t merge_from(
+    size_t merge_from_multiple(
             const InvertedLists** ils,
             int n_il,
+            bool shift_ids = false,
             bool verbose = false);
 
     /// same as merge_from for a single invlist
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index 84b90a4e5f..0e7cbbfb03 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -9,6 +9,7 @@
 import platform
 import os
 import random
+import shutil
 import tempfile
 
 from faiss.contrib import datasets
@@ -17,15 +18,13 @@
 from faiss.contrib import ivf_tools
 from faiss.contrib import clustering
 from faiss.contrib import big_batch_search
+from faiss.contrib.ondisk import merge_ondisk
 
 from common_faiss_tests import get_dataset_2
-try:
-    from faiss.contrib.exhaustive_search import \
-        knn_ground_truth, knn, range_ground_truth, \
-        range_search_max_results, exponential_query_iterator
-except:
-    pass  # Submodule import broken in python 2.
-
+from faiss.contrib.exhaustive_search import \
+    knn_ground_truth, knn, range_ground_truth, \
+    range_search_max_results, exponential_query_iterator
+from contextlib import contextmanager
 
 @unittest.skipIf(platform.python_version_tuple()[0] < '3',
                  'Submodule import broken in python 2.')
@@ -674,3 +673,63 @@ def test_code_set(self):
         np.testing.assert_equal(
             np.sort(np.unique(codes, axis=0), axis=None),
             np.sort(codes[inserted], axis=None))
+
+
+@unittest.skipIf(platform.system() == 'Windows',
+                'OnDiskInvertedLists is unsupported on Windows.')
+class TestMerge(unittest.TestCase):
+    @contextmanager
+    def temp_directory(self):
+        temp_dir = tempfile.mkdtemp()
+        try:
+            yield temp_dir
+        finally:
+            shutil.rmtree(temp_dir)
+
+    def do_test_ondisk_merge(self, shift_ids=False):
+        with self.temp_directory() as tmpdir:
+            # only train and add index to disk without adding elements.
+            # this will create empty inverted lists.
+            ds = datasets.SyntheticDataset(32, 2000, 200, 20)
+            index = faiss.index_factory(ds.d, "IVF32,Flat")
+            index.train(ds.get_train())
+            faiss.write_index(index, tmpdir + "/trained.index")
+
+            # create 4 shards and add elements to them
+            ns = 4  # number of shards
+
+            for bno in range(ns):
+                index = faiss.read_index(tmpdir + "/trained.index")
+                i0, i1 = int(bno * ds.nb / ns), int((bno + 1) * ds.nb / ns)
+                if shift_ids:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(0, ds.nb / ns))
+                else:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(i0, i1))
+                faiss.write_index(index, tmpdir + "/block_%d.index" % bno)
+
+            # construct the output index and merge them on disk
+            index = faiss.read_index(tmpdir + "/trained.index")
+            block_fnames = [tmpdir + "/block_%d.index" % bno for bno in range(4)]
+
+            merge_ondisk(
+                index, block_fnames, tmpdir + "/merged_index.ivfdata", shift_ids
+            )
+            faiss.write_index(index, tmpdir + "/populated.index")
+
+            # perform a search from index on disk
+            index = faiss.read_index(tmpdir + "/populated.index")
+            index.nprobe = 5
+            D, I = index.search(ds.xq, 5)
+
+            # ground-truth
+            gtI = ds.get_groundtruth(5)
+
+            recall_at_1 = (I[:, :1] == gtI[:, :1]).sum() / float(ds.xq.shape[0])
+            self.assertGreaterEqual(recall_at_1, 0.5)
+
+    def test_ondisk_merge(self):
+        self.do_test_ondisk_merge()
+
+    def test_ondisk_merge_with_shift_ids(self):
+        # verified that recall is same for test_ondisk_merge and
+        self.do_test_ondisk_merge(True)
diff --git a/tests/test_merge.cpp b/tests/test_merge.cpp
index 5a1d08cfba..edbe2a03a6 100644
--- a/tests/test_merge.cpp
+++ b/tests/test_merge.cpp
@@ -32,6 +32,7 @@ size_t nq = 100;
 int nindex = 4;
 int k = 10;
 int nlist = 40;
+int shard_size = nb / nindex;
 
 struct CommonData {
     std::vector<float> database;
@@ -100,7 +101,7 @@ int compare_merged(
         auto il = new faiss::OnDiskInvertedLists(
                 index0->nlist, index0->code_size, filename.c_str());
 
-        il->merge_from(lists.data(), lists.size());
+        il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
 
         index0->replace_invlists(il, true);
         index0->ntotal = ntotal;
@@ -110,11 +111,14 @@ int compare_merged(
             nq, cd.queries.data(), k, newD.data(), newI.data());
 
     size_t ndiff = 0;
+    bool adjust_ids = shift_ids && !standard_merge;
     for (size_t i = 0; i < k * nq; i++) {
-        if (refI[i] != newI[i]) {
+        idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
+        if (refI[i] != new_id) {
             ndiff++;
         }
     }
+
     return ndiff;
 }
 
@@ -220,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) {
     int ndiff = compare_merged(&index_shards, false, false);
     EXPECT_GE(0, ndiff);
 }
+
+// now use ondisk specific merge and use shift ids
+TEST(MERGE, merge_flat_ondisk_3) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_indices = true;
+
+    std::vector<idx_t> ids;
+    for (int i = 0; i < nb; ++i) {
+        int id = i % shard_size;
+        ids.push_back(id);
+    }
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard(
+                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), ids.data());
+    int ndiff = compare_merged(&index_shards, true, false);
+    EXPECT_GE(0, ndiff);
+}

From cfc7fe513b92bbe540d7d02664deb55cf2f6238b Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Fri, 5 Apr 2024 14:08:35 -0700
Subject: [PATCH 028/116] Implement reconstruct_n for GPU IVFFlat indexes
 (#3338)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3338

add reconstruct_n for GPU IVFFlat

Reviewed By: mdouze

Differential Revision: D55577561

fbshipit-source-id: 47f8b939611e2df7dbcd087129538145f627293c
---
 faiss/gpu/GpuIndexIVFFlat.cu             | 22 ++++++++
 faiss/gpu/GpuIndexIVFFlat.h              |  2 +
 faiss/gpu/impl/IVFBase.cu                |  4 ++
 faiss/gpu/impl/IVFBase.cuh               | 13 ++++-
 faiss/gpu/impl/IVFFlat.cu                | 47 +++++++++++++++++
 faiss/gpu/impl/IVFFlat.cuh               |  2 +
 faiss/gpu/test/TestGpuIndexIVFFlat.cpp   | 65 ++++++++++++++++++++++++
 faiss/gpu/test/test_gpu_basics.py        |  1 +
 faiss/gpu/test/test_gpu_index_ivfflat.py | 25 +++++++++
 faiss/gpu/test/torch_test_contrib_gpu.py | 36 ++++++++++++-
 faiss/gpu/utils/DeviceVector.cuh         |  2 +
 11 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 faiss/gpu/test/test_gpu_index_ivfflat.py

diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 440b449a50..884b5b0fc0 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -356,5 +356,27 @@ void GpuIndexIVFFlat::setIndex_(
     }
 }
 
+void GpuIndexIVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) const {
+    FAISS_ASSERT(index_);
+
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    FAISS_THROW_IF_NOT_FMT(
+            i0 < this->ntotal,
+            "start index (%zu) out of bounds (ntotal %zu)",
+            i0,
+            this->ntotal);
+    FAISS_THROW_IF_NOT_FMT(
+            i0 + ni - 1 < this->ntotal,
+            "max index requested (%zu) out of bounds (ntotal %zu)",
+            i0 + ni - 1,
+            this->ntotal);
+
+    index_->reconstruct_n(i0, ni, out);
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index 678bf8e7f4..1401e2b291 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     /// Trains the coarse quantizer based on the given vector data
     void train(idx_t n, const float* x) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
+
    protected:
     /// Initialize appropriate index
     void setIndex_(
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 890d489440..3b373b8280 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -340,6 +340,10 @@ void IVFBase::copyInvertedListsTo(InvertedLists* ivf) {
     }
 }
 
+void IVFBase::reconstruct_n(idx_t i0, idx_t n, float* out) {
+    FAISS_THROW_MSG("not implemented");
+}
+
 void IVFBase::addEncodedVectorsToList_(
         idx_t listId,
         const void* codes,
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 6b1f2ac394..04af9a906e 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -109,9 +109,18 @@ class IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) = 0;
 
+    /*  It is used to reconstruct a given number of vectors in an Inverted File
+     * (IVF) index
+     *  @param i0          index of the first vector to reconstruct
+     *  @param n           number of vectors to reconstruct
+     *  @param out         This is a pointer to a buffer where the reconstructed
+     * vectors will be stored.
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t n, float* out);
+
    protected:
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
+    /// Adds a set of codes and indices to a list, with the
+    /// representation coming from the CPU equivalent
     virtual void addEncodedVectorsToList_(
             idx_t listId,
             // resident on the host
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 4607e49870..e0ecfd82cf 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -283,6 +283,53 @@ void IVFFlat::searchPreassigned(
             storePairs);
 }
 
+void IVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) {
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    for (idx_t list_no = 0; list_no < numLists_; list_no++) {
+        size_t list_size = deviceListData_[list_no]->numVecs;
+
+        auto idlist = getListIndices(list_no);
+
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+
+            // vector data in the non-interleaved format is laid out like:
+            // v0d0 v0d1 ... v0d(dim-1) v1d0 v1d1 ... v1d(dim-1)
+
+            // vector data in the interleaved format is laid out like:
+            // (v0d0 v1d0 ... v31d0) (v0d1 v1d1 ... v31d1)
+            // (v0d(dim - 1) ... v31d(dim-1))
+            // (v32d0 v33d0 ... v63d0) (... v63d(dim-1)) (v64d0 ...)
+
+            // where vectors are chunked into groups of 32, and each dimension
+            // for each of the 32 vectors is contiguous
+
+            auto vectorChunk = offset / 32;
+            auto vectorWithinChunk = offset % 32;
+
+            auto listDataPtr = (float*)deviceListData_[list_no]->data.data();
+            listDataPtr += vectorChunk * 32 * dim_ + vectorWithinChunk;
+
+            for (int d = 0; d < dim_; ++d) {
+                fromDevice<float>(
+                        listDataPtr + 32 * d,
+                        out + (id - i0) * dim_ + d,
+                        1,
+                        stream);
+            }
+        }
+    }
+}
+
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 246fc18b16..889b510795 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -51,6 +51,8 @@ class IVFFlat : public IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) override;
+
    protected:
     /// Returns the number of bytes in which an IVF list containing numVecs
     /// vectors is encoded on the device. Note that due to padding this is not
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 6e423e582e..28eefec308 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -842,6 +842,71 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 #endif
 }
 
+TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
+    config.use_raft = false;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
+
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+
+    std::vector<float> gpuVals(opt.numAdd * opt.dim);
+
+    gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+
+    std::vector<float> cpuVals(opt.numAdd * opt.dim);
+
+    cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_32_BIT;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex1(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex1.nprobe = opt.nprobe;
+
+    gpuIndex1.train(opt.numTrain, trainVecs.data());
+    gpuIndex1.add(opt.numAdd, addVecs.data());
+
+    gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_CPU;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex2(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex2.nprobe = opt.nprobe;
+
+    gpuIndex2.train(opt.numTrain, trainVecs.data());
+    gpuIndex2.add(opt.numAdd, addVecs.data());
+
+    gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff --git a/faiss/gpu/test/test_gpu_basics.py b/faiss/gpu/test/test_gpu_basics.py
index f3f0a525d4..4b4024d236 100755
--- a/faiss/gpu/test/test_gpu_basics.py
+++ b/faiss/gpu/test/test_gpu_basics.py
@@ -11,6 +11,7 @@
 import random
 from common_faiss_tests import get_dataset_2
 
+
 class ReferencedObject(unittest.TestCase):
 
     d = 16
diff --git a/faiss/gpu/test/test_gpu_index_ivfflat.py b/faiss/gpu/test/test_gpu_index_ivfflat.py
new file mode 100644
index 0000000000..099615aff5
--- /dev/null
+++ b/faiss/gpu/test/test_gpu_index_ivfflat.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+
+class TestGpuIndexIvfflat(unittest.TestCase):
+    def test_reconstruct_n(self):
+        index = faiss.index_factory(4, "IVF10,Flat")
+        x = np.random.RandomState(123).rand(10, 4).astype('float32')
+        index.train(x)
+        index.add(x)
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        index2 = faiss.GpuIndexIVFFlat(res, index, config)
+        recons = index2.reconstruct_n(0, 10)
+
+        np.testing.assert_array_equal(recons, x)
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 1510b10f1d..0c949c29f2 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -108,7 +108,7 @@ def test_train_add_with_ids(self):
         self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
 
     # tests reconstruct, reconstruct_n
-    def test_reconstruct(self):
+    def test_flat_reconstruct(self):
         d = 32
         res = faiss.StandardGpuResources()
         res.noTempMemory()
@@ -157,6 +157,40 @@ def test_reconstruct(self):
         index.reconstruct_n(50, 10, y)
         self.assertTrue(torch.equal(xb[50:60], y))
 
+    def test_ivfflat_reconstruct(self):
+        d = 32
+        nlist = 5
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+
+        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
+
+        xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.train(xb)
+        index.add(xb)
+
+        # Test reconstruct_n with torch gpu (native return)
+        y = index.reconstruct_n(10, 10)
+        self.assertTrue(y.is_cuda)
+        self.assertTrue(torch.equal(xb[10:20], y))
+
+        # Test reconstruct with numpy output provided
+        y = np.empty((10, d), dtype='float32')
+        index.reconstruct_n(20, 10, y)
+        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
+
+        # Test reconstruct_n with torch cpu output provided
+        y = torch.empty(10, d, dtype=torch.float32)
+        index.reconstruct_n(40, 10, y)
+        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
+
+        # Test reconstruct_n with torch gpu output provided
+        y = torch.empty(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.reconstruct_n(50, 10, y)
+        self.assertTrue(torch.equal(xb[50:60], y))
+
     # tests assign
     def test_assign(self):
         d = 32
diff --git a/faiss/gpu/utils/DeviceVector.cuh b/faiss/gpu/utils/DeviceVector.cuh
index 0517d06c32..51cb7c8d37 100644
--- a/faiss/gpu/utils/DeviceVector.cuh
+++ b/faiss/gpu/utils/DeviceVector.cuh
@@ -169,6 +169,8 @@ class DeviceVector {
         T out;
         CUDA_VERIFY(cudaMemcpyAsync(
                 &out, data() + idx, sizeof(T), cudaMemcpyDeviceToHost, stream));
+
+        return out;
     }
 
     // Clean up after oversized allocations, while leaving some space to

From f34588aae79f558a6b590f7464a98ba8a2cb1e28 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 5 Apr 2024 15:54:23 -0700
Subject: [PATCH 029/116] Support for Remove ids from IVFPQFastScan index
 (#3349)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3349

**Context**
[Issue 3128](https://github.com/facebookresearch/faiss/issues/3128) is an enhancement request to support remove_ids for IVFPQFastScan.

Existing mechanism use direct map and iterate over items in selector and use scopecodes and scopeIds to replace item to be removed. Given that codes are packed, it is hard to return single code how it is packed in CodePackerPQ4. Thus, we need a custom implementation to removed_ids.

**In this diff**,
1. We have added custom implementation of remove_ids from BlockInvertedLists which unpack code as it iterate and repack in new position. DirectMap use this remove_id function in BlockInvertedLists for type NoMap in DirectMap.

2. Also, we are throwing exception for other map type in DirectMap i.e. HashTable

Reviewed By: mdouze

Differential Revision: D55723390

fbshipit-source-id: 0017b556bd790765251e778ac48ed37ff3a29a45
---
 faiss/invlists/BlockInvertedLists.cpp | 34 ++++++++++++++----
 faiss/invlists/BlockInvertedLists.h   |  3 ++
 faiss/invlists/DirectMap.cpp          | 10 +++++-
 tests/test_merge_index.py             | 50 ++++++++++++++++++++-------
 4 files changed, 77 insertions(+), 20 deletions(-)

diff --git a/faiss/invlists/BlockInvertedLists.cpp b/faiss/invlists/BlockInvertedLists.cpp
index 6370d11871..dbdb0302dc 100644
--- a/faiss/invlists/BlockInvertedLists.cpp
+++ b/faiss/invlists/BlockInvertedLists.cpp
@@ -9,6 +9,7 @@
 
 #include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/impl/io_macros.h>
@@ -54,7 +55,9 @@ size_t BlockInvertedLists::add_entries(
     codes[list_no].resize(n_block * block_size);
     if (o % block_size == 0) {
         // copy whole blocks
-        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+        memcpy(&codes[list_no][o * packer->code_size],
+               code,
+               n_block * block_size);
     } else {
         FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
         std::vector<uint8_t> buffer(packer->code_size);
@@ -76,6 +79,29 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
+size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
+    idx_t nremove = 0;
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        std::vector<uint8_t> buffer(packer->code_size);
+        idx_t l = ids[i].size(), j = 0;
+        while (j < l) {
+            if (sel.is_member(ids[i][j])) {
+                l--;
+                ids[i][j] = ids[i][l];
+                packer->unpack_1(codes[i].data(), l, buffer.data());
+                packer->pack_1(buffer.data(), j, codes[i].data());
+            } else {
+                j++;
+            }
+        }
+        resize(i, l);
+        nremove += ids[i].size() - l;
+    }
+
+    return nremove;
+}
+
 const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
@@ -102,12 +128,6 @@ void BlockInvertedLists::update_entries(
         const idx_t*,
         const uint8_t*) {
     FAISS_THROW_MSG("not impemented");
-    /*
-    assert (list_no < nlist);
-    assert (n_entry + offset <= ids[list_no].size());
-    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
-    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
-    */
 }
 
 BlockInvertedLists::~BlockInvertedLists() {
diff --git a/faiss/invlists/BlockInvertedLists.h b/faiss/invlists/BlockInvertedLists.h
index 8d8df720bf..2b9cbba455 100644
--- a/faiss/invlists/BlockInvertedLists.h
+++ b/faiss/invlists/BlockInvertedLists.h
@@ -15,6 +15,7 @@
 namespace faiss {
 
 struct CodePacker;
+struct IDSelector;
 
 /** Inverted Lists that are organized by blocks.
  *
@@ -47,6 +48,8 @@ struct BlockInvertedLists : InvertedLists {
     size_t list_size(size_t list_no) const override;
     const uint8_t* get_codes(size_t list_no) const override;
     const idx_t* get_ids(size_t list_no) const override;
+    /// remove ids from the InvertedLists
+    size_t remove_ids(const IDSelector& sel);
 
     // works only on empty BlockInvertedLists
     // the codes should be of size ceil(n_entry / n_per_block) * block_size
diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
index 2b272922d5..dc2b92aa1c 100644
--- a/faiss/invlists/DirectMap.cpp
+++ b/faiss/invlists/DirectMap.cpp
@@ -15,6 +15,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -148,8 +149,12 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
     std::vector<idx_t> toremove(nlist);
 
     size_t nremove = 0;
-
+    BlockInvertedLists* block_invlists =
+            dynamic_cast<BlockInvertedLists*>(invlists);
     if (type == NoMap) {
+        if (block_invlists != nullptr) {
+            return block_invlists->remove_ids(sel);
+        }
         // exhaustive scan of IVF
 #pragma omp parallel for
         for (idx_t i = 0; i < nlist; i++) {
@@ -178,6 +183,9 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
             }
         }
     } else if (type == Hashtable) {
+        FAISS_THROW_IF_MSG(
+                block_invlists,
+                "remove with hashtable is not supported with BlockInvertedLists");
         const IDSelectorArray* sela =
                 dynamic_cast<const IDSelectorArray*>(&sel);
         FAISS_THROW_IF_NOT_MSG(
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 8c4c1f0912..4417f57fe7 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -246,19 +246,45 @@ def test_merge_IDMap2(self):
 
 class TestRemoveFastScan(unittest.TestCase):
 
-    def do_fast_scan_test(self, factory_key, size1):
+    def do_fast_scan_test(self,
+                          factory_key,
+                          with_ids=False,
+                          direct_map_type=faiss.DirectMap.NoMap):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index1 = faiss.index_factory(ds.d, factory_key)
-        index1.train(ds.get_train())
-        index1.reset()
+        index = faiss.index_factory(ds.d, factory_key)
+        index.train(ds.get_train())
+
+        index.reset()
         tokeep = [i % 3 == 0 for i in range(ds.nb)]
-        index1.add(ds.get_database()[tokeep])
-        _, Iref = index1.search(ds.get_queries(), 5)
-        index1.reset()
-        index1.add(ds.get_database())
-        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
-        _, Inew = index1.search(ds.get_queries(), 5)
+        if with_ids:
+            index.add_with_ids(ds.get_database()[tokeep], np.arange(ds.nb)[tokeep])
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database()[tokeep])
+        _, Iref = index.search(ds.get_queries(), 5)
+
+        index.reset()
+        if with_ids:
+            index.add_with_ids(ds.get_database(), np.arange(ds.nb))
+            index.set_direct_map_type(direct_map_type)
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database())
+        index.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
-    def test_remove(self):
-        self.do_fast_scan_test("PQ5x4fs", 320)
+    def test_remove_PQFastScan(self):
+        # with_ids is not support for this type of index
+        self.do_fast_scan_test("PQ5x4fs", False)
+
+    def test_remove_IVFPQFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", True)
+
+    def test_remove_IVFPQFastScan_2(self):
+        self.assertRaisesRegex(Exception,
+                               ".*not supported.*",
+                               self.do_fast_scan_test,
+                               "IVF20,PQ5x4fs",
+                               True,
+                               faiss.DirectMap.Hashtable)

From 7657e812c45f21cb4da78b110b6a21c67f522a4e Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Fri, 5 Apr 2024 18:13:03 -0700
Subject: [PATCH 030/116] Change index_cpu_to_gpu to throw for indices not
 implemented on GPU (#3336)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3336

Issues:
https://github.com/facebookresearch/faiss/issues/3269
https://github.com/facebookresearch/faiss/issues/3024

List of implemented GPU indices: https://github.com/facebookresearch/faiss/wiki/Faiss-on-the-GPU#implemented-indexes

Reviewed By: mdouze

Differential Revision: D55577576

fbshipit-source-id: 49f490cfba6784661e378acf4de3cce4195bb43b
---
 CHANGELOG.md                            |  2 +
 faiss/gpu/GpuCloner.cpp                 | 13 +++-
 faiss/gpu/GpuClonerOptions.h            |  6 ++
 faiss/gpu/GpuIndexIVF.cu                | 28 ++++++-
 faiss/gpu/GpuIndexIVF.h                 |  6 ++
 faiss/gpu/test/test_gpu_index.py        |  5 +-
 faiss/gpu/test/test_index_cpu_to_gpu.py | 98 ++++++++++++++++++++-----
 faiss/impl/FaissAssert.h                |  7 --
 8 files changed, 130 insertions(+), 35 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e61bd997ca..8d289ec2f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,8 @@ We try to indicate most contributions here with the contributor names who are no
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
+### Changed
+- Previously, when moving indices to GPU with coarse quantizers that were not implemented on GPU, the cloner would silently fallback to CPU. This version will now throw an exception instead and the calling code would need to explicitly allow fallback to CPU by setting a flag in cloner config.
 
 ## [1.8.0] - 2024-02-27
 ### Added
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 06ad082272..8f895ac9c7 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -153,6 +153,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.use_raft = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
                 provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -205,6 +206,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.usePrecomputedTables = usePrecomputed;
         config.use_raft = use_raft;
         config.interleavedLayout = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
 
@@ -214,8 +216,13 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
 
         return res;
     } else {
-        // default: use CPU cloner
-        return Cloner::clone_Index(index);
+        // use CPU cloner for IDMap and PreTransform
+        auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
+        auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
+        if (index_idmap || index_pt) {
+            return Cloner::clone_Index(index);
+        }
+        FAISS_THROW_MSG("This index type is not implemented on GPU.");
     }
 }
 
@@ -224,8 +231,6 @@ faiss::Index* index_cpu_to_gpu(
         int device,
         const faiss::Index* index,
         const GpuClonerOptions* options) {
-    auto index_pq = dynamic_cast<const faiss::IndexPQ*>(index);
-    FAISS_THROW_IF_MSG(index_pq, "This index type is not implemented on GPU.");
     GpuClonerOptions defaults;
     ToGpuCloner cl(provider, device, options ? *options : defaults);
     return cl.clone_Index(index);
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index 197e09dc88..e643e848fb 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -43,6 +43,12 @@ struct GpuClonerOptions {
 #else
     bool use_raft = false;
 #endif
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 struct GpuMultipleClonerOptions : public GpuClonerOptions {
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 0c5b8db686..40129a54c5 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -7,6 +7,7 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
 #include <faiss/gpu/GpuCloner.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
@@ -172,10 +173,29 @@ void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
         // over to the GPU, on the same device that we are on.
         GpuResourcesProviderFromInstance pfi(getResources());
 
-        GpuClonerOptions options;
-        auto cloner = ToGpuCloner(&pfi, getDevice(), options);
-
-        quantizer = cloner.clone_Index(index->quantizer);
+        // Attempt to clone the index to GPU. If it fails because the coarse
+        // quantizer is not implemented on GPU and the flag to allow CPU
+        // fallback is set, retry it with CPU cloner and re-throw errors.
+        try {
+            GpuClonerOptions options;
+            auto cloner = ToGpuCloner(&pfi, getDevice(), options);
+            quantizer = cloner.clone_Index(index->quantizer);
+        } catch (const std::exception& e) {
+            if (strstr(e.what(), "not implemented on GPU")) {
+                if (ivfConfig_.allowCpuCoarseQuantizer) {
+                    Cloner cpuCloner;
+                    quantizer = cpuCloner.clone_Index(index->quantizer);
+                } else {
+                    FAISS_THROW_MSG(
+                            "This index type is not implemented on "
+                            "GPU and allowCpuCoarseQuantizer is set to false. "
+                            "Please set the flag to true to allow the CPU "
+                            "fallback in cloning.");
+                }
+            } else {
+                throw;
+            }
+        }
         own_fields = true;
     } else {
         // Otherwise, this is a GPU coarse quantizer index instance found in a
diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index a9f092d35b..65a27aa94e 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
 
     /// Configuration for the coarse quantizer object
     GpuIndexFlatConfig flatConfig;
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 /// Base class of all GPU IVF index types. This (for now) deliberately does not
diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py
index 620bfea198..28572ebcb4 100755
--- a/faiss/gpu/test/test_gpu_index.py
+++ b/faiss/gpu/test/test_gpu_index.py
@@ -589,7 +589,10 @@ class TestGpuAutoTune(unittest.TestCase):
 
     def test_params(self):
         index = faiss.index_factory(32, "IVF65536_HNSW,PQ16")
-        index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
+        res = faiss.StandardGpuResources()
+        options = faiss.GpuClonerOptions()
+        options.allowCpuCoarseQuantizer = True
+        index = faiss.index_cpu_to_gpu(res, 0, index, options)
         ps = faiss.GpuParameterSpace()
         ps.initialize(index)
         for i in range(ps.parameter_ranges.size()):
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
index 84c35e2af7..088ea2bf74 100644
--- a/faiss/gpu/test/test_index_cpu_to_gpu.py
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -4,26 +4,86 @@
 
 
 class TestMoveToGpu(unittest.TestCase):
-    def test_index_cpu_to_gpu(self):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.res = faiss.StandardGpuResources()
+
+    def create_index(self, factory_string):
         dimension = 128
         n = 2500
         db_vectors = np.random.random((n, dimension)).astype('float32')
-        code_size = 16
-        res = faiss.StandardGpuResources()
-        index_pq = faiss.IndexPQ(dimension, code_size, 6)
-        index_pq.train(db_vectors)
-        index_pq.add(db_vectors)
-        self.assertRaisesRegex(Exception, ".*not implemented.*",
-                               faiss.index_cpu_to_gpu, res, 0, index_pq)
-
-    def test_index_cpu_to_gpu_does_not_throw_with_index_flat(self):
-        dimension = 128
-        n = 100
-        db_vectors = np.random.random((n, dimension)).astype('float32')
-        res = faiss.StandardGpuResources()
-        index_flat = faiss.IndexFlatL2(dimension)
-        index_flat.add(db_vectors)
+        index = faiss.index_factory(dimension, factory_string)
+        index.train(db_vectors)
+        if factory_string.startswith("IDMap"):
+            index.add_with_ids(db_vectors, np.arange(n))
+        else:
+            index.add(db_vectors)
+        return index
+
+    def create_and_clone(self, factory_string,
+                         allowCpuCoarseQuantizer=None,
+                         use_raft=None):
+        idx = self.create_index(factory_string)
+        config = faiss.GpuClonerOptions()
+        if allowCpuCoarseQuantizer is not None:
+            config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer
+        if use_raft is not None:
+            config.use_raft = use_raft
+        faiss.index_cpu_to_gpu(self.res, 0, idx, config)
+
+    def verify_throws_not_implemented_exception(self, factory_string):
+        try:
+            self.create_and_clone(factory_string)
+        except Exception as e:
+            if "not implemented" not in str(e):
+                self.fail("Expected an exception but no exception was "
+                          "thrown for factory_string: %s." % factory_string)
+
+    def verify_clones_successfully(self, factory_string,
+                                   allowCpuCoarseQuantizer=None,
+                                   use_raft=None):
+        try:
+            self.create_and_clone(
+                factory_string,
+                allowCpuCoarseQuantizer=allowCpuCoarseQuantizer,
+                use_raft=use_raft)
+        except Exception as e:
+            self.fail("Unexpected exception thrown factory_string: "
+                      "%s; error message: %s." % (factory_string, str(e)))
+
+    def test_not_implemented_indices(self):
+        self.verify_throws_not_implemented_exception("PQ16")
+        self.verify_throws_not_implemented_exception("LSHrt")
+        self.verify_throws_not_implemented_exception("HNSW")
+        self.verify_throws_not_implemented_exception("HNSW,PQ16")
+        self.verify_throws_not_implemented_exception("IDMap,PQ16")
+        self.verify_throws_not_implemented_exception("IVF256,ITQ64,SH1.2")
+
+    def test_implemented_indices(self):
+        self.verify_clones_successfully("Flat")
+        self.verify_clones_successfully("IVF1,Flat")
+        self.verify_clones_successfully("IVF32,PQ8")
+        self.verify_clones_successfully("IDMap,Flat")
+        self.verify_clones_successfully("PCA12,IVF32,Flat")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8np")
+
+        # set use_raft to false, these index types are not supported on RAFT
+        self.verify_clones_successfully("IVF32,SQ8", use_raft=False)
+        self.verify_clones_successfully(
+            "PCA32,IVF32,SQ8", use_raft=False)
+
+    def test_with_flag(self):
+        self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                        allowCpuCoarseQuantizer=True)
+        self.verify_clones_successfully("IVF256(PQ2x4fs),Flat",
+                                        allowCpuCoarseQuantizer=True)
+
+    def test_with_flag_set_to_false(self):
         try:
-            faiss.index_cpu_to_gpu(res, 0, index_flat)
-        except Exception:
-            self.fail("index_cpu_to_gpu() threw an unexpected exception.")
+            self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                            allowCpuCoarseQuantizer=False)
+        except Exception as e:
+            if "set the flag to true to allow the CPU fallback" not in str(e):
+                self.fail("Unexepected error message thrown: %s." % str(e))
diff --git a/faiss/impl/FaissAssert.h b/faiss/impl/FaissAssert.h
index 2aea23e6a8..6f666f684c 100644
--- a/faiss/impl/FaissAssert.h
+++ b/faiss/impl/FaissAssert.h
@@ -94,13 +94,6 @@
         }                                              \
     } while (false)
 
-#define FAISS_THROW_IF_MSG(X, MSG)                           \
-    do {                                                     \
-        if (X) {                                             \
-            FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
-        }                                                    \
-    } while (false)
-
 #define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
     do {                                                     \
         if (!(X)) {                                          \

From 366a8146aa8744277c02328987bec95acf364ba7 Mon Sep 17 00:00:00 2001
From: Gufan Yin <gyin@fb.com>
Date: Fri, 5 Apr 2024 19:11:34 -0700
Subject: [PATCH 031/116] Revert D55723390: Support for Remove ids from
 IVFPQFastScan index

Differential Revision:
D55723390

Original commit changeset: 0017b556bd79

Original Phabricator Diff: D55723390

fbshipit-source-id: 58d61467b30dd11d27398f9f825162f598896845
---
 faiss/invlists/BlockInvertedLists.cpp | 34 ++++--------------
 faiss/invlists/BlockInvertedLists.h   |  3 --
 faiss/invlists/DirectMap.cpp          | 10 +-----
 tests/test_merge_index.py             | 50 +++++++--------------------
 4 files changed, 20 insertions(+), 77 deletions(-)

diff --git a/faiss/invlists/BlockInvertedLists.cpp b/faiss/invlists/BlockInvertedLists.cpp
index dbdb0302dc..6370d11871 100644
--- a/faiss/invlists/BlockInvertedLists.cpp
+++ b/faiss/invlists/BlockInvertedLists.cpp
@@ -9,7 +9,6 @@
 
 #include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/impl/io_macros.h>
@@ -55,9 +54,7 @@ size_t BlockInvertedLists::add_entries(
     codes[list_no].resize(n_block * block_size);
     if (o % block_size == 0) {
         // copy whole blocks
-        memcpy(&codes[list_no][o * packer->code_size],
-               code,
-               n_block * block_size);
+        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
     } else {
         FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
         std::vector<uint8_t> buffer(packer->code_size);
@@ -79,29 +76,6 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
-size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
-    idx_t nremove = 0;
-#pragma omp parallel for
-    for (idx_t i = 0; i < nlist; i++) {
-        std::vector<uint8_t> buffer(packer->code_size);
-        idx_t l = ids[i].size(), j = 0;
-        while (j < l) {
-            if (sel.is_member(ids[i][j])) {
-                l--;
-                ids[i][j] = ids[i][l];
-                packer->unpack_1(codes[i].data(), l, buffer.data());
-                packer->pack_1(buffer.data(), j, codes[i].data());
-            } else {
-                j++;
-            }
-        }
-        resize(i, l);
-        nremove += ids[i].size() - l;
-    }
-
-    return nremove;
-}
-
 const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
@@ -128,6 +102,12 @@ void BlockInvertedLists::update_entries(
         const idx_t*,
         const uint8_t*) {
     FAISS_THROW_MSG("not impemented");
+    /*
+    assert (list_no < nlist);
+    assert (n_entry + offset <= ids[list_no].size());
+    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
+    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
+    */
 }
 
 BlockInvertedLists::~BlockInvertedLists() {
diff --git a/faiss/invlists/BlockInvertedLists.h b/faiss/invlists/BlockInvertedLists.h
index 2b9cbba455..8d8df720bf 100644
--- a/faiss/invlists/BlockInvertedLists.h
+++ b/faiss/invlists/BlockInvertedLists.h
@@ -15,7 +15,6 @@
 namespace faiss {
 
 struct CodePacker;
-struct IDSelector;
 
 /** Inverted Lists that are organized by blocks.
  *
@@ -48,8 +47,6 @@ struct BlockInvertedLists : InvertedLists {
     size_t list_size(size_t list_no) const override;
     const uint8_t* get_codes(size_t list_no) const override;
     const idx_t* get_ids(size_t list_no) const override;
-    /// remove ids from the InvertedLists
-    size_t remove_ids(const IDSelector& sel);
 
     // works only on empty BlockInvertedLists
     // the codes should be of size ceil(n_entry / n_per_block) * block_size
diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
index dc2b92aa1c..2b272922d5 100644
--- a/faiss/invlists/DirectMap.cpp
+++ b/faiss/invlists/DirectMap.cpp
@@ -15,7 +15,6 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
-#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -149,12 +148,8 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
     std::vector<idx_t> toremove(nlist);
 
     size_t nremove = 0;
-    BlockInvertedLists* block_invlists =
-            dynamic_cast<BlockInvertedLists*>(invlists);
+
     if (type == NoMap) {
-        if (block_invlists != nullptr) {
-            return block_invlists->remove_ids(sel);
-        }
         // exhaustive scan of IVF
 #pragma omp parallel for
         for (idx_t i = 0; i < nlist; i++) {
@@ -183,9 +178,6 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
             }
         }
     } else if (type == Hashtable) {
-        FAISS_THROW_IF_MSG(
-                block_invlists,
-                "remove with hashtable is not supported with BlockInvertedLists");
         const IDSelectorArray* sela =
                 dynamic_cast<const IDSelectorArray*>(&sel);
         FAISS_THROW_IF_NOT_MSG(
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 4417f57fe7..8c4c1f0912 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -246,45 +246,19 @@ def test_merge_IDMap2(self):
 
 class TestRemoveFastScan(unittest.TestCase):
 
-    def do_fast_scan_test(self,
-                          factory_key,
-                          with_ids=False,
-                          direct_map_type=faiss.DirectMap.NoMap):
+    def do_fast_scan_test(self, factory_key, size1):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index = faiss.index_factory(ds.d, factory_key)
-        index.train(ds.get_train())
-
-        index.reset()
+        index1 = faiss.index_factory(ds.d, factory_key)
+        index1.train(ds.get_train())
+        index1.reset()
         tokeep = [i % 3 == 0 for i in range(ds.nb)]
-        if with_ids:
-            index.add_with_ids(ds.get_database()[tokeep], np.arange(ds.nb)[tokeep])
-            faiss.extract_index_ivf(index).nprobe = 5
-        else:
-            index.add(ds.get_database()[tokeep])
-        _, Iref = index.search(ds.get_queries(), 5)
-
-        index.reset()
-        if with_ids:
-            index.add_with_ids(ds.get_database(), np.arange(ds.nb))
-            index.set_direct_map_type(direct_map_type)
-            faiss.extract_index_ivf(index).nprobe = 5
-        else:
-            index.add(ds.get_database())
-        index.remove_ids(np.where(np.logical_not(tokeep))[0])
-        _, Inew = index.search(ds.get_queries(), 5)
+        index1.add(ds.get_database()[tokeep])
+        _, Iref = index1.search(ds.get_queries(), 5)
+        index1.reset()
+        index1.add(ds.get_database())
+        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index1.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
-    def test_remove_PQFastScan(self):
-        # with_ids is not support for this type of index
-        self.do_fast_scan_test("PQ5x4fs", False)
-
-    def test_remove_IVFPQFastScan(self):
-        self.do_fast_scan_test("IVF20,PQ5x4fs", True)
-
-    def test_remove_IVFPQFastScan_2(self):
-        self.assertRaisesRegex(Exception,
-                               ".*not supported.*",
-                               self.do_fast_scan_test,
-                               "IVF20,PQ5x4fs",
-                               True,
-                               faiss.DirectMap.Hashtable)
+    def test_remove(self):
+        self.do_fast_scan_test("PQ5x4fs", 320)

From 252ae16ea371ca861663db3fcae02cbe40deef05 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Tue, 9 Apr 2024 09:36:22 -0700
Subject: [PATCH 032/116] Support for Remove ids from IVFPQFastScan index
 (#3354)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3354

**Change was previously reverted because of build failure as change D55577576 removed the definition of FAISS_THROW_IF_MSG**

**Context**
[Issue 3128](https://github.com/facebookresearch/faiss/issues/3128) is an enhancement request to support remove_ids for IVFPQFastScan.

Existing mechanism use direct map and iterate over items in selector and use scopecodes and scopeIds to replace item to be removed. Given that codes are packed, it is hard to return single code how it is packed in CodePackerPQ4. Thus, we need a custom implementation to removed_ids.

**In this diff**,
1. We have added custom implementation of remove_ids from BlockInvertedLists which unpack code as it iterate and repack in new position. DirectMap use this remove_id function in BlockInvertedLists for type NoMap in DirectMap.

2. Also, we are throwing exception for other map type in DirectMap i.e. HashTable

Reviewed By: ramilbakhshyiev

Differential Revision: D55858959

fbshipit-source-id: c8a0631495380b7dead36720e4507f4d1900d39f
---
 faiss/impl/FaissAssert.h              |  6 ++--
 faiss/invlists/BlockInvertedLists.cpp | 34 ++++++++++++++----
 faiss/invlists/BlockInvertedLists.h   |  3 ++
 faiss/invlists/DirectMap.cpp          | 10 +++++-
 tests/test_merge_index.py             | 50 ++++++++++++++++++++-------
 5 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/faiss/impl/FaissAssert.h b/faiss/impl/FaissAssert.h
index 6f666f684c..9d357823d0 100644
--- a/faiss/impl/FaissAssert.h
+++ b/faiss/impl/FaissAssert.h
@@ -94,13 +94,15 @@
         }                                              \
     } while (false)
 
-#define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
+#define FAISS_THROW_IF_MSG(X, MSG)                           \
     do {                                                     \
-        if (!(X)) {                                          \
+        if (X) {                                             \
             FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
         }                                                    \
     } while (false)
 
+#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
+
 #define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                               \
     do {                                                                  \
         if (!(X)) {                                                       \
diff --git a/faiss/invlists/BlockInvertedLists.cpp b/faiss/invlists/BlockInvertedLists.cpp
index 6370d11871..dbdb0302dc 100644
--- a/faiss/invlists/BlockInvertedLists.cpp
+++ b/faiss/invlists/BlockInvertedLists.cpp
@@ -9,6 +9,7 @@
 
 #include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/impl/io_macros.h>
@@ -54,7 +55,9 @@ size_t BlockInvertedLists::add_entries(
     codes[list_no].resize(n_block * block_size);
     if (o % block_size == 0) {
         // copy whole blocks
-        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+        memcpy(&codes[list_no][o * packer->code_size],
+               code,
+               n_block * block_size);
     } else {
         FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
         std::vector<uint8_t> buffer(packer->code_size);
@@ -76,6 +79,29 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
+size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
+    idx_t nremove = 0;
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        std::vector<uint8_t> buffer(packer->code_size);
+        idx_t l = ids[i].size(), j = 0;
+        while (j < l) {
+            if (sel.is_member(ids[i][j])) {
+                l--;
+                ids[i][j] = ids[i][l];
+                packer->unpack_1(codes[i].data(), l, buffer.data());
+                packer->pack_1(buffer.data(), j, codes[i].data());
+            } else {
+                j++;
+            }
+        }
+        resize(i, l);
+        nremove += ids[i].size() - l;
+    }
+
+    return nremove;
+}
+
 const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
@@ -102,12 +128,6 @@ void BlockInvertedLists::update_entries(
         const idx_t*,
         const uint8_t*) {
     FAISS_THROW_MSG("not impemented");
-    /*
-    assert (list_no < nlist);
-    assert (n_entry + offset <= ids[list_no].size());
-    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
-    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
-    */
 }
 
 BlockInvertedLists::~BlockInvertedLists() {
diff --git a/faiss/invlists/BlockInvertedLists.h b/faiss/invlists/BlockInvertedLists.h
index 8d8df720bf..2b9cbba455 100644
--- a/faiss/invlists/BlockInvertedLists.h
+++ b/faiss/invlists/BlockInvertedLists.h
@@ -15,6 +15,7 @@
 namespace faiss {
 
 struct CodePacker;
+struct IDSelector;
 
 /** Inverted Lists that are organized by blocks.
  *
@@ -47,6 +48,8 @@ struct BlockInvertedLists : InvertedLists {
     size_t list_size(size_t list_no) const override;
     const uint8_t* get_codes(size_t list_no) const override;
     const idx_t* get_ids(size_t list_no) const override;
+    /// remove ids from the InvertedLists
+    size_t remove_ids(const IDSelector& sel);
 
     // works only on empty BlockInvertedLists
     // the codes should be of size ceil(n_entry / n_per_block) * block_size
diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
index 2b272922d5..dc2b92aa1c 100644
--- a/faiss/invlists/DirectMap.cpp
+++ b/faiss/invlists/DirectMap.cpp
@@ -15,6 +15,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -148,8 +149,12 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
     std::vector<idx_t> toremove(nlist);
 
     size_t nremove = 0;
-
+    BlockInvertedLists* block_invlists =
+            dynamic_cast<BlockInvertedLists*>(invlists);
     if (type == NoMap) {
+        if (block_invlists != nullptr) {
+            return block_invlists->remove_ids(sel);
+        }
         // exhaustive scan of IVF
 #pragma omp parallel for
         for (idx_t i = 0; i < nlist; i++) {
@@ -178,6 +183,9 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
             }
         }
     } else if (type == Hashtable) {
+        FAISS_THROW_IF_MSG(
+                block_invlists,
+                "remove with hashtable is not supported with BlockInvertedLists");
         const IDSelectorArray* sela =
                 dynamic_cast<const IDSelectorArray*>(&sel);
         FAISS_THROW_IF_NOT_MSG(
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 8c4c1f0912..4417f57fe7 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -246,19 +246,45 @@ def test_merge_IDMap2(self):
 
 class TestRemoveFastScan(unittest.TestCase):
 
-    def do_fast_scan_test(self, factory_key, size1):
+    def do_fast_scan_test(self,
+                          factory_key,
+                          with_ids=False,
+                          direct_map_type=faiss.DirectMap.NoMap):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index1 = faiss.index_factory(ds.d, factory_key)
-        index1.train(ds.get_train())
-        index1.reset()
+        index = faiss.index_factory(ds.d, factory_key)
+        index.train(ds.get_train())
+
+        index.reset()
         tokeep = [i % 3 == 0 for i in range(ds.nb)]
-        index1.add(ds.get_database()[tokeep])
-        _, Iref = index1.search(ds.get_queries(), 5)
-        index1.reset()
-        index1.add(ds.get_database())
-        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
-        _, Inew = index1.search(ds.get_queries(), 5)
+        if with_ids:
+            index.add_with_ids(ds.get_database()[tokeep], np.arange(ds.nb)[tokeep])
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database()[tokeep])
+        _, Iref = index.search(ds.get_queries(), 5)
+
+        index.reset()
+        if with_ids:
+            index.add_with_ids(ds.get_database(), np.arange(ds.nb))
+            index.set_direct_map_type(direct_map_type)
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database())
+        index.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
-    def test_remove(self):
-        self.do_fast_scan_test("PQ5x4fs", 320)
+    def test_remove_PQFastScan(self):
+        # with_ids is not support for this type of index
+        self.do_fast_scan_test("PQ5x4fs", False)
+
+    def test_remove_IVFPQFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", True)
+
+    def test_remove_IVFPQFastScan_2(self):
+        self.assertRaisesRegex(Exception,
+                               ".*not supported.*",
+                               self.do_fast_scan_test,
+                               "IVF20,PQ5x4fs",
+                               True,
+                               faiss.DirectMap.Hashtable)

From 17fbeb8d7e3eb12dbc1a6ba7fc58be316761d842 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Thu, 11 Apr 2024 14:23:46 -0700
Subject: [PATCH 033/116] Improve filtering & search parameters propagation
 (#3304)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3304

Reviewed By: junjieqi

Differential Revision: D55823369

Pulled By: mdouze

fbshipit-source-id: c0e9f4b85d979758f02e9953f3706b63a846bf22
---
 faiss/IVFlib.cpp                  |   5 +-
 faiss/IndexFastScan.cpp           |  10 +-
 faiss/IndexIVFFastScan.cpp        | 189 +++++++++++++++++++++---------
 faiss/IndexIVFFastScan.h          |  21 ++--
 faiss/impl/simd_result_handlers.h | 180 ++++++++++++++++++++--------
 5 files changed, 289 insertions(+), 116 deletions(-)

diff --git a/faiss/IVFlib.cpp b/faiss/IVFlib.cpp
index 91aa7af7f3..f2c975f4de 100644
--- a/faiss/IVFlib.cpp
+++ b/faiss/IVFlib.cpp
@@ -352,7 +352,10 @@ void search_with_parameters(
     const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
     FAISS_THROW_IF_NOT(index_ivf);
 
-    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
+    SearchParameters* quantizer_params =
+            (params) ? params->quantizer_params : nullptr;
+    index_ivf->quantizer->search(
+            n, x, params->nprobe, Dq.data(), Iq.data(), quantizer_params);
 
     if (nb_dis_ptr) {
         *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp
index 2dfb2f55fd..529465da3e 100644
--- a/faiss/IndexFastScan.cpp
+++ b/faiss/IndexFastScan.cpp
@@ -189,6 +189,7 @@ void estimators_from_tables_generic(
                 dt += index.ksub;
             }
         }
+
         if (C::cmp(heap_dis[0], dis)) {
             heap_pop<C>(k, heap_dis, heap_ids);
             heap_push<C>(k, heap_dis, heap_ids, dis, j);
@@ -203,17 +204,18 @@ ResultHandlerCompare<C, false>* make_knn_handler(
         idx_t k,
         size_t ntotal,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel = nullptr) {
     using HeapHC = HeapHandler<C, false>;
     using ReservoirHC = ReservoirHandler<C, false>;
     using SingleResultHC = SingleResultHandler<C, false>;
 
     if (k == 1) {
-        return new SingleResultHC(n, ntotal, distances, labels);
+        return new SingleResultHC(n, ntotal, distances, labels, sel);
     } else if (impl % 2 == 0) {
-        return new HeapHC(n, ntotal, k, distances, labels);
+        return new HeapHC(n, ntotal, k, distances, labels, sel);
     } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels);
+        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels, sel);
     }
 }
 
diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
index 00bc6c823e..19828753d2 100644
--- a/faiss/IndexIVFFastScan.cpp
+++ b/faiss/IndexIVFFastScan.cpp
@@ -211,7 +211,7 @@ void estimators_from_tables_generic(
         int64_t* heap_ids,
         const NormTableScaler* scaler) {
     using accu_t = typename C::T;
-    int nscale = scaler ? scaler->nscale : 0;
+    size_t nscale = scaler ? scaler->nscale : 0;
     for (size_t j = 0; j < ncodes; ++j) {
         BitstringReader bsr(codes + j * index.code_size, index.code_size);
         accu_t dis = bias;
@@ -270,6 +270,7 @@ void IndexIVFFastScan::compute_LUT_uint8(
         biases.resize(n * nprobe);
     }
 
+    // OMP for MSVC requires i to have signed integral type
 #pragma omp parallel for if (n > 100)
     for (int64_t i = 0; i < n; i++) {
         const float* t_in = dis_tables_float.get() + i * dim123;
@@ -306,11 +307,16 @@ void IndexIVFFastScan::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) const {
-    auto paramsi = dynamic_cast<const SearchParametersIVF*>(params);
-    FAISS_THROW_IF_NOT_MSG(!params || paramsi, "need IVFSearchParameters");
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+    }
+
     search_preassigned(
-            n, x, k, nullptr, nullptr, distances, labels, false, paramsi);
+            n, x, k, nullptr, nullptr, distances, labels, false, params);
 }
 
 void IndexIVFFastScan::search_preassigned(
@@ -326,18 +332,17 @@ void IndexIVFFastScan::search_preassigned(
         IndexIVFStats* stats) const {
     size_t nprobe = this->nprobe;
     if (params) {
-        FAISS_THROW_IF_NOT_MSG(
-                !params->quantizer_params, "quantizer params not supported");
         FAISS_THROW_IF_NOT(params->max_codes == 0);
         nprobe = params->nprobe;
     }
+
     FAISS_THROW_IF_NOT_MSG(
             !store_pairs, "store_pairs not supported for this index");
     FAISS_THROW_IF_NOT_MSG(!stats, "stats not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
     const CoarseQuantized cq = {nprobe, centroid_dis, assign};
-    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr);
+    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr, params);
 }
 
 void IndexIVFFastScan::range_search(
@@ -345,10 +350,18 @@ void IndexIVFFastScan::range_search(
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(!params);
+        const SearchParameters* params_in) const {
+    size_t nprobe = this->nprobe;
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+        nprobe = params->nprobe;
+    }
+
     const CoarseQuantized cq = {nprobe, nullptr, nullptr};
-    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr);
+    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr, params);
 }
 
 namespace {
@@ -359,17 +372,18 @@ ResultHandlerCompare<C, true>* make_knn_handler_fixC(
         idx_t n,
         idx_t k,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel) {
     using HeapHC = HeapHandler<C, true>;
     using ReservoirHC = ReservoirHandler<C, true>;
     using SingleResultHC = SingleResultHandler<C, true>;
 
     if (k == 1) {
-        return new SingleResultHC(n, 0, distances, labels);
+        return new SingleResultHC(n, 0, distances, labels, sel);
     } else if (impl % 2 == 0) {
-        return new HeapHC(n, 0, k, distances, labels);
+        return new HeapHC(n, 0, k, distances, labels, sel);
     } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, 0, k, 2 * k, distances, labels);
+        return new ReservoirHC(n, 0, k, 2 * k, distances, labels, sel);
     }
 }
 
@@ -379,13 +393,14 @@ SIMDResultHandlerToFloat* make_knn_handler(
         idx_t n,
         idx_t k,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel) {
     if (is_max) {
         return make_knn_handler_fixC<CMax<uint16_t, int64_t>>(
-                impl, n, k, distances, labels);
+                impl, n, k, distances, labels, sel);
     } else {
         return make_knn_handler_fixC<CMin<uint16_t, int64_t>>(
-                impl, n, k, distances, labels);
+                impl, n, k, distances, labels, sel);
     }
 }
 
@@ -402,10 +417,20 @@ struct CoarseQuantizedWithBuffer : CoarseQuantized {
     std::vector<idx_t> ids_buffer;
     std::vector<float> dis_buffer;
 
-    void quantize(const Index* quantizer, idx_t n, const float* x) {
+    void quantize(
+            const Index* quantizer,
+            idx_t n,
+            const float* x,
+            const SearchParameters* quantizer_params) {
         dis_buffer.resize(nprobe * n);
         ids_buffer.resize(nprobe * n);
-        quantizer->search(n, x, nprobe, dis_buffer.data(), ids_buffer.data());
+        quantizer->search(
+                n,
+                x,
+                nprobe,
+                dis_buffer.data(),
+                ids_buffer.data(),
+                quantizer_params);
         dis = dis_buffer.data();
         ids = ids_buffer.data();
     }
@@ -421,8 +446,11 @@ struct CoarseQuantizedSlice : CoarseQuantizedWithBuffer {
         }
     }
 
-    void quantize_slice(const Index* quantizer, const float* x) {
-        quantize(quantizer, i1 - i0, x + quantizer->d * i0);
+    void quantize_slice(
+            const Index* quantizer,
+            const float* x,
+            const SearchParameters* quantizer_params) {
+        quantize(quantizer, i1 - i0, x + quantizer->d * i0, quantizer_params);
     }
 };
 
@@ -459,7 +487,13 @@ void IndexIVFFastScan::search_dispatch_implem(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
     bool is_max = !is_similarity_metric(metric_type);
     using RH = SIMDResultHandlerToFloat;
 
@@ -489,52 +523,70 @@ void IndexIVFFastScan::search_dispatch_implem(
     }
 
     CoarseQuantizedWithBuffer cq(cq_in);
+    cq.nprobe = nprobe;
 
     if (!cq.done() && !multiple_threads) {
         // we do the coarse quantization here execpt when search is
         // sliced over threads (then it is more efficient to have each thread do
         // its own coarse quantization)
-        cq.quantize(quantizer, n, x);
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
     }
 
     if (impl == 1) {
         if (is_max) {
             search_implem_1<CMax<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         } else {
             search_implem_1<CMin<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         }
     } else if (impl == 2) {
         if (is_max) {
             search_implem_2<CMax<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         } else {
             search_implem_2<CMin<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         }
-
     } else if (impl >= 10 && impl <= 15) {
         size_t ndis = 0, nlist_visited = 0;
 
         if (!multiple_threads) {
             // clang-format off
             if (impl == 12 || impl == 13) {
-                std::unique_ptr<RH> handler(make_knn_handler(is_max, impl, n, k, distances, labels));
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
+                        labels, sel
+                    )
+                );
                 search_implem_12(
                         n, x, *handler.get(),
-                        cq, &ndis, &nlist_visited, scaler);
-
+                        cq, &ndis, &nlist_visited, scaler, params);
             } else if (impl == 14 || impl == 15) {
-
                 search_implem_14(
                         n, x, k, distances, labels,
-                        cq, impl, scaler);
+                        cq, impl, scaler, params);
             } else {
-                std::unique_ptr<RH> handler(make_knn_handler(is_max, impl, n, k, distances, labels));
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
+                        labels,
+                        sel
+                    )
+                );
                 search_implem_10(
                         n, x, *handler.get(), cq,
-                        &ndis, &nlist_visited, scaler);
+                        &ndis, &nlist_visited, scaler, params);
             }
             // clang-format on
         } else {
@@ -543,7 +595,8 @@ void IndexIVFFastScan::search_dispatch_implem(
             if (impl == 14 || impl == 15) {
                 // this might require slicing if there are too
                 // many queries (for now we keep this simple)
-                search_implem_14(n, x, k, distances, labels, cq, impl, scaler);
+                search_implem_14(
+                        n, x, k, distances, labels, cq, impl, scaler, params);
             } else {
 #pragma omp parallel for reduction(+ : ndis, nlist_visited)
                 for (int slice = 0; slice < nslice; slice++) {
@@ -553,19 +606,19 @@ void IndexIVFFastScan::search_dispatch_implem(
                     idx_t* lab_i = labels + i0 * k;
                     CoarseQuantizedSlice cq_i(cq, i0, i1);
                     if (!cq_i.done()) {
-                        cq_i.quantize_slice(quantizer, x);
+                        cq_i.quantize_slice(quantizer, x, quantizer_params);
                     }
                     std::unique_ptr<RH> handler(make_knn_handler(
-                            is_max, impl, i1 - i0, k, dis_i, lab_i));
+                            is_max, impl, i1 - i0, k, dis_i, lab_i, sel));
                     // clang-format off
                     if (impl == 12 || impl == 13) {
                         search_implem_12(
                                 i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler);
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     } else {
                         search_implem_10(
                                 i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler);
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     }
                     // clang-format on
                 }
@@ -585,7 +638,13 @@ void IndexIVFFastScan::range_search_dispatch_implem(
         float radius,
         RangeSearchResult& rres,
         const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    // const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
     bool is_max = !is_similarity_metric(metric_type);
 
     if (n == 0) {
@@ -613,7 +672,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
     }
 
     if (!multiple_threads && !cq.done()) {
-        cq.quantize(quantizer, n, x);
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
     }
 
     size_t ndis = 0, nlist_visited = 0;
@@ -622,10 +682,10 @@ void IndexIVFFastScan::range_search_dispatch_implem(
         std::unique_ptr<SIMDResultHandlerToFloat> handler;
         if (is_max) {
             handler.reset(new RangeHandler<CMax<uint16_t, int64_t>, true>(
-                    rres, radius, 0));
+                    rres, radius, 0, sel));
         } else {
             handler.reset(new RangeHandler<CMin<uint16_t, int64_t>, true>(
-                    rres, radius, 0));
+                    rres, radius, 0, sel));
         }
         if (impl == 12) {
             search_implem_12(
@@ -649,17 +709,17 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                 idx_t i1 = n * (slice + 1) / nslice;
                 CoarseQuantizedSlice cq_i(cq, i0, i1);
                 if (!cq_i.done()) {
-                    cq_i.quantize_slice(quantizer, x);
+                    cq_i.quantize_slice(quantizer, x, quantizer_params);
                 }
                 std::unique_ptr<SIMDResultHandlerToFloat> handler;
                 if (is_max) {
                     handler.reset(new PartialRangeHandler<
                                   CMax<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1));
+                                  true>(pres, radius, 0, i0, i1, sel));
                 } else {
                     handler.reset(new PartialRangeHandler<
                                   CMin<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1));
+                                  true>(pres, radius, 0, i0, i1, sel));
                 }
 
                 if (impl == 12 || impl == 13) {
@@ -670,7 +730,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                             cq_i,
                             &ndis,
                             &nlist_visited,
-                            scaler);
+                            scaler,
+                            params);
                 } else {
                     search_implem_10(
                             i1 - i0,
@@ -679,7 +740,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                             cq_i,
                             &ndis,
                             &nlist_visited,
-                            scaler);
+                            scaler,
+                            params);
                 }
             }
             pres.finalize();
@@ -699,7 +761,8 @@ void IndexIVFFastScan::search_implem_1(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
     size_t dim12 = ksub * M;
@@ -766,7 +829,8 @@ void IndexIVFFastScan::search_implem_2(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
     size_t dim12 = ksub * M2;
@@ -848,7 +912,12 @@ void IndexIVFFastScan::search_implem_10(
         const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    const size_t max_codes = params ? params->max_codes : this->max_codes;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
@@ -909,6 +978,7 @@ void IndexIVFFastScan::search_implem_10(
             ndis++;
         }
     }
+
     handler.end();
     *ndis_out = ndis;
     *nlist_out = nlist;
@@ -921,7 +991,8 @@ void IndexIVFFastScan::search_implem_12(
         const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
@@ -933,6 +1004,7 @@ void IndexIVFFastScan::search_implem_12(
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
     compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
+
     handler.begin(skip & 16 ? nullptr : normalizers.get());
 
     struct QC {
@@ -958,6 +1030,7 @@ void IndexIVFFastScan::search_implem_12(
             return a.list_no < b.list_no;
         });
     }
+
     // prepare the result handlers
 
     int qbs2 = this->qbs2 ? this->qbs2 : 11;
@@ -1049,12 +1122,15 @@ void IndexIVFFastScan::search_implem_14(
         idx_t* labels,
         const CoarseQuantized& cq,
         int impl,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
     FAISS_THROW_IF_NOT(bbs == 32);
 
+    const IDSelector* sel = params ? params->sel : nullptr;
+
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
@@ -1157,7 +1233,7 @@ void IndexIVFFastScan::search_implem_14(
 
         // prepare the result handlers
         std::unique_ptr<SIMDResultHandlerToFloat> handler(make_knn_handler(
-                is_max, impl, n, k, local_dis.data(), local_idx.data()));
+                is_max, impl, n, k, local_dis.data(), local_idx.data(), sel));
         handler->begin(normalizers.get());
 
         int qbs2 = this->qbs2 ? this->qbs2 : 11;
@@ -1167,6 +1243,7 @@ void IndexIVFFastScan::search_implem_14(
             tmp_bias.resize(qbs2);
             handler->dbias = tmp_bias.data();
         }
+
         std::set<int> q_set;
         uint64_t t_copy_pack = 0, t_scan = 0;
 #pragma omp for schedule(dynamic)
diff --git a/faiss/IndexIVFFastScan.h b/faiss/IndexIVFFastScan.h
index 159a3a7098..9d4c4910d3 100644
--- a/faiss/IndexIVFFastScan.h
+++ b/faiss/IndexIVFFastScan.h
@@ -148,7 +148,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     void range_search_dispatch_implem(
             idx_t n,
@@ -156,7 +157,8 @@ struct IndexIVFFastScan : IndexIVF {
             float radius,
             RangeSearchResult& rres,
             const CoarseQuantized& cq_in,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // impl 1 and 2 are just for verification
     template <class C>
@@ -167,7 +169,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     template <class C>
     void search_implem_2(
@@ -177,7 +180,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 10 and 12 are not multithreaded internally, so
     // export search stats
@@ -188,7 +192,8 @@ struct IndexIVFFastScan : IndexIVF {
             const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     void search_implem_12(
             idx_t n,
@@ -197,7 +202,8 @@ struct IndexIVFFastScan : IndexIVF {
             const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 14 is multithreaded internally across nprobes and queries
     void search_implem_14(
@@ -208,7 +214,8 @@ struct IndexIVFFastScan : IndexIVF {
             idx_t* labels,
             const CoarseQuantized& cq,
             int impl,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // reconstruct vectors from packed invlists
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
diff --git a/faiss/impl/simd_result_handlers.h b/faiss/impl/simd_result_handlers.h
index 633d480990..2fa18fa340 100644
--- a/faiss/impl/simd_result_handlers.h
+++ b/faiss/impl/simd_result_handlers.h
@@ -15,6 +15,7 @@
 #include <faiss/utils/simdlib.h>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
@@ -137,6 +138,7 @@ struct FixedStorageHandler : SIMDResultHandler {
             }
         }
     }
+
     virtual ~FixedStorageHandler() {}
 };
 
@@ -150,8 +152,10 @@ struct ResultHandlerCompare : SIMDResultHandlerToFloat {
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
 
-    ResultHandlerCompare(size_t nq, size_t ntotal)
-            : SIMDResultHandlerToFloat(nq, ntotal) {
+    const IDSelector* sel;
+
+    ResultHandlerCompare(size_t nq, size_t ntotal, const IDSelector* sel_in)
+            : SIMDResultHandlerToFloat(nq, ntotal), sel{sel_in} {
         this->is_CMax = C::is_max;
         this->sizeof_ids = sizeof(typename C::TI);
         this->with_fields = with_id_map;
@@ -232,9 +236,14 @@ struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
     float* dis;
     int64_t* ids;
 
-    SingleResultHandler(size_t nq, size_t ntotal, float* dis, int64_t* ids)
-            : RHC(nq, ntotal), idis(nq), dis(dis), ids(ids) {
-        for (int i = 0; i < nq; i++) {
+    SingleResultHandler(
+            size_t nq,
+            size_t ntotal,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in), idis(nq), dis(dis), ids(ids) {
+        for (size_t i = 0; i < nq; i++) {
             ids[i] = -1;
             idis[i] = C::neutral();
         }
@@ -256,20 +265,36 @@ struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T d = d32tab[j];
-            if (C::cmp(idis[q], d)) {
-                idis[q] = d;
-                ids[q] = this->adjust_id(b, j);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T d = d32tab[j];
+                    if (C::cmp(idis[q], d)) {
+                        idis[q] = d;
+                        ids[q] = real_idx;
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T d = d32tab[j];
+                if (C::cmp(idis[q], d)) {
+                    idis[q] = d;
+                    ids[q] = this->adjust_id(b, j);
+                }
             }
         }
     }
 
     void end() {
-        for (int q = 0; q < this->nq; q++) {
+        for (size_t q = 0; q < this->nq; q++) {
             if (!normalizers) {
                 dis[q] = idis[q];
             } else {
@@ -296,8 +321,14 @@ struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
 
     int64_t k; // number of results to keep
 
-    HeapHandler(size_t nq, size_t ntotal, int64_t k, float* dis, int64_t* ids)
-            : RHC(nq, ntotal),
+    HeapHandler(
+            size_t nq,
+            size_t ntotal,
+            int64_t k,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
               idis(nq * k),
               iids(nq * k),
               dis(dis),
@@ -330,21 +361,36 @@ struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(heap_dis[0], dis)) {
-                int64_t idx = this->adjust_id(b, j);
-                heap_pop<C>(k, heap_dis, heap_ids);
-                heap_push<C>(k, heap_dis, heap_ids, dis, idx);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    if (C::cmp(heap_dis[0], dis)) {
+                        heap_replace_top<C>(
+                                k, heap_dis, heap_ids, dis, real_idx);
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                if (C::cmp(heap_dis[0], dis)) {
+                    int64_t idx = this->adjust_id(b, j);
+                    heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
+                }
             }
         }
     }
 
     void end() override {
-        for (int q = 0; q < this->nq; q++) {
+        for (size_t q = 0; q < this->nq; q++) {
             T* heap_dis_in = idis.data() + q * k;
             TI* heap_ids_in = iids.data() + q * k;
             heap_reorder<C>(k, heap_dis_in, heap_ids_in);
@@ -393,8 +439,12 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
             size_t k,
             size_t cap,
             float* dis,
-            int64_t* ids)
-            : RHC(nq, ntotal), capacity((cap + 15) & ~15), dis(dis), ids(ids) {
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
+              capacity((cap + 15) & ~15),
+              dis(dis),
+              ids(ids) {
         assert(capacity % 16 == 0);
         all_ids.resize(nq * capacity);
         all_vals.resize(nq * capacity);
@@ -423,12 +473,25 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            res.add(dis, this->adjust_id(b, j));
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    res.add(dis, real_idx);
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                res.add(dis, this->adjust_id(b, j));
+            }
         }
     }
 
@@ -439,7 +502,7 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
                 CMin<float, int64_t>>::type;
 
         std::vector<int> perm(reservoirs[0].n);
-        for (int q = 0; q < reservoirs.size(); q++) {
+        for (size_t q = 0; q < reservoirs.size(); q++) {
             ReservoirTopN<C>& res = reservoirs[q];
             size_t n = res.n;
 
@@ -454,14 +517,14 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
                 one_a = 1 / normalizers[2 * q];
                 b = normalizers[2 * q + 1];
             }
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 perm[i] = i;
             }
             // indirect sort of result arrays
             std::sort(perm.begin(), perm.begin() + res.i, [&res](int i, int j) {
                 return C::cmp(res.vals[j], res.vals[i]);
             });
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 heap_dis[i] = res.vals[perm[i]] * one_a + b;
                 heap_ids[i] = res.ids[perm[i]];
             }
@@ -499,8 +562,12 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
     };
     std::vector<Triplet> triplets;
 
-    RangeHandler(RangeSearchResult& rres, float radius, size_t ntotal)
-            : RHC(rres.nq, ntotal), rres(rres), radius(radius) {
+    RangeHandler(
+            RangeSearchResult& rres,
+            float radius,
+            size_t ntotal,
+            const IDSelector* sel_in)
+            : RHC(rres.nq, ntotal, sel_in), rres(rres), radius(radius) {
         thresholds.resize(nq);
         n_per_query.resize(nq + 1);
     }
@@ -528,13 +595,28 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            n_per_query[q]++;
-            triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+
+                auto real_idx = this->adjust_id(b, j);
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    n_per_query[q]++;
+                    triplets.push_back({idx_t(q + q0), real_idx, dis});
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                n_per_query[q]++;
+                triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+            }
         }
     }
 
@@ -578,8 +660,9 @@ struct PartialRangeHandler : RangeHandler<C, with_id_map> {
             float radius,
             size_t ntotal,
             size_t q0,
-            size_t q1)
-            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal),
+            size_t q1,
+            const IDSelector* sel_in)
+            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal, sel_in),
               pres(pres) {
         nq = q1 - q0;
         this->q0 = q0;
@@ -698,6 +781,7 @@ void dispatch_SIMDResultHanlder(
         FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
     }
 }
+
 } // namespace simd_result_handlers
 
 } // namespace faiss

From 40e86433368095dd5e257474439195ced551017c Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Thu, 11 Apr 2024 14:23:46 -0700
Subject: [PATCH 034/116] selector parameter for FastScan (#3362)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3362

Add test to Alex' PR

Reviewed By: junjieqi

Differential Revision: D56003946

fbshipit-source-id: 5a8a881d450bc97ae0777d73ce0ce8607ec6b686
---
 tests/test_search_params.py | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 954d39cd00..22b845c2ea 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -22,7 +22,7 @@ class TestSelector(unittest.TestCase):
     combinations as possible.
     """
 
-    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2):
+    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10):
         """ Verify that the id selector returns the subset of results that are
         members according to the IDSelector.
         Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
@@ -30,7 +30,6 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
         ds = datasets.SyntheticDataset(32, 1000, 100, 20)
         index = faiss.index_factory(ds.d, index_key, mt)
         index.train(ds.get_train())
-        k = 10
 
         # reference result
         if "range" in id_selector_type:
@@ -145,6 +144,16 @@ def test_IVFFlat_range_sorted(self):
     def test_IVFPQ(self):
         self.do_test_id_selector("IVF32,PQ4x4np")
 
+    def test_IVFPQfs(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs")
+
+    def test_IVFPQfs_k1(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=1)
+
+    def test_IVFPQfs_k40(self):
+        # test reservoir codepath
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=40)
+
     def test_IVFSQ(self):
         self.do_test_id_selector("IVF32,SQ8")
 

From acd06d62119bad16a3e0ed3aee9fa63837c2e58c Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Fri, 12 Apr 2024 10:52:31 -0700
Subject: [PATCH 035/116] Switch sprintf to snprintf (#3363)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3363

'sprintf' is deprecated: This function is provided for compatibility reasons only.  Due to security concerns inherent in the design of sprintf(3), it is highly recommended that you use snprintf(3) instead

{F1484071654}

Reviewed By: kuarora

Differential Revision: D56009251

fbshipit-source-id: ec222cf589ff98b016979058d59fc20cccec8f43
---
 faiss/impl/io.cpp          | 2 +-
 faiss/utils/simdlib_neon.h | 9 ++++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/faiss/impl/io.cpp b/faiss/impl/io.cpp
index 3d3af95036..5f5b2d5ebd 100644
--- a/faiss/impl/io.cpp
+++ b/faiss/impl/io.cpp
@@ -267,7 +267,7 @@ std::string fourcc_inv_printable(uint32_t x) {
             str += c;
         } else {
             char buf[10];
-            sprintf(buf, "\\x%02x", c);
+            snprintf(buf, sizeof(buf), "\\x%02x", c);
             str += buf;
         }
     }
diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simdlib_neon.h
index 656a561217..439a5210bc 100644
--- a/faiss/utils/simdlib_neon.h
+++ b/faiss/utils/simdlib_neon.h
@@ -168,9 +168,16 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     simd.store(bytes);
     char res[1000], *ptr = res;
     for (size_t i = 0; i < N; ++i) {
-        ptr += sprintf(ptr, fmt, bytes[i]);
+        int bytesWritten =
+                snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
+        if (bytesWritten >= 0) {
+            ptr += bytesWritten;
+        } else {
+            break;
+        }
     }
     // strip last ,
+
     ptr[-1] = 0;
     return std::string(res);
 }

From a35eb0ac11908f1ab1a6056dca15e74fb06cd4e7 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Fri, 12 Apr 2024 13:03:17 -0700
Subject: [PATCH 036/116] Remove unused variables in faiss/IndexIVF.cpp

Summary:
LLVM-15 has a warning `-Wunused-but-set-variable` which we treat as an error because it's so often diagnostic of a code issue. Unused variables can compromise readability or, worse, performance.

This diff either (a) removes an unused variable and, possibly, it's associated code, or (b) qualifies the variable with `[[maybe_unused]]`, mostly in cases where the variable _is_ used, but, eg, in an `assert` statement that isn't present in production code.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Reviewed By: dmm-fb

Differential Revision: D56065763

fbshipit-source-id: b0541b8a759c4b6ca0e8753fc24b8c227047bd3d
---
 demos/demo_imi_pq.cpp     | 1 -
 faiss/IndexIVF.cpp        | 4 ++--
 faiss/utils/distances.cpp | 4 ++--
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/demos/demo_imi_pq.cpp b/demos/demo_imi_pq.cpp
index a2af65e792..4fab0778d8 100644
--- a/demos/demo_imi_pq.cpp
+++ b/demos/demo_imi_pq.cpp
@@ -77,7 +77,6 @@ int main() {
     // the coarse quantizer should not be dealloced before the index
     // 4 = nb of bytes per code (d must be a multiple of this)
     // 8 = nb of bits per sub-code (almost always 8)
-    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
     faiss::IndexIVFPQ index(
             &coarse_quantizer, d, ncentroids, bytes_per_code, 8);
     index.quantizer_trains_alone = true;
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index 95d3bc9e68..548aaa4cc7 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -444,7 +444,7 @@ void IndexIVF::search_preassigned(
         max_codes = unlimited_list_size;
     }
 
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 0           ? false
                      : pmode == 3 ? n > 1
                      : pmode == 1 ? nprobe > 1
@@ -784,7 +784,7 @@ void IndexIVF::range_search_preassigned(
 
     int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
     // don't start parallel section if single query
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 3           ? false
                      : pmode == 0 ? nx > 1
                      : pmode == 1 ? nprobe > 1
diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
index 82bc164ae1..74b56bcc87 100644
--- a/faiss/utils/distances.cpp
+++ b/faiss/utils/distances.cpp
@@ -141,7 +141,7 @@ void exhaustive_inner_product_seq(
         const IDSelector* sel = nullptr) {
     using SingleResultHandler =
             typename BlockResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
@@ -178,7 +178,7 @@ void exhaustive_L2sqr_seq(
         const IDSelector* sel = nullptr) {
     using SingleResultHandler =
             typename BlockResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 

From ab2b7f50936b3bafd4e076f798413da2399482bd Mon Sep 17 00:00:00 2001
From: Andres Suarez <asuarez@meta.com>
Date: Sun, 14 Apr 2024 11:28:32 -0700
Subject: [PATCH 037/116] Apply clang-format 18

Summary: Previously this code conformed from clang-format 12.

Reviewed By: igorsugak

Differential Revision: D56065247

fbshipit-source-id: f5a985dd8f8b84f2f9e1818b3719b43c5a1b05b3
---
 benchs/bench_cppcontrib_sa_decode.cpp  | 151 ++++++++++---------------
 faiss/IndexBinaryIVF.cpp               |   2 +-
 faiss/gpu/utils/Tensor.cuh             |   7 +-
 faiss/impl/HNSW.cpp                    |   4 +-
 faiss/invlists/OnDiskInvertedLists.cpp |   4 +-
 5 files changed, 67 insertions(+), 101 deletions(-)

diff --git a/benchs/bench_cppcontrib_sa_decode.cpp b/benchs/bench_cppcontrib_sa_decode.cpp
index c5c6b0bf18..f0266172a8 100644
--- a/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/benchs/bench_cppcontrib_sa_decode.cpp
@@ -214,9 +214,8 @@ static void verifyIndex2LevelDecoder(
         double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -264,9 +263,8 @@ static void verifyIndex2LevelDecoder(
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -327,9 +325,8 @@ static void verifyIndex2LevelDecoder(
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -357,9 +354,8 @@ static void verifyIndex2LevelDecoder(
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -389,9 +385,8 @@ static void verifyIndex2LevelDecoder(
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -424,9 +419,8 @@ static void verifyIndex2LevelDecoder(
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -463,9 +457,8 @@ static void verifyIndex2LevelDecoder(
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
     }
 }
 
@@ -532,9 +525,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -582,9 +574,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -651,9 +642,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -686,9 +676,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -723,9 +712,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -763,9 +751,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -807,9 +794,8 @@ static void verifyMinMaxIndex2LevelDecoder(
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
     }
 }
 
@@ -866,9 +852,8 @@ static void verifyIndexPQDecoder(
         double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -915,9 +900,8 @@ static void verifyIndexPQDecoder(
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -978,9 +962,8 @@ static void verifyIndexPQDecoder(
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1007,9 +990,8 @@ static void verifyIndexPQDecoder(
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1037,9 +1019,8 @@ static void verifyIndexPQDecoder(
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1071,9 +1052,8 @@ static void verifyIndexPQDecoder(
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1107,9 +1087,8 @@ static void verifyIndexPQDecoder(
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
     }
 }
 
@@ -1171,9 +1150,8 @@ static void verifyMinMaxIndexPQDecoder(
         double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1220,9 +1198,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1288,9 +1265,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1322,9 +1298,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1357,9 +1332,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1396,9 +1370,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1437,9 +1410,8 @@ static void verifyMinMaxIndexPQDecoder(
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
         std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
+                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
     }
 }
 
@@ -1512,14 +1484,9 @@ int main(int argc, char** argv) {
             (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
 
     // print the header
-    std::cout << "Codec\t"
-              << "n\t"
-              << "d\t"
-              << "Experiment\t"
-              << "Iterations\t"
-              << "Faiss time\t"
-              << "SADecodeKernel time\t"
-              << "Error" << std::endl;
+    std::cout << "Codec\t" << "n\t" << "d\t" << "Experiment\t" << "Iterations\t"
+              << "Faiss time\t" << "SADecodeKernel time\t" << "Error"
+              << std::endl;
 
     // The following experiment types are available:
     // * store_seq - decode a contiguous block of codes into vectors, one by one
diff --git a/faiss/IndexBinaryIVF.cpp b/faiss/IndexBinaryIVF.cpp
index 686785a987..ab1b9fd89a 100644
--- a/faiss/IndexBinaryIVF.cpp
+++ b/faiss/IndexBinaryIVF.cpp
@@ -456,7 +456,7 @@ void search_knn_hamming_heap(
             }
 
         } // parallel for
-    }     // parallel
+    } // parallel
 
     indexIVF_stats.nq += n;
     indexIVF_stats.nlist += nlistv;
diff --git a/faiss/gpu/utils/Tensor.cuh b/faiss/gpu/utils/Tensor.cuh
index b13d0e1496..0fbb2417b3 100644
--- a/faiss/gpu/utils/Tensor.cuh
+++ b/faiss/gpu/utils/Tensor.cuh
@@ -232,13 +232,12 @@ class Tensor {
     }
 
     /// Returns a read/write view of a portion of our tensor.
-    __host__ __device__ inline detail::SubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
+    __host__ __device__ inline detail::
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT);
 
     /// Returns a read/write view of a portion of our tensor (const).
     __host__ __device__ inline const detail::
-            SubTensor<TensorType, Dim - 1, PtrTraits>
-            operator[](IndexT) const;
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT) const;
 
     /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
     /// checking.
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index b1324e1211..d8c8225968 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -110,8 +110,8 @@ void HNSW::print_neighbor_stats(int level) const {
            level,
            nb_neighbors(level));
     size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
-  reduction(+: tot_reciprocal) reduction(+: n_node)
+#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
+        reduction(+ : tot_reciprocal) reduction(+ : n_node)
     for (int i = 0; i < levels.size(); i++) {
         if (levels[i] > level) {
             n_node++;
diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
index dc17fe67f6..8565572a9b 100644
--- a/faiss/invlists/OnDiskInvertedLists.cpp
+++ b/faiss/invlists/OnDiskInvertedLists.cpp
@@ -394,8 +394,8 @@ const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
         return nullptr;
     }
 
-    return (
-        const idx_t*)(ptr + lists[list_no].offset + code_size * lists[list_no].capacity);
+    return (const idx_t*)(ptr + lists[list_no].offset +
+                          code_size * lists[list_no].capacity);
 }
 
 void OnDiskInvertedLists::update_entries(

From 0169f29f375db27e86151320b60932a6b46716d5 Mon Sep 17 00:00:00 2001
From: iotamudelta <dieterich@ogolem.org>
Date: Wed, 17 Apr 2024 03:43:57 -0700
Subject: [PATCH 038/116] Update required cmake version to 3.24. (#3305)

Summary:
The CMakeLists.txt in faiss/gpu uses the $<LINK_LIBRARY:WHOLE_ARCHIVE expression which requires at least cmake 3.24.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3305

Reviewed By: mlomeli1

Differential Revision: D56234500

Pulled By: algoriddle

fbshipit-source-id: dfe7df3379c5250dedec7d1988cffa889fc1c393
---
 CMakeLists.txt                 | 2 +-
 conda/faiss-gpu-raft/meta.yaml | 4 ++--
 conda/faiss-gpu/meta.yaml      | 4 ++--
 conda/faiss/meta.yaml          | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6cdc37c46f..cedee9c456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
 
 set(FAISS_LANGUAGES CXX)
 
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index c43e7656c3..3eebc9876b 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -48,7 +48,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
@@ -85,7 +85,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index b0df707181..7ac24e785d 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -48,7 +48,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
@@ -81,7 +81,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}
diff --git a/conda/faiss/meta.yaml b/conda/faiss/meta.yaml
index c4d66ca0d3..79e7be953e 100644
--- a/conda/faiss/meta.yaml
+++ b/conda/faiss/meta.yaml
@@ -39,7 +39,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
       host:
@@ -69,7 +69,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}

From 3677ab502135c44da76b0e6b0f3755b0f407b337 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Fri, 19 Apr 2024 13:55:15 -0700
Subject: [PATCH 039/116] Switch clang-format-11 to clang-format-18 (#3372)

Summary:
In this commit https://github.com/facebookresearch/faiss/commit/ab2b7f50936b3bafd4e076f798413da2399482bd, they changed format based on clang-format-18. However, we still use clang-format-11 in our circle ci job which caused the failure. In this PR, we are going to switch to clang-format-18

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3372

Reviewed By: kuarora

Differential Revision: D56280363

Pulled By: junjieqi

fbshipit-source-id: f832ab2112f762e6000b55a155e3e43fe99071d7
---
 .circleci/config.yml | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 549e4a2793..7e8bd8170a 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -38,12 +38,17 @@ jobs:
       - run:
           name: Install clang-format
           command: |
-            apt-get update
-            apt-get install -y git-core clang-format-11
+            apt-get update -y
+            apt-get install -y wget
+            apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            ./llvm.sh 18
+            apt-get install -y git-core clang-format-18
       - run:
           name: Verify clang-format
           command: |
-             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-11 -i
+             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
              if git diff --quiet; then
                echo "Formatting OK!"
              else

From 5893ab77daee3c84ecc74a2c84c18d7cd486fcea Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Mon, 22 Apr 2024 09:30:05 -0700
Subject: [PATCH 040/116] remove unused code (#3371)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3371

This will never happen because N is fixed at compile time and the buffer is large enough. It is misleading to add error handling code for a case that will never happen.

Reviewed By: kuarora

Differential Revision: D56274458

fbshipit-source-id: ca706f1223dbc97e69d5ac9750b277afa4df80a7
---
 faiss/utils/simdlib_neon.h | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simdlib_neon.h
index 439a5210bc..1bdf0ed01e 100644
--- a/faiss/utils/simdlib_neon.h
+++ b/faiss/utils/simdlib_neon.h
@@ -170,14 +170,10 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     for (size_t i = 0; i < N; ++i) {
         int bytesWritten =
                 snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
-        if (bytesWritten >= 0) {
-            ptr += bytesWritten;
-        } else {
-            break;
-        }
+        ptr += bytesWritten;
     }
-    // strip last ,
-
+    // The format usually contains a ',' separator so this is to remove the last
+    // separator.
     ptr[-1] = 0;
     return std::string(res);
 }

From b2e91f685e10dc05269b2e436deaf75e242d7394 Mon Sep 17 00:00:00 2001
From: Carl Love <carll@ltcden2-lp1.aus.stglabs.ibm.com>
Date: Wed, 24 Apr 2024 02:05:48 -0700
Subject: [PATCH 041/116] Unroll loop in lookup_2_lanes (#3364)

Summary:
The current loop goes from 0 to 31.  It has an if statement to do an assignment for j < 16 and a different assignment for j >= 16.  By unrolling the loop to do the j < 16 and the j >= 16 iterations in parallel the if j < 16 is eliminated and the number of loop iterations is reduced in half.

Then unroll the loop for the j < 16 and the j >=16 to a depth of 2.

This change results in approximately a 55% reduction in the execution time for the bench_ivf_fastscan.py workload on Power 10 when compiled with CMAKE_INSTALL_CONFIG_NAME=Release.

The removal of the if (j < 16) statement and the unrolling of the loop removes branch cycle stall and register dependencies on instruction issue. The result is the unrolled code is able issue instructions earlier thus reducing the total number of cycles required to execute the function.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3364

Reviewed By: kuarora

Differential Revision: D56455690

Pulled By: mdouze

fbshipit-source-id: 490a17a40d9d4439b1a8ea22e991e706d68fb2fa
---
 faiss/utils/simdlib.h       |    4 +
 faiss/utils/simdlib_ppc64.h | 1084 +++++++++++++++++++++++++++++++++++
 2 files changed, 1088 insertions(+)
 create mode 100644 faiss/utils/simdlib_ppc64.h

diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h
index beeec2374e..ea5020d719 100644
--- a/faiss/utils/simdlib.h
+++ b/faiss/utils/simdlib.h
@@ -27,6 +27,10 @@
 
 #include <faiss/utils/simdlib_neon.h>
 
+#elif defined(__PPC64__)
+
+#include <faiss/utils/simdlib_ppc64.h>
+
 #else
 
 // emulated = all operations are implemented as scalars
diff --git a/faiss/utils/simdlib_ppc64.h b/faiss/utils/simdlib_ppc64.h
new file mode 100644
index 0000000000..94b3e42dc7
--- /dev/null
+++ b/faiss/utils/simdlib_ppc64.h
@@ -0,0 +1,1084 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace faiss {
+
+struct simd256bit {
+    union {
+        uint8_t u8[32];
+        uint16_t u16[16];
+        uint32_t u32[8];
+        float f32[8];
+    };
+
+    simd256bit() {}
+
+    explicit simd256bit(const void* x) {
+        memcpy(u8, x, 32);
+    }
+
+    void clear() {
+        memset(u8, 0, 32);
+    }
+
+    void storeu(void* ptr) const {
+        memcpy(ptr, u8, 32);
+    }
+
+    void loadu(const void* ptr) {
+        memcpy(u8, ptr, 32);
+    }
+
+    void store(void* ptr) const {
+        storeu(ptr);
+    }
+
+    void bin(char bits[257]) const {
+        const char* bytes = (char*)this->u8;
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+};
+
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+
+    explicit simd16uint16(int x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(uint16_t x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, u16[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    template <typename F>
+    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j]);
+        }
+        return c;
+    }
+
+    template <typename F>
+    static simd16uint16 binary_func(
+            const simd16uint16& a,
+            const simd16uint16& b,
+            F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j], b.u16[j]);
+        }
+        return c;
+    }
+
+    void set1(uint16_t x) {
+        for (int i = 0; i < 16; i++) {
+            u16[i] = x;
+        }
+    }
+
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
+    }
+
+    simd16uint16 operator+=(const simd16uint16& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    simd16uint16 operator-=(const simd16uint16& other) {
+        *this = *this - other;
+        return *this;
+    }
+
+    simd16uint16 operator+(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
+    }
+
+    simd16uint16 operator-(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
+    }
+
+    simd16uint16 operator&(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a & b;
+                });
+    }
+
+    simd16uint16 operator|(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a | b;
+                });
+    }
+
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+
+    // returns binary masks
+    simd16uint16 operator==(const simd16uint16& other) const {
+        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
+            return a == b ? 0xffff : 0;
+        });
+    }
+
+    simd16uint16 operator~() const {
+        return unary_func(*this, [](uint16_t a) { return ~a; });
+    }
+
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return u16[0];
+    }
+
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(const simd16uint16& thresh) const {
+        uint32_t gem = 0;
+        for (int j = 0; j < 16; j++) {
+            if (u16[j] >= thresh.u16[j]) {
+                gem |= 3 << (j * 2);
+            }
+        }
+        return gem;
+    }
+
+    uint32_t le_mask(const simd16uint16& thresh) const {
+        return thresh.ge_mask(*this);
+    }
+
+    uint32_t gt_mask(const simd16uint16& thresh) const {
+        return ~le_mask(thresh);
+    }
+
+    bool all_gt(const simd16uint16& thresh) const {
+        return le_mask(thresh) == 0;
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        return u16[i];
+    }
+
+    void accu_min(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] < u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+
+    void accu_max(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] > u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+};
+
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
+}
+
+inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
+}
+
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    for (int j = 0; j < 8; j++) {
+        c.u16[j] = a.u16[j] + a.u16[j + 8];
+        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
+    }
+    return c;
+}
+
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] >= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] >= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+inline uint32_t cmp_le32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] <= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] <= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+
+    return c;
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+
+    explicit simd32uint8(int x) {
+        set1(x);
+    }
+
+    explicit simd32uint8(uint8_t x) {
+        set1(x);
+    }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
+
+    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, u8[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        for (int j = 0; j < 32; j++) {
+            u8[j] = x;
+        }
+    }
+
+    template <typename F>
+    static simd32uint8 binary_func(
+            const simd32uint8& a,
+            const simd32uint8& b,
+            F&& f) {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            c.u8[j] = f(a.u8[j], b.u8[j]);
+        }
+        return c;
+    }
+
+    simd32uint8 operator&(const simd256bit& other) const {
+        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
+            return a & b;
+        });
+    }
+
+    simd32uint8 operator+(const simd32uint8& other) const {
+        return binary_func(
+                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
+    }
+
+    // The very important operation that everything relies on
+    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
+        simd32uint8 c;
+        // The original for loop:
+        // for (int j = 0; j < 32; j++) {
+        //     if (idx.u8[j] & 0x80) {
+        //         c.u8[j] = 0;
+        //     } else {
+        //         uint8_t i = idx.u8[j] & 15;
+        //         if (j < 16) {
+        //             c.u8[j] = u8[i];
+        //         } else {
+        //             c.u8[j] = u8[16 + i];
+        //         }
+        //     }
+
+        // The following function was re-written for Power 10
+        // The loop was unrolled to remove the if (j < 16) statement by doing
+        // the j and j + 16 iterations in parallel.  The additional unrolling
+        // for j + 1 and j + 17, reduces the execution time on Power 10 by
+        // about 50% as the instruction scheduling allows on average 2X more
+        // instructions to be issued per cycle.
+
+        for (int j = 0; j < 16; j = j + 2) {
+            // j < 16, unrolled to depth of 2
+            if (idx.u8[j] & 0x80) {
+                c.u8[j] = 0;
+            } else {
+                uint8_t i = idx.u8[j] & 15;
+                c.u8[j] = u8[i];
+            }
+
+            if (idx.u8[j + 1] & 0x80) {
+                c.u8[j + 1] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 1] & 15;
+                c.u8[j + 1] = u8[i];
+            }
+
+            // j >= 16, unrolled to depth of 2
+            if (idx.u8[j + 16] & 0x80) {
+                c.u8[j + 16] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 16] & 15;
+                c.u8[j + 16] = u8[i + 16];
+            }
+
+            if (idx.u8[j + 17] & 0x80) {
+                c.u8[j + 17] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 17] & 15;
+                c.u8[j + 17] = u8[i + 16];
+            }
+        }
+        return c;
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+
+    simd32uint8 operator+=(const simd32uint8& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        return u8[i];
+    }
+};
+
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(
+        const simd16uint16& a,
+        const simd16uint16& b) {
+    simd32uint8 c;
+
+    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
+
+    for (int i = 0; i < 8; i++) {
+        c.u8[i] = saturate_16_to_8(a.u16[i]);
+        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
+        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
+        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
+    }
+    return c;
+}
+
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(const simd32uint8& a) {
+    uint32_t res = 0;
+    for (int i = 0; i < 32; i++) {
+        if (a.u8[i] & 0x80) {
+            res |= 1 << i;
+        }
+    }
+    return res;
+}
+
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(
+        const simd32uint8& a,
+        const simd32uint8& b,
+        const simd32uint8& mask) {
+    simd32uint8 c;
+    for (int i = 0; i < 32; i++) {
+        if (mask.u8[i] & 0x80) {
+            c.u8[i] = b.u8[i];
+        } else {
+            c.u8[i] = a.u8[i];
+        }
+    }
+    return c;
+}
+
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+
+    explicit simd8uint32(uint32_t x) {
+        set1(x);
+    }
+
+    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, u32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+
+    void set1(uint32_t x) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] = x;
+        }
+    }
+
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
+};
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+
+    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8float32(float x) {
+        set1(x);
+    }
+
+    explicit simd8float32(const float* x) {
+        loadu((void*)x);
+    }
+
+    void set1(float x) {
+        for (int i = 0; i < 8; i++) {
+            f32[i] = x;
+        }
+    }
+
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+
+    template <typename F>
+    static simd8float32 binary_func(
+            const simd8float32& a,
+            const simd8float32& b,
+            F&& f) {
+        simd8float32 c;
+        for (int j = 0; j < 8; j++) {
+            c.f32[j] = f(a.f32[j], b.f32[j]);
+        }
+        return c;
+    }
+
+    simd8float32 operator*(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a * b; });
+    }
+
+    simd8float32 operator+(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a + b; });
+    }
+
+    simd8float32 operator-(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a - b; });
+    }
+
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
+    std::string tostring() const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", f32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
+// hadd does not cross lanes
+inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0] + a.f32[1];
+    c.f32[1] = a.f32[2] + a.f32[3];
+    c.f32[2] = b.f32[0] + b.f32[1];
+    c.f32[3] = b.f32[2] + b.f32[3];
+
+    c.f32[4] = a.f32[4] + a.f32[5];
+    c.f32[5] = a.f32[6] + a.f32[7];
+    c.f32[6] = b.f32[4] + b.f32[5];
+    c.f32[7] = b.f32[6] + b.f32[7];
+
+    return c;
+}
+
+inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = b.f32[0];
+    c.f32[2] = a.f32[1];
+    c.f32[3] = b.f32[1];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = b.f32[4];
+    c.f32[6] = a.f32[5];
+    c.f32[7] = b.f32[5];
+
+    return c;
+}
+
+inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[2];
+    c.f32[1] = b.f32[2];
+    c.f32[2] = a.f32[3];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[6];
+    c.f32[5] = b.f32[6];
+    c.f32[6] = a.f32[7];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// compute a * b + c
+inline simd8float32 fmadd(
+        const simd8float32& a,
+        const simd8float32& b,
+        const simd8float32& c) {
+    simd8float32 res;
+    for (int i = 0; i < 8; i++) {
+        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
+    }
+    return res;
+}
+
+namespace {
+
+// get even float32's of a and b, interleaved
+simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[2];
+    c.f32[2] = b.f32[0];
+    c.f32[3] = b.f32[2];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = a.f32[6];
+    c.f32[6] = b.f32[4];
+    c.f32[7] = b.f32[6];
+
+    return c;
+}
+
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[1];
+    c.f32[1] = a.f32[3];
+    c.f32[2] = b.f32[1];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[5];
+    c.f32[5] = a.f32[7];
+    c.f32[6] = b.f32[5];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[1];
+    c.f32[2] = a.f32[2];
+    c.f32[3] = a.f32[3];
+
+    c.f32[4] = b.f32[0];
+    c.f32[5] = b.f32[1];
+    c.f32[6] = b.f32[2];
+    c.f32[7] = b.f32[3];
+
+    return c;
+}
+
+simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[4];
+    c.f32[1] = a.f32[5];
+    c.f32[2] = a.f32[6];
+    c.f32[3] = a.f32[7];
+
+    c.f32[4] = b.f32[4];
+    c.f32[5] = b.f32[5];
+    c.f32[6] = b.f32[6];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+} // namespace
+
+} // namespace faiss

From 67574aabbcb76893498f3116509e90434db76379 Mon Sep 17 00:00:00 2001
From: Aditya Vidyadhar Kamath <Aditya.Kamath1@ibm.com>
Date: Wed, 24 Apr 2024 05:40:49 -0700
Subject: [PATCH 042/116] Fix the endianness issue in AIX while running the
 benchmark. (#3345)

Summary:
This pull request is for issue https://github.com/facebookresearch/faiss/issues/3330. This patch makes sure that packed code arrays are in big endian format. Kindly let us know if we need any changes or if we can have a better approach.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3345

Reviewed By: junjieqi

Differential Revision: D55957630

Pulled By: mdouze

fbshipit-source-id: f728f9563f6b942af9d8899b54662d7ceb811206
---
 contrib/vecs_io.py                      |   3 +
 faiss/cppcontrib/detail/UintReader.h    | 112 +++++++++++++++++++----
 faiss/cppcontrib/sa_decode/Level2-inl.h | 113 +++++++++++++++++-------
 faiss/impl/platform_macros.h            |  14 +++
 faiss/impl/pq4_fast_scan.cpp            |  11 +++
 faiss/python/CMakeLists.txt             |  13 +++
 6 files changed, 219 insertions(+), 47 deletions(-)

diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index ea75d5f94d..cd16a2b73d 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 import numpy as np
 
 """
@@ -13,6 +14,8 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
+    if sys.byteorder == 'big':
+      a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
 
diff --git a/faiss/cppcontrib/detail/UintReader.h b/faiss/cppcontrib/detail/UintReader.h
index 81e600f410..4a64a1a254 100644
--- a/faiss/cppcontrib/detail/UintReader.h
+++ b/faiss/cppcontrib/detail/UintReader.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/impl/platform_macros.h>
 #include <cstdint>
 
 namespace faiss {
@@ -31,7 +32,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 3) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32) >> 24;
+#else
                     return (code32 & 0x000000FF);
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -40,7 +45,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 2) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x00FF0000) >> 16;
+#else
                     return (code32 & 0x0000FF00) >> 8;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -49,7 +58,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 1) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x0000FF00) >> 8;
+#else
                     return (code32 & 0x00FF0000) >> 16;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -58,7 +71,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x000000FF);
+#else
                     return (code32) >> 24;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -87,40 +104,61 @@ struct Uint10Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000001111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000001111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111110000000000) >> 10;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111100) >> 2;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b00111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0011111111110000) >> 4;
                 }
             }
             case 3: {
-                const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                         codes + ELEMENT_TO_READ * 5 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                code16 = Swap2Bytes(code16);
+#endif
                 return (code16 & 0b1111111111000000) >> 6;
             }
         }
@@ -147,45 +185,69 @@ struct Uint12Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000111111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b111111111111000000000000) >> 12;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111111100000000) >> 8;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 3: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b11111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
@@ -208,23 +270,39 @@ struct Uint16Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0x0000FFFF);
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return code32 >> 16;
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
diff --git a/faiss/cppcontrib/sa_decode/Level2-inl.h b/faiss/cppcontrib/sa_decode/Level2-inl.h
index 36355af001..1eb7767ba8 100644
--- a/faiss/cppcontrib/sa_decode/Level2-inl.h
+++ b/faiss/cppcontrib/sa_decode/Level2-inl.h
@@ -12,10 +12,19 @@
 #include <cstdint>
 
 #include <faiss/cppcontrib/detail/CoarseBitType.h>
+#include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 namespace cppcontrib {
 
+bool isBigEndian() {
+#ifdef FAISS_BIG_ENDIAN
+    return true;
+#else
+    return false;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////////
 /// Index2LevelDecoder
 ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
 
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
-
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
                             COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index 2aecc51222..a0faea7cba 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -165,3 +165,17 @@ inline int __builtin_clzll(uint64_t x) {
 #endif
 
 // clang-format on
+
+/*******************************************************
+ * BIGENDIAN specific macros
+ *******************************************************/
+#if !defined(_MSC_VER) && \
+        (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+#define FAISS_BIG_ENDIAN
+#endif
+
+#define Swap2Bytes(val) ((((val) >> 8) & 0x00FF) | (((val) << 8) & 0xFF00))
+
+#define Swap4Bytes(val)                                           \
+    ((((val) >> 24) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
+     (((val) << 8) & 0x00FF0000) | (((val) << 24) & 0xFF000000))
diff --git a/faiss/impl/pq4_fast_scan.cpp b/faiss/impl/pq4_fast_scan.cpp
index 6173ecef47..127646e0eb 100644
--- a/faiss/impl/pq4_fast_scan.cpp
+++ b/faiss/impl/pq4_fast_scan.cpp
@@ -6,6 +6,7 @@
  */
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
 #include <faiss/impl/pq4_fast_scan.h>
 #include <faiss/impl/simd_result_handlers.h>
 
@@ -58,8 +59,13 @@ void pq4_pack_codes(
         return;
     }
     memset(blocks, 0, nb * nsq / 2);
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     uint8_t* codes2 = blocks;
     for (size_t i0 = 0; i0 < nb; i0 += bbs) {
@@ -93,8 +99,13 @@ void pq4_pack_codes_range(
         size_t bbs,
         size_t nsq,
         uint8_t* blocks) {
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     // range of affected blocks
     size_t block0 = i0 / bbs;
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 8bca710f5f..dee8c7762e 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -67,11 +67,20 @@ else()
   find_package(faiss REQUIRED)
 endif()
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+swig_add_library(swigfaiss
+  TYPE MODULE
+  LANGUAGE python
+  SOURCES swigfaiss.swig
+)
+else ()
 swig_add_library(swigfaiss
   TYPE SHARED
   LANGUAGE python
   SOURCES swigfaiss.swig
 )
+endif()
+
 set_property(TARGET swigfaiss PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
 
 set_property(SOURCE swigfaiss_avx2.swig
@@ -160,6 +169,10 @@ set_property(TARGET faiss_python_callbacks
   PROPERTY POSITION_INDEPENDENT_CODE ON
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+target_link_libraries(faiss_python_callbacks PRIVATE faiss)
+endif()
+
 # Hack so that python_callbacks.h can be included as
 # `#include <faiss/python/python_callbacks.h>`.
 target_include_directories(faiss_python_callbacks PRIVATE ${PROJECT_SOURCE_DIR}/../..)

From 783e044a2dfb1f3cdd2c6e65362e496ed07b5e2c Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Wed, 24 Apr 2024 05:40:49 -0700
Subject: [PATCH 043/116] support big-endian machines (#3361)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3361

Fix a few issues in the PR.
Normally all tests should pass on a litlle-endian machine

Reviewed By: junjieqi

Differential Revision: D56003181

fbshipit-source-id: 405dec8c71898494f5ddcd2718c35708a1abf9cb
---
 contrib/vecs_io.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index cd16a2b73d..5d18c0b162 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -14,8 +14,8 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
-    if sys.byteorder == 'big':
-      a.byteswap(inplace=True)
+    if sys.big_endian:
+        a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
 
@@ -25,6 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
+    assert not sys.big_endian
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -36,7 +37,11 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
+    if sys.big_endian:
+        da = x[:4][::-1].copy()
+        d = da.view('int32')[0]
+    else:
+        d = x[:4].view('int32')[0]
     return x.reshape(-1, d + 4)[:, 4:]
 
 
@@ -45,6 +50,8 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
+    if sys.big_endian:
+        m1.byteswap(inplace=True)
     m1.tofile(fname)
 
 

From 2379b45f827047182d424b76e7cb454a1929fa7a Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Wed, 24 Apr 2024 09:42:05 -0700
Subject: [PATCH 044/116] Few fixes in bench_fw to enable IndexFromCodec
 (#3383)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3383

In this diff, I am fixing minor issues in bench_fw where either certain fields are not accessible when index is build from codec. It also requires index to be discovered using codec alias as index factory is not always available.

In subsequent diff internal to meta will have testcase that execute this path.

Reviewed By: algoriddle

Differential Revision: D56444641

fbshipit-source-id: b7af7e7bb47b20bbb5515a66f41dd24f42459d52
---
 benchs/bench_fw/benchmark.py   |  9 ++++++---
 benchs/bench_fw/descriptors.py |  1 +
 benchs/bench_fw/index.py       | 10 ++++++++--
 3 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py
index 1053f99388..8ca68c4cd8 100644
--- a/benchs/bench_fw/benchmark.py
+++ b/benchs/bench_fw/benchmark.py
@@ -208,9 +208,11 @@ def set_io(self, benchmark_io):
         self.io.distance_metric = self.distance_metric
         self.io.distance_metric_type = self.distance_metric_type
 
-    def get_index_desc(self, factory: str) -> Optional[IndexDescriptor]:
+    def get_index_desc(self, factory_or_codec: str) -> Optional[IndexDescriptor]:
         for desc in self.index_descs:
-            if desc.factory == factory:
+            if desc.factory == factory_or_codec:
+                return desc
+            if desc.codec_alias == factory_or_codec:
                 return desc
         return None
 
@@ -232,7 +234,7 @@ def range_search_reference(self, index, parameters, range_metric):
             parameters,
             radius=m_radius,
         )
-        flat = index.factory == "Flat"
+        flat = index.is_flat_index()
         (
             gt_radius,
             range_search_metric_function,
@@ -650,6 +652,7 @@ def benchmark(
                     f"Range index {index_desc.factory} has no radius_score"
                 )
             results["metrics"] = {}
+            self.build_index_wrapper(index_desc)
             for metric_key, range_metric in index_desc.range_metrics.items():
                 (
                     gt_radius,
diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index f1dd7354c2..173b07ce16 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -20,6 +20,7 @@ class IndexDescriptor:
     # but not both at the same time.
     path: Optional[str] = None
     factory: Optional[str] = None
+    codec_alias: Optional[str] = None
     construction_params: Optional[List[Dict[str, int]]] = None
     search_params: Optional[Dict[str, int]] = None
     # range metric definitions
diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py
index 14f2158e64..3deaa4afcf 100644
--- a/benchs/bench_fw/index.py
+++ b/benchs/bench_fw/index.py
@@ -495,7 +495,7 @@ def range_search(
         radius: Optional[float] = None,
     ):
         logger.info("range_search: begin")
-        if search_parameters is not None and search_parameters["snap"] == 1:
+        if search_parameters is not None and search_parameters.get("snap") == 1:
             query_vectors = self.snap(query_vectors)
         filename = (
             self.get_range_search_name(
@@ -776,6 +776,9 @@ def add_range_or_val(name, range):
             )
         return op
 
+    def is_flat_index(self):
+        return self.get_index_name().startswith("Flat")
+
 
 # IndexFromCodec, IndexFromQuantizer and IndexFromPreTransform
 # are used to wrap pre-trained Faiss indices (codecs)
@@ -807,6 +810,9 @@ def get_codec_name(self):
         name += Index.param_dict_list_to_name(self.construction_params)
         return name
 
+    def fetch_meta(self, dry_run=False):
+        return None, None
+
     def fetch_codec(self):
         codec = self.io.read_index(
             os.path.basename(self.path),
@@ -911,7 +917,7 @@ def fetch_codec(self, dry_run=False):
             assert codec_size is not None
             meta = {
                 "training_time": training_time,
-                "training_size": self.training_vectors.num_vectors,
+                "training_size": self.training_vectors.num_vectors if self.training_vectors else 0,
                 "codec_size": codec_size,
                 "sa_code_size": self.get_sa_code_size(codec),
                 "code_size": self.get_code_size(codec),

From 03750f51419e9a2c6b5c533c2e6fbc837bdb646a Mon Sep 17 00:00:00 2001
From: Amir Sadoughi <sadoughi@meta.com>
Date: Wed, 24 Apr 2024 14:11:02 -0700
Subject: [PATCH 045/116] Fix IndexBinary.assign Python method

Summary: Fixes #3343

Reviewed By: kuarora, junjieqi

Differential Revision: D56526842

fbshipit-source-id: b7c4377495db4e68283cf4ce2b7c8fae008cd404
---
 faiss/python/class_wrappers.py | 34 ++++++++++++++++++++++++++++++++++
 tests/test_index_binary.py     |  3 +++
 2 files changed, 37 insertions(+)

diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 4a6808d286..4af2345009 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -956,10 +956,44 @@ def replacement_remove_ids(self, x):
             sel = IDSelectorBatch(x.size, swig_ptr(x))
         return self.remove_ids_c(sel)
 
+    def replacement_assign(self, x, k, labels=None):
+        """Find the k nearest neighbors of the set of vectors x in the index.
+        This is the same as the `search` method, but discards the distances.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be uint8.
+        k : int
+            Number of nearest neighbors.
+        labels : array_like, optional
+            Labels array to store the results.
+
+        Returns
+        -------
+        labels: array_like
+            Labels of the nearest neighbors, shape (n, k).
+            When not enough results are found, the label is set to -1
+        """
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d == self.code_size
+        assert k > 0
+
+        if labels is None:
+            labels = np.empty((n, k), dtype=np.int64)
+        else:
+            assert labels.shape == (n, k)
+
+        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
+        return labels
+
     replace_method(the_class, 'add', replacement_add)
     replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
     replace_method(the_class, 'train', replacement_train)
     replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'assign', replacement_assign)
     replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index b505e0ba1c..3acf622fd4 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -100,6 +100,9 @@ def test_flat(self):
         index.add(self.xb)
         D, I = index.search(self.xq, 3)
 
+        I2 = index.assign(x=self.xq, k=3, labels=None)
+        assert np.all(I == I2)
+
         for i in range(nq):
             for j, dj in zip(I[i], D[i]):
                 ref_dis = binary_dis(self.xq[i], self.xb[j])

From bd22c936e108d4551648435764b126aef40b4530 Mon Sep 17 00:00:00 2001
From: Junjie Qi <jqi@meta.com>
Date: Thu, 25 Apr 2024 02:51:55 -0700
Subject: [PATCH 046/116] Fix swig osx (#3357)

Summary:
The osx failed

https://app.circleci.com/pipelines/github/facebookresearch/faiss/5698/workflows/4e029c32-8d8b-4db7-99e2-8e802aad6653/jobs/32701

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3357

Reviewed By: kuarora

Differential Revision: D56039739

Pulled By: junjieqi

fbshipit-source-id: dd434a8817148364797eae39c09e0e1e9edbe858
---
 faiss/python/swigfaiss.swig | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 0ea93609e3..5c9a7b3fa7 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -1022,14 +1022,17 @@ PyObject *swig_ptr (PyObject *a)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
     }
     if(PyArray_TYPE(ao) == NPY_UINT64) {
-#if (__SIZEOF_LONG__ == 8)
+    // Convert npy64 either long or long long  and it depends on how compiler define int64_t.
+    // In the 64bit machine, typically the int64_t should be long but it is not hold for Apple osx.
+    // In this case, we want to convert npy64 to long_Long in osx
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
 #endif
     }
     if(PyArray_TYPE(ao) == NPY_INT64) {
-#if  (__SIZEOF_LONG__ == 8)
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);
@@ -1054,11 +1057,8 @@ struct PythonInterruptCallback: faiss::InterruptCallback {
     }
 
 };
-
-
 %}
 
-
 %init %{
     /* needed, else crash at runtime */
     import_array();
@@ -1121,15 +1121,8 @@ int * cast_integer_to_int_ptr (int64_t x) {
 void * cast_integer_to_void_ptr (int64_t x) {
     return (void*)x;
 }
-
 %}
 
-
-
-
-
-
-
 %inline %{
     void wait() {
         // in gdb, use return to get out of this function

From 5cbff67c7b8e0086d17466e1fecca1c3799c6430 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Fri, 26 Apr 2024 06:32:34 -0700
Subject: [PATCH 047/116] fix raft log spew

Summary: Remove debugging log lines

Reviewed By: mlomeli1

Differential Revision: D56626636

fbshipit-source-id: 2721b84e4e1359d1372df2b2c95cc668c6a75c3f
---
 faiss/gpu/GpuDistance.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index a235404b14..38a62f03bb 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -327,7 +327,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     int64_t,
                     raft::col_major>>
                     index_vec = {index.view()};
-            RAFT_LOG_INFO("Invoking flat bfknn");
+
             brute_force::knn(
                     handle,
                     index_vec,
@@ -354,10 +354,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     [] __device__(const float& a) { return powf(a, 2); });
         }
 
-        RAFT_LOG_INFO("Done.");
-
         handle.sync_stream();
-        RAFT_LOG_INFO("All synced.");
     } else
 #else
     if (should_use_raft(args)) {

From a233bc93e3815b8803fc4361a1ce128c79063e4a Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Fri, 26 Apr 2024 09:52:23 -0700
Subject: [PATCH 048/116] Demo on how to address mulitple index contents

Summary:
This demonstrates how to query several independent IVF indexes with a trained index in common. This avoids to duplicate the coarse quantizer and metadata in memory.

On the Faiss side, it also implements a InvertedListIterator on top of the flat inverted lists, which can prove useful.

Reviewed By: junjieqi

Differential Revision: D56575887

fbshipit-source-id: cc3b26e952ee21f24b10169b5b614066600cf4b8
---
 faiss/invlists/InvertedLists.cpp      |  72 ++++++++++---
 faiss/invlists/InvertedLists.h        |  27 +++--
 tests/CMakeLists.txt                  |   1 +
 tests/test_common_ivf_empty_index.cpp | 144 ++++++++++++++++++++++++++
 4 files changed, 218 insertions(+), 26 deletions(-)
 create mode 100644 tests/test_common_ivf_empty_index.cpp

diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index cc337d004b..c2bfa2cabc 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/invlists/InvertedLists.h>
 
 #include <cstdio>
@@ -24,18 +22,10 @@ InvertedListsIterator::~InvertedListsIterator() {}
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size), use_iterator(false) {}
+        : nlist(nlist), code_size(code_size) {}
 
 InvertedLists::~InvertedLists() {}
 
-bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
-        const {
-    return use_iterator ? !std::unique_ptr<InvertedListsIterator>(
-                                   get_iterator(list_no, inverted_list_context))
-                                   ->is_available()
-                        : list_size(list_no) == 0;
-}
-
 idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
     const idx_t* ids = get_ids(list_no);
@@ -78,12 +68,6 @@ void InvertedLists::reset() {
     }
 }
 
-InvertedListsIterator* InvertedLists::get_iterator(
-        size_t /*list_no*/,
-        void* /*inverted_list_context*/) const {
-    FAISS_THROW_MSG("get_iterator is not supported");
-}
-
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
@@ -233,6 +217,54 @@ size_t InvertedLists::compute_ntotal() const {
     return tot;
 }
 
+bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    if (use_iterator) {
+        return !std::unique_ptr<InvertedListsIterator>(
+                        get_iterator(list_no, inverted_list_context))
+                        ->is_available();
+    } else {
+        FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+        return list_size(list_no) == 0;
+    }
+}
+
+// implemnent iterator on top of get_codes / get_ids
+namespace {
+
+struct CodeArrayIterator : InvertedListsIterator {
+    size_t list_size;
+    size_t code_size;
+    InvertedLists::ScopedCodes codes;
+    InvertedLists::ScopedIds ids;
+    size_t idx = 0;
+
+    CodeArrayIterator(const InvertedLists* il, size_t list_no)
+            : list_size(il->list_size(list_no)),
+              code_size(il->code_size),
+              codes(il, list_no),
+              ids(il, list_no) {}
+
+    bool is_available() const override {
+        return idx < list_size;
+    }
+    void next() override {
+        idx++;
+    }
+    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+        return {ids[idx], codes.get() + code_size * idx};
+    }
+};
+
+} // namespace
+
+InvertedListsIterator* InvertedLists::get_iterator(
+        size_t list_no,
+        void* inverted_list_context) const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return new CodeArrayIterator(this, list_no);
+}
+
 /*****************************************
  * ArrayInvertedLists implementation
  ******************************************/
@@ -264,6 +296,12 @@ size_t ArrayInvertedLists::list_size(size_t list_no) const {
     return ids[list_no].size();
 }
 
+bool ArrayInvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return ids[list_no].size() == 0;
+}
+
 const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
     assert(list_no < nlist);
     return codes[list_no].data();
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 90a9d65411..b24700fad1 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -37,7 +37,9 @@ struct InvertedListsIterator {
 struct InvertedLists {
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
-    bool use_iterator;
+
+    /// request to use iterator rather than get_codes / get_ids
+    bool use_iterator = false;
 
     InvertedLists(size_t nlist, size_t code_size);
 
@@ -50,17 +52,9 @@ struct InvertedLists {
     /*************************
      *  Read only functions */
 
-    // check if the list is empty
-    bool is_empty(size_t list_no, void* inverted_list_context) const;
-
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
-    /// get iterable for lists that use_iterator
-    virtual InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context) const;
-
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -92,6 +86,18 @@ struct InvertedLists {
     /// a list can be -1 hence the signed long
     virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
 
+    /*****************************************
+     * Iterator interface (with context)     */
+
+    /// check if the list is empty
+    virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const;
+
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const;
+
     /*************************
      * writing functions     */
 
@@ -262,6 +268,9 @@ struct ArrayInvertedLists : InvertedLists {
     /// permute the inverted lists, map maps new_id to old_id
     void permute_invlists(const idx_t* map);
 
+    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const override;
+
     ~ArrayInvertedLists() override;
 };
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 66ec9f74a5..443195eecb 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -33,6 +33,7 @@ set(FAISS_TEST_SRC
   test_partitioning.cpp
   test_fastscan_perf.cpp
   test_disable_pq_sdc_tables.cpp
+  test_common_ivf_empty_index.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff --git a/tests/test_common_ivf_empty_index.cpp b/tests/test_common_ivf_empty_index.cpp
new file mode 100644
index 0000000000..1a99b77141
--- /dev/null
+++ b/tests/test_common_ivf_empty_index.cpp
@@ -0,0 +1,144 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <gtest/gtest.h>
+
+#include <omp.h>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_factory.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/random.h>
+
+/* This demonstrates how to query several independent IVF indexes with a trained
+ *index in common. This avoids to duplicate the coarse quantizer and metadata
+ *in memory.
+ **/
+
+namespace {
+
+int d = 64;
+
+}; // namespace
+
+std::vector<float> get_random_vectors(size_t n, int seed) {
+    std::vector<float> x(n * d);
+    faiss::rand_smooth_vectors(n, d, x.data(), seed);
+    seed++;
+    return x;
+}
+
+/** InvetedLists implementation that dispatches the search to an InvertedList
+ * object that is passed in at query time */
+
+struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
+    DispatchingInvertedLists(size_t nlist, size_t code_size)
+            : faiss::ReadOnlyInvertedLists(nlist, code_size) {
+        use_iterator = true;
+    }
+
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const override {
+        assert(inverted_list_context);
+        auto il =
+                static_cast<const faiss::InvertedLists*>(inverted_list_context);
+        return il->get_iterator(list_no);
+    }
+
+    using idx_t = faiss::idx_t;
+
+    size_t list_size(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const uint8_t* get_codes(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const idx_t* get_ids(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+};
+
+TEST(COMMON, test_common_trained_index) {
+    int N = 3;    // number of independent indexes
+    int nt = 500; // training vectors
+    int nb = 200; // nb database vectors per index
+    int nq = 10;  // nb queries performed on each index
+    int k = 4;    // restults requested per query
+
+    // construct and build an "empty index": a trained index that does not
+    // itself hold any data
+    std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
+            faiss::index_factory(d, "IVF32,PQ8np")));
+    auto xt = get_random_vectors(nt, 123);
+    empty_index->train(nt, xt.data());
+    empty_index->nprobe = 4;
+
+    // reference run: build one index for each set of db / queries and record
+    // results
+    std::vector<std::vector<faiss::idx_t>> ref_I(N);
+
+    for (int i = 0; i < N; i++) {
+        // clone the empty index
+        std::unique_ptr<faiss::Index> index(
+                faiss::clone_index(empty_index.get()));
+        auto xb = get_random_vectors(nb, 1234 + i);
+        auto xq = get_random_vectors(nq, 12345 + i);
+        // add vectors and perform a search
+        index->add(nb, xb.data());
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+        index->search(nq, xq.data(), k, D.data(), I.data());
+        // record result as reference
+        ref_I[i] = I;
+    }
+
+    // build a set of inverted lists for each independent index
+    std::vector<faiss::ArrayInvertedLists> sub_invlists;
+
+    for (int i = 0; i < N; i++) {
+        // swap in other inverted lists
+        sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
+        faiss::InvertedLists* invlists = &sub_invlists.back();
+
+        // replace_invlists swaps in a new InvertedLists for an existing index
+        empty_index->replace_invlists(invlists, false);
+        empty_index->reset(); // reset id counter to 0
+        // populate inverted lists
+        auto xb = get_random_vectors(nb, 1234 + i);
+        empty_index->add(nb, xb.data());
+    }
+
+    // perform search dispatching to the sub-invlists. At search time, we don't
+    // use replace_invlists because that would wreak havoc in a multithreaded
+    // context
+    DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
+    empty_index->replace_invlists(&di, false);
+
+    std::vector<std::vector<faiss::idx_t>> new_I(N);
+
+    // run searches in the independent indexes but with a common empty_index
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+        auto xq = get_random_vectors(nq, 12345 + i);
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+
+        // here we set to what sub-index the queries should be directed
+        faiss::SearchParametersIVF params;
+        params.nprobe = empty_index->nprobe;
+        params.inverted_list_context = &sub_invlists[i];
+
+        empty_index->search(nq, xq.data(), k, D.data(), I.data(), &params);
+        new_I[i] = I;
+    }
+
+    // compare with reference reslt
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(ref_I[i], new_I[i]);
+    }
+}

From c5599a06849b9c0ff0060915373e0b04bd360867 Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Sat, 27 Apr 2024 09:37:26 -0700
Subject: [PATCH 049/116] Fix deprecated use of 0/NULL in
 faiss/python/python_callbacks.cpp + 1

Summary:
`nullptr` is typesafe. `0` and `NULL` are not. In the future, only `nullptr` will be allowed.

This diff helps us embrace the future _now_ in service of enabling `-Wzero-as-null-pointer-constant`.

Reviewed By: palmje

Differential Revision: D56650318

fbshipit-source-id: 803ae62114c39143b65946f6f0387715eaf7f534
---
 faiss/python/python_callbacks.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/faiss/python/python_callbacks.cpp b/faiss/python/python_callbacks.cpp
index bfcf883aec..06b5c18cfc 100644
--- a/faiss/python/python_callbacks.cpp
+++ b/faiss/python/python_callbacks.cpp
@@ -46,7 +46,7 @@ size_t PyCallbackIOWriter::operator()(
         size_t wi = ws > bs ? bs : ws;
         PyObject* result = PyObject_CallFunction(
                 callback, "(N)", PyBytes_FromStringAndSize(ptr, wi));
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("py err");
         }
         // TODO check nb of bytes written
@@ -77,7 +77,7 @@ size_t PyCallbackIOReader::operator()(void* ptrv, size_t size, size_t nitems) {
     while (rs > 0) {
         size_t ri = rs > bs ? bs : rs;
         PyObject* result = PyObject_CallFunction(callback, "(n)", ri);
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("propagate py error");
         }
         if (!PyBytes_Check(result)) {
@@ -122,7 +122,7 @@ bool PyCallbackIDSelector::is_member(faiss::idx_t id) const {
     FAISS_THROW_IF_NOT((id >> 32) == 0);
     PyThreadLock gil;
     PyObject* result = PyObject_CallFunction(callback, "(n)", int(id));
-    if (result == NULL) {
+    if (result == nullptr) {
         FAISS_THROW_MSG("propagate py error");
     }
     bool b = PyObject_IsTrue(result);

From 7e1d2b1f55efd53f2322570cf1d3bc74430da265 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 29 Apr 2024 10:48:52 -0700
Subject: [PATCH 050/116] Initial config and linux-x86_64-cmake build job only

Summary:
This commit is the first in a series in an attempt to incrementally enable all jobs currenlty performed by CircleCI. It includes the main configuration files provided by GitHub team + 1 build.

Original PR: https://github.com/facebookresearch/faiss/pull/3325

Reviewed By: junjieqi

Differential Revision: D56671582

fbshipit-source-id: c8a21cd69aabaf86134eb86753e90b1facf51bc3
---
 .github/actions/build_cmake/action.yml | 103 +++++++++++++++++++++++++
 .github/actions/build_conda/action.yml |  98 +++++++++++++++++++++++
 .github/workflows/build.yml            |  19 +++++
 3 files changed, 220 insertions(+)
 create mode 100644 .github/actions/build_cmake/action.yml
 create mode 100644 .github/actions/build_conda/action.yml
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
new file mode 100644
index 0000000000..5892b24dd2
--- /dev/null
+++ b/.github/actions/build_cmake/action.yml
@@ -0,0 +1,103 @@
+name: Build cmake
+inputs:
+  opt_level:
+    description: 'The optimization level'
+    required: false
+    default: generic
+  gpu:
+    description: 'The GPU to use'
+    required: false
+    default: OFF
+  raft:
+    description: 'The raft to use'
+    required: false
+    default: OFF
+runs:
+  using: composite
+  steps:
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3.0.3
+      with:
+        python-version: '3.11'
+        miniconda-version: latest
+    - name: Set up environment
+      shell: bash
+      run: |
+        conda config --set solver libmamba
+        conda update -y -q conda
+    - name: Install env using main channel
+      if: inputs.raft == 'OFF'
+      shell: bash
+      run: |
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
+    - name: Install env using conda-forge channel
+      if: inputs.raft == 'ON'
+      shell: bash
+      run: |
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+    - name: Install CUDA
+      if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
+      shell: bash
+      run: |
+        conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+    - name: Build all targets
+      shell: bash
+      run: |
+        eval "$(conda shell.bash hook)"
+        conda activate
+        cmake -B build \
+              -DBUILD_TESTING=ON \
+              -DBUILD_SHARED_LIBS=ON \
+              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
+              -DFAISS_ENABLE_RAFT=${{ inputs.raft }} \
+              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
+              -DFAISS_ENABLE_C_API=ON \
+              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBLA_VENDOR=Intel10_64_dyn \
+              -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+              .
+        make -k -C build -j$(nproc)
+    - name: C++ tests
+      shell: bash
+      run: |
+        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
+        make -C build test
+    - name: Install Python extension
+      shell: bash
+      working-directory: build/faiss/python
+      run: |
+        $CONDA/bin/python setup.py install
+    - name: Install pytest
+      shell: bash
+      run: |
+        conda install -y pytest
+        echo "$CONDA/bin" >> $GITHUB_PATH
+    - name: Python tests (CPU only)
+      if: inputs.gpu == 'OFF'
+      shell: bash
+      run: |
+        conda install -y -q pytorch -c pytorch
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+    - name: Python tests (CPU + GPU)
+      if: inputs.gpu == 'ON'
+      shell: bash
+      run: |
+        conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+        cp tests/common_faiss_tests.py faiss/gpu/test
+        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+    - name: Test avx2 loading
+      if: inputs.opt_level == 'avx2'
+      shell: bash
+      run: |
+        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
+        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
+    - name: Upload test results
+      uses: actions/upload-artifact@v4.3.1
+      with:
+        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
+        path: test-results
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
new file mode 100644
index 0000000000..7e4510b4b2
--- /dev/null
+++ b/.github/actions/build_conda/action.yml
@@ -0,0 +1,98 @@
+name: Build conda
+description: Build conda
+inputs:
+  label:
+    description: "Label"
+    default: ""
+    required: false
+  cuda:
+    description: "cuda"
+    default: ""
+    required: false
+  raft:
+    description: "raft"
+    default: ""
+    required: false
+  compiler_version:
+    description: "compiler_version"
+    default: ""
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Choose shell
+      shell: bash
+      id: choose_shell
+      run: |
+        # if runner.os != 'Windows' use bash, else use pwsh
+        if [ "${{ runner.os }}" != "Windows" ]; then
+          echo "shell=bash" >> "$GITHUB_OUTPUT"
+        else
+          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3.0.3
+      with:
+        python-version: '3.11'
+        miniconda-version:  latest
+    - name: Install conda build tools
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        # conda config --set solver libmamba
+        # conda config --set verbosity 3
+        conda update -y -q conda
+        conda install -y -q conda-build
+    - name: Enable anaconda uploads
+      if: inputs.label != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      env:
+        PACKAGE_TYPE: inputs.label
+      run: |
+        conda install -y -q anaconda-client
+        conda config --set anaconda_upload yes
+    - name: Conda build (CPU)
+      if: inputs.label == '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss --python 3.11 -c pytorch
+    - name: Conda build (CPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: inputs.label
+      run: |
+        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
+    - name: Conda build (GPU)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: inputs.label
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU w/ RAFT)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+    - name: Conda build (GPU w/ RAFT) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: inputs.label
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000000..dc469b6694
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,19 @@
+name: Build
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    tags:
+      - 'v*'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-cmake:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - uses: ./.github/actions/build_cmake

From 825cbac467ed47f5fbf0841b8d6a7df05a5bb41c Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Tue, 30 Apr 2024 08:31:45 -0700
Subject: [PATCH 051/116] Add linux-x86_64-AVX2-cmake build

Summary: GitHub checks

Reviewed By: junjieqi

Differential Revision: D56733297

fbshipit-source-id: fe5a2ca7c67f36a4fe986af78fb6dc8f4f843150
---
 .github/workflows/build.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dc469b6694..67130f252d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -17,3 +17,11 @@ jobs:
       - name: Checkout
         uses: actions/checkout@v4.1.1
       - uses: ./.github/actions/build_cmake
+  linux-x86_64-AVX2-cmake:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx2

From 3121fc6175b6c183712da90ab6cead0ac736446a Mon Sep 17 00:00:00 2001
From: Jayjeet Chakraborty <jc.github@rediffmail.com>
Date: Tue, 30 Apr 2024 12:23:58 -0700
Subject: [PATCH 052/116] Fix #3379: Add tutorial for HNSW index (#3381)

Summary:
Fixes https://github.com/facebookresearch/faiss/issues/3379

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3381

Reviewed By: junjieqi

Differential Revision: D56570120

Pulled By: kuarora

fbshipit-source-id: 758ea4ab866609d6dd5621e6e6ffda583ba52503
---
 tutorial/cpp/6-HNSW.cpp     | 76 +++++++++++++++++++++++++++++++++++++
 tutorial/cpp/CMakeLists.txt |  3 ++
 2 files changed, 79 insertions(+)
 create mode 100644 tutorial/cpp/6-HNSW.cpp

diff --git a/tutorial/cpp/6-HNSW.cpp b/tutorial/cpp/6-HNSW.cpp
new file mode 100644
index 0000000000..d7c615328b
--- /dev/null
+++ b/tutorial/cpp/6-HNSW.cpp
@@ -0,0 +1,76 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexHNSW.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[d * nb];
+    float* xq = new float[d * nq];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++)
+            xb[d * i + j] = distrib(rng);
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++)
+            xq[d * i + j] = distrib(rng);
+        xq[d * i] += i / 1000.;
+    }
+
+    int nlist = 100;
+    int k = 4;
+
+    faiss::IndexHNSWFlat index(d, 32);
+    index.add(nb, xb);
+
+    { // search xq
+        idx_t* I = new idx_t[k * nq];
+        float* D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5zd ", I[i * k + j]);
+            printf("\n");
+        }
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5zd ", I[i * k + j]);
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index 7361b33a03..abcb253826 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -18,3 +18,6 @@ target_link_libraries(4-GPU PRIVATE faiss)
 
 add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
 target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
+
+add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
+target_link_libraries(6-HNSW PRIVATE faiss)

From c92b4809ff4fb14d8157287ad4ef581b1b21f260 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Tue, 30 Apr 2024 14:34:15 -0700
Subject: [PATCH 053/116] Add format check

Summary: Migration to GitHub actions

Reviewed By: junjieqi

Differential Revision: D56745520

fbshipit-source-id: 5311a549842f19672ae574edaa8be3ea5a580dbc
---
 .github/workflows/build.yml | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 67130f252d..91a8acb6fe 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -11,6 +11,31 @@ env:
   OMP_NUM_THREADS: '10'
   MKL_THREADING_LAYER: GNU
 jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - name: Install clang-format
+        run: |
+            sudo apt-get update -y
+            sudo apt-get install -y wget
+            sudo apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            sudo ./llvm.sh 18
+            sudo apt-get install -y git-core clang-format-18
+      - name: Verify clang-format
+        run: |
+            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
+            if git diff --quiet; then
+              echo "Formatting OK!"
+            else
+              echo "Formatting not OK!"
+              echo "------------------"
+              git --no-pager diff --color
+              exit 1
+            fi
   linux-x86_64-cmake:
     runs-on: ubuntu-latest
     steps:

From 5fd8b810a493d7144b491d50b5cd82cb3e4fcf62 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 1 May 2024 14:39:09 -0700
Subject: [PATCH 054/116] Enable linux-x86_64-conda build via GitHub Actions
 (#3405)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3405

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D56843276

fbshipit-source-id: 3d5c7ee9a36a783407dfdcc3574c377da5f9db78
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 91a8acb6fe..36f7220c3d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,3 +50,12 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx2
+  linux-x86_64-conda:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda

From 74562b2a39bb424a7b7e0132271710d04660ecb1 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 1 May 2024 15:58:17 -0700
Subject: [PATCH 055/116] Enable windows-x86_64-conda build via GitHub Actions
 (#3406)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3406

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D56848895

fbshipit-source-id: 5a351534d9151369a9104314fee203657ac40043
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 36f7220c3d..b00d3b25eb 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -59,3 +59,12 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
       - uses: ./.github/actions/build_conda
+  windows-x86_64-conda:
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda

From 96b88ac361058a8bcbfe9e745f24941b122c924d Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 1 May 2024 17:25:17 -0700
Subject: [PATCH 056/116] Enable linux-arm64-conda check via GitHub Actions
 (#3407)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3407

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D56856565

fbshipit-source-id: d7400eb9cb7bd68e93a712af81c6cbb7e76e2400
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b00d3b25eb..bc45474b23 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -68,3 +68,12 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
       - uses: ./.github/actions/build_conda
+  linux-arm64-conda:
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda

From 7b8b98131846847fb60a48f8ee3d135af1c3ee41 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 2 May 2024 20:49:59 -0700
Subject: [PATCH 057/116] Enable packages builds on main for windows,
 linux-arm64, linux-x86_64 via GitHub Actions (#3409)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3409

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D56917083

fbshipit-source-id: 93a2358ce5697b26aa40ced505f42c584fa8c46c
---
 .github/workflows/build.yml | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bc45474b23..16ffe51e18 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -77,3 +77,39 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
       - uses: ./.github/actions/build_conda
+  linux-x86_64-packages:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  windows-x86_64-packages:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-arm64-packages:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main

From 1b1a403bb6ed210b7056dbbc6e743d1cf72f678c Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Fri, 3 May 2024 00:28:19 -0700
Subject: [PATCH 058/116] Change linux-arm64-packages build to use
 2-core-ubuntu-arm for better availability (#3410)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3410

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D56921925

fbshipit-source-id: 64e7a636b47d828110a6d413c8645e5343b578cb
---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 16ffe51e18..bd9805b9d9 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -103,7 +103,7 @@ jobs:
           label: main
   linux-arm64-packages:
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
-    runs-on: 4-core-arm
+    runs-on: 2-core-ubuntu-arm
     steps:
       - name: Checkout
         uses: actions/checkout@v4.1.1

From 0cc0e19f9e40d3e8eaf8ad00ea644051d3c5ee7a Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Fri, 3 May 2024 13:16:51 -0700
Subject: [PATCH 059/116] Enable osx-arm64-packages build via GitHub Actions
 (#3411)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3411

Migration to GitHub

Reviewed By: kuarora

Differential Revision: D56923116

fbshipit-source-id: 1e2b493b0dd81ce850f2970e6d28c713f6a9927b
---
 .github/workflows/build.yml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index bd9805b9d9..e05645f3d3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -101,6 +101,18 @@ jobs:
       - uses: ./.github/actions/build_conda
         with:
           label: main
+  osx-arm64-packages:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
   linux-arm64-packages:
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 2-core-ubuntu-arm

From b3e3c2d38db4457209d3c3456baeccdcee8282b5 Mon Sep 17 00:00:00 2001
From: Amir Sadoughi <sadoughi@meta.com>
Date: Wed, 8 May 2024 21:52:46 -0700
Subject: [PATCH 060/116] TimeoutCallback C++ and Python (#3417)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3417

https://github.com/facebookresearch/faiss/issues/3351

Reviewed By: junjieqi

Differential Revision: D57120422

fbshipit-source-id: e2e446642e7be8647f5115f90916fad242e31286
---
 faiss/gpu/perf/PerfClustering.cpp |  6 +++++
 faiss/impl/AuxIndexStructures.cpp | 25 +++++++++++++++++++++
 faiss/impl/AuxIndexStructures.h   |  8 +++++++
 faiss/python/__init__.py          | 11 +++++++++
 faiss/python/swigfaiss.swig       |  9 ++++++--
 tests/CMakeLists.txt              |  1 +
 tests/test_callback.cpp           | 37 +++++++++++++++++++++++++++++++
 tests/test_callback_py.py         | 32 ++++++++++++++++++++++++++
 8 files changed, 127 insertions(+), 2 deletions(-)
 create mode 100644 tests/test_callback.cpp
 create mode 100644 tests/test_callback_py.py

diff --git a/faiss/gpu/perf/PerfClustering.cpp b/faiss/gpu/perf/PerfClustering.cpp
index 0322f0e490..532557fe20 100644
--- a/faiss/gpu/perf/PerfClustering.cpp
+++ b/faiss/gpu/perf/PerfClustering.cpp
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include <cuda_profiler_api.h>
+#include <faiss/impl/AuxIndexStructures.h>
 
 DEFINE_int32(num, 10000, "# of vecs");
 DEFINE_int32(k, 100, "# of clusters");
@@ -34,6 +35,7 @@ DEFINE_int64(
         "minimum size to use CPU -> GPU paged copies");
 DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
 DEFINE_int32(max_points, -1, "max points per centroid");
+DEFINE_double(timeout, 0, "timeout in seconds");
 
 using namespace faiss::gpu;
 
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
         cp.max_points_per_centroid = FLAGS_max_points;
     }
 
+    auto tc = new faiss::TimeoutCallback();
+    faiss::InterruptCallback::instance.reset(tc);
+
     faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
 
     // Time k-means
     {
+        tc->set_timeout(FLAGS_timeout);
         CpuTimer timer;
 
         kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
diff --git a/faiss/impl/AuxIndexStructures.cpp b/faiss/impl/AuxIndexStructures.cpp
index cebe8a1e23..e2b2791e55 100644
--- a/faiss/impl/AuxIndexStructures.cpp
+++ b/faiss/impl/AuxIndexStructures.cpp
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
     return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
 }
 
+void TimeoutCallback::set_timeout(double timeout_in_seconds) {
+    timeout = timeout_in_seconds;
+    start = std::chrono::steady_clock::now();
+}
+
+bool TimeoutCallback::want_interrupt() {
+    if (timeout == 0) {
+        return false;
+    }
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<float, std::milli> duration = end - start;
+    float elapsed_in_seconds = duration.count() / 1000.0;
+    if (elapsed_in_seconds > timeout) {
+        timeout = 0;
+        return true;
+    }
+    return false;
+}
+
+void TimeoutCallback::reset(double timeout_in_seconds) {
+    auto tc(new faiss::TimeoutCallback());
+    faiss::InterruptCallback::instance.reset(tc);
+    tc->set_timeout(timeout_in_seconds);
+}
+
 } // namespace faiss
diff --git a/faiss/impl/AuxIndexStructures.h b/faiss/impl/AuxIndexStructures.h
index f8b5cca842..7e12a1a3af 100644
--- a/faiss/impl/AuxIndexStructures.h
+++ b/faiss/impl/AuxIndexStructures.h
@@ -161,6 +161,14 @@ struct FAISS_API InterruptCallback {
     static size_t get_period_hint(size_t flops);
 };
 
+struct TimeoutCallback : InterruptCallback {
+    std::chrono::time_point<std::chrono::steady_clock> start;
+    double timeout;
+    bool want_interrupt() override;
+    void set_timeout(double timeout_in_seconds);
+    static void reset(double timeout_in_seconds);
+};
+
 /// set implementation optimized for fast access.
 struct VisitedTable {
     std::vector<uint8_t> visited;
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 95be4254dc..0562d1dd89 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -316,3 +316,14 @@ def deserialize_index_binary(data):
     reader = VectorIOReader()
     copy_array_to_vector(data, reader.data)
     return read_index_binary(reader)
+
+
+class TimeoutGuard:
+    def __init__(self, timeout_in_seconds: float):
+        self.timeout = timeout_in_seconds
+
+    def __enter__(self):
+        TimeoutCallback.reset(self.timeout)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        PythonInterruptCallback.reset()
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 5c9a7b3fa7..85e04d322c 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -1041,7 +1041,9 @@ PyObject *swig_ptr (PyObject *a)
     PyErr_SetString(PyExc_ValueError, "did not recognize array type");
     return NULL;
 }
+%}
 
+%inline %{
 
 struct PythonInterruptCallback: faiss::InterruptCallback {
 
@@ -1056,15 +1058,18 @@ struct PythonInterruptCallback: faiss::InterruptCallback {
         return err == -1;
     }
 
+    static void reset() {
+        faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
+    }
 };
+
 %}
 
 %init %{
     /* needed, else crash at runtime */
     import_array();
 
-    faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
-
+    PythonInterruptCallback::reset();
 %}
 
 // return a pointer usable as input for functions that expect pointers
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 443195eecb..3980d7dd7c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -34,6 +34,7 @@ set(FAISS_TEST_SRC
   test_fastscan_perf.cpp
   test_disable_pq_sdc_tables.cpp
   test_common_ivf_empty_index.cpp
+  test_callback.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
diff --git a/tests/test_callback.cpp b/tests/test_callback.cpp
new file mode 100644
index 0000000000..cdfadf1d39
--- /dev/null
+++ b/tests/test_callback.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/random.h>
+
+TEST(TestCallback, timeout) {
+    int n = 1000;
+    int k = 100;
+    int d = 128;
+    int niter = 1000000000;
+    int seed = 42;
+
+    std::vector<float> vecs(n * d);
+    faiss::float_rand(vecs.data(), vecs.size(), seed);
+
+    auto index(new faiss::IndexFlat(d));
+
+    faiss::ClusteringParameters cp;
+    cp.niter = niter;
+    cp.verbose = false;
+
+    faiss::Clustering kmeans(d, k, cp);
+
+    faiss::TimeoutCallback::reset(0.010);
+    EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
+    delete index;
+}
diff --git a/tests/test_callback_py.py b/tests/test_callback_py.py
new file mode 100644
index 0000000000..0ec176dd86
--- /dev/null
+++ b/tests/test_callback_py.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import numpy as np
+import faiss
+
+
+class TestCallbackPy(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+    def test_timeout(self) -> None:
+        n = 1000
+        k = 100
+        d = 128
+        niter = 1_000_000_000
+
+        x = np.random.rand(n, d).astype('float32')
+        index = faiss.IndexFlat(d)
+
+        cp = faiss.ClusteringParameters()
+        cp.niter = niter
+        cp.verbose = False
+
+        kmeans = faiss.Clustering(d, k, cp)
+
+        with self.assertRaises(RuntimeError):
+            with faiss.TimeoutGuard(0.010):
+                kmeans.train(x, index)

From 34fa2aeb72ff5c40b62410f121a6438ff15ccd76 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 9 May 2024 10:49:48 -0700
Subject: [PATCH 061/116] Enable linux-x86_64-GPU-w-RAFT-cmake build via GitHub
 Actions (#3418)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3418

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D57133934

fbshipit-source-id: 255b7afbbb90cc966916cd900174833416b0bc51
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index e05645f3d3..888306d1d3 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,6 +50,15 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx2
+  linux-x86_64-GPU-w-RAFT-cmake:
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          raft: ON
   linux-x86_64-conda:
     runs-on: ubuntu-latest
     steps:

From e1e4ad00831174d25bba5a157d26b428ae54b39d Mon Sep 17 00:00:00 2001
From: Carl Love <carll@ltcden2-lp1.aus.stglabs.ibm.com>
Date: Fri, 10 May 2024 09:27:26 -0700
Subject: [PATCH 062/116] PowerPC, improve code generation for function
 fvec_L2sqr (#3416)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Summary:
The code generated for function fvec_L2sqr generated by OpenXL do not perform as good as the codes generated by gcc on Power. The macros to enable imprecise floating point operation don’t cover Power with OpenXL. This patch adds the OpenXL compiler options for the PowerPC macros to achieve better performance.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3416

Reviewed By: asadoughi

Differential Revision: D57210015

Pulled By: mdouze

fbshipit-source-id: 6b838a2fa4d4996fe52c9f1105827004626fe720
---
 faiss/impl/platform_macros.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index a0faea7cba..3fc328535b 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -127,6 +127,13 @@ inline int __builtin_clzll(uint64_t x) {
     __pragma(float_control(precise, off, push))
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
 #elif defined(__clang__)
+#if defined(__PPC__)
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize_width(4) interleave_count(8)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+#else
 #define FAISS_PRAGMA_IMPRECISE_LOOP \
     _Pragma("clang loop vectorize(enable) interleave(enable)")
 
@@ -144,6 +151,7 @@ inline int __builtin_clzll(uint64_t x) {
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 #endif
+#endif
 #elif defined(__GNUC__)
 // Unfortunately, GCC does not provide a pragma for detecting it.
 // So, we have to stick to GNUC, which is defined by MANY compilers.

From b487c62a1e164767ac4f73b2b2264a68570602c6 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 13 May 2024 09:54:41 -0700
Subject: [PATCH 063/116] Update system dependencies to enable CUDA builds on
 v6 kernel and newer libc (#3426)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3426

GitHub Actions only supports Ubuntu 22 and newer and this change is necessary to enable CUDA builds to complete the migration.

Reviewed By: algoriddle

Differential Revision: D57261685

fbshipit-source-id: 34467f57426864ffa8b32f6018ccdc7bb4424b57
---
 .github/actions/build_cmake/action.yml | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 5892b24dd2..510d4c9dc3 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -25,21 +25,21 @@ runs:
       run: |
         conda config --set solver libmamba
         conda update -y -q conda
-    - name: Install env using main channel
-      if: inputs.raft == 'OFF'
+    - name: Configure conda environment
       shell: bash
       run: |
-        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
-    - name: Install env using conda-forge channel
-      if: inputs.raft == 'ON'
-      shell: bash
-      run: |
-        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
     - name: Install CUDA
       if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
       shell: bash
       run: |
         conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+    - name: Install RAFT
+      if: inputs.raft == 'ON'
+      shell: bash
+      run: |
+        conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
     - name: Build all targets
       shell: bash
       run: |

From 2e04533fe17658a0e424e25cd5c85ff3259ba71e Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 13 May 2024 09:54:41 -0700
Subject: [PATCH 064/116] Enable linux-x86_64-GPU-cmake build on GitHub Actions
 (#3427)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3427

Migration to GitHub Actions

Reviewed By: algoriddle

Differential Revision: D57261696

fbshipit-source-id: d7b8c26259fd3de1cf59fc460f6af20185ceacfe
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 888306d1d3..5ee3c67d6e 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,6 +50,15 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx2
+  linux-x86_64-GPU-cmake:
+    needs: linux-x86_64-AVX2-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
   linux-x86_64-GPU-w-RAFT-cmake:
     runs-on: 4-core-ubuntu-gpu-t4
     steps:

From 4d06d7069fca2359e5b56d33c762ab91d015ee9d Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 13 May 2024 09:54:41 -0700
Subject: [PATCH 065/116] Add disabled linux-x86_64-AVX512-cmake build on
 GitHub Actions (#3428)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3428

GitHub Actions currently does not support runners with AVX-512  but committed to add this support in early 2025. We will be running these on CircleCI until then. This placeholder build configuration will allow us to enable it with a 1-liner when the hosts are available.

Reviewed By: algoriddle

Differential Revision: D57261783

fbshipit-source-id: 1fb985a0c3dbb11851af63c95bde6494d25d0ac2
---
 .github/workflows/build.yml | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 5ee3c67d6e..dc7f73c147 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -50,6 +50,15 @@ jobs:
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx2
+  linux-x86_64-AVX512-cmake:
+    if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512
   linux-x86_64-GPU-cmake:
     needs: linux-x86_64-AVX2-cmake
     runs-on: 4-core-ubuntu-gpu-t4

From 83df64cd7f3d34bad29051cf637c56b952b839c3 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Wed, 15 May 2024 01:48:49 -0700
Subject: [PATCH 066/116] Get rid of redundant instructions in ScalarQuantizer
 (#3430)

Summary:
This PR removes unneeded ARM NEON SIMD instructions for ScalarQuantizer.

The removed instructions are completely redundant, and I believe that it is a funky way of converting two `float32x4_t` variables (which hold 4 float values in a single SIMD register) into a single `float32x4x2_t` variable (two SIMD registers packed together).

Clang compiler is capable of eliminating these instructions. The only GCC that can eliminate these unneeded instructions is GCC 14, which was released very recently (Apr-May 2024).

mdouze

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3430

Reviewed By: mlomeli1

Differential Revision: D57369849

Pulled By: mdouze

fbshipit-source-id: 09d7cf16e113df3eb9ddbfa54d074b58b452ba7f
---
 faiss/impl/ScalarQuantizer.cpp | 58 +++++++++++++---------------------
 1 file changed, 22 insertions(+), 36 deletions(-)

diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index 07d77d5622..e3b29e621d 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -101,8 +101,7 @@ struct Codec8bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -153,8 +152,7 @@ struct Codec4bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -266,8 +264,7 @@ struct Codec6bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -345,16 +342,14 @@ struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
         float32x4x2_t xi = Codec::decode_8_components(code, i);
-        float32x4x2_t res = vzipq_f32(
-                vfmaq_f32(
+        return {vfmaq_f32(
                         vdupq_n_f32(this->vmin),
                         xi.val[0],
                         vdupq_n_f32(this->vdiff)),
                 vfmaq_f32(
                         vdupq_n_f32(this->vmin),
                         xi.val[1],
-                        vdupq_n_f32(this->vdiff)));
-        return vuzpq_f32(res.val[0], res.val[1]);
+                        vdupq_n_f32(this->vdiff))};
     }
 };
 
@@ -431,10 +426,8 @@ struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
         float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
         float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
 
-        float32x4x2_t res = vzipq_f32(
-                vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
-                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1]));
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
+                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])};
     }
 };
 
@@ -496,10 +489,9 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
 
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld2_u16((const uint16_t*)(code + 2 * i));
-        return vzipq_f32(
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1])));
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
+                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))};
     }
 };
 #endif
@@ -568,8 +560,7 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 };
 
@@ -868,7 +859,7 @@ struct SimilarityL2<8> {
     float32x4x2_t accu8;
 
     FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
         yi = y;
     }
 
@@ -882,8 +873,7 @@ struct SimilarityL2<8> {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
 
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE void add_8_components_2(
@@ -895,8 +885,7 @@ struct SimilarityL2<8> {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
 
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE float result_8() {
@@ -996,7 +985,7 @@ struct SimilarityIP<8> {
     float32x4x2_t accu8;
 
     FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
         yi = y;
     }
 
@@ -1006,8 +995,7 @@ struct SimilarityIP<8> {
 
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE void add_8_components_2(
@@ -1015,19 +1003,17 @@ struct SimilarityIP<8> {
             float32x4x2_t x2) {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE float result_8() {
-        float32x4x2_t sum_tmp = vzipq_f32(
+        float32x4x2_t sum = {
                 vpaddq_f32(accu8.val[0], accu8.val[0]),
-                vpaddq_f32(accu8.val[1], accu8.val[1]));
-        float32x4x2_t sum = vuzpq_f32(sum_tmp.val[0], sum_tmp.val[1]);
-        float32x4x2_t sum2_tmp = vzipq_f32(
+                vpaddq_f32(accu8.val[1], accu8.val[1])};
+
+        float32x4x2_t sum2 = {
                 vpaddq_f32(sum.val[0], sum.val[0]),
-                vpaddq_f32(sum.val[1], sum.val[1]));
-        float32x4x2_t sum2 = vuzpq_f32(sum2_tmp.val[0], sum2_tmp.val[1]);
+                vpaddq_f32(sum.val[1], sum.val[1])};
         return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
     }
 };

From 509f4c1320c5990096a202e210f5107a1ca8dd71 Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Wed, 15 May 2024 06:24:23 -0700
Subject: [PATCH 067/116] fix install instructions (#3442)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3442

fix install instruction for GPU + pytorch

Reviewed By: mlomeli1

Differential Revision: D57376959

fbshipit-source-id: 74caff960be7dbf8102e7593ce1485452a18de6e
---
 INSTALL.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/INSTALL.md b/INSTALL.md
index 45e2c9341b..5bd4f6d448 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -40,11 +40,11 @@ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8
 ```
 In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
 
-A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-03-01):
+A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
 ```
 conda create --name faiss_1.8.0
 conda activate faiss_1.8.0
-conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch pytorch-cuda numpy
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
 ```
 
 ## Installing from conda-forge

From 558a7c3fbc37e1bbf02dec99d47b60f75d824d76 Mon Sep 17 00:00:00 2001
From: Mengdi Lin <mengdilin@meta.com>
Date: Wed, 15 May 2024 09:17:10 -0700
Subject: [PATCH 068/116] interrupt for NNDescent (#3432)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3432

Addresses the issue in https://github.com/facebookresearch/faiss/issues/3173

for `IndexNNDescent`, I see that there is already interrupt implemented for it's [search](https://fburl.com/code/iwn3tqic) API, so I looked into it's `add` API.

For a given dataset nb = 10 mil, iter = 10, K =  32, d = 32 on a CPU only machine reveals that bulk of the cost comes from [nndescent](https://fburl.com/code/5rdb1p5o). For every iteration of `nndescent` takes around ~12 seconds, ~70-80% of the time is spent on `join` method (~10 seconds per iteration) and ~20-30% spent on `update` (~2 second per iteration). Adding the interrupt on the `join` should suffice on quickly terminating the program when users hit ctrl+C (happy to move the interrupt elsewhere if we think otherwise)

Reviewed By: junjieqi, mdouze

Differential Revision: D57300514

fbshipit-source-id: d343e0a292c35027ffdb8cbd0131e945b9881d63
---
 faiss/impl/NNDescent.cpp | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/faiss/impl/NNDescent.cpp b/faiss/impl/NNDescent.cpp
index b609aba390..5afcdaf5b7 100644
--- a/faiss/impl/NNDescent.cpp
+++ b/faiss/impl/NNDescent.cpp
@@ -154,15 +154,20 @@ NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
 NNDescent::~NNDescent() {}
 
 void NNDescent::join(DistanceComputer& qdis) {
+    idx_t check_period = InterruptCallback::get_period_hint(d * search_L);
+    for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal);
 #pragma omp parallel for default(shared) schedule(dynamic, 100)
-    for (int n = 0; n < ntotal; n++) {
-        graph[n].join([&](int i, int j) {
-            if (i != j) {
-                float dist = qdis.symmetric_dis(i, j);
-                graph[i].insert(j, dist);
-                graph[j].insert(i, dist);
-            }
-        });
+        for (idx_t n = i0; n < i1; n++) {
+            graph[n].join([&](int i, int j) {
+                if (i != j) {
+                    float dist = qdis.symmetric_dis(i, j);
+                    graph[i].insert(j, dist);
+                    graph[j].insert(i, dist);
+                }
+            });
+        }
+        InterruptCallback::check();
     }
 }
 

From b8e4489b98b5b8f9a84e61cbc84812280954514e Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Wed, 15 May 2024 11:45:48 -0700
Subject: [PATCH 069/116] Remove unused variables in faiss/IndexIVFFastScan.cpp
 (#3439)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3439

LLVM-15 has a warning `-Wunused-but-set-variable` which we treat as an error because it's so often diagnostic of a code issue. Unused variables can compromise readability or, worse, performance.

This diff either (a) removes an unused variable and, possibly, it's associated code, or (b) qualifies the variable with `[[maybe_unused]]`, mostly in cases where the variable _is_ used, but, eg, in an `assert` statement that isn't present in production code.

 - If you approve of this diff, please use the "Accept & Ship" button :-)

Reviewed By: palmje, junjieqi

Differential Revision: D57344013

fbshipit-source-id: adf410139d2e6ca69a26ccdbff8511c9b7620489
---
 faiss/IndexIVFFastScan.cpp | 4 ----
 tutorial/cpp/6-HNSW.cpp    | 1 -
 2 files changed, 5 deletions(-)

diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
index 19828753d2..3e40f7a3da 100644
--- a/faiss/IndexIVFFastScan.cpp
+++ b/faiss/IndexIVFFastScan.cpp
@@ -914,10 +914,6 @@ void IndexIVFFastScan::search_implem_10(
         size_t* nlist_out,
         const NormTableScaler* scaler,
         const IVFSearchParameters* params) const {
-    const size_t max_codes = params ? params->max_codes : this->max_codes;
-    const SearchParameters* quantizer_params =
-            params ? params->quantizer_params : nullptr;
-
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
diff --git a/tutorial/cpp/6-HNSW.cpp b/tutorial/cpp/6-HNSW.cpp
index d7c615328b..1b3434a433 100644
--- a/tutorial/cpp/6-HNSW.cpp
+++ b/tutorial/cpp/6-HNSW.cpp
@@ -37,7 +37,6 @@ int main() {
         xq[d * i] += i / 1000.;
     }
 
-    int nlist = 100;
     int k = 4;
 
     faiss::IndexHNSWFlat index(d, 32);

From 2050a0309043469c14546532f18e49222e6c73c9 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 15 May 2024 13:06:00 -0700
Subject: [PATCH 070/116] Add cuda-toolkit package dependency to faiss-gpu and
 faiss-gpu-raft conda build recipes (#3440)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3440

This change is required to unblock the migration to GitHub Actions. `cuda-toolkit` was only specified in the `libfaiss` package and it was not available in `faiss-gpu` or `faiss-gpu-raft`. This currently works on CircleCI because the runner image has CUDA toolkit of the needed version installed on the system and the build logic falls back to that but breaks on GitHub Actions because their runner images do not come with CUDA toolkit pre-installed.

Reviewed By: junjieqi

Differential Revision: D57371597

fbshipit-source-id: 8bededd53e2528f033fac797b296d74b47f9403e
---
 conda/faiss-gpu-raft/meta.yaml | 1 +
 conda/faiss-gpu/meta.yaml      | 1 +
 2 files changed, 2 insertions(+)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 3eebc9876b..23e4835032 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -87,6 +87,7 @@ outputs:
         - swig
         - cmake >=3.24.0
         - make  # [not win]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - python {{ python }}
         - numpy >=1.19,<2
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index 7ac24e785d..3d614df1bf 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -83,6 +83,7 @@ outputs:
         - swig
         - cmake >=3.24.0
         - make  # [not win]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - python {{ python }}
         - numpy >=1.19,<2

From 745bca88e3053a2e53bb6fe2b98b0bbb98470b2c Mon Sep 17 00:00:00 2001
From: Mengdi Lin <mengdilin@meta.com>
Date: Wed, 15 May 2024 13:39:18 -0700
Subject: [PATCH 071/116] stabilize formatting for
 bench_cppcontrib_sa_decode.cpp (#3443)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3443

Stabilize this file for clang-formatting versions 18.1.3 (VSCode) and 18.1.5 (our Github CI)

Reviewed By: junjieqi

Differential Revision: D57393650

fbshipit-source-id: 15170436bbd03194dbeaac1ef1130e20adc8c23e
---
 benchs/bench_cppcontrib_sa_decode.cpp | 175 +++++++++++++-------------
 1 file changed, 88 insertions(+), 87 deletions(-)

diff --git a/benchs/bench_cppcontrib_sa_decode.cpp b/benchs/bench_cppcontrib_sa_decode.cpp
index f0266172a8..b960fb7c6a 100644
--- a/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/benchs/bench_cppcontrib_sa_decode.cpp
@@ -213,9 +213,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -261,10 +261,9 @@ static void verifyIndex2LevelDecoder(
 
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -324,9 +323,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -353,9 +352,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -384,9 +383,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -418,9 +417,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -456,9 +455,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -524,9 +523,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -573,9 +572,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -641,9 +640,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -675,9 +674,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -711,9 +710,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -750,9 +749,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -793,9 +792,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -851,9 +850,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -899,9 +898,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -961,9 +960,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -989,9 +988,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1018,9 +1017,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1051,9 +1050,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1086,9 +1085,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1149,9 +1148,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1197,9 +1196,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1264,9 +1263,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1297,9 +1296,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1331,9 +1330,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1369,9 +1368,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1409,9 +1408,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd" << "\t" << nIterations << "\t" << timeFaiss
-                  << "\t" << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1484,8 +1483,10 @@ int main(int argc, char** argv) {
             (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
 
     // print the header
-    std::cout << "Codec\t" << "n\t" << "d\t" << "Experiment\t" << "Iterations\t"
-              << "Faiss time\t" << "SADecodeKernel time\t" << "Error"
+    auto delim = "\t";
+    std::cout << "Codec" << delim << "n" << delim << "d" << delim
+              << "Experiment" << delim << "Iterations" << delim << "Faiss time"
+              << delim << "SADecodeKernel time" << delim << "Error"
               << std::endl;
 
     // The following experiment types are available:

From 72571c767d5f263bf92b5b20930f9403b6f88d0d Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 15 May 2024 13:42:27 -0700
Subject: [PATCH 072/116] Enable both RAFT package builds and CUDA 12.1.1 GPU
 package build (#3441)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3441

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D57372738

fbshipit-source-id: 745b3c3f43c49045f8f5035e2af302ffa30d7755
---
 .github/workflows/build.yml | 50 +++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dc7f73c147..6a9114628c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -116,6 +116,56 @@ jobs:
       - uses: ./.github/actions/build_conda
         with:
           label: main
+  linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-packages-CUDA-12-1-1:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
   windows-x86_64-packages:
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: windows-2019

From 1876925e84be6c75cb5c42fc9b2c50a39782d821 Mon Sep 17 00:00:00 2001
From: Amir Sadoughi <sadoughi@meta.com>
Date: Wed, 15 May 2024 17:53:30 -0700
Subject: [PATCH 073/116] Implement METRIC.NaNEuclidean (#3414)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3414

https://github.com/facebookresearch/faiss/issues/3355

A couple open questions:
- Given L2 was squared, I figured I would leave this one as squared as well?
- Also, wasn't sure if we wanted to return nan when present == 0 or -1?

Reviewed By: mdouze

Differential Revision: D57017608

fbshipit-source-id: ba14458b92c8b055f3bf2a871565175935c8333a
---
 faiss/MetricType.h                |  1 +
 faiss/utils/extra_distances-inl.h | 20 ++++++++++++++++++++
 faiss/utils/extra_distances.cpp   |  3 +++
 tests/test_extra_distances.py     | 20 ++++++++++++++++++++
 4 files changed, 44 insertions(+)

diff --git a/faiss/MetricType.h b/faiss/MetricType.h
index 538b0a8e72..4689d4d018 100644
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
@@ -33,6 +33,7 @@ enum MetricType {
     METRIC_JensenShannon,
     METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
                     ///< where a_i, b_i > 0
+    METRIC_NaNEuclidean,
 };
 
 /// all vector indices are this type
diff --git a/faiss/utils/extra_distances-inl.h b/faiss/utils/extra_distances-inl.h
index d3768df668..5b21482d18 100644
--- a/faiss/utils/extra_distances-inl.h
+++ b/faiss/utils/extra_distances-inl.h
@@ -10,6 +10,7 @@
 
 #include <faiss/MetricType.h>
 #include <faiss/utils/distances.h>
+#include <cmath>
 #include <type_traits>
 
 namespace faiss {
@@ -130,4 +131,23 @@ inline float VectorDistance<METRIC_Jaccard>::operator()(
     return accu_num / accu_den;
 }
 
+template <>
+inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
+        const float* x,
+        const float* y) const {
+    // https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
+    float accu = 0;
+    size_t present = 0;
+    for (size_t i = 0; i < d; i++) {
+        if (!std::isnan(x[i]) && !std::isnan(y[i])) {
+            float diff = x[i] - y[i];
+            accu += diff * diff;
+            present++;
+        }
+    }
+    if (present == 0) {
+        return NAN;
+    }
+    return float(d) / float(present) * accu;
+}
 } // namespace faiss
diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp
index 8c0699880d..fb225e7c9e 100644
--- a/faiss/utils/extra_distances.cpp
+++ b/faiss/utils/extra_distances.cpp
@@ -164,6 +164,7 @@ void pairwise_extra_distances(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
@@ -195,6 +196,7 @@ void knn_extra_metrics(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
@@ -242,6 +244,7 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/tests/test_extra_distances.py b/tests/test_extra_distances.py
index a474dd6ba7..66318f76c5 100644
--- a/tests/test_extra_distances.py
+++ b/tests/test_extra_distances.py
@@ -94,6 +94,26 @@ def test_jaccard(self):
         new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
         self.assertTrue(np.allclose(ref_dis, new_dis))
 
+    def test_nan_euclidean(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.sqeuclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+        x = [[3, np.nan, np.nan, 6]]
+        q = [[1, np.nan, np.nan, 5]]
+        dis = [(4 / 2 * ((3 - 1)**2 + (6 - 5)**2))]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(new_dis, dis))
+
+        x = [[np.nan] * 4]
+        q = [[np.nan] * 4]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.isnan(new_dis[0]))
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """

From 4972abd36ca78ff66aba171cbfaa5a001e2ca090 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Thu, 16 May 2024 14:22:08 -0700
Subject: [PATCH 074/116] Improve testing code step 1 (#3451)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3451

This is a first step to clean up the faiss codebase following T187322081

Reviewed By: junjieqi

Differential Revision: D57448335

fbshipit-source-id: c9760d01479d3352b786bbcf2015251e7a7168d6
---
 tests/test_index_binary.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index 3acf622fd4..7820cb6627 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -142,7 +142,6 @@ def test_range_search(self):
                 self.assertTrue(set(range_res) <= set(I[i]))
                 nt2 += 1
             # in case of equality we have a problem with ties
-        print('nb tests', nt1, nt2)
         # nb tests is actually low...
         self.assertTrue(nt1 > 19 and nt2 > 19)
 
@@ -287,8 +286,6 @@ def test_ivf_nprobe(self):
         ref_index.add(xb)
         ref_D, ref_I = ref_index.search(xq, k)
 
-        print(D[0], ref_D[0])
-        print(I[0], ref_I[0])
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 

From bf8bd6b689f25a1a13c09f4363d8ea9bf6bf7a3a Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Thu, 16 May 2024 19:51:07 -0700
Subject: [PATCH 075/116] Delete all remaining print (#3452)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3452

Delete all remaining print within the Tests to improve the readability and effectiveness of the codebase.

Reviewed By: junjieqi

Differential Revision: D57466393

fbshipit-source-id: 6ebd66ae2e769894d810d4ba7a5f69fc865b797d
---
 tests/common_faiss_tests.py          |  1 -
 tests/test_RCQ_cropping.cpp          |  2 --
 tests/test_clustering.py             |  3 ---
 tests/test_contrib.py                |  2 --
 tests/test_contrib_with_scipy.py     |  2 --
 tests/test_fast_scan.py              |  3 ---
 tests/test_index_composite.py        |  3 ---
 tests/test_local_search_quantizer.py |  6 ------
 tests/test_partition.py              |  6 ------
 tests/test_product_quantizer.py      |  1 -
 tests/test_residual_quantizer.py     | 14 --------------
 tests/test_rowwise_minmax.py         |  1 -
 tests/test_standalone_codec.py       |  9 ---------
 13 files changed, 53 deletions(-)

diff --git a/tests/common_faiss_tests.py b/tests/common_faiss_tests.py
index 8dc25edec0..a8afe344e4 100644
--- a/tests/common_faiss_tests.py
+++ b/tests/common_faiss_tests.py
@@ -49,7 +49,6 @@ def evalres(self, DI):
         for rank in 1, 10, 100:
             e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
                        float(self.nq))
-        # print("1-recalls: %s" % e)
         return e
 
 
diff --git a/tests/test_RCQ_cropping.cpp b/tests/test_RCQ_cropping.cpp
index 4dd3470885..4463c256ed 100644
--- a/tests/test_RCQ_cropping.cpp
+++ b/tests/test_RCQ_cropping.cpp
@@ -28,7 +28,6 @@ TEST(RCQCropping, test_cropping) {
     faiss::ResidualCoarseQuantizer rcq(d, nbits);
 
     rcq.train(nt, xt);
-    // fprintf(stderr, "nb centroids: %zd\n", rcq.ntotal);
 
     // the test below works only for beam size == nprobe
     rcq.set_beam_factor(1.0);
@@ -44,7 +43,6 @@ TEST(RCQCropping, test_cropping) {
     nbits.pop_back();
     faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
     rcq_cropped.initialize_from(rcq);
-    // fprintf(stderr, "cropped nb centroids: %zd\n", rcq_cropped.ntotal);
 
     EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
 
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
index 2b81fc3e35..b1afc8523f 100644
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@@ -110,9 +110,6 @@ def test_weighted(self):
         cdis2_first = cdis2[:5].sum()
         cdis2_last = cdis2[5:].sum()
 
-        print(cdis1_first, cdis1_last)
-        print(cdis2_first, cdis2_last)
-
         # with the new clustering, the last should be much (*2) closer
         # to their centroids
         self.assertGreater(cdis1_last, cdis1_first * 2)
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index 0e7cbbfb03..05a2c4ac8b 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -147,7 +147,6 @@ def test_query_iterator(self, metric=faiss.METRIC_L2):
         xb = ds.get_database()
         D, I = faiss.knn(xq, xb, 10, metric=metric)
         threshold = float(D[:, -1].mean())
-        print(threshold)
 
         index = faiss.IndexFlat(32, metric)
         index.add(xb)
@@ -251,7 +250,6 @@ def test_precision_recall(self):
         Inew = np.hstack(Inew)
 
         precision, recall = evaluation.range_PR(lims_ref, Iref, lims_new, Inew)
-        print(precision, recall)
 
         self.assertEqual(precision, 0.6)
         self.assertEqual(recall, 0.6)
diff --git a/tests/test_contrib_with_scipy.py b/tests/test_contrib_with_scipy.py
index cb81bb623c..4f89e2fc1b 100644
--- a/tests/test_contrib_with_scipy.py
+++ b/tests/test_contrib_with_scipy.py
@@ -44,7 +44,6 @@ def test_sparse_routines(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         centroids = ds.get_queries()
@@ -72,7 +71,6 @@ def test_sparse_kmeans(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         km = faiss.Kmeans(ds.d, 50)
diff --git a/tests/test_fast_scan.py b/tests/test_fast_scan.py
index b061ee3af0..cfe9636fee 100644
--- a/tests/test_fast_scan.py
+++ b/tests/test_fast_scan.py
@@ -34,7 +34,6 @@ def test_PQ4_accuracy(self):
         nq = Iref.shape[0]
         recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq
         assert recall_at_1 > 0.6
-        # print(f'recall@1 = {recall_at_1:.3f}')
 
 
     # This is an experiment to see if we can catch performance
@@ -498,7 +497,6 @@ def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall = (Ia == gt).sum() / nq
 
-        print(aq, st, implem, metric_type, recall_ref, recall)
         assert abs(recall_ref - recall) < 0.05
 
     def xx_test_accuracy(self):
@@ -531,7 +529,6 @@ def subtest_from_idxaq(self, implem, metric):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_from_idxaq(self):
diff --git a/tests/test_index_composite.py b/tests/test_index_composite.py
index a760c0cf09..8d9b441adc 100644
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@@ -168,8 +168,6 @@ def test_remove_id_map_2(self):
         index.remove_ids(remove_set)
         index.add_with_ids(X[5:, :], idx[5:])
 
-        print (index.search(X, 1))
-
         for i in range(10):
             _, searchres = index.search(X[i:i + 1, :], 1)
             if idx[i] in remove_set:
@@ -954,7 +952,6 @@ def do_test(self, factory_string):
         index.nprobe = 10
         Dref, Iref = index.search(ds.get_queries(), 10)
 
-        #print(index.search_and_return_codes)
         D, I, codes = index.search_and_return_codes(
             ds.get_queries(), 10, include_listnos=True)
 
diff --git a/tests/test_local_search_quantizer.py b/tests/test_local_search_quantizer.py
index 01fec70ccf..7975929811 100644
--- a/tests/test_local_search_quantizer.py
+++ b/tests/test_local_search_quantizer.py
@@ -196,7 +196,6 @@ def test_update_codebooks_with_double(self):
         err_float = eval_codec(lsq, xb)
 
         # 6533.377 vs 25457.99
-        print(err_double, err_float)
         self.assertLess(err_double, err_float)
 
     def test_compute_binary_terms(self):
@@ -348,7 +347,6 @@ def test_training(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_lsq, err_pq)
         self.assertLess(err_lsq, err_pq)
 
 
@@ -463,7 +461,6 @@ def eval_index_accuracy(self, factory_key):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         inters = np.array(inters)
@@ -528,7 +525,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_plsq, err_pq)
         self.assertLess(err_plsq, err_pq)
 
     def test_with_lsq(self):
@@ -549,7 +545,6 @@ def test_with_lsq(self):
         lsq.train(xt)
         err_lsq = eval_codec(lsq, xb)
 
-        print(err_plsq, err_lsq)
         self.assertEqual(err_plsq, err_lsq)
 
     def test_lut(self):
@@ -664,7 +659,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as LSQ."""
         inter1 = self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF32,LSQ4x5_Nqint8")
-        # print(inter1, inter2)  # 381 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_partition.py b/tests/test_partition.py
index 02de7e8c2c..fd41eabe1f 100644
--- a/tests/test_partition.py
+++ b/tests/test_partition.py
@@ -49,7 +49,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -95,7 +94,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -148,7 +146,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
 
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -160,7 +157,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         tab_a = faiss.AlignedTableUint16()
         faiss.copy_array_to_AlignedTable(vals, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
@@ -196,7 +192,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -209,7 +204,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         vals_inv = (65535 - vals).astype('uint16')
         faiss.copy_array_to_AlignedTable(vals_inv, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
diff --git a/tests/test_product_quantizer.py b/tests/test_product_quantizer.py
index 1cdee7f144..e05426b129 100644
--- a/tests/test_product_quantizer.py
+++ b/tests/test_product_quantizer.py
@@ -26,7 +26,6 @@ def test_pq(self):
         x2 = pq.decode(codes)
         diff = ((x - x2)**2).sum()
 
-        # print("diff=", diff)
         # diff= 4418.0562
         self.assertGreater(5000, diff)
 
diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py
index e37ee3efe2..f87e7650d9 100644
--- a/tests/test_residual_quantizer.py
+++ b/tests/test_residual_quantizer.py
@@ -211,7 +211,6 @@ def test_training(self):
 
         # in practice RQ is often better than PQ but it does not the case here, so just check
         # that we are within some factor.
-        # print(err_pq, err_rq)
         self.assertLess(err_rq, err_pq * 1.2)
 
     def test_beam_size(self):
@@ -321,10 +320,8 @@ def retrain_AQ_codebook(index, xt):
 
     x_decoded = index.sa_decode(codes_packed)
     MSE = ((xt - x_decoded) ** 2).sum() / n
-    # print(f"Initial MSE on training set: {MSE:g}")
 
     codes = unpack_codes(index.rq, codes_packed)
-    # print("ref codes", codes[0])
     codebook_offsets = faiss.vector_to_array(rq.codebook_offsets)
 
     # build sparse code matrix (represented as a dense matrix)
@@ -343,7 +340,6 @@ def retrain_AQ_codebook(index, xt):
         B, residuals, rank, singvals = scipy.linalg.lstsq(C, xt, )
 
     MSE = ((C @ B - xt) ** 2).sum() / n
-    # print(f"MSE after retrainining: {MSE:g}")
 
     # replace codebook
     # faiss.copy_array_to_vector(B.astype('float32').ravel(), index.rq.codebooks)
@@ -503,7 +499,6 @@ def test_reestimate_codebook_2(self):
         xt_decoded = ir.sa_decode(ir.sa_encode(xt))
         err_after_refined = ((xt - xt_decoded) ** 2).sum()
 
-        # print(err_before, err_after_refined)
         # ref run 7474.98 / 7006.1777
         self.assertGreater(err_before, err_after_refined * 1.06)
 
@@ -781,7 +776,6 @@ def test_search_L2(self):
             else:
                 inter_2 = faiss.eval_intersection(I2, gt)
                 self.assertGreaterEqual(inter_ref, inter_2)
-                # print(st, inter_ref, inter_2)
 
 
 ###########################################################
@@ -814,7 +808,6 @@ def do_test_accuracy(self, by_residual, st):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print(st, "nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         # do a little I/O test
@@ -909,18 +902,13 @@ def do_test_accuracy_IP(self, by_residual):
             D, I = index.search(ds.get_queries(), 10)
             index.rq.search_type = faiss.AdditiveQuantizer.ST_LUT_nonorm
             D2, I2 = index.search(ds.get_queries(), 10)
-            # print(D[:5] - D2[:5])
-            # print(I[:5])
             np.testing.assert_array_almost_equal(D, D2, decimal=5)
             # there are many ties because the codes are so short
             self.assertLess((I != I2).sum(), I.size * 0.1)
 
             # D2, I2 = index2.search(ds.get_queries(), 10)
-            # print(D[:5])
-            # print(D2[:5])
 
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
         self.assertTrue(np.all(inters[1:4] >= inters[:3]))
 
@@ -1166,7 +1154,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        # print(err_prq, err_pq)
         self.assertLess(err_prq, err_pq)
 
     def test_with_rq(self):
@@ -1271,7 +1258,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as RQ."""
         inter1 = self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF100,RQ4x5_Nqint8")
-        # print(inter1, inter2)  # 392 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_rowwise_minmax.py b/tests/test_rowwise_minmax.py
index dbd14de388..53e6c00b15 100644
--- a/tests/test_rowwise_minmax.py
+++ b/tests/test_rowwise_minmax.py
@@ -45,7 +45,6 @@ def compare_train_vs_train_inplace(self, factory_key):
 
         # make sure that the reconstruction error is not crazy
         reconstruction_err = ((x - decoded) ** 2).sum()
-        print(reconstruction_err)
 
         self.assertLess(reconstruction_err, 0.6)
 
diff --git a/tests/test_standalone_codec.py b/tests/test_standalone_codec.py
index 7fdcf6849f..2176a12e99 100644
--- a/tests/test_standalone_codec.py
+++ b/tests/test_standalone_codec.py
@@ -151,7 +151,6 @@ def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
             err = ((x - x2) ** 2).sum()
             errs.append(err)
 
-        print(errs)
         self.assertGreater(errs[0], errs[1])
 
         self.assertGreater(max_errs[0], errs[0])
@@ -214,7 +213,6 @@ def test_repeats(self):
             code = repeats.encode(swig_ptr(vec))
             vec2 = np.zeros(dim, dtype='float32')
             repeats.decode(code, swig_ptr(vec2))
-            # print(vec2)
             assert np.all(vec == vec2)
 
     def test_ZnSphereCodec_encode_centroid(self):
@@ -222,7 +220,6 @@ def test_ZnSphereCodec_encode_centroid(self):
         r2 = 5
         ref_codec = faiss.ZnSphereCodec(dim, r2)
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print(ref_codec.nv, codec.nv)
         assert ref_codec.nv == codec.nv
         s = set()
         for i in range(ref_codec.nv):
@@ -237,7 +234,6 @@ def test_ZnSphereCodecRec(self):
         dim = 16
         r2 = 6
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print("nv=", codec.nv)
         for i in range(codec.nv):
             c = np.zeros(dim, dtype='float32')
             codec.decode(i, swig_ptr(c))
@@ -300,15 +296,10 @@ def test_rw(self):
         for i in range(nbyte):
             self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
 
-        #for i in range(nbyte):
-        #    print(bin(bs[i] + 256)[3:], end=' ')
-        # print()
-
         br = faiss.BitstringReader(swig_ptr(bs), nbyte)
 
         for nbit, xref in ctrl:
             xnew = br.read(nbit)
-            # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
     def test_arrays(self):

From e822a8c152374c318ca1ae43f06a49af456f18c3 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Fri, 17 May 2024 12:02:55 -0700
Subject: [PATCH 076/116] GitHub Actions files cleanup (#3454)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3454

Removing commented out lines and adding proper descriptions and comments where appropriate.

Reviewed By: junjieqi

Differential Revision: D57501602

fbshipit-source-id: 0202ff73b7a83158808affba9b98b96dff569457
---
 .github/actions/build_cmake/action.yml | 10 +++++-----
 .github/actions/build_conda/action.yml | 16 +++++++---------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 510d4c9dc3..a46462c8d9 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -1,15 +1,15 @@
 name: Build cmake
 inputs:
   opt_level:
-    description: 'The optimization level'
+    description: 'Compile options / optimization level.'
     required: false
     default: generic
   gpu:
-    description: 'The GPU to use'
+    description: 'Enable GPU support.'
     required: false
     default: OFF
   raft:
-    description: 'The raft to use'
+    description: 'Enable RAFT support.'
     required: false
     default: OFF
 runs:
@@ -20,12 +20,12 @@ runs:
       with:
         python-version: '3.11'
         miniconda-version: latest
-    - name: Set up environment
+    - name: Initialize Conda environment
       shell: bash
       run: |
         conda config --set solver libmamba
         conda update -y -q conda
-    - name: Configure conda environment
+    - name: Configure Conda environment
       shell: bash
       run: |
         conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index 7e4510b4b2..ec407e6569 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -1,21 +1,21 @@
-name: Build conda
-description: Build conda
+name: Conda build
+description: Builds FAISS inside a Conda environment and uploads to repository when label is provided.
 inputs:
   label:
-    description: "Label"
+    description: "The label to be used for uploads to Conda."
     default: ""
     required: false
   cuda:
-    description: "cuda"
+    description: "CUDA toolkit version to use."
     default: ""
     required: false
   raft:
-    description: "raft"
+    description: "Enable RAFT support."
     default: ""
     required: false
   compiler_version:
     description: "compiler_version"
-    default: ""
+    default: "Compiler version for C/C++/CUDA."
     required: false
 runs:
   using: composite
@@ -24,7 +24,7 @@ runs:
       shell: bash
       id: choose_shell
       run: |
-        # if runner.os != 'Windows' use bash, else use pwsh
+        # Use pwsh on Windows; bash everywhere else
         if [ "${{ runner.os }}" != "Windows" ]; then
           echo "shell=bash" >> "$GITHUB_OUTPUT"
         else
@@ -38,8 +38,6 @@ runs:
     - name: Install conda build tools
       shell: ${{ steps.choose_shell.outputs.shell }}
       run: |
-        # conda config --set solver libmamba
-        # conda config --set verbosity 3
         conda update -y -q conda
         conda install -y -q conda-build
     - name: Enable anaconda uploads

From 5e452ed52a976226e33eb9c90c34b3ede0193f3a Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Fri, 17 May 2024 16:59:36 -0700
Subject: [PATCH 077/116] Cleaning up more unnecessary print (#3455)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3455

Code quality control by reducing the number of prints

Reviewed By: junjieqi

Differential Revision: D57502194

fbshipit-source-id: a6cd65ed4cc49590ce73d2978d41b640b5259c17
---
 tests/test_binary_hashindex.py   | 10 ----------
 tests/test_build_blocks.py       | 15 ---------------
 tests/test_graph_based.py        |  7 -------
 tests/test_index.py              |  5 -----
 tests/test_index_accuracy.py     | 24 ------------------------
 tests/test_io.py                 |  1 -
 tests/test_ivflib.py             |  1 -
 tests/test_lowlevel_ivf.cpp      | 13 -------------
 tests/test_merge_index.py        |  1 -
 tests/test_meta_index.py         |  7 -------
 tests/test_product_quantizer.py  |  1 -
 tests/test_residual_quantizer.py |  3 ---
 tests/test_search_params.py      |  1 -
 tests/test_sliding_ivf.cpp       | 16 ----------------
 14 files changed, 105 deletions(-)

diff --git a/tests/test_binary_hashindex.py b/tests/test_binary_hashindex.py
index 2d33050571..e9a6eaca49 100644
--- a/tests/test_binary_hashindex.py
+++ b/tests/test_binary_hashindex.py
@@ -58,8 +58,6 @@ def test_hash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         index = faiss.IndexBinaryHash(d, 10)
         index.add(xb)
         # index.display()
@@ -80,8 +78,6 @@ def test_hash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -100,8 +96,6 @@ def test_multihash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         nfound = []
         ndis = []
 
@@ -123,8 +117,6 @@ def test_multihash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         # self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -163,7 +155,6 @@ def test_hash_and_multihash(self):
                     # no duplicates
                     self.assertTrue(len(new) == len(snew))
                     nf += len(set(ref) & snew)
-                print('nfound', nh, nbit, nf)
                 nfound[(nh, nbit)] = nf
             self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
 
@@ -175,7 +166,6 @@ def test_hash_and_multihash(self):
             np.testing.assert_array_equal(Inew, I2)
             np.testing.assert_array_equal(Dnew, D2)
 
-        print('nfound=', nfound)
         self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
         self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
         self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py
index 0a97e63185..fdf9ad8bd7 100644
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -189,7 +189,6 @@ def test_l2(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = ((x - y) ** 2).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -204,7 +203,6 @@ def test_IP(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = (x * y).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -220,7 +218,6 @@ def test_0s(self):
         m = rs.rand(40, 20).astype('float32')
         m[5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'has 5 copies' in comments
         assert '5 null vectors' in comments
 
@@ -229,7 +226,6 @@ def test_copies(self):
         m = rs.rand(40, 20).astype('float32')
         m[::2] = m[1::2]
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '20 vectors are distinct' in comments
 
     def test_dead_dims(self):
@@ -237,7 +233,6 @@ def test_dead_dims(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are constant' in comments
 
     def test_rogue_means(self):
@@ -245,7 +240,6 @@ def test_rogue_means(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] += 12345
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are too large wrt. their variance' in comments
 
     def test_normalized(self):
@@ -253,7 +247,6 @@ def test_normalized(self):
         m = rs.rand(40, 20).astype('float32')
         faiss.normalize_L2(m)
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'vectors are normalized' in comments
 
     def test_hash(self):
@@ -300,7 +293,6 @@ def test_8bit_equiv(self):
                 D, I = index.search(x[3:], 1)
 
                 # assert D[0, 0] == Dref[0, 0]
-                # print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
                 assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
 
     def test_6bit_equiv(self):
@@ -314,8 +306,6 @@ def test_6bit_equiv(self):
                 d, faiss.ScalarQuantizer.QT_6bit)
             index.train(trainset)
 
-            print('cs=', index.code_size)
-
             x = rs.randint(64, size=(100, d)).astype('float32')
 
             # verify encoder / decoder
@@ -330,7 +320,6 @@ def test_6bit_equiv(self):
             for i in range(20):
                 for j in range(10):
                     dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
-                    # print(dis, D[i, j])
                     assert abs(D[i, j] - dis) / dis < 1e-5
 
     def test_reconstruct(self):
@@ -371,7 +360,6 @@ def test_randint(self):
         x = faiss.randint(20000, vmax=100)
         assert np.all(x >= 0) and np.all(x < 100)
         c = np.bincount(x, minlength=100)
-        print(c)
         assert c.max() - c.min() < 50 * 2
 
     def test_rand_vector(self):
@@ -473,7 +461,6 @@ def do_test_array_type(self, dtype):
         """ tests swig_ptr and rev_swig_ptr for this type of array """
         a = np.arange(12).astype(dtype)
         ptr = faiss.swig_ptr(a)
-        print(ptr)
         a2 = faiss.rev_swig_ptr(ptr, 12)
         np.testing.assert_array_equal(a, a2)
 
@@ -547,7 +534,6 @@ def subtest(self, d, K, metric):
                         recalls += 1
                         break
         recall = 1.0 * recalls / (nb * K)
-        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
         assert recall > 0.99
 
     def test_small_nndescent(self):
@@ -656,7 +642,6 @@ def do_test_bucket_sort_inplace(
             rows, _ = np.where(tab == b)
             rows.sort()
             tab2[lims[b]:lims[b + 1]].sort()
-            # print(rows, tab2[lims[b] : lims[b + 1]])
             rows = set(rows)
             self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
 
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index dd4212d717..d5ddbeec37 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -209,7 +209,6 @@ def subtest_add(self, build_type, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
         self.subtest_io_and_clone(index, Dnsg, Insg)
@@ -230,7 +229,6 @@ def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -286,7 +284,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -294,7 +291,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -335,7 +331,6 @@ def test_nsg_pq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGPQ", recalls)
         self.assertGreaterEqual(recalls, 190)  # 193
 
         # test I/O
@@ -361,7 +356,6 @@ def test_nsg_sq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGSQ", recalls)
         self.assertGreaterEqual(recalls, 405)  # 411
 
         # test I/O
@@ -395,7 +389,6 @@ def test_nndescentflat(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNNDescentFlat", recalls)
         self.assertGreaterEqual(recalls, 450)  # 462
 
         # do some IO tests
diff --git a/tests/test_index.py b/tests/test_index.py
index f46c6a94bf..b9f3dbd46b 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -338,7 +338,6 @@ def test_4variants_ivf(self):
             D, I = index.search(xq, 10)
 
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
-        print(nok, nq)
 
         self.assertGreaterEqual(nok['flat'], nq * 0.6)
         # The tests below are a bit fragile, it happens that the
@@ -373,8 +372,6 @@ def test_4variants(self):
             D, I = index.search(xq, 10)
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        print(nok, nq)
-
         self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
@@ -442,7 +439,6 @@ def norm1(x):
 
         recons_err = np.mean(norm1(R_flat - xb[I_flat]))
 
-        print('Reconstruction error = %.3f' % recons_err)
         if eps is not None:
             self.assertLessEqual(recons_err, eps)
 
@@ -638,7 +634,6 @@ def test_reconstuct_after_add(self):
 
         # should not raise an exception
         index.reconstruct(5)
-        print(index.ntotal)
         index.reconstruct(150)
 
 
diff --git a/tests/test_index_accuracy.py b/tests/test_index_accuracy.py
index 3f7bfbd303..8d8b4a28f6 100644
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
@@ -56,7 +56,6 @@ def test_ivf_kmeans(self):
         Dref, Iref = ivfk.search(ev.xq, 100)
         ivfk.parallel_mode = 1
         Dnew, Inew = ivfk.search(ev.xq, 100)
-        print((Iref != Inew).sum(), Iref.size)
         assert (Iref != Inew).sum() < Iref.size / 5000.0
         assert np.all(Dref == Dnew)
 
@@ -136,8 +135,6 @@ def test_polysemous(self):
 
         res = ev.launch("Polysemous ht=%d" % index.polysemous_ht, index)
         e_polysemous = ev.evalres(res)
-        print(e_baseline, e_polysemous, index.polysemous_ht)
-        print(stats.n_hamming_pass, stats.ncode)
         # The randu dataset is difficult, so we are not too picky on
         # the results. Here we assert that we have < 10 % loss when
         # computing full PQ on fewer than 20% of the data.
@@ -248,7 +245,6 @@ def subtest(self, mt):
             index.nprobe = 4  # hopefully more robust than 1
             D, I = index.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, repr(qname), ninter))
             assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
 
             if qname == "6bit":
@@ -264,7 +260,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            # print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -278,14 +273,11 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            # print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.01
 
             for pm in 1, 2:
-                # print("parallel_mode=%d" % pm)
                 index.parallel_mode = pm
                 lims4, D4, I4 = index.range_search(xq, radius)
-                # print("sizes", lims4[1:] - lims4[:-1])
                 for qno in range(len(lims) - 1):
                     Iref = I3[lims[qno]: lims[qno + 1]]
                     Inew = I4[lims4[qno]: lims4[qno + 1]]
@@ -485,7 +477,6 @@ def subtest(self, mt):
             D, I = index.search(xq, 10)
 
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, by_residual, ninter))
 
             assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
 
@@ -499,10 +490,6 @@ def subtest(self, mt):
                 index.polysemous_ht = 20
                 D, I = index.search(xq, 10)
                 ninter = faiss.eval_intersection(I, gt_I)
-                print(
-                    "(%d, %s, %d): %d, "
-                    % (mt, by_residual, index.polysemous_ht, ninter)
-                )
 
                 # polysemous behaves bizarrely on ARM
                 assert (
@@ -516,7 +503,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -530,7 +516,6 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
     def test_IVFPQ_non8bit(self):
@@ -555,7 +540,6 @@ def test_IVFPQ_non8bit(self):
 
             D, I = index.search(xq, 10)
             ninter[v] = faiss.eval_intersection(I, gt_I)
-        print("ninter=", ninter)
         # this should be the case but we don't observe
         # that... Probavly too few test points
         #  assert ninter['2x8'] > ninter['8x2']
@@ -623,9 +607,6 @@ def test_OPQ(self):
         res = ev.launch("OPQ", index)
         e_opq = ev.evalres(res)
 
-        print("e_pq=%s" % e_pq)
-        print("e_opq=%s" % e_opq)
-
         # verify that OPQ better than PQ
         for r in 1, 10, 100:
             assert e_opq[r] > e_pq[r]
@@ -656,7 +637,6 @@ def test_OIVFPQ(self):
 
         # verify same on OIVFPQ
         for r in 1, 10, 100:
-            print(e_oivfpq[r], e_ivfpq[r])
             assert e_oivfpq[r] >= e_ivfpq[r]
 
 
@@ -758,9 +738,6 @@ def test_sh(self):
                     ninter = faiss.eval_intersection(I, gt_I)
                     key = (nbit, tt, period)
 
-                    print("(%d, %s, %g): %d, " % (nbit, repr(tt), period,
-                                                  ninter))
-                    print(abs(ninter - self.ref_results[key]))
                     assert abs(ninter - self.ref_results[key]) <= 14
 
 
@@ -799,7 +776,6 @@ def do_test(self, metric):
         # check that with refinement, the recall@10 is the same as
         # the original recall@100
         recall2 = (I2 == Iref[:, :1]).sum()
-        # print("recalls", recall1, recall2)
         self.assertEqual(recall1, recall2)
 
     def test_IP(self):
diff --git a/tests/test_io.py b/tests/test_io.py
index dc8ac3dcfb..99dfe60847 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -102,7 +102,6 @@ def test_buf_read(self):
                 reader = faiss.BufferedIOReader(reader, bsz)
 
                 y = np.zeros_like(x)
-                print('nbytes=', y.nbytes)
                 reader(faiss.swig_ptr(y), y.nbytes, 1)
 
             np.testing.assert_array_equal(x, y)
diff --git a/tests/test_ivflib.py b/tests/test_ivflib.py
index f19c3da45b..0a3fb8c87e 100644
--- a/tests/test_ivflib.py
+++ b/tests/test_ivflib.py
@@ -125,7 +125,6 @@ def test_range_search_with_parameters(self):
 
         Dpre, _ = index.search(xq, 15)
         radius = float(np.median(Dpre[:, -1]))
-        print("Radius=", radius)
         stats = faiss.cvar.indexIVF_stats
         stats.reset()
         Lref, Dref, Iref = index.range_search(xq, radius)
diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp
index e28e2a946f..7ce90a1d2d 100644
--- a/tests/test_lowlevel_ivf.cpp
+++ b/tests/test_lowlevel_ivf.cpp
@@ -364,22 +364,9 @@ void test_lowlevel_access_binary(const char* index_key) {
             }
         }
 
-        printf("new before reroder: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // re-order heap
         heap_reorder<CMax<int32_t, idx_t>>(k, D.data(), I.data());
 
-        printf("ref: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I_ref[j], D_ref[j]);
-        printf("]\nnew: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // check that we have the same results as the reference search
         for (int j = 0; j < k; j++) {
             // here the order is not guaranteed to be the same
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 4417f57fe7..bdcc813f1c 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -72,7 +72,6 @@ def do_test_merge(self, index_type):
             index.merge_from(indexes[i], index.ntotal)
 
         _D, I = index.search(xq, k)
-        print(I[:5, :6])
 
         ndiff = (I != Iref).sum()
         print('%d / %d differences' % (ndiff, nq * k))
diff --git a/tests/test_meta_index.py b/tests/test_meta_index.py
index d53cad48f7..d0896e8ba2 100644
--- a/tests/test_meta_index.py
+++ b/tests/test_meta_index.py
@@ -82,10 +82,8 @@ def test_shards(self):
         k = 32
         ref_index = faiss.IndexFlatL2(d)
 
-        print('ref search')
         ref_index.add(xb)
         _Dref, Iref = ref_index.search(xq, k)
-        print(Iref[:5, :6])
 
         shard_index = faiss.IndexShards(d)
         shard_index_2 = faiss.IndexShards(d, True, False)
@@ -109,7 +107,6 @@ def test_shards(self):
         for test_no in range(3):
             with_threads = test_no == 1
 
-            print('shard search test_no = %d' % test_no)
             if with_threads:
                 remember_nt = faiss.omp_get_max_threads()
                 faiss.omp_set_num_threads(1)
@@ -122,14 +119,10 @@ def test_shards(self):
             else:
                 _D, I = shard_index_2.search(xq, k)
 
-            print(I[:5, :6])
-
             if with_threads:
                 faiss.omp_set_num_threads(remember_nt)
 
             ndiff = (I != Iref).sum()
-
-            print('%d / %d differences' % (ndiff, nq * k))
             assert (ndiff < nq * k / 1000.)
 
     def test_shards_ivf(self):
diff --git a/tests/test_product_quantizer.py b/tests/test_product_quantizer.py
index e05426b129..f531cab2a1 100644
--- a/tests/test_product_quantizer.py
+++ b/tests/test_product_quantizer.py
@@ -70,7 +70,6 @@ def do_test_codec(self, nbit):
 
     def test_codec(self):
         for i in range(16):
-            print("Testing nbits=%d" % (i + 1))
             self.do_test_codec(i + 1)
 
 
diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py
index f87e7650d9..f4381607e1 100644
--- a/tests/test_residual_quantizer.py
+++ b/tests/test_residual_quantizer.py
@@ -967,8 +967,6 @@ def beam_search_encode_step_tab(codes, L, distances, codebook_cross_prods_i,
             for b in range(beam_size):
                 dotprods[i, b, :] += cb[codes[i, b, j]]
 
-    # print("dps", dotprods[:3, :2, :4])
-
     new_distances += 2 * dotprods
     cent_distances = new_distances
 
@@ -1174,7 +1172,6 @@ def test_with_rq(self):
         rq.train(xt)
         err_rq = eval_codec(rq, xb)
 
-        # print(err_prq, err_rq)
         self.assertEqual(err_prq, err_rq)
 
 
diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 22b845c2ea..886ffc0c62 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -465,7 +465,6 @@ def test_12_92(self):
         sp = faiss.swig_ptr
         selr.find_sorted_ids_bounds(
             len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
-        print(j01)
         assert j01[0] >= j01[1]
 
 
diff --git a/tests/test_sliding_ivf.cpp b/tests/test_sliding_ivf.cpp
index ea9e53d6b5..0214dd72e8 100644
--- a/tests/test_sliding_ivf.cpp
+++ b/tests/test_sliding_ivf.cpp
@@ -74,8 +74,6 @@ void make_index_slices(
     for (int i = 0; i < total_size; i++) {
         sub_indexes.emplace_back(clone_index(trained_index));
 
-        printf("preparing sub-index # %d\n", i);
-
         Index* index = sub_indexes.back().get();
 
         auto xb = make_data(nb * d);
@@ -122,13 +120,10 @@ int test_sliding_window(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         window.step(
                 i < total_size ? sub_indexes[i].get() : nullptr,
                 i >= window_size);
-        printf("   current n_slice = %d\n", window.n_slice);
 
         auto new_res = search_index(index.get(), xq.data());
 
@@ -159,8 +154,6 @@ int test_sliding_invlists(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         std::vector<const InvertedLists*> ils;
         for (int j = i - window_size + 1; j <= i; j++) {
@@ -178,8 +171,6 @@ int test_sliding_invlists(const char* index_key) {
         // will be deleted by the index
         index_ivf->replace_invlists(ci, true);
 
-        printf("   nb invlists = %zd\n", ils.size());
-
         auto new_res = search_index(index.get(), xq.data());
 
         std::unique_ptr<Index> merged_index(
@@ -188,13 +179,6 @@ int test_sliding_invlists(const char* index_key) {
         auto ref_res = search_index(merged_index.get(), xq.data());
 
         EXPECT_EQ(ref_res.size(), new_res.size());
-
-        size_t ndiff = 0;
-        for (size_t j = 0; j < ref_res.size(); j++) {
-            if (ref_res[j] != new_res[j])
-                ndiff++;
-        }
-        printf("  nb differences: %zd / %zd\n", ndiff, ref_res.size());
         EXPECT_EQ(ref_res, new_res);
     }
     return 0;

From 0c983f361ba0af23b7a1c434dcc042ac0c24b454 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 20 May 2024 12:50:08 -0700
Subject: [PATCH 078/116] Workaround for CUDA 11.4.4 build in Conda on Ubuntu
 22 / v6 kernel (#3459)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3459

When building with CUDA 11.4.4, CMake does not properly include files under Conda environment. This workaround flattens the include sub-directories in to the include root. It will unblock us for now while we are looking for a fix through CMakeLists files or figure out if it's a CMake bug and it gets fixed.

Reviewed By: junjieqi

Differential Revision: D57545169

fbshipit-source-id: 9cbdd0866e00e899cc889930a59448da55d873c2
---
 conda/faiss-gpu/build-lib.sh | 6 ++++++
 conda/faiss-gpu/meta.yaml    | 3 +++
 2 files changed, 9 insertions(+)

diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index 2d25e9c5e6..41f4f02bbc 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -6,6 +6,12 @@
 
 set -e
 
+# Workaround for CUDA 11.4.4 builds. Moves all necessary headers to include root.
+if [[ -n "$FAISS_FLATTEN_CONDA_INCLUDES" && "$FAISS_FLATTEN_CONDA_INCLUDES" == "1" ]]; then
+  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/* $CONDA_PREFIX/include/
+  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/* $CONDA_PREFIX/include/
+  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/* $CONDA_PREFIX/include/
+fi
 
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index 3d614df1bf..db6b4e243f 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -43,6 +43,9 @@ outputs:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
         - CUDA_ARCHS
+        {% if cudatoolkit == '11.4.4' %}
+        - FAISS_FLATTEN_CONDA_INCLUDES=1
+        {% endif %}
     requirements:
       build:
         - {{ compiler('cxx') }}

From 86bf74dc0c7e9d9d3b7a3f56fa8c4ef9b740b4cd Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 20 May 2024 12:50:08 -0700
Subject: [PATCH 079/116] Enable linux-x86_64-GPU-packages-CUDA-11-4-4 build
 via GitHub Actions (#3460)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3460

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D57545637

fbshipit-source-id: 8ee970e5642ae9354455d60d84019d4217884d3a
---
 .github/workflows/build.yml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6a9114628c..3ba83c7735 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -116,6 +116,22 @@ jobs:
       - uses: ./.github/actions/build_conda
         with:
           label: main
+  linux-x86_64-GPU-packages-CUDA-11-4-4:
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4.1.1
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "11.4.4"
+          compiler_version: "11.2"
   linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4

From 7fc81841a2730b24e799ff299f6080f489e272cf Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 20 May 2024 12:50:08 -0700
Subject: [PATCH 080/116] Relax version requirements for action steps (#3461)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3461

Migration to GitHub Actions

Reviewed By: junjieqi

Differential Revision: D57545814

fbshipit-source-id: a3d08f8cf516ce26f8f16892eceef9e36bfe9f05
---
 .github/actions/build_cmake/action.yml |  4 +--
 .github/actions/build_conda/action.yml |  2 +-
 .github/workflows/build.yml            | 34 +++++++++++++-------------
 3 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index a46462c8d9..cd023aaca7 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -16,7 +16,7 @@ runs:
   using: composite
   steps:
     - name: Setup miniconda
-      uses: conda-incubator/setup-miniconda@v3.0.3
+      uses: conda-incubator/setup-miniconda@v3
       with:
         python-version: '3.11'
         miniconda-version: latest
@@ -97,7 +97,7 @@ runs:
         FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
         LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
     - name: Upload test results
-      uses: actions/upload-artifact@v4.3.1
+      uses: actions/upload-artifact@v4
       with:
         name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
         path: test-results
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index ec407e6569..4658f13a8f 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -31,7 +31,7 @@ runs:
           echo "shell=pwsh" >> "$GITHUB_OUTPUT"
         fi
     - name: Setup miniconda
-      uses: conda-incubator/setup-miniconda@v3.0.3
+      uses: conda-incubator/setup-miniconda@v3
       with:
         python-version: '3.11'
         miniconda-version:  latest
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3ba83c7735..3f31ae3b96 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - name: Install clang-format
         run: |
             sudo apt-get update -y
@@ -40,13 +40,13 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
   linux-x86_64-AVX2-cmake:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx2
@@ -55,7 +55,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
         with:
           opt_level: avx512
@@ -64,7 +64,7 @@ jobs:
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
         with:
           gpu: ON
@@ -72,7 +72,7 @@ jobs:
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
         with:
           gpu: ON
@@ -81,7 +81,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -90,7 +90,7 @@ jobs:
     runs-on: windows-2019
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -99,7 +99,7 @@ jobs:
     runs-on: 2-core-ubuntu-arm
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -109,7 +109,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -123,7 +123,7 @@ jobs:
       CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -139,7 +139,7 @@ jobs:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -156,7 +156,7 @@ jobs:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -172,7 +172,7 @@ jobs:
       CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -187,7 +187,7 @@ jobs:
     runs-on: windows-2019
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -199,7 +199,7 @@ jobs:
     runs-on: macos-14
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true
@@ -211,7 +211,7 @@ jobs:
     runs-on: 2-core-ubuntu-arm
     steps:
       - name: Checkout
-        uses: actions/checkout@v4.1.1
+        uses: actions/checkout@v4
         with:
           fetch-depth: 0
           fetch-tags: true

From 8c95c6943bd21af530342e9d540f12dadc6fd28d Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 20 May 2024 14:30:36 -0700
Subject: [PATCH 081/116] Fix linter warnings in faiss-gpu Conda build script
 (#3463)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3463

Satisfying linter warnings from a previous change

Reviewed By: junjieqi

Differential Revision: D57581364

fbshipit-source-id: 9e9b7f963a27d2da54d0e85390cce2f9f773c502
---
 conda/faiss-gpu/build-lib.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index 41f4f02bbc..9957be96ea 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -7,10 +7,10 @@
 set -e
 
 # Workaround for CUDA 11.4.4 builds. Moves all necessary headers to include root.
-if [[ -n "$FAISS_FLATTEN_CONDA_INCLUDES" && "$FAISS_FLATTEN_CONDA_INCLUDES" == "1" ]]; then
-  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/* $CONDA_PREFIX/include/
-  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/* $CONDA_PREFIX/include/
-  cp -r -n $CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/* $CONDA_PREFIX/include/
+if [ -n "$FAISS_FLATTEN_CONDA_INCLUDES" ] && [ "$FAISS_FLATTEN_CONDA_INCLUDES" = "1" ]; then
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/"* "$CONDA_PREFIX/include/"
 fi
 
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so

From 0698ac72eff28406c40e0f1738dac39951cba74b Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Tue, 21 May 2024 09:07:47 -0700
Subject: [PATCH 082/116] Properly pass the label for conda upload steps
 (#3464)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3464

Migration to GitHub

Reviewed By: algoriddle

Differential Revision: D57593494

fbshipit-source-id: 726159b553d5544efcdfa064f5a82fd51ed793e9
---
 .github/actions/build_conda/action.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
index 4658f13a8f..982430c351 100644
--- a/.github/actions/build_conda/action.yml
+++ b/.github/actions/build_conda/action.yml
@@ -44,7 +44,7 @@ runs:
       if: inputs.label != ''
       shell: ${{ steps.choose_shell.outputs.shell }}
       env:
-        PACKAGE_TYPE: inputs.label
+        PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda install -y -q anaconda-client
         conda config --set anaconda_upload yes
@@ -59,7 +59,7 @@ runs:
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       env:
-        PACKAGE_TYPE: inputs.label
+        PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
     - name: Conda build (GPU)
@@ -74,7 +74,7 @@ runs:
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       env:
-        PACKAGE_TYPE: inputs.label
+        PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
@@ -90,7 +90,7 @@ runs:
       shell: ${{ steps.choose_shell.outputs.shell }}
       working-directory: conda
       env:
-        PACKAGE_TYPE: inputs.label
+        PACKAGE_TYPE: ${{ inputs.label }}
       run: |
         conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
             --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge

From a60a9e56c66ec4370ec6e32597927ede441ad611 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Tue, 21 May 2024 09:07:47 -0700
Subject: [PATCH 083/116] Fix CUDA 11.4.4 builds under CircleCI (#3466)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3466

Flattening Conda include directories breaks CUDA 11.4.4 build on Ubuntu 20 / v5 kernel. This change updates the logic to only flatten includes on Ubuntu 22 / v6 kernel (aka as running on GitHub Actions runners).

Reviewed By: algoriddle

Differential Revision: D57602154

fbshipit-source-id: 00c14ca7c64644b8b86483ac6b4d40c6d8f12372
---
 .github/workflows/build.yml | 1 +
 conda/faiss-gpu/meta.yaml   | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 3f31ae3b96..39f865bc4a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -121,6 +121,7 @@ jobs:
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index db6b4e243f..05f7b59008 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -43,9 +43,7 @@ outputs:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
         - CUDA_ARCHS
-        {% if cudatoolkit == '11.4.4' %}
-        - FAISS_FLATTEN_CONDA_INCLUDES=1
-        {% endif %}
+        - FAISS_FLATTEN_CONDA_INCLUDES
     requirements:
       build:
         - {{ compiler('cxx') }}

From c1528b55b70099e79f6fb5a59004ce5761eabe5a Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Tue, 21 May 2024 09:07:47 -0700
Subject: [PATCH 084/116] Enable nightly builds via GitHub Actions (#3467)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3467

1. Cron is scheduled to execute at 1:10am UTC. This is per GitHub's recommendation to avoid hotspots. The docs mention that when GH backend gets overloaded, they can drop scheduled jobs which we want to avoid so we scheduled off hour, off midnight.
2. The plan is to let these nightlies run once and, if successful, then disable them in GitHub UI to perform validation. Also disable if things break and need to be fixed of course.

Reviewed By: algoriddle

Differential Revision: D57602833

fbshipit-source-id: 4f4d9abbaa5ed3d1edb024ea4dd3f87aa78dd9b5
---
 .github/workflows/nightly.yml | 138 ++++++++++++++++++++++++++++++++++
 1 file changed, 138 insertions(+)
 create mode 100644 .github/workflows/nightly.yml

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 0000000000..9b0d2d0d31
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,138 @@
+name: Nightly
+on:
+  schedule:
+    - cron:  '1 10 * * *'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-nightly:
+    name: Linux x86_64 nightlies
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-x86_64-GPU-CUDA-11-4-4-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-CUDA-12-1-1-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA12-1-1-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-nightly:
+    name: Windows x86_64 nightlies
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  osx-arm64-nightly:
+    name: OSX arm64 nightlies
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-arm64-nightly:
+    name: Linux arm64 nightlies
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly

From 4489773fd43165dc8b053c408ca1033eee9fa2b0 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Tue, 21 May 2024 09:50:41 -0700
Subject: [PATCH 085/116] Add tutorial for FastScan (#3465)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3465

This commit include python version of tutorial for FastScan. It includes all the parameters enabled within PQFastScan.

Reviewed By: junjieqi

Differential Revision: D57594044

fbshipit-source-id: cb12679b6fc241a654b9545c5bc7bd0517aa1813
---
 tutorial/python/7-PQFastScan.py | 35 +++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)
 create mode 100644 tutorial/python/7-PQFastScan.py

diff --git a/tutorial/python/7-PQFastScan.py b/tutorial/python/7-PQFastScan.py
new file mode 100644
index 0000000000..34d7a34ac1
--- /dev/null
+++ b/tutorial/python/7-PQFastScan.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8   # 8 specifies that the number of sub-vector is 8
+k = 4   # number of dimension in etracted vector
+n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
+# construct FastScan Index
+
+assert not index.is_trained
+index.train(xb)     # Train vectors data index within mockup database
+assert index.is_trained
+
+index.add(xb)
+D, I = index.search(xb[:5], k)  # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])               # neighbors of the 5 last queries

From 59e3ee1e30bd6586ba7bde9e3196b7ed676160e7 Mon Sep 17 00:00:00 2001
From: Saarth Deshpande <deshpandesaarth@gmail.com>
Date: Tue, 21 May 2024 13:02:25 -0700
Subject: [PATCH 086/116] Missed printing 'D' (#3433)

Summary:
'I' was printed twice and 'D' (distance vector) was not printed. Fixed.

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3433

Reviewed By: fxdawnn

Differential Revision: D57451544

Pulled By: junjieqi

fbshipit-source-id: fc17b3b467f8b2c4ad7d80b44866456d9146e530
---
 tutorial/cpp/1-Flat.cpp    | 4 ++--
 tutorial/cpp/2-IVFFlat.cpp | 7 ++-----
 tutorial/cpp/6-HNSW.cpp    | 6 ++----
 3 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/tutorial/cpp/1-Flat.cpp b/tutorial/cpp/1-Flat.cpp
index 819e419573..147fa89bc0 100644
--- a/tutorial/cpp/1-Flat.cpp
+++ b/tutorial/cpp/1-Flat.cpp
@@ -83,10 +83,10 @@ int main() {
             printf("\n");
         }
 
-        printf("I (5 last results)=\n");
+        printf("D (5 last results)=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/2-IVFFlat.cpp b/tutorial/cpp/2-IVFFlat.cpp
index febd5be049..86530ae985 100644
--- a/tutorial/cpp/2-IVFFlat.cpp
+++ b/tutorial/cpp/2-IVFFlat.cpp
@@ -61,13 +61,10 @@ int main() {
             printf("\n");
         }
 
-        index.nprobe = 10;
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
+        printf("D=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/6-HNSW.cpp b/tutorial/cpp/6-HNSW.cpp
index 1b3434a433..9bd8cd3faa 100644
--- a/tutorial/cpp/6-HNSW.cpp
+++ b/tutorial/cpp/6-HNSW.cpp
@@ -55,12 +55,10 @@ int main() {
             printf("\n");
         }
 
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
+        printf("D=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 

From f38e52c1e2eed726dd435f10757280c9e8fe5b17 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Tue, 21 May 2024 16:42:15 -0700
Subject: [PATCH 087/116] Add tutorial on PQFastScan for cpp (#3468)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3468

This commit includes the tutorial for PQFastScan in the cpp environment.

Reviewed By: junjieqi

Differential Revision: D57631441

fbshipit-source-id: f5e17eee2a584ebfc9ff63868d741d0da6b3b413
---
 tutorial/cpp/7-PQFastScan.cpp | 75 +++++++++++++++++++++++++++++++++++
 tutorial/cpp/CMakeLists.txt   |  3 ++
 2 files changed, 78 insertions(+)
 create mode 100644 tutorial/cpp/7-PQFastScan.cpp

diff --git a/tutorial/cpp/7-PQFastScan.cpp b/tutorial/cpp/7-PQFastScan.cpp
new file mode 100644
index 0000000000..4cdfea052e
--- /dev/null
+++ b/tutorial/cpp/7-PQFastScan.cpp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.train(nb, xb);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.add(nb, xb);
+
+    int k = 4;
+
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+} // namespace facebook::detail
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index abcb253826..894fb4168e 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -21,3 +21,6 @@ target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
 
 add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
 target_link_libraries(6-HNSW PRIVATE faiss)
+
+add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
+target_link_libraries(7-PQFastScan PRIVATE faiss)

From 7d7fef0b5869b19e81a24c875808c0fa7bddacf7 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Wed, 22 May 2024 09:30:18 -0700
Subject: [PATCH 088/116] Add FastScan refinement tutorial for python (#3469)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3469

Reviewed By: junjieqi

Differential Revision: D57650807

fbshipit-source-id: 5e642a8140455e4a3f1f21afe2f06771462e61f4
---
 tutorial/python/8-PQFastScanRefine.py | 38 +++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)
 create mode 100644 tutorial/python/8-PQFastScanRefine.py

diff --git a/tutorial/python/8-PQFastScanRefine.py b/tutorial/python/8-PQFastScanRefine.py
new file mode 100644
index 0000000000..115a036fa7
--- /dev/null
+++ b/tutorial/python/8-PQFastScanRefine.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8  # 8 specifies that the number of sub-vector is 8
+k = 4  # number of dimension in etracted vector
+n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
+
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
+index_refine = faiss.IndexRefineFlat(index)
+# construct FastScan and run index refinement
+
+assert not index_refine.is_trained
+index_refine.train(xb)  # Train vectors data index within mockup database
+assert index_refine.is_trained
+
+index_refine.add(xb)
+params = faiss.IndexRefineSearchParameters(k_factor=3)
+D, I = index_refine.search(xq[:5], 10, params=params)
+print(I)
+print(D)
+index.nprobe = 10  # make comparable with experiment above
+D, I = index.search(xq[:5], k)  # search
+print(I[-5:])

From f352168c64c922bcccf7d70d8b04e84c10d4a7f8 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 22 May 2024 10:45:41 -0700
Subject: [PATCH 089/116] Fix cron schedule for nightlies via GitHub Actions
 (#3470)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3470

Hour and minute values are swapped, the goal is to run is at 1:10am UTC.

Reviewed By: algoriddle

Differential Revision: D57654059

fbshipit-source-id: 23bcb42e5c95f731cd4713ad4691d0f475ed8ad2
---
 .github/workflows/nightly.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 9b0d2d0d31..fcec4ba3c6 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -1,7 +1,7 @@
 name: Nightly
 on:
   schedule:
-    - cron:  '1 10 * * *'
+    - cron:  '10 1 * * *'
 env:
   OMP_NUM_THREADS: '10'
   MKL_THREADING_LAYER: GNU

From b39dd4dd57b08426eb3a0ea27ceba3063d823bcb Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Wed, 22 May 2024 11:50:25 -0700
Subject: [PATCH 090/116] Fix CUDA 11.4.4 nightly in GitHub Actions (#3473)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3473

Previous diff (D57602154) fixed the CircleCI version and the PR build version of GHA but not the nightly one.

Reviewed By: junjieqi

Differential Revision: D57680576

fbshipit-source-id: 39f49c20df824c915f536b1ed3ffc35db2907988
---
 .github/workflows/nightly.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index fcec4ba3c6..eabee07744 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -25,6 +25,7 @@ jobs:
     runs-on: 4-core-ubuntu-gpu-t4
     env:
       CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
     steps:
       - name: Checkout
         uses: actions/checkout@v4

From 414fd1e3b796a045271c545d071fbb8826ca23f0 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Wed, 22 May 2024 13:01:37 -0700
Subject: [PATCH 091/116] Add tutorial for FastScan with refinement for cpp
 (#3474)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3474

This commit focus on the cpp version of PQfastscan tutorial with index refinement by defining the k factor.

Reviewed By: junjieqi

Differential Revision: D57680905

fbshipit-source-id: 980c2990172f24ec9a4f870685e706195883408f
---
 tutorial/cpp/8-PQFastScanRefine.cpp | 84 +++++++++++++++++++++++++++++
 tutorial/cpp/CMakeLists.txt         |  3 ++
 2 files changed, 87 insertions(+)
 create mode 100644 tutorial/cpp/8-PQFastScanRefine.cpp

diff --git a/tutorial/cpp/8-PQFastScanRefine.cpp b/tutorial/cpp/8-PQFastScanRefine.cpp
new file mode 100644
index 0000000000..2435d94d2c
--- /dev/null
+++ b/tutorial/cpp/8-PQFastScanRefine.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    faiss::IndexRefineFlat index_refine(&index);
+    // refine index after PQFastScan
+
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.train(nb, xb);
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.add(nb, xb);
+
+    int k = 4;
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+        index_refine.search(nq, xq, k, D, I, params);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+        delete params;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index 894fb4168e..ad152c499d 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -24,3 +24,6 @@ target_link_libraries(6-HNSW PRIVATE faiss)
 
 add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
 target_link_libraries(7-PQFastScan PRIVATE faiss)
+
+add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
+target_link_libraries(8-PQFastScanRefine PRIVATE faiss)

From 6a94c67a2fa87af1f108fe5fa1d307f44509d729 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Thu, 23 May 2024 02:59:15 -0700
Subject: [PATCH 092/116] QT_bf16 for scalar quantizer for bfloat16 (#3444)

Summary:
mdouze Please let me know if any additional unit tests are needed

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3444

Reviewed By: algoriddle

Differential Revision: D57665641

Pulled By: mdouze

fbshipit-source-id: 9bec91306a1c31ea4f1f1d726c9d60ac6415fdfc
---
 benchs/bench_fw/optimize.py    |  1 +
 c_api/IndexScalarQuantizer_c.h |  1 +
 contrib/factory_tools.py       |  3 ++
 faiss/CMakeLists.txt           |  1 +
 faiss/IndexScalarQuantizer.cpp |  3 +-
 faiss/impl/ScalarQuantizer.cpp | 83 ++++++++++++++++++++++++++++++++++
 faiss/impl/ScalarQuantizer.h   |  1 +
 faiss/index_factory.cpp        |  3 +-
 faiss/utils/bf16.h             | 36 +++++++++++++++
 tests/test_index.py            |  6 ++-
 tests/test_standalone_codec.py |  3 ++
 11 files changed, 137 insertions(+), 4 deletions(-)
 create mode 100644 faiss/utils/bf16.h

diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py
index 473436ea68..a2653b7144 100644
--- a/benchs/bench_fw/optimize.py
+++ b/benchs/bench_fw/optimize.py
@@ -226,6 +226,7 @@ def optimize_codec(
             [
                 (None, "Flat"),
                 (None, "SQfp16"),
+                (None, "SQbf16"),
                 (None, "SQ8"),
             ] + [
                 (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
diff --git a/c_api/IndexScalarQuantizer_c.h b/c_api/IndexScalarQuantizer_c.h
index 2c5e3f2942..87fe6d3415 100644
--- a/c_api/IndexScalarQuantizer_c.h
+++ b/c_api/IndexScalarQuantizer_c.h
@@ -26,6 +26,7 @@ typedef enum FaissQuantizerType {
     QT_fp16,
     QT_8bit_direct, ///< fast indexing of uint8s
     QT_6bit,        ///< 6 bits per component
+    QT_bf16,
 } FaissQuantizerType;
 
 // forward declaration
diff --git a/contrib/factory_tools.py b/contrib/factory_tools.py
index 745dc7f7ff..cfad7c7b5c 100644
--- a/contrib/factory_tools.py
+++ b/contrib/factory_tools.py
@@ -56,6 +56,8 @@ def get_code_size(d, indexkey):
         return (d * 6 + 7) // 8
     elif indexkey == 'SQfp16':
         return d * 2
+    elif indexkey == 'SQbf16':
+        return d * 2
 
     mo = re.match('PCAR?(\\d+),(.*)$', indexkey)
     if mo:
@@ -140,6 +142,7 @@ def reverse_index_factory(index):
             faiss.ScalarQuantizer.QT_4bit: "4",
             faiss.ScalarQuantizer.QT_6bit: "6",
             faiss.ScalarQuantizer.QT_fp16: "fp16",
+            faiss.ScalarQuantizer.QT_bf16: "bf16",
         }
         return f"SQ{sqtypes[index.sq.qtype]}"
 
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 33e1849568..1b0860f3fb 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -183,6 +183,7 @@ set(FAISS_HEADERS
   invlists/InvertedLists.h
   invlists/InvertedListsIOHook.h
   utils/AlignedTable.h
+  utils/bf16.h
   utils/Heap.h
   utils/WorkerThread.h
   utils/distances.h
diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp
index 9203a98932..7ce838db5e 100644
--- a/faiss/IndexScalarQuantizer.cpp
+++ b/faiss/IndexScalarQuantizer.cpp
@@ -32,7 +32,8 @@ IndexScalarQuantizer::IndexScalarQuantizer(
         MetricType metric)
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
     is_trained = qtype == ScalarQuantizer::QT_fp16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct;
+            qtype == ScalarQuantizer::QT_8bit_direct ||
+            qtype == ScalarQuantizer::QT_bf16;
     code_size = sq.code_size;
 }
 
diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index e3b29e621d..7ad50189e4 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -23,6 +23,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/utils/bf16.h>
 #include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
@@ -496,6 +497,72 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
 };
 #endif
 
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16 {};
+
+template <>
+struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_bf16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_bf16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_bf16(((uint16_t*)code)[i]);
+    }
+};
+
+#ifdef __AVX2__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
+        code_256i = _mm256_slli_epi32(code_256i, 16);
+        return _mm256_castsi256_ps(code_256i);
+    }
+};
+
+#endif
+
+#ifdef __aarch64__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
+                vreinterpretq_f32_u32(
+                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
+    }
+};
+#endif
+
 /*******************************************************************
  * 8bit_direct quantizer
  *******************************************************************/
@@ -589,6 +656,8 @@ ScalarQuantizer::SQuantizer* select_quantizer_1(
                     d, trained);
         case ScalarQuantizer::QT_fp16:
             return new QuantizerFP16<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_bf16:
+            return new QuantizerBF16<SIMDWIDTH>(d, trained);
         case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
     }
@@ -1378,6 +1447,10 @@ SQDistanceComputer* select_distance_computer(
             return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
                     d, trained);
 
+        case ScalarQuantizer::QT_bf16:
+            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
+                    d, trained);
+
         case ScalarQuantizer::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
@@ -1426,6 +1499,10 @@ void ScalarQuantizer::set_derived_sizes() {
             code_size = d * 2;
             bits = 16;
             break;
+        case QT_bf16:
+            code_size = d * 2;
+            bits = 16;
+            break;
     }
 }
 
@@ -1462,6 +1539,7 @@ void ScalarQuantizer::train(size_t n, const float* x) {
             break;
         case QT_fp16:
         case QT_8bit_direct:
+        case QT_bf16:
             // no training necessary
             break;
     }
@@ -1791,6 +1869,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                     QuantizerFP16<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_bf16:
+            return sel2_InvertedListScanner<DCTemplate<
+                    QuantizerBF16<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case ScalarQuantizer::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner<
diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h
index 550a979092..49fd42cc31 100644
--- a/faiss/impl/ScalarQuantizer.h
+++ b/faiss/impl/ScalarQuantizer.h
@@ -32,6 +32,7 @@ struct ScalarQuantizer : Quantizer {
         QT_fp16,
         QT_8bit_direct, ///< fast indexing of uint8s
         QT_6bit,        ///< 6 bits per component
+        QT_bf16,
     };
 
     QuantizerType qtype = QT_8bit;
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 0d61b73ecd..d88fe7b393 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -140,8 +140,9 @@ std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
         {"SQ4", ScalarQuantizer::QT_4bit},
         {"SQ6", ScalarQuantizer::QT_6bit},
         {"SQfp16", ScalarQuantizer::QT_fp16},
+        {"SQbf16", ScalarQuantizer::QT_bf16},
 };
-const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16)";
+const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16|SQbf16)";
 
 std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nfloat", AdditiveQuantizer::ST_norm_float},
diff --git a/faiss/utils/bf16.h b/faiss/utils/bf16.h
new file mode 100644
index 0000000000..ff0fbe898b
--- /dev/null
+++ b/faiss/utils/bf16.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+
+namespace {
+
+union fp32_bits {
+    uint32_t as_u32;
+    float as_f32;
+};
+
+} // namespace
+
+inline uint16_t encode_bf16(const float f) {
+    // Round off
+    fp32_bits fp;
+    fp.as_f32 = f;
+    return static_cast<uint16_t>((fp.as_u32 + 0x8000) >> 16);
+}
+
+inline float decode_bf16(const uint16_t v) {
+    fp32_bits fp;
+    fp.as_u32 = (uint32_t(v) << 16);
+    return fp.as_f32;
+}
+
+} // namespace faiss
diff --git a/tests/test_index.py b/tests/test_index.py
index b9f3dbd46b..43db906e47 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -327,7 +327,7 @@ def test_4variants_ivf(self):
         D, I = index.search(xq, 10)
         nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
                                                   qtype, faiss.METRIC_L2)
@@ -349,6 +349,7 @@ def test_4variants_ivf(self):
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nok['QT_8bit'])
 
     def test_4variants(self):
         d = 32
@@ -364,7 +365,7 @@ def test_4variants(self):
 
         nok = {}
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
             index.train(xt)
@@ -377,6 +378,7 @@ def test_4variants(self):
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nq * 0.9)
 
 
 class TestRangeSearch(unittest.TestCase):
diff --git a/tests/test_standalone_codec.py b/tests/test_standalone_codec.py
index 2176a12e99..391b88b9dd 100644
--- a/tests/test_standalone_codec.py
+++ b/tests/test_standalone_codec.py
@@ -173,6 +173,9 @@ def test_SQ2(self):
     def test_SQ3(self):
         self.compare_accuracy('SQ8', 'SQfp16')
 
+    def test_SQ4(self):
+        self.compare_accuracy('SQ8', 'SQbf16')
+
     def test_PQ(self):
         self.compare_accuracy('PQ6x8np', 'PQ8x8np')
 

From ee7ce21acd00ee9d4f84091647de376a80074df2 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 23 May 2024 06:42:48 -0700
Subject: [PATCH 093/116] Add display names to all PR build jobs on GitHub
 Actions (#3475)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3475

Migration to GitHub

Reviewed By: algoriddle

Differential Revision: D57707064

fbshipit-source-id: 17f0a97028007f3664faa5b6b2c269f50bcdf39e
---
 .github/workflows/build.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 39f865bc4a..b16dee9f2b 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -12,6 +12,7 @@ env:
   MKL_THREADING_LAYER: GNU
 jobs:
   format:
+    name: Format
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -37,12 +38,14 @@ jobs:
               exit 1
             fi
   linux-x86_64-cmake:
+    name: Linux x86_64 (cmake)
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
         uses: actions/checkout@v4
       - uses: ./.github/actions/build_cmake
   linux-x86_64-AVX2-cmake:
+    name: Linux x86_64 AVX2 (cmake)
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -51,6 +54,7 @@ jobs:
         with:
           opt_level: avx2
   linux-x86_64-AVX512-cmake:
+    name: Linux x86_64 AVX512 (cmake)
     if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
     runs-on: ubuntu-latest
     steps:
@@ -60,6 +64,7 @@ jobs:
         with:
           opt_level: avx512
   linux-x86_64-GPU-cmake:
+    name: Linux x86_64 GPU (cmake)
     needs: linux-x86_64-AVX2-cmake
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
@@ -69,6 +74,7 @@ jobs:
         with:
           gpu: ON
   linux-x86_64-GPU-w-RAFT-cmake:
+    name: Linux x86_64 GPU w/ RAFT (cmake)
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
       - name: Checkout
@@ -78,6 +84,7 @@ jobs:
           gpu: ON
           raft: ON
   linux-x86_64-conda:
+    name: Linux x86_64 (conda)
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -87,6 +94,7 @@ jobs:
           fetch-tags: true
       - uses: ./.github/actions/build_conda
   windows-x86_64-conda:
+    name: Windows x86_64 (conda)
     runs-on: windows-2019
     steps:
       - name: Checkout
@@ -96,6 +104,7 @@ jobs:
           fetch-tags: true
       - uses: ./.github/actions/build_conda
   linux-arm64-conda:
+    name: Linux arm64 (conda)
     runs-on: 2-core-ubuntu-arm
     steps:
       - name: Checkout
@@ -105,6 +114,7 @@ jobs:
           fetch-tags: true
       - uses: ./.github/actions/build_conda
   linux-x86_64-packages:
+    name: Linux x86_64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: ubuntu-latest
     steps:
@@ -117,6 +127,7 @@ jobs:
         with:
           label: main
   linux-x86_64-GPU-packages-CUDA-11-4-4:
+    name: Linux x86_64 GPU packages (CUDA 11.4.4)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -134,6 +145,7 @@ jobs:
           cuda: "11.4.4"
           compiler_version: "11.2"
   linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -151,6 +163,7 @@ jobs:
           cuda: "11.8.0"
           compiler_version: "11.2"
   linux-x86_64-GPU-packages-CUDA-12-1-1:
+    name: Linux x86_64 GPU packages (CUDA 12.1.1)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -167,6 +180,7 @@ jobs:
           cuda: "12.1.1"
           compiler_version: "11.2"
   linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 4-core-ubuntu-gpu-t4
     env:
@@ -184,6 +198,7 @@ jobs:
           cuda: "12.1.1"
           compiler_version: "11.2"
   windows-x86_64-packages:
+    name: Windows x86_64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: windows-2019
     steps:
@@ -196,6 +211,7 @@ jobs:
         with:
           label: main
   osx-arm64-packages:
+    name: OSX arm64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: macos-14
     steps:
@@ -208,6 +224,7 @@ jobs:
         with:
           label: main
   linux-arm64-packages:
+    name: Linux arm64 packages
     if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
     runs-on: 2-core-ubuntu-arm
     steps:

From 93bc9b6470f3576f798e9a25e7879debb9676fe3 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 23 May 2024 06:42:48 -0700
Subject: [PATCH 094/116] Gate all PR builds behind linux-x86_64-cmake in
 GitHub Actions (#3476)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3476

The long tail will still be the RAFT build but we can save on cost if the build has errors by incurring ~10m penalty added by waiting for the basic cmake build to complete. Both GPU and RAFT builds will start together so this will take less time overall to complete.

Reviewed By: algoriddle

Differential Revision: D57707298

fbshipit-source-id: 3589842e9bda9ebca9b25e089e6177fe96b6a0f5
---
 .github/workflows/build.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b16dee9f2b..bd415dfce8 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -46,6 +46,7 @@ jobs:
       - uses: ./.github/actions/build_cmake
   linux-x86_64-AVX2-cmake:
     name: Linux x86_64 AVX2 (cmake)
+    needs: linux-x86_64-cmake
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -56,6 +57,7 @@ jobs:
   linux-x86_64-AVX512-cmake:
     name: Linux x86_64 AVX512 (cmake)
     if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
+    needs: linux-x86_64-cmake
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -65,7 +67,7 @@ jobs:
           opt_level: avx512
   linux-x86_64-GPU-cmake:
     name: Linux x86_64 GPU (cmake)
-    needs: linux-x86_64-AVX2-cmake
+    needs: linux-x86_64-cmake
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
       - name: Checkout
@@ -75,6 +77,7 @@ jobs:
           gpu: ON
   linux-x86_64-GPU-w-RAFT-cmake:
     name: Linux x86_64 GPU w/ RAFT (cmake)
+    needs: linux-x86_64-cmake
     runs-on: 4-core-ubuntu-gpu-t4
     steps:
       - name: Checkout
@@ -85,6 +88,7 @@ jobs:
           raft: ON
   linux-x86_64-conda:
     name: Linux x86_64 (conda)
+    needs: linux-x86_64-cmake
     runs-on: ubuntu-latest
     steps:
       - name: Checkout
@@ -95,6 +99,7 @@ jobs:
       - uses: ./.github/actions/build_conda
   windows-x86_64-conda:
     name: Windows x86_64 (conda)
+    needs: linux-x86_64-cmake
     runs-on: windows-2019
     steps:
       - name: Checkout
@@ -105,6 +110,7 @@ jobs:
       - uses: ./.github/actions/build_conda
   linux-arm64-conda:
     name: Linux arm64 (conda)
+    needs: linux-x86_64-cmake
     runs-on: 2-core-ubuntu-arm
     steps:
       - name: Checkout

From eec4cba0253da49eee08d42d7181964c7d5ebe41 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 23 May 2024 06:42:48 -0700
Subject: [PATCH 095/116] Disable CircleCI builds (#3477)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3477

AVX-512 must remain on CircleCI until GitHub provides runners with AVX-512 support (ETA: Q1 2025).

Reviewed By: algoriddle

Differential Revision: D57707621

fbshipit-source-id: e8a0885f8363cf8f20854cccca3ec0adc946362b
---
 .circleci/config.yml | 433 ++-----------------------------------------
 1 file changed, 12 insertions(+), 421 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7e8bd8170a..0330939153 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,190 +5,8 @@ executors:
     docker:
       - image: continuumio/miniconda3
     resource_class: large
-  linux-x86_64-gpu:
-    environment:
-      CONDA_ARCH: Linux-x86_64
-    machine:
-      image: linux-cuda-12:default
-    resource_class: gpu.nvidia.medium
-  linux-arm64-cpu:
-    environment:
-      CONDA_ARCH: Linux-aarch64
-    machine:
-      image: ubuntu-2204:current
-    resource_class: arm.medium
-  macosx-arm64-cpu:
-    environment:
-      CONDA_ARCH: MacOSX-arm64
-    macos:
-      xcode: 14.2.0 # minimum supported for M1
-    resource_class: macos.m1.large.gen1
-  windows-x86_64-cpu:
-    machine:
-      image: windows-server-2019-vs2019:2023.04.1
-      shell: bash.exe
-    resource_class: windows.medium
 
 jobs:
-  format:
-    docker:
-      - image: ubuntu:22.04
-    steps:
-      - checkout
-      - run:
-          name: Install clang-format
-          command: |
-            apt-get update -y
-            apt-get install -y wget
-            apt install -y lsb-release wget software-properties-common gnupg
-            wget https://apt.llvm.org/llvm.sh
-            chmod u+x llvm.sh
-            ./llvm.sh 18
-            apt-get install -y git-core clang-format-18
-      - run:
-          name: Verify clang-format
-          command: |
-             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
-             if git diff --quiet; then
-               echo "Formatting OK!"
-             else
-               echo "Formatting not OK!"
-               echo "------------------"
-               git --no-pager diff --color
-               exit 1
-             fi
-
-  build_conda:
-    parameters:
-      label:
-        type: string
-        default: ""
-      cuda:
-        type: string
-        default: ""
-      raft:
-        type: string
-        default: ""
-      cuda_archs:
-        type: string
-        default: ""
-      compiler_version:
-        type: string
-        default: ""
-      exec:
-        type: executor
-    executor: << parameters.exec >>
-    environment:
-      OMP_NUM_THREADS: 10
-      PACKAGE_TYPE: <<parameters.label>>
-      CUDA_ARCHS: <<parameters.cuda_archs>>
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            if [ -n "${CONDA_ARCH}" ]
-            then
-              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
-              bash miniconda.sh -b -p $HOME/miniconda
-              ~/miniconda/bin/conda init
-            fi
-      - run:
-          name: Install conda build tools
-          command: |
-            # conda config --set solver libmamba
-            # conda config --set verbosity 3
-            conda update -y -q conda
-            conda install -y -q conda-build
-      - when:
-          condition: << parameters.label >>
-          steps:
-            - run:
-                name: Enable anaconda uploads
-                command: |
-                  conda install -y -q anaconda-client
-                  conda config --set anaconda_upload yes
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU)
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --python 3.11 -c pytorch
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU) w/ anaconda upload
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
-
   build_cmake:
     parameters:
       exec:
@@ -196,12 +14,6 @@ jobs:
       opt_level:
         type: string
         default: generic
-      gpu:
-        type: string
-        default: "OFF"
-      raft:
-        type: string
-        default: "OFF"
     executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
@@ -222,32 +34,10 @@ jobs:
           command: |
             conda config --set solver libmamba
             conda update -y -q conda
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using main channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
-      - when:
-          condition:
-            equal: [ "ON", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using conda-forge channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
-      - when:
-          condition:
-            and:
-              - equal: [ "ON", << parameters.gpu >> ]
-              - equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install CUDA
-                command: |
-                  conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+      - run:
+          name: Install env using main channel
+          command: |
+            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
       - run:
           name: Build all targets
           no_output_timeout: 30m
@@ -257,8 +47,8 @@ jobs:
             cmake -B build \
                   -DBUILD_TESTING=ON \
                   -DBUILD_SHARED_LIBS=ON \
-                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
-                  -DFAISS_ENABLE_RAFT=<< parameters.raft >> \
+                  -DFAISS_ENABLE_GPU=OFF \
+                  -DFAISS_ENABLE_RAFT=OFF \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
                   -DPYTHON_EXECUTABLE=$(which python) \
@@ -277,38 +67,12 @@ jobs:
           command: |
             cd build/faiss/python
             python setup.py install
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU only)
-                command: |
-                  conda install -y -q pytorch -c pytorch
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - when:
-          condition:
-            equal: [ "ON", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU + GPU)
-                command: |
-                  conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-                  cp tests/common_faiss_tests.py faiss/gpu/test
-                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
-                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
-      - when:
-          condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
-          steps:
-            - run:
-                name: Test avx2 loading
-                command: |
-                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
-                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
+      - run:
+          name: Python tests (CPU only)
+          command: |
+            conda install -y -q pytorch -c pytorch
+            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
       - store_test_results:
           path: test-results
 
@@ -316,180 +80,7 @@ workflows:
   version: 2
   build:
     jobs:
-      - format:
-          name: Format
-      - build_cmake:
-          name: Linux x86_64 (cmake)
-          exec: linux-x86_64-cpu
-      - build_cmake:
-          name: Linux x86_64 AVX2 (cmake)
-          exec: linux-x86_64-cpu
-          opt_level: "avx2"
       - build_cmake:
           name: Linux x86_64 AVX512 (cmake)
           exec: linux-x86_64-cpu
           opt_level: "avx512"
-      - build_cmake:
-          name: Linux x86_64 GPU (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          requires:
-            - Linux x86_64 AVX2 (cmake)
-      - build_cmake:
-          name: Linux x86_64 GPU w/ RAFT (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          raft: "ON"
-          requires:
-            - Linux x86_64 GPU (cmake)
-      - build_conda:
-          name: Linux x86_64 (conda)
-          exec: linux-x86_64-cpu
-      - build_conda:
-          name: Windows x86_64 (conda)
-          exec: windows-x86_64-cpu
-      - build_conda:
-          name: Linux arm64 (conda)
-          exec: linux-arm64-cpu
-      - build_conda:
-          name: Linux x86_64 packages
-          exec: linux-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Windows x86_64 packages
-          exec: windows-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: OSX arm64 packages
-          exec: macosx-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux arm64 packages
-          exec: linux-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-
-  nightly:
-    triggers:
-      - schedule:
-          cron: "0 0 * * *"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - build_conda:
-          name: Linux x86_64 nightlies
-          exec: linux-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Windows x86_64 nightlies
-          exec: windows-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: OSX arm64 nightlies
-          exec: macosx-arm64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux arm64 nightlies
-          exec: linux-arm64-cpu
-          label: nightly

From 729a66f0044f1a893bcb47c0ab429b4f0b52600d Mon Sep 17 00:00:00 2001
From: Richard Barnes <rbarnes@meta.com>
Date: Thu, 23 May 2024 07:47:33 -0700
Subject: [PATCH 096/116] Remove extra semi colon from
 deprecated/libmccpp/ThreadSafeClientPool.h (#3479)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3479

`-Wextra-semi` or `-Wextra-semi-stmt`

If the code compiles, this is safe to land.

Reviewed By: palmje

Differential Revision: D57632759

fbshipit-source-id: 48bc23e87b3f518182085124c4c8e68ddbb3ca8f
---
 tests/test_common_ivf_empty_index.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_common_ivf_empty_index.cpp b/tests/test_common_ivf_empty_index.cpp
index 1a99b77141..a3e33031bd 100644
--- a/tests/test_common_ivf_empty_index.cpp
+++ b/tests/test_common_ivf_empty_index.cpp
@@ -23,7 +23,7 @@ namespace {
 
 int d = 64;
 
-}; // namespace
+} // namespace
 
 std::vector<float> get_random_vectors(size_t n, int seed) {
     std::vector<float> x(n * d);

From eb284811e093f8ddd18b0379a1be8fafc0cb7847 Mon Sep 17 00:00:00 2001
From: Alexandr Guzhva <alexanderguzhva@gmail.com>
Date: Thu, 23 May 2024 09:19:24 -0700
Subject: [PATCH 097/116] Remove duplicate NegativeDistanceComputer instances
 (#3450)

Summary: Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3450

Reviewed By: mdouze

Differential Revision: D57708412

Pulled By: junjieqi

fbshipit-source-id: 9540b7e60d8b2b39e0ca92423d2a305fab2a17e6
---
 faiss/IndexHNSW.cpp           | 46 -----------------------------------
 faiss/IndexNNDescent.cpp      | 29 ----------------------
 faiss/impl/DistanceComputer.h | 46 +++++++++++++++++++++++++++++++++++
 faiss/impl/NSG.cpp            | 29 ----------------------
 4 files changed, 46 insertions(+), 104 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 3325c8c0e1..0686917211 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -68,52 +68,6 @@ HNSWStats hnsw_stats;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        basedis->distances_batch_4(
-                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
-        dis0 = -dis0;
-        dis1 = -dis1;
-        dis2 = -dis2;
-        dis3 = -dis3;
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    virtual ~NegativeDistanceComputer() {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
diff --git a/faiss/IndexNNDescent.cpp b/faiss/IndexNNDescent.cpp
index 27bd6e33ee..382e9c41c6 100644
--- a/faiss/IndexNNDescent.cpp
+++ b/faiss/IndexNNDescent.cpp
@@ -58,35 +58,6 @@ using storage_idx_t = NNDescent::storage_idx_t;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
diff --git a/faiss/impl/DistanceComputer.h b/faiss/impl/DistanceComputer.h
index dc46d113fb..5ac3a702c9 100644
--- a/faiss/impl/DistanceComputer.h
+++ b/faiss/impl/DistanceComputer.h
@@ -59,6 +59,52 @@ struct DistanceComputer {
     virtual ~DistanceComputer() {}
 };
 
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer : DistanceComputer {
+    /// owned by this
+    DistanceComputer* basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer* basedis)
+            : basedis(basedis) {}
+
+    void set_query(const float* x) override {
+        basedis->set_query(x);
+    }
+
+    /// compute distance of vector i to current query
+    float operator()(idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) override {
+        basedis->distances_batch_4(
+                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
+        dis0 = -dis0;
+        dis1 = -dis1;
+        dis2 = -dis2;
+        dis3 = -dis3;
+    }
+
+    /// compute distance between two stored vectors
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer() {
+        delete basedis;
+    }
+};
+
 /*************************************************************
  * Specialized version of the DistanceComputer when we know that codes are
  * laid out in a flat index.
diff --git a/faiss/impl/NSG.cpp b/faiss/impl/NSG.cpp
index 1f30b576b9..c974943343 100644
--- a/faiss/impl/NSG.cpp
+++ b/faiss/impl/NSG.cpp
@@ -25,35 +25,6 @@ namespace {
 // It needs to be smaller than 0
 constexpr int EMPTY_ID = -1;
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {

From 6580156e9647ea8a6cdef1b49e6dd431bf3d0096 Mon Sep 17 00:00:00 2001
From: Tarang Jain <jaintarang2015@gmail.com>
Date: Thu, 23 May 2024 15:33:23 -0700
Subject: [PATCH 098/116] Delete Raft Handle (#3435)

Summary:
Small Raft related modification to StandardGpuResources:
if the stream for a particular device is modified by a user, delete the Raft handle for that device. On any subsequent call to `getRaftHandle(device)`, a new raft handle with the updated stream will be created.
Closes https://github.com/facebookresearch/faiss/issues/3424

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3435

Reviewed By: ramilbakhshyiev

Differential Revision: D57640976

Pulled By: junjieqi

fbshipit-source-id: 41e2898a39250b7e52e920b71e819fc21ca9fc85
---
 faiss/gpu/StandardGpuResources.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 004f80a27e..78336b4994 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_[device] = stream;
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_.erase(device);

From 6e423cc649168e61ec7614e838da9fbce21c0b15 Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Mon, 27 May 2024 11:55:06 -0700
Subject: [PATCH 099/116] Add python tutorial on different indexs refinement
 and respect accuracy measurement (#3480)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3480

This tutorial summarize the methods to construct different indexs for PQFastScan refinement. It shows how the choice can impact on accuracy.

Reviewed By: junjieqi

Differential Revision: D57799598

fbshipit-source-id: a75c52c60a5217366f3361676da8f03f0c4a9feb
---
 tutorial/python/9-RefineComparison.py | 42 +++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 tutorial/python/9-RefineComparison.py

diff --git a/tutorial/python/9-RefineComparison.py b/tutorial/python/9-RefineComparison.py
new file mode 100644
index 0000000000..6fa69f33d9
--- /dev/null
+++ b/tutorial/python/9-RefineComparison.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+
+from faiss.contrib.evaluation import knn_intersection_measure
+from faiss.contrib import datasets
+
+# 64-dim vectors, 50000 vectors in the training, 100000 in database,
+# 10000 in queries, dtype ('float32')
+ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
+d = 64                           # dimension
+
+# Constructing the refine PQ index with SQfp16 with index factory
+index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
+index_fp16.train(ds.get_train())
+index_fp16.add(ds.get_database())
+
+# Constructing the refine PQ index with SQ8
+index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
+index_sq8.train(ds.get_train())
+index_sq8.add(ds.get_database())
+
+# Parameterization on k factor while doing search for index refinement
+k_factor = 3.0
+params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
+
+# Perform index search using different index refinement
+D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
+D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
+
+# Calculating knn intersection measure for different index types on refinement
+KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
+KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
+
+# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
+assert (KIM_fp16 > KIM_sq8)
+
+print(I_sq8[:5])
+print(I_fp16[:5])

From db6ff2e0953e07aaba0780637f38467652695c3b Mon Sep 17 00:00:00 2001
From: Jim Borden <jim.borden@couchbase.com>
Date: Mon, 27 May 2024 17:44:09 -0700
Subject: [PATCH 100/116] Workaround for missing intrinsic on gcc < 9 (#3481)

Summary:
Rebased branch for https://github.com/facebookresearch/faiss/issues/3420

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3481

Reviewed By: mdouze

Differential Revision: D57830230

Pulled By: junjieqi

fbshipit-source-id: a93fb3cc53f11245faec891a9590b5e849dbf3b9
---
 faiss/impl/code_distance/code_distance-avx2.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/faiss/impl/code_distance/code_distance-avx2.h b/faiss/impl/code_distance/code_distance-avx2.h
index 0aa1535b28..d37b022441 100644
--- a/faiss/impl/code_distance/code_distance-avx2.h
+++ b/faiss/impl/code_distance/code_distance-avx2.h
@@ -16,6 +16,11 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/code_distance/code_distance-generic.h>
 
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+#if defined(__GNUC__) && __GNUC__ < 9
+#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
+#endif
+
 namespace {
 
 inline float horizontal_sum(const __m128 v) {

From 6e7d9e040f9be9734277c3f27b2cb364a67f442d Mon Sep 17 00:00:00 2001
From: simshi <simonshi@gmail.com>
Date: Tue, 28 May 2024 11:47:04 -0700
Subject: [PATCH 101/116] fix algorithm of spreading vectors over shards
 (#3374)

Summary:
simple math:
| **input n** | **input nshards** |  shard_size | idx | i0 | ni |
| -- |-- |-- |-- |-- |-- |
| 19 | 6 | 4 | 5 | 20 | **-1** |
| 1000 | 37 | 28 | 36 | 1008 | -8 |
| 1000 | 64 | 16 | 63 | 1008 | -8 |

root cause:
integer cause precision loss, `idx * shard_size` overflows, because `(n + nshards - 1) / nshards` is roundup

my solution:
each shard takes at least  `base_shard_size = n / nshards`, then `remain = n % nshards`, we know `0 <= remain < nshards`, next, assign those remain vectors to first `remain` shards, i.e. first `remain` shards take one more vector each.
```c++
auto i0 = idx * base_shard_size;
if (i0 < remain) {
  // if current idx is one of the first `remain` shards
  i0 += idx;
} else {
  i0 += remain;
}
```
simplify above code: `i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);`

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3374

Reviewed By: fxdawnn

Differential Revision: D57867910

Pulled By: junjieqi

fbshipit-source-id: 7e72ea5cd197af4f3446fb7a3fd34ad08901dbb2
---
 faiss/gpu/GpuIcmEncoder.cu | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/faiss/gpu/GpuIcmEncoder.cu b/faiss/gpu/GpuIcmEncoder.cu
index 434fae9e36..8bd60f91b8 100644
--- a/faiss/gpu/GpuIcmEncoder.cu
+++ b/faiss/gpu/GpuIcmEncoder.cu
@@ -82,7 +82,7 @@ void GpuIcmEncoder::encode(
         size_t n,
         size_t ils_iters) const {
     size_t nshards = shards->size();
-    size_t shard_size = (n + nshards - 1) / nshards;
+    size_t base_shard_size = n / nshards;
 
     auto codebooks = lsq->codebooks.data();
     auto M = lsq->M;
@@ -94,8 +94,14 @@ void GpuIcmEncoder::encode(
 
     // split input data
     auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        size_t i0 = idx * shard_size;
-        size_t ni = std::min(shard_size, n - i0);
+        size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
+        size_t ni = base_shard_size;
+        if (ni < n % nshards) {
+            ++ni;
+        }
+        if (ni <= 0) { // only if n < nshards
+            return;
+        }
         auto xi = x + i0 * d;
         auto ci = codes + i0 * M;
         std::mt19937 geni(idx + seed); // different seed for each shard

From 0beecb4c85d0b0c49483f7f0a3100b28ba44b793 Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Thu, 30 May 2024 09:27:55 -0700
Subject: [PATCH 102/116] sys.big_endian to sys.byteorder (#3422)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3422

Found vec_io failing when running some benchmarking.
There is no such field named big_endian in sys. So, reverting it to original field byteorder

Reviewed By: algoriddle

Differential Revision: D56718607

fbshipit-source-id: 553f1d2d6bc967581142a92282e534f3f164e8f9
---
 contrib/vecs_io.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index 5d18c0b162..9ef9e0ab64 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -14,7 +14,7 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
@@ -25,7 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
-    assert not sys.big_endian
+    assert sys.byteorder != 'big'
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -37,7 +37,7 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         da = x[:4][::-1].copy()
         d = da.view('int32')[0]
     else:
@@ -50,7 +50,7 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
-    if sys.big_endian:
+    if sys.byteorder == 'big':
         m1.byteswap(inplace=True)
     m1.tofile(fname)
 

From 22304340d22edae38ddb9e13874688ae18eb121d Mon Sep 17 00:00:00 2001
From: Kumar Saurabh Arora <kuarora@meta.com>
Date: Fri, 31 May 2024 14:30:39 -0700
Subject: [PATCH 103/116] Adding buck target for experiment bench_fw_ivf
 (#3423)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3423

Adding small fixes to run experiments from fbcode.
1. Added buck target
2. Full import path of faiss bench_fw modules
3. new dataset path to run tests locally as we can't use  an existing directory ./data in fbcode.

Reviewed By: algoriddle, junjieqi

Differential Revision: D57235092

fbshipit-source-id: f78a23199e619b640a19ca37f8b52ff0abdd8298
---
 benchs/bench_fw_ivf.py | 13 +++++++++----
 contrib/datasets.py    |  6 +++++-
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py
index 8c84743e27..e9e144c569 100644
--- a/benchs/bench_fw_ivf.py
+++ b/benchs/bench_fw_ivf.py
@@ -3,16 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptor,
+)
 
 logging.basicConfig(level=logging.INFO)
 
+
 def sift1M(bio):
     benchmark = Benchmark(
         num_threads=32,
@@ -37,6 +41,7 @@ def sift1M(bio):
     benchmark.set_io(bio)
     benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
 
+
 def bigann(bio):
     for scale in [1, 2, 5, 10, 20, 50]:
         benchmark = Benchmark(
diff --git a/contrib/datasets.py b/contrib/datasets.py
index f37a2fb6e4..281f16e2fa 100644
--- a/contrib/datasets.py
+++ b/contrib/datasets.py
@@ -6,6 +6,8 @@
 import os
 import numpy as np
 import faiss
+import getpass
+
 
 from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
 from .exhaustive_search import knn
@@ -115,10 +117,12 @@ def get_groundtruth(self, k=100):
 # that directory is
 ############################################################################
 
+username = getpass.getuser()
 
 for dataset_basedir in (
         '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/'):
+        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
+        f'/home/{username}/simsearch/data/'):
     if os.path.exists(dataset_basedir):
         break
 else:

From bf73e38d10ae6818d7e5d7250a55bb0c9944a9ef Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Fri, 31 May 2024 14:48:13 -0700
Subject: [PATCH 104/116] add skip_storage flag to HNSW (#3487)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3487

Sometimes it is not useful to serialize the storage index along with a HNSW index. This diff adds a flag that supports skipping the storage of the index.

Searchign and adding to the index is not possible until a storage index is added back in.

Reviewed By: junjieqi

Differential Revision: D57911060

fbshipit-source-id: 5a4ceee4a8f53f6f746df59af3942b813a99c14f
---
 faiss/IndexHNSW.cpp        |  5 ++---
 faiss/impl/index_read.cpp  | 10 ++++++----
 faiss/impl/index_write.cpp | 25 ++++++++++++++++---------
 faiss/index_io.h           | 11 ++++++-----
 faiss/python/__init__.py   |  4 ++--
 tests/test_graph_based.py  | 36 ++++++++++++++++++++++++++++++++++++
 6 files changed, 68 insertions(+), 23 deletions(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 0686917211..94798c1b4a 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -251,7 +249,8 @@ void hnsw_search(
         const SearchParameters* params_in) {
     FAISS_THROW_IF_NOT_MSG(
             index->storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+            "No storage index, please use IndexHNSWFlat (or variants) "
+            "instead of IndexHNSW directly");
     const SearchParametersHNSW* params = nullptr;
     const HNSW& hnsw = index->hnsw;
 
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 8d80329bf9..ce4b1e76b1 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io_macros.h>
@@ -531,7 +529,11 @@ Index* read_index(IOReader* f, int io_flags) {
     Index* idx = nullptr;
     uint32_t h;
     READ1(h);
-    if (h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
+    if (h == fourcc("null")) {
+        // denotes a missing index, useful for some cases
+        return nullptr;
+    } else if (
+            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
         IndexFlat* idxf;
         if (h == fourcc("IxFI")) {
             idxf = new IndexFlatIP();
@@ -961,7 +963,7 @@ Index* read_index(IOReader* f, int io_flags) {
         read_index_header(idxhnsw, f);
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
-        idxhnsw->own_fields = true;
+        idxhnsw->own_fields = idxhnsw->storage != nullptr;
         if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index b2808d7170..01e5ae7257 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io.h>
@@ -390,8 +388,12 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_direct_map(&ivf->direct_map, f);
 }
 
-void write_index(const Index* idx, IOWriter* f) {
-    if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
+void write_index(const Index* idx, IOWriter* f, int io_flags) {
+    if (idx == nullptr) {
+        // eg. for a storage component of HNSW that is set to nullptr
+        uint32_t h = fourcc("null");
+        WRITE1(h);
+    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
         uint32_t h =
                 fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
                                : idxf->metric_type == METRIC_L2  ? "IxF2"
@@ -765,7 +767,12 @@ void write_index(const Index* idx, IOWriter* f) {
         WRITE1(h);
         write_index_header(idxhnsw, f);
         write_HNSW(&idxhnsw->hnsw, f);
-        write_index(idxhnsw->storage, f);
+        if (io_flags & IO_FLAG_SKIP_STORAGE) {
+            uint32_t n4 = fourcc("null");
+            WRITE1(n4);
+        } else {
+            write_index(idxhnsw->storage, f);
+        }
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
         uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
                 : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
@@ -841,14 +848,14 @@ void write_index(const Index* idx, IOWriter* f) {
     }
 }
 
-void write_index(const Index* idx, FILE* f) {
+void write_index(const Index* idx, FILE* f, int io_flags) {
     FileIOWriter writer(f);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
-void write_index(const Index* idx, const char* fname) {
+void write_index(const Index* idx, const char* fname, int io_flags) {
     FileIOWriter writer(fname);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
 void write_VectorTransform(const VectorTransform* vt, const char* fname) {
diff --git a/faiss/index_io.h b/faiss/index_io.h
index f73cd073b7..3e77d0227c 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // I/O code for indexes
 
 #ifndef FAISS_INDEX_IO_H
@@ -35,9 +33,12 @@ struct IOReader;
 struct IOWriter;
 struct InvertedLists;
 
-void write_index(const Index* idx, const char* fname);
-void write_index(const Index* idx, FILE* f);
-void write_index(const Index* idx, IOWriter* writer);
+/// skip the storage for graph-based indexes
+const int IO_FLAG_SKIP_STORAGE = 1;
+
+void write_index(const Index* idx, const char* fname, int io_flags = 0);
+void write_index(const Index* idx, FILE* f, int io_flags = 0);
+void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
 
 void write_index_binary(const IndexBinary* idx, const char* fname);
 void write_index_binary(const IndexBinary* idx, FILE* f);
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 0562d1dd89..ce4b42c618 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -292,10 +292,10 @@ def range_search_with_parameters(index, x, radius, params=None, output_stats=Fal
 ###########################################
 
 
-def serialize_index(index):
+def serialize_index(index, io_flags=0):
     """ convert an index to a numpy uint8 array  """
     writer = VectorIOWriter()
-    write_index(index, writer)
+    write_index(index, writer, io_flags)
     return vector_to_array(writer.data)
 
 
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index d5ddbeec37..95925d7ae9 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -133,6 +133,42 @@ def test_ndis_stats(self):
         Dhnsw, Ihnsw = index.search(self.xq, 1)
         self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
 
+    def test_io_no_storage(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+
+        Dref, Iref = index.search(self.xq, 5)
+
+        # test writing without storage
+        index2 = faiss.deserialize_index(
+            faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
+        )
+        self.assertEquals(index2.storage, None)
+        self.assertRaises(
+            RuntimeError,
+            index2.search, self.xb, 1)
+
+        # make sure we can store an index with empty storage
+        index4 = faiss.deserialize_index(
+            faiss.serialize_index(index2))
+
+        # add storage afterwards
+        index.storage = faiss.clone_index(index.storage)
+        index.own_fields = True
+
+        Dnew, Inew = index.search(self.xq, 5)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+        if False:
+            # test reading without storage
+            # not implemented because it is hard to skip over an index
+            index3 = faiss.deserialize_index(
+                faiss.serialize_index(index), faiss.IO_FLAG_SKIP_STORAGE
+            )
+            self.assertEquals(index3.storage, None)
+
 
 class TestNSG(unittest.TestCase):
 

From a900cfa9f1d2842b4504b0eedc646414b377616b Mon Sep 17 00:00:00 2001
From: Xiao Fu <xiaofu@meta.com>
Date: Tue, 4 Jun 2024 19:44:27 -0700
Subject: [PATCH 105/116] Add cpp tutorial for index factory refine index
 construction (#3494)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3494

This tasks focus on the refine index construction tutorial with different index refinement on fp16/sq8 quantization. The python version was added a while ago.

Reviewed By: junjieqi

Differential Revision: D58161983

fbshipit-source-id: 1c598fe612b5dee3952c5f7398e6802e117f141d
---
 tutorial/cpp/9-RefineComparison.cpp | 104 ++++++++++++++++++++++++++++
 tutorial/cpp/CMakeLists.txt         |   3 +
 2 files changed, 107 insertions(+)
 create mode 100644 tutorial/cpp/9-RefineComparison.cpp

diff --git a/tutorial/cpp/9-RefineComparison.cpp b/tutorial/cpp/9-RefineComparison.cpp
new file mode 100644
index 0000000000..d7fbc90aec
--- /dev/null
+++ b/tutorial/cpp/9-RefineComparison.cpp
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/index_factory.h>
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    // Constructing the refine PQ index with SQfp16 with index factory
+    faiss::Index* index_fp16;
+    index_fp16 = faiss::index_factory(
+            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
+    index_fp16->train(nb, xb);
+    index_fp16->add(nb, xb);
+
+    // Constructing the refine PQ index with SQ8
+    faiss::Index* index_sq8;
+    index_sq8 =
+            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
+    index_sq8->train(nb, xb);
+    index_sq8->add(nb, xb);
+
+    int k = 10;
+    { // search xq
+        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
+        float* D_fp16 = new float[(int)(k * nq)];
+        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
+        float* D_sq8 = new float[(int)(k * nq)];
+
+        // Parameterization on k factor while doing search for index refinement
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+
+        // Perform index search using different index refinement
+        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
+        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
+
+        printf("I_fp16=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_fp16[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        printf("I_sq8=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_sq8[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I_fp16;
+        delete[] D_fp16;
+        delete[] I_sq8;
+        delete[] D_sq8;
+        delete params;
+
+        delete index_fp16;
+        delete index_sq8;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index ad152c499d..f964b3dda9 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -27,3 +27,6 @@ target_link_libraries(7-PQFastScan PRIVATE faiss)
 
 add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
 target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
+
+add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
+target_link_libraries(9-RefineComparison PRIVATE faiss)

From ec67ac159476d96c16912699584b0809074b5f0c Mon Sep 17 00:00:00 2001
From: Abhiram Vadlapatla <v.abhiram97@gmail.com>
Date: Tue, 4 Jun 2024 21:59:57 -0700
Subject: [PATCH 106/116] Update .gitignore (#3492)

Summary:
Adding build folder to gitignore, so that they don't show up in the commit tree while building from source

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3492

Reviewed By: junjieqi

Differential Revision: D58171359

Pulled By: asadoughi

fbshipit-source-id: b0efed348769328a3bdbcc13098dcb84cadb6c4f
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index caab1304c8..d6df432fa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 *.pyc
 *~
+/build/
 /config.*
 /aclocal.m4
 /autom4te.cache/

From df0dea6c6d8951056763dc03528b3973c6ba26e2 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Tue, 11 Jun 2024 08:14:48 -0700
Subject: [PATCH 107/116] Interop between CAGRA and HNSW (#3252)

Summary:
Depends on https://github.com/facebookresearch/faiss/pull/3084

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3252

Reviewed By: junjieqi

Differential Revision: D57971948

Pulled By: mdouze

fbshipit-source-id: 4371f4d136eeceb59568593f98a6ae9163a768ba
---
 CMakeLists.txt                      |   2 +
 faiss/IndexHNSW.cpp                 | 124 +++++++-
 faiss/IndexHNSW.h                   |  44 ++-
 faiss/gpu/CMakeLists.txt            |   8 +-
 faiss/gpu/GpuCloner.cpp             |  29 +-
 faiss/gpu/GpuIndexCagra.cu          | 274 ++++++++++++++++
 faiss/gpu/GpuIndexCagra.h           | 282 +++++++++++++++++
 faiss/gpu/impl/RaftCagra.cu         | 371 ++++++++++++++++++++++
 faiss/gpu/impl/RaftCagra.cuh        | 132 ++++++++
 faiss/gpu/test/CMakeLists.txt       |   4 +-
 faiss/gpu/test/TestGpuIndexCagra.cu | 474 ++++++++++++++++++++++++++++
 faiss/gpu/test/test_cagra.py        |  71 +++++
 faiss/impl/HNSW.cpp                 |  72 ++++-
 faiss/impl/HNSW.h                   |  12 +-
 faiss/impl/index_read.cpp           |  10 +-
 faiss/impl/index_write.cpp          |   7 +
 faiss/python/CMakeLists.txt         |   5 +
 faiss/python/swigfaiss.swig         |   7 +
 18 files changed, 1895 insertions(+), 33 deletions(-)
 create mode 100644 faiss/gpu/GpuIndexCagra.cu
 create mode 100644 faiss/gpu/GpuIndexCagra.h
 create mode 100644 faiss/gpu/impl/RaftCagra.cu
 create mode 100644 faiss/gpu/impl/RaftCagra.cuh
 create mode 100644 faiss/gpu/test/TestGpuIndexCagra.cu
 create mode 100644 faiss/gpu/test/test_cagra.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index cedee9c456..1a468fb247 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,6 +46,8 @@ project(faiss
   LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
+set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
+
 set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 94798c1b4a..c04642d218 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -15,12 +15,16 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <limits>
+#include <memory>
 #include <queue>
+#include <random>
 #include <unordered_set>
 
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <cstdint>
+#include "impl/HNSW.h"
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
@@ -144,7 +148,9 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+        for (int pt_level = hist.size() - 1;
+             pt_level >= !index_hnsw.init_level0;
+             pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -180,7 +186,13 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    hnsw.add_with_locks(
+                            *dis,
+                            pt_level,
+                            pt_id,
+                            locks,
+                            vt,
+                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -200,7 +212,11 @@ void hnsw_add_vertices(
             }
             i1 = i0;
         }
-        FAISS_ASSERT(i1 == 0);
+        if (index_hnsw.init_level0) {
+            FAISS_ASSERT(i1 == 0);
+        } else {
+            FAISS_ASSERT((i1 - hist[0]) == 0);
+        }
     }
     if (verbose) {
         printf("Done in %.3f ms\n", getmillisecs() - t0);
@@ -404,10 +420,18 @@ void IndexHNSW::search_level_0(
         float* distances,
         idx_t* labels,
         int nprobe,
-        int search_type) const {
+        int search_type,
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const SearchParametersHNSW* params = nullptr;
+
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+    }
+
     storage_idx_t ntotal = hnsw.levels.size();
 
     using RH = HeapBlockResultHandler<HNSW::C>;
@@ -434,13 +458,21 @@ void IndexHNSW::search_level_0(
                     nearest_d + i * nprobe,
                     search_type,
                     search_stats,
-                    vt);
+                    vt,
+                    params);
             res.end();
             vt.advance();
         }
 #pragma omp critical
         { hnsw_stats.combine(search_stats); }
     }
+    if (is_similarity_metric(this->metric_type)) {
+// we need to revert the negated distances
+#pragma omp parallel for
+        for (size_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
@@ -863,4 +895,86 @@ void IndexHNSW2Level::flip_to_ivf() {
     delete storage2l;
 }
 
+/**************************************************************
+ * IndexHNSWCagra implementation
+ **************************************************************/
+
+IndexHNSWCagra::IndexHNSWCagra() {
+    is_trained = true;
+}
+
+IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
+        : IndexHNSW(
+                  (metric == METRIC_L2)
+                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
+                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
+                  M) {
+    FAISS_THROW_IF_NOT_MSG(
+            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
+            "unsupported metric type for IndexHNSWCagra");
+    own_fields = true;
+    is_trained = true;
+    init_level0 = true;
+    keep_max_size_level0 = true;
+}
+
+void IndexHNSWCagra::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            !base_level_only,
+            "Cannot add vectors when base_level_only is set to True");
+
+    IndexHNSW::add(n, x);
+}
+
+void IndexHNSWCagra::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    if (!base_level_only) {
+        IndexHNSW::search(n, x, k, distances, labels, params);
+    } else {
+        std::vector<storage_idx_t> nearest(n);
+        std::vector<float> nearest_d(n);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            std::unique_ptr<DistanceComputer> dis(
+                    storage_distance_computer(this->storage));
+            dis->set_query(x + i * d);
+            nearest[i] = -1;
+            nearest_d[i] = std::numeric_limits<float>::max();
+
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal);
+
+            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
+                auto idx = distrib(gen);
+                auto distance = (*dis)(idx);
+                if (distance < nearest_d[i]) {
+                    nearest[i] = idx;
+                    nearest_d[i] = distance;
+                }
+            }
+            FAISS_THROW_IF_NOT_MSG(
+                    nearest[i] >= 0, "Could not find a valid entrypoint.");
+        }
+
+        search_level_0(
+                n,
+                x,
+                k,
+                nearest.data(),
+                nearest_d.data(),
+                distances,
+                labels,
+                1, // n_probes
+                1, // search_type
+                params);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index e0b65fca9d..71807c6537 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -34,6 +34,18 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
+    // When set to false, level 0 in the knn graph is not initialized.
+    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
+    // as level 0 knn graph is copied over from the index built by
+    // GpuIndexCagra.
+    bool init_level0 = true;
+
+    // When set to true, all neighbors in level 0 are filled up
+    // to the maximum size allowed (2 * M). This option is used by
+    // IndexHHNSWCagra to create a full base layer graph that is
+    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
+    bool keep_max_size_level0 = false;
+
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
 
@@ -81,7 +93,8 @@ struct IndexHNSW : Index {
             float* distances,
             idx_t* labels,
             int nprobe = 1,
-            int search_type = 1) const;
+            int search_type = 1,
+            const SearchParameters* params = nullptr) const;
 
     /// alternative graph building
     void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
@@ -148,4 +161,33 @@ struct IndexHNSW2Level : IndexHNSW {
             const SearchParameters* params = nullptr) const override;
 };
 
+struct IndexHNSWCagra : IndexHNSW {
+    IndexHNSWCagra();
+    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+
+    /// When set to true, the index is immutable.
+    /// This option is used to copy the knn graph from GpuIndexCagra
+    /// to the base level of IndexHNSWCagra without adding upper levels.
+    /// Doing so enables to search the HNSW index, but removes the
+    /// ability to add vectors.
+    bool base_level_only = false;
+
+    /// When `base_level_only` is set to `True`, the search function
+    /// searches only the base level knn graph of the HNSW index.
+    /// This parameter selects the entry point by randomly selecting
+    /// some points and using the best one.
+    int num_base_level_search_entrypoints = 32;
+
+    void add(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
 } // namespace faiss
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 126cbe5044..d20f3b7f8e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -238,11 +238,15 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
+          GpuIndexCagra.h
+          impl/RaftCagra.cuh
           impl/RaftFlatIndex.cuh
           impl/RaftIVFFlat.cuh
           impl/RaftIVFPQ.cuh
           utils/RaftUtils.h)
   list(APPEND FAISS_GPU_SRC
+          GpuIndexCagra.cu
+          impl/RaftCagra.cu
           impl/RaftFlatIndex.cu
           impl/RaftIVFFlat.cu
           impl/RaftIVFPQ.cu
@@ -316,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 8f895ac9c7..b6d55a47aa 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,6 +14,9 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/IndexHNSW.h>
+#endif
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/GpuIndexCagra.h>
+#endif
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
 
-    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+        IndexHNSWCagra* res = new IndexHNSWCagra();
+        icg->copyTo(res);
+        return res;
+    }
+#endif
+    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
         Index* res = clone_Index(ish->at(0));
@@ -215,7 +229,18 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         }
 
         return res;
-    } else {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+        GpuIndexCagraConfig config;
+        config.device = device;
+        GpuIndexCagra* res =
+                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
+        res->copyFrom(icg);
+        return res;
+    }
+#endif
+    else {
         // use CPU cloner for IDMap and PreTransform
         auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
         auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
new file mode 100644
index 0000000000..4ae56df10d
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -0,0 +1,274 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <cstddef>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+#include <optional>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexCagra::GpuIndexCagra(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexCagraConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
+          cagraConfig_(config) {
+    this->is_trained = false;
+}
+
+void GpuIndexCagra::train(idx_t n, const float* x) {
+    if (this->is_trained) {
+        FAISS_ASSERT(index_);
+        return;
+    }
+
+    FAISS_ASSERT(!index_);
+
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+            std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params =
+            std::nullopt;
+    if (cagraConfig_.ivf_pq_params != nullptr) {
+        ivf_pq_params =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+        ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
+        ivf_pq_params->kmeans_n_iters =
+                cagraConfig_.ivf_pq_params->kmeans_n_iters;
+        ivf_pq_params->kmeans_trainset_fraction =
+                cagraConfig_.ivf_pq_params->kmeans_trainset_fraction;
+        ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
+        ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
+        ivf_pq_params->codebook_kind =
+                static_cast<raft::neighbors::ivf_pq::codebook_gen>(
+                        cagraConfig_.ivf_pq_params->codebook_kind);
+        ivf_pq_params->force_random_rotation =
+                cagraConfig_.ivf_pq_params->force_random_rotation;
+        ivf_pq_params->conservative_memory_allocation =
+                cagraConfig_.ivf_pq_params->conservative_memory_allocation;
+    }
+    if (cagraConfig_.ivf_pq_search_params != nullptr) {
+        ivf_pq_search_params =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+        ivf_pq_search_params->n_probes =
+                cagraConfig_.ivf_pq_search_params->n_probes;
+        ivf_pq_search_params->lut_dtype =
+                cagraConfig_.ivf_pq_search_params->lut_dtype;
+        ivf_pq_search_params->preferred_shmem_carveout =
+                cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
+    }
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            cagraConfig_.intermediate_graph_degree,
+            cagraConfig_.graph_degree,
+            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+            cagraConfig_.nn_descent_niter,
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT,
+            ivf_pq_params,
+            ivf_pq_search_params);
+
+    index_->train(n, x);
+
+    this->is_trained = true;
+    this->ntotal = n;
+}
+
+bool GpuIndexCagra::addImplRequiresIDs_() const {
+    return false;
+};
+
+void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
+    FAISS_THROW_MSG("adding vectors is not supported by GpuIndexCagra.");
+};
+
+void GpuIndexCagra::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* search_params) const {
+    FAISS_ASSERT(this->is_trained && index_);
+    FAISS_ASSERT(n > 0);
+
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+
+    SearchParametersCagra* params;
+    if (search_params) {
+        params = dynamic_cast<SearchParametersCagra*>(
+                const_cast<SearchParameters*>(search_params));
+    } else {
+        params = new SearchParametersCagra{};
+    }
+
+    index_->search(
+            queries,
+            k,
+            outDistances,
+            outLabels,
+            params->max_queries,
+            params->itopk_size,
+            params->max_iterations,
+            static_cast<faiss::cagra_search_algo>(params->algo),
+            params->team_size,
+            params->search_width,
+            params->min_iterations,
+            params->thread_block_size,
+            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+            params->hashmap_min_bitlen,
+            params->hashmap_max_fill_rate,
+            params->num_random_samplings,
+            params->seed);
+
+    if (not search_params) {
+        delete params;
+    }
+}
+
+void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+    FAISS_ASSERT(index);
+
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
+    FAISS_ASSERT(base_index);
+    auto distances = base_index->get_xb();
+
+    auto hnsw = index->hnsw;
+    // copy level 0 to a dense knn graph matrix
+    std::vector<idx_t> knn_graph;
+    knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0));
+
+#pragma omp parallel for
+    for (size_t i = 0; i < index->ntotal; ++i) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            // knn_graph.push_back(hnsw.neighbors[j]);
+            knn_graph[i * hnsw.nb_neighbors(0) + (j - begin)] =
+                    hnsw.neighbors[j];
+        }
+    }
+
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            index->ntotal,
+            hnsw.nb_neighbors(0),
+            distances,
+            knn_graph.data(),
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT);
+
+    this->is_trained = true;
+}
+
+void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
+    FAISS_ASSERT(index_ && this->is_trained && index);
+
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
+    // This needs to be zeroed out as this implementation adds vectors to the
+    // cpuIndex instead of copying fields
+    index->ntotal = 0;
+
+    auto graph_degree = index_->get_knngraph_degree();
+    auto M = graph_degree / 2;
+    if (index->storage and index->own_fields) {
+        delete index->storage;
+    }
+
+    if (this->metric_type == METRIC_L2) {
+        index->storage = new IndexFlatL2(index->d);
+    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
+        index->storage = new IndexFlatIP(index->d);
+    }
+    index->own_fields = true;
+    index->keep_max_size_level0 = true;
+    index->hnsw.reset();
+    index->hnsw.assign_probas.clear();
+    index->hnsw.cum_nneighbor_per_level.clear();
+    index->hnsw.set_default_probas(M, 1.0 / log(M));
+
+    auto n_train = this->ntotal;
+    auto train_dataset = index_->get_training_dataset();
+
+    // turn off as level 0 is copied from CAGRA graph
+    index->init_level0 = false;
+    if (!index->base_level_only) {
+        index->add(n_train, train_dataset.data());
+    } else {
+        index->hnsw.prepare_level_tab(n_train, false);
+        index->storage->add(n_train, train_dataset.data());
+        index->ntotal = n_train;
+    }
+
+    auto graph = get_knngraph();
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n_train; i++) {
+        size_t begin, end;
+        index->hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            index->hnsw.neighbors[j] = graph[i * graph_degree + (j - begin)];
+        }
+    }
+
+    // turn back on to allow new vectors to be added to level 0
+    index->init_level0 = true;
+}
+
+void GpuIndexCagra::reset() {
+    DeviceScope scope(config_.device);
+
+    if (index_) {
+        index_->reset();
+        this->ntotal = 0;
+        this->is_trained = false;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
+std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
+    FAISS_ASSERT(index_ && this->is_trained);
+
+    return index_->get_knngraph();
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
new file mode 100644
index 0000000000..6ecee3ae03
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -0,0 +1,282 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+
+namespace faiss {
+struct IndexHNSWCagra;
+}
+
+namespace faiss {
+namespace gpu {
+
+class RaftCagra;
+
+enum class graph_build_algo {
+    /// Use IVF-PQ to build all-neighbors knn graph
+    IVF_PQ,
+    /// Experimental, use NN-Descent to build all-neighbors knn graph
+    NN_DESCENT
+};
+
+/// A type for specifying how PQ codebooks are created.
+enum class codebook_gen { // NOLINT
+    PER_SUBSPACE = 0,     // NOLINT
+    PER_CLUSTER = 1,      // NOLINT
+};
+
+struct IVFPQBuildCagraConfig {
+    ///
+    /// The number of inverted lists (clusters)
+    ///
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
+    /// approximately 1,000 to 10,000.
+
+    uint32_t n_lists = 1024;
+    /// The number of iterations searching for kmeans centers (index building).
+    uint32_t kmeans_n_iters = 20;
+    /// The fraction of data to use during iterative kmeans building.
+    double kmeans_trainset_fraction = 0.5;
+    ///
+    /// The bit length of the vector element after compression by PQ.
+    ///
+    /// Possible values: [4, 5, 6, 7, 8].
+    ///
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
+    /// better the search performance, but the lower the recall.
+
+    uint32_t pq_bits = 8;
+    ///
+    /// The dimensionality of the vector after compression by PQ. When zero, an
+    /// optimal value is selected using a heuristic.
+    ///
+    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
+    ///
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
+    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
+    /// set to any number, but multiple of 8 are desirable for good performance.
+    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
+    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+    /// 'pq_dim' should be also a divisor of the dataset dim.
+
+    uint32_t pq_dim = 0;
+    /// How PQ codebooks are created.
+    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+    ///
+    /// Apply a random rotation matrix on the input data and queries even if
+    /// `dim % pq_dim == 0`.
+    ///
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
+    /// applied to the input data and queries to transform the working space
+    /// from `dim` to `rot_dim`, which may be slightly larger than the original
+    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of
+    /// `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
+    ///   features).
+    ///
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized
+    /// with the identity matrix. When `force_random_rotation == true`, a random
+    /// orthogonal transform matrix is generated regardless of the values of
+    /// `dim` and `pq_dim`.
+
+    bool force_random_rotation = false;
+    ///
+    /// By default, the algorithm allocates more space than necessary for
+    /// individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and
+    /// reduce the number of data copies during repeated calls to `extend`
+    /// (extending the database).
+    ///
+    /// The alternative is the conservative allocation behavior; when enabled,
+    /// the algorithm always allocates the minimum amount of memory required to
+    /// store the given number of records. Set this flag to `true` if you prefer
+    /// to use as little GPU memory for the database as possible.
+
+    bool conservative_memory_allocation = false;
+};
+
+struct IVFPQSearchCagraConfig {
+    /// The number of clusters to search.
+    uint32_t n_probes = 20;
+    ///
+    /// Data type of look up table to be created dynamically at search time.
+    ///
+    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+    ///
+    /// The use of low-precision types reduces the amount of shared memory
+    /// required at search time, so fast shared memory kernels can be used even
+    /// for datasets with large dimansionality. Note that the recall is slightly
+    /// degraded when low-precision type is selected.
+
+    cudaDataType_t lut_dtype = CUDA_R_32F;
+    ///
+    /// Storage data type for distance/similarity computed at search time.
+    ///
+    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+    ///
+    /// If the performance limiter at search time is device memory access,
+    /// selecting FP16 will improve performance slightly.
+
+    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+    ///
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as
+    /// shared memory.
+    ///
+    /// Possible values: [0.0 - 1.0] as a fraction of the
+    /// `sharedMemPerMultiprocessor`.
+    ///
+    /// One wants to increase the carveout to make sure a good GPU occupancy for
+    /// the main search kernel, but not to keep it too high to leave some memory
+    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
+    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
+    /// so the provided value is rounded up to the nearest configuration. Refer
+    /// to the NVIDIA tuning guide for the target GPU architecture.
+    ///
+    /// Note, this is a low-level tuning parameter that can have drastic
+    /// negative effects on the search performance if tweaked incorrectly.
+
+    double preferred_shmem_carveout = 1.0;
+};
+
+struct GpuIndexCagraConfig : public GpuIndexConfig {
+    /// Degree of input graph for pruning.
+    size_t intermediate_graph_degree = 128;
+    /// Degree of output graph.
+    size_t graph_degree = 64;
+    /// ANN algorithm to build knn graph.
+    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+    /// Number of Iterations to run if building with NN_DESCENT
+    size_t nn_descent_niter = 20;
+
+    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+};
+
+enum class search_algo {
+    /// For large batch sizes.
+    SINGLE_CTA,
+    /// For small batch sizes.
+    MULTI_CTA,
+    MULTI_KERNEL,
+    AUTO
+};
+
+enum class hash_mode { HASH, SMALL, AUTO };
+
+struct SearchParametersCagra : SearchParameters {
+    /// Maximum number of queries to search at the same time (batch size). Auto
+    /// select when 0.
+    size_t max_queries = 0;
+
+    /// Number of intermediate search results retained during the search.
+    ///
+    ///  This is the main knob to adjust trade off between accuracy and search
+    /// speed. Higher values improve the search accuracy.
+
+    size_t itopk_size = 64;
+
+    /// Upper limit of search iterations. Auto select when 0.
+    size_t max_iterations = 0;
+
+    // In the following we list additional search parameters for fine tuning.
+    // Reasonable default values are automatically chosen.
+
+    /// Which search implementation to use.
+    search_algo algo = search_algo::AUTO;
+
+    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+
+    size_t team_size = 0;
+
+    /// Number of graph nodes to select as the starting point for the search in
+    /// each iteration. aka search width?
+    size_t search_width = 1;
+    /// Lower limit of search iterations.
+    size_t min_iterations = 0;
+
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
+    size_t thread_block_size = 0;
+    /// Hashmap type. Auto selection when AUTO.
+    hash_mode hashmap_mode = hash_mode::AUTO;
+    /// Lower limit of hashmap bit length. More than 8.
+    size_t hashmap_min_bitlen = 0;
+    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    float hashmap_max_fill_rate = 0.5;
+
+    /// Number of iterations of initial random seed node selection. 1 or more.
+
+    uint32_t num_random_samplings = 1;
+    /// Bit mask used for initial random seed node selection.
+    uint64_t seed = 0x128394;
+};
+
+struct GpuIndexCagra : public GpuIndex {
+   public:
+    GpuIndexCagra(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexCagraConfig config = GpuIndexCagraConfig());
+
+    /// Trains CAGRA based on the given vector data
+    void train(idx_t n, const float* x) override;
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexHNSWCagra* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexHNSWCagra* index) const;
+
+    void reset() override;
+
+    std::vector<idx_t> get_knngraph() const;
+
+   protected:
+    bool addImplRequiresIDs_() const override;
+
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            idx_t n,
+            const float* x,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
+
+    /// Our configuration options
+    const GpuIndexCagraConfig cagraConfig_;
+
+    /// Instance that we own; contains the inverted lists
+    std::shared_ptr<RaftCagra> index_;
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
new file mode 100644
index 0000000000..292079321d
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <cstddef>
+#include <cstdint>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft_runtime/neighbors/cagra.hpp>
+#include <optional>
+#include <raft/neighbors/cagra.cuh>
+
+namespace faiss {
+namespace gpu {
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t intermediate_graph_degree,
+        idx_t graph_degree,
+        faiss::cagra_build_algo graph_build_algo,
+        size_t nn_descent_niter,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions,
+        std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<raft::neighbors::ivf_pq::search_params>
+                ivf_pq_search_params)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg),
+          index_params_(),
+          ivf_pq_params_(ivf_pq_params),
+          ivf_pq_search_params_(ivf_pq_search_params) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    index_params_.intermediate_graph_degree = intermediate_graph_degree;
+    index_params_.graph_degree = graph_degree;
+    index_params_.build_algo =
+            static_cast<raft::neighbors::cagra::graph_build_algo>(
+                    graph_build_algo);
+    index_params_.nn_descent_niter = nn_descent_niter;
+
+    if (!ivf_pq_params_) {
+        ivf_pq_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+    }
+    if (!ivf_pq_search_params_) {
+        ivf_pq_search_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+    }
+    index_params_.metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+    ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+
+    reset();
+}
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t n,
+        int graph_degree,
+        const float* distances,
+        const idx_t* knn_graph,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    if (distances_on_gpu && knn_graph_on_gpu) {
+        raft_handle.sync_stream();
+        // Copying to host so that raft::neighbors::cagra::index
+        // creates an owning copy of the knn graph on device
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        thrust::copy(
+                thrust::device_ptr<const idx_t>(knn_graph),
+                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds =
+                raft::make_device_matrix_view<const float, int64_t>(
+                        distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        std::copy(
+                knn_graph,
+                knn_graph + (n * graph_degree),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
+                distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else {
+        FAISS_THROW_MSG(
+                "distances and knn_graph must both be in device or host memory");
+    }
+}
+
+void RaftCagra::train(idx_t n, const float* x) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    if (index_params_.build_algo ==
+        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
+        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
+                raft::make_host_matrix<uint32_t, int64_t>(
+                        n, index_params_.intermediate_graph_degree));
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_d,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_h,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        }
+        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
+                n, index_params_.graph_degree);
+
+        raft::neighbors::cagra::optimize<uint32_t>(
+                raft_handle, knn_graph->view(), cagra_graph.view());
+
+        // free intermediate graph before trying to create the index
+        knn_graph.reset();
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_d,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_h,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        }
+
+    } else {
+        if (getDeviceForAddress(x) >= 0) {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        } else {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_host_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        }
+    }
+}
+
+void RaftCagra::search(
+        Tensor<float, 2, true>& queries,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        idx_t max_queries,
+        idx_t itopk_size,
+        idx_t max_iterations,
+        faiss::cagra_search_algo graph_search_algo,
+        idx_t team_size,
+        idx_t search_width,
+        idx_t min_iterations,
+        idx_t thread_block_size,
+        faiss::cagra_hash_mode hash_mode,
+        idx_t hashmap_min_bitlen,
+        float hashmap_max_fill_rate,
+        idx_t num_random_samplings,
+        idx_t rand_xor_mask) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t numQueries = queries.getSize(0);
+    idx_t cols = queries.getSize(1);
+    idx_t k_ = k;
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+
+    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
+            queries.data(), numQueries, cols);
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
+            outDistances.data(), numQueries, k_);
+    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
+            outIndices.data(), numQueries, k_);
+
+    raft::neighbors::cagra::search_params search_pams;
+    search_pams.max_queries = max_queries;
+    search_pams.itopk_size = itopk_size;
+    search_pams.max_iterations = max_iterations;
+    search_pams.algo =
+            static_cast<raft::neighbors::cagra::search_algo>(graph_search_algo);
+    search_pams.team_size = team_size;
+    search_pams.search_width = search_width;
+    search_pams.min_iterations = min_iterations;
+    search_pams.thread_block_size = thread_block_size;
+    search_pams.hashmap_mode =
+            static_cast<raft::neighbors::cagra::hash_mode>(hash_mode);
+    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
+    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+    search_pams.num_random_samplings = num_random_samplings;
+    search_pams.rand_xor_mask = rand_xor_mask;
+
+    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
+            raft_handle, numQueries, k_);
+
+    raft::runtime::neighbors::cagra::search(
+            raft_handle,
+            search_pams,
+            raft_knn_index.value(),
+            queries_view,
+            indices_copy.view(),
+            distances_view);
+    thrust::copy(
+            raft::resource::get_thrust_policy(raft_handle),
+            indices_copy.data_handle(),
+            indices_copy.data_handle() + indices_copy.size(),
+            indices_view.data_handle());
+}
+
+void RaftCagra::reset() {
+    raft_knn_index.reset();
+}
+
+idx_t RaftCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    return static_cast<idx_t>(raft_knn_index.value().graph_degree());
+}
+
+std::vector<idx_t> RaftCagra::get_knngraph() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_graph = raft_knn_index.value().graph();
+
+    std::vector<idx_t> host_graph(
+            device_graph.extent(0) * device_graph.extent(1));
+
+    raft_handle.sync_stream();
+
+    thrust::copy(
+            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+            thrust::device_ptr<const uint32_t>(
+                    device_graph.data_handle() + device_graph.size()),
+            host_graph.data());
+
+    return host_graph;
+}
+
+std::vector<float> RaftCagra::get_training_dataset() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_dataset = raft_knn_index.value().dataset();
+
+    std::vector<float> host_dataset(
+            device_dataset.extent(0) * device_dataset.extent(1));
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+            host_dataset.data(),
+            sizeof(float) * dim_,
+            device_dataset.data_handle(),
+            sizeof(float) * device_dataset.stride(0),
+            sizeof(float) * dim_,
+            device_dataset.extent(0),
+            cudaMemcpyDefault,
+            raft_handle.get_stream()));
+    raft_handle.sync_stream();
+
+    return host_dataset;
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
new file mode 100644
index 0000000000..95f6c03fca
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/GpuResources.h>
+#include <cstddef>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <optional>
+
+#include <faiss/MetricType.h>
+
+#include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+namespace faiss {
+
+/// Algorithm used to build underlying CAGRA graph
+enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
+
+enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
+
+enum class cagra_hash_mode { HASH, SMALL, AUTO };
+
+namespace gpu {
+
+class RaftCagra {
+   public:
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t intermediate_graph_degree,
+            idx_t graph_degree,
+            faiss::cagra_build_algo graph_build_algo,
+            size_t nn_descent_niter,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions,
+            std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+                    std::nullopt,
+            std::optional<raft::neighbors::ivf_pq::search_params>
+                    ivf_pq_search_params = std::nullopt);
+
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t n,
+            int graph_degree,
+            const float* distances,
+            const idx_t* knn_graph,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions);
+
+    ~RaftCagra() = default;
+
+    void train(idx_t n, const float* x);
+
+    void search(
+            Tensor<float, 2, true>& queries,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            idx_t max_queries,
+            idx_t itopk_size,
+            idx_t max_iterations,
+            faiss::cagra_search_algo graph_search_algo,
+            idx_t team_size,
+            idx_t search_width,
+            idx_t min_iterations,
+            idx_t thread_block_size,
+            faiss::cagra_hash_mode hash_mode,
+            idx_t hashmap_min_bitlen,
+            float hashmap_max_fill_rate,
+            idx_t num_random_samplings,
+            idx_t rand_xor_mask);
+
+    void reset();
+
+    idx_t get_knngraph_degree() const;
+
+    std::vector<idx_t> get_knngraph() const;
+
+    std::vector<float> get_training_dataset() const;
+
+   private:
+    /// Collection of GPU resources that we use
+    GpuResources* resources_;
+
+    /// Expected dimensionality of the vectors
+    const int dim_;
+
+    /// Metric type of the index
+    faiss::MetricType metric_;
+
+    /// Metric arg
+    float metricArg_;
+
+    /// Parameters to build RAFT CAGRA index
+    raft::neighbors::cagra::index_params index_params_;
+
+    /// Parameters to build CAGRA graph using IVF PQ
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+
+    /// Instance of trained RAFT CAGRA index
+    std::optional<raft::neighbors::cagra::index<float, uint32_t>>
+            raft_knn_index{std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 9300deead9..60f78ef74f 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -21,7 +21,6 @@ find_package(CUDAToolkit REQUIRED)
 
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
-
 add_library(faiss_gpu_test_helper TestUtils.cpp)
 target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>)
 
@@ -42,6 +41,9 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
+if(FAISS_ENABLE_RAFT)
+  faiss_gpu_test(TestGpuIndexCagra.cu)
+endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
new file mode 100644
index 0000000000..8d330a81cb
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -0,0 +1,474 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/MetricType.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/distances.h>
+#include <cstddef>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <optional>
+#include <vector>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/neighborhood_recall.cuh>
+
+struct Options {
+    Options() {
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(4, 10);
+        numAdd = faiss::gpu::randVal(1000, 3000);
+
+        graphDegree = faiss::gpu::randSelect({32, 64});
+        intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
+        buildAlgo = faiss::gpu::randSelect(
+                {faiss::gpu::graph_build_algo::IVF_PQ,
+                 faiss::gpu::graph_build_algo::NN_DESCENT});
+
+        numQuery = faiss::gpu::randVal(32, 100);
+        k = faiss::gpu::randVal(10, 30);
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
+            << dim << " graphDegree " << graphDegree
+            << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo) << " numQuery "
+            << numQuery << " k " << k;
+
+        return str.str();
+    }
+
+    int numTrain;
+    int numAdd;
+    int dim;
+    size_t graphDegree;
+    size_t intermediateGraphDegree;
+    faiss::gpu::graph_build_algo buildAlgo;
+    int numQuery;
+    int k;
+    int device;
+};
+
+void queryTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // train gpu index
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        // test quality of searches
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                test_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                test_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_L2) {
+    queryTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_IP) {
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+void copyToTest(
+        faiss::MetricType metric,
+        double expected_recall,
+        bool base_level_only) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
+        }
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // train gpu index and copy to cpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::IndexHNSWCagra copiedCpuIndex(
+                opt.dim, opt.graphDegree / 2, metric);
+        copiedCpuIndex.base_level_only = base_level_only;
+        gpuIndex.copyTo(&copiedCpuIndex);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
+
+        // add more vecs to copied cpu index
+        if (!base_level_only) {
+            copiedCpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // add more vecs to cpu index
+        if (!base_level_only) {
+            cpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // query indexes
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParamstwo;
+        cpuSearchParamstwo.efSearch = opt.k * 2;
+        copiedCpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyRefDistance.data(),
+                copyRefIndices.data(),
+                &cpuSearchParamstwo);
+
+        // test quality of search
+        auto gpuRes = res.getResources();
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                copyRefDistanceDev.data(), opt.numQuery, opt.k);
+        auto copy_ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_ref_dis_mds);
+        auto copy_ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_ref_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                copy_ref_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
+    copyToTest(faiss::METRIC_L2, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_L2, 0.98, true);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
+}
+
+void copyFromTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // convert to gpu index
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
+        copiedGpuIndex.copyFrom(&cpuIndex);
+
+        // train gpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        copiedGpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyTestDistance.data(),
+                copyTestIndices.data());
+
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        // test quality of searches
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        auto copy_test_dis_mds =
+                raft::make_device_matrix_view<const float, int>(
+                        copyTestDistance.data(), opt.numQuery, opt.k);
+        auto copy_test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_test_dis_mds);
+
+        auto copy_test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyTestIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_test_ind_mds,
+                test_ind_mds,
+                recall_score.view(),
+                copy_test_dis_mds_opt,
+                test_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
+    copyFromTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
new file mode 100644
index 0000000000..dd7d09f2de
--- /dev/null
+++ b/faiss/gpu/test/test_cagra.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+from faiss.contrib import datasets, evaluation
+
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestComputeGT(unittest.TestCase):
+
+    def do_compute_GT(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+        Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+    def test_compute_GT_L2(self):
+        self.do_compute_GT(faiss.METRIC_L2)
+
+    def test_compute_GT_IP(self):
+        self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestInterop(unittest.TestCase):
+
+    def do_interop(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+
+        cpu_index = faiss.index_gpu_to_cpu(index)
+        Dref, Iref = cpu_index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+        faiss.write_index(cpu_index, "index_hnsw_cagra.index")
+        deserialized_index = faiss.read_index("index_hnsw_cagra.index")
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
+        Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
+
+        evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k)
+
+    def test_interop_L2(self):
+        self.do_interop(faiss.METRIC_L2)
+
+    def test_interop_IP(self):
+        self.do_interop(faiss.METRIC_INNER_PRODUCT)
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index d8c8225968..3ba5f72f68 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -7,6 +7,7 @@
 
 #include <faiss/impl/HNSW.h>
 
+#include <cstddef>
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
@@ -215,8 +216,8 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
         if (pt_level > max_level)
             max_level = pt_level;
         offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-        neighbors.resize(offsets.back(), -1);
     }
+    neighbors.resize(offsets.back(), -1);
 
     return max_level;
 }
@@ -229,7 +230,14 @@ void HNSW::shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistFarther>& input,
         std::vector<NodeDistFarther>& output,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0) {
+    // This prevents number of neighbors at
+    // level 0 from being shrunk to less than 2 * M.
+    // This is essential in making sure
+    // `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
+    std::vector<NodeDistFarther> outsiders;
+
     while (input.size() > 0) {
         NodeDistFarther v1 = input.top();
         input.pop();
@@ -250,8 +258,15 @@ void HNSW::shrink_neighbor_list(
             if (output.size() >= max_size) {
                 return;
             }
+        } else if (keep_max_size_level0) {
+            outsiders.push_back(v1);
         }
     }
+    size_t idx = 0;
+    while (keep_max_size_level0 && (output.size() < max_size) &&
+           (idx < outsiders.size())) {
+        output.push_back(outsiders[idx++]);
+    }
 }
 
 namespace {
@@ -268,7 +283,8 @@ using NodeDistFarther = HNSW::NodeDistFarther;
 void shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0 = false) {
     if (resultSet1.size() < max_size) {
         return;
     }
@@ -280,7 +296,8 @@ void shrink_neighbor_list(
         resultSet1.pop();
     }
 
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+    HNSW::shrink_neighbor_list(
+            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
 
     for (NodeDistFarther curen2 : returnlist) {
         resultSet1.emplace(curen2.d, curen2.id);
@@ -294,7 +311,8 @@ void add_link(
         DistanceComputer& qdis,
         storage_idx_t src,
         storage_idx_t dest,
-        int level) {
+        int level,
+        bool keep_max_size_level0 = false) {
     size_t begin, end;
     hnsw.neighbor_range(src, level, &begin, &end);
     if (hnsw.neighbors[end - 1] == -1) {
@@ -319,7 +337,7 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(qdis, resultSet, end - begin);
+    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -429,7 +447,8 @@ void HNSW::add_links_starting_from(
         float d_nearest,
         int level,
         omp_lock_t* locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     std::priority_queue<NodeDistCloser> link_targets;
 
     search_neighbors_to_add(
@@ -438,13 +457,13 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+    ::faiss::shrink_neighbor_list(ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
     while (!link_targets.empty()) {
         storage_idx_t other_id = link_targets.top().id;
-        add_link(*this, ptdis, pt_id, other_id, level);
+        add_link(*this, ptdis, pt_id, other_id, level, keep_max_size_level0);
         neighbors.push_back(other_id);
         link_targets.pop();
     }
@@ -452,7 +471,7 @@ void HNSW::add_links_starting_from(
     omp_unset_lock(&locks[pt_id]);
     for (storage_idx_t other_id : neighbors) {
         omp_set_lock(&locks[other_id]);
-        add_link(*this, ptdis, other_id, pt_id, level);
+        add_link(*this, ptdis, other_id, pt_id, level, keep_max_size_level0);
         omp_unset_lock(&locks[other_id]);
     }
     omp_set_lock(&locks[pt_id]);
@@ -467,7 +486,8 @@ void HNSW::add_with_locks(
         int pt_level,
         int pt_id,
         std::vector<omp_lock_t>& locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     //  greedy search on upper levels
 
     storage_idx_t nearest;
@@ -496,7 +516,14 @@ void HNSW::add_with_locks(
 
     for (; level >= 0; level--) {
         add_links_starting_from(
-                ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt);
+                ptdis,
+                pt_id,
+                nearest,
+                d_nearest,
+                level,
+                locks.data(),
+                vt,
+                keep_max_size_level0);
     }
 
     omp_unset_lock(&locks[pt_id]);
@@ -910,9 +937,12 @@ void HNSW::search_level_0(
         const float* nearest_d,
         int search_type,
         HNSWStats& search_stats,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     const HNSW& hnsw = *this;
+    auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
+
     if (search_type == 1) {
         int nres = 0;
 
@@ -925,16 +955,24 @@ void HNSW::search_level_0(
             if (vt.get(cj))
                 continue;
 
-            int candidates_size = std::max(hnsw.efSearch, k);
+            int candidates_size = std::max(efSearch, k);
             MinimaxHeap candidates(candidates_size);
 
             candidates.push(cj, nearest_d[j]);
 
             nres = search_from_candidates(
-                    hnsw, qdis, res, candidates, vt, search_stats, 0, nres);
+                    hnsw,
+                    qdis,
+                    res,
+                    candidates,
+                    vt,
+                    search_stats,
+                    0,
+                    nres,
+                    params);
         }
     } else if (search_type == 2) {
-        int candidates_size = std::max(hnsw.efSearch, int(k));
+        int candidates_size = std::max(efSearch, int(k));
         candidates_size = std::max(candidates_size, int(nprobe));
 
         MinimaxHeap candidates(candidates_size);
@@ -947,7 +985,7 @@ void HNSW::search_level_0(
         }
 
         search_from_candidates(
-                hnsw, qdis, res, candidates, vt, search_stats, 0);
+                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
     }
 }
 
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index 8261423cdd..f3aacf8a5b 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -184,7 +184,8 @@ struct HNSW {
             float d_nearest,
             int level,
             omp_lock_t* locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /** add point pt_id on all levels <= pt_level and build the link
      * structure for them. */
@@ -193,7 +194,8 @@ struct HNSW {
             int pt_level,
             int pt_id,
             std::vector<omp_lock_t>& locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /// search interface for 1 point, single thread
     HNSWStats search(
@@ -211,7 +213,8 @@ struct HNSW {
             const float* nearest_d,
             int search_type,
             HNSWStats& search_stats,
-            VisitedTable& vt) const;
+            VisitedTable& vt,
+            const SearchParametersHNSW* params = nullptr) const;
 
     void reset();
 
@@ -224,7 +227,8 @@ struct HNSW {
             DistanceComputer& qdis,
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
-            int max_size);
+            int max_size,
+            bool keep_max_size_level0 = false);
 
     void permute_entries(const idx_t* map);
 };
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index ce4b1e76b1..aa041c0fac 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -950,7 +950,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2")) {
+            h == fourcc("IHN2") || h == fourcc("IHNc")) {
         IndexHNSW* idxhnsw = nullptr;
         if (h == fourcc("IHNf"))
             idxhnsw = new IndexHNSWFlat();
@@ -960,7 +960,15 @@ Index* read_index(IOReader* f, int io_flags) {
             idxhnsw = new IndexHNSWSQ();
         if (h == fourcc("IHN2"))
             idxhnsw = new IndexHNSW2Level();
+        if (h == fourcc("IHNc"))
+            idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            READ1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
+            READ1(idx_hnsw_cagra->base_level_only);
+            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
         idxhnsw->own_fields = idxhnsw->storage != nullptr;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 01e5ae7257..0a924d0225 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -762,10 +762,17 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            WRITE1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
+            WRITE1(idx_hnsw_cagra->base_level_only);
+            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         write_HNSW(&idxhnsw->hnsw, f);
         if (io_flags & IO_FLAG_SKIP_STORAGE) {
             uint32_t n4 = fourcc("null");
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index dee8c7762e..0073c20e04 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -38,6 +38,11 @@ macro(configure_swigfaiss source)
     set_source_files_properties(${source} PROPERTIES
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
+    if (FAISS_ENABLE_RAFT)
+      set_property(SOURCE ${source} APPEND PROPERTY
+        COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
+      )
+    endif()
   endif()
 endmacro()
 
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 85e04d322c..74a371f6cd 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -304,6 +304,7 @@ void gpu_sync_all_devices();
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuClonerOptions.h>
 #include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -557,6 +558,9 @@ struct faiss::simd16uint16 {};
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
+#ifdef FAISS_ENABLE_RAFT
+%include  <faiss/gpu/GpuIndexCagra.h>
+#endif
 %include  <faiss/gpu/GpuIndexFlat.h>
 %include  <faiss/gpu/GpuIndexIVF.h>
 %include  <faiss/gpu/GpuIndexIVFPQ.h>
@@ -673,6 +677,9 @@ struct faiss::simd16uint16 {};
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
+#ifdef FAISS_ENABLE_RAFT
+    DOWNCAST_GPU ( GpuIndexCagra )
+#endif
     DOWNCAST_GPU ( GpuIndexIVFPQ )
     DOWNCAST_GPU ( GpuIndexIVFFlat )
     DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )

From f71d5b9aa2c37d24c63a5ef4035294f2eeed1e25 Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Tue, 11 Jun 2024 08:14:48 -0700
Subject: [PATCH 108/116] fix spurious include to land the cagra diff (#3502)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3502

include probably added by vscode

Reviewed By: mengdilin

Differential Revision: D58411537

fbshipit-source-id: 3035f690d26decc937fb492c54ffa2f974ee2db8
---
 CMakeLists.txt               | 2 --
 faiss/IndexHNSW.cpp          | 1 -
 faiss/gpu/test/test_cagra.py | 6 +++---
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a468fb247..cedee9c456 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -46,8 +46,6 @@ project(faiss
   LANGUAGES ${FAISS_LANGUAGES})
 include(GNUInstallDirs)
 
-set(CMAKE_INSTALL_PREFIX "$ENV{CONDA_PREFIX}")
-
 set(CMAKE_CXX_STANDARD 17)
 
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index c04642d218..fd80b87df7 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -24,7 +24,6 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <cstdint>
-#include "impl/HNSW.h"
 
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexFlat.h>
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
index dd7d09f2de..4c7e532c2b 100644
--- a/faiss/gpu/test/test_cagra.py
+++ b/faiss/gpu/test/test_cagra.py
@@ -6,7 +6,6 @@
 import unittest
 
 import faiss
-import numpy as np
 
 from faiss.contrib import datasets, evaluation
 
@@ -57,8 +56,9 @@ def do_interop(self, metric):
         
         evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
 
-        faiss.write_index(cpu_index, "index_hnsw_cagra.index")
-        deserialized_index = faiss.read_index("index_hnsw_cagra.index")
+        deserialized_index = faiss.deserialize_index(
+            faiss.serialize_index(cpu_index))
+
         gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
         Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
 

From 3d32330e3d00f6f89d3680f44c86cc4ec602a105 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Wed, 12 Jun 2024 19:19:23 -0700
Subject: [PATCH 109/116] add use_raft to knn_gpu (torch) (#3509)

Summary:
Add support for `use_raft` in the torch version of `knn_gpu`. The numpy version already has this support, see https://github.com/facebookresearch/faiss/blob/main/faiss/python/gpu_wrappers.py#L59

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3509

Reviewed By: mlomeli1, junjieqi

Differential Revision: D58489851

Pulled By: algoriddle

fbshipit-source-id: cfad722fefd4809b135b765d0d43587cfd782d0e
---
 contrib/torch_utils.py                   |  4 +++-
 faiss/gpu/test/torch_test_contrib_gpu.py | 20 +++++++++++++-------
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index e371932c9f..18f136e914 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -492,8 +492,9 @@ def torch_replacement_sa_decode(self, codes, x=None):
         if issubclass(the_class, faiss.Index):
             handle_torch_Index(the_class)
 
+
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_raft=False):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
@@ -574,6 +575,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
+    args.use_raft = use_raft
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 0c949c29f2..f7444337f1 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -249,7 +249,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self):
+    def test_knn_gpu(self, use_raft=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -286,7 +286,7 @@ def test_knn_gpu(self):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -312,7 +312,7 @@ def test_knn_gpu(self):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -320,7 +320,7 @@ def test_knn_gpu(self):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -331,7 +331,13 @@ def test_knn_gpu(self):
                         self.assertTrue(torch.equal(I.cpu(), gt_I[6:8]))
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
-    def test_knn_gpu_datatypes(self):
+    @unittest.skipUnless(
+        "RAFT" in faiss.get_compile_options(),
+        "only if RAFT is compiled in")
+    def test_knn_gpu_raft(self):
+        self.test_knn_gpu(use_raft=True)
+
+    def test_knn_gpu_datatypes(self, use_raft=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -354,7 +360,7 @@ def test_knn_gpu_datatypes(self):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -366,7 +372,7 @@ def test_knn_gpu_datatypes(self):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)

From d45f78b1e14867086faebeb1d3b7bedbc79c644b Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 13 Jun 2024 08:14:38 -0700
Subject: [PATCH 110/116] Add conda bin to path early in the cmake GitHub
 action (#3512)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3512

Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3510

GitHub hosted runners some with the build-essentials package pre-installed, self-hosted runners on AWS do not have this package. This made it all steps other than the `all targets` one fall back to the system executables which unintentially worked on GitHub hosted runners but not on the self-hosted ones. This diff fixes it by pulling the line that adds conda bin to path early in the cmake build action.

Reviewed By: asadoughi

Differential Revision: D58513853

fbshipit-source-id: 23e95459e0031c96bd142515db07d1b700d713cf
---
 .github/actions/build_cmake/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index cd023aaca7..6e21f785ea 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -30,6 +30,7 @@ runs:
       run: |
         conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
         conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
+        echo "$CONDA/bin" >> $GITHUB_PATH
     - name: Install CUDA
       if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
       shell: bash
@@ -72,7 +73,6 @@ runs:
       shell: bash
       run: |
         conda install -y pytest
-        echo "$CONDA/bin" >> $GITHUB_PATH
     - name: Python tests (CPU only)
       if: inputs.gpu == 'OFF'
       shell: bash

From 34feae48d47ca0e8d097d3672168675d903584c3 Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Thu, 13 Jun 2024 13:31:34 -0700
Subject: [PATCH 111/116] typo in test_io_no_storage (#3515)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3515

Fix typo `test_io_no_storage`

Reviewed By: kuarora, asadoughi

Differential Revision: D58540190

fbshipit-source-id: b8b9cacd7ea6005c0edb94014de74188450318c1
---
 tests/test_graph_based.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index 95925d7ae9..c769e03ade 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -144,7 +144,7 @@ def test_io_no_storage(self):
         index2 = faiss.deserialize_index(
             faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
         )
-        self.assertEquals(index2.storage, None)
+        self.assertEqual(index2.storage, None)
         self.assertRaises(
             RuntimeError,
             index2.search, self.xb, 1)

From 44d21eedb623fa0ef244e86b6fa38fb27b771aa5 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Thu, 13 Jun 2024 22:30:28 -0700
Subject: [PATCH 112/116] Consolidate build environment configuration steps in
 cmake builds (#3516)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3516

This diff seeks to simplify the steps that install conda packages and environment configuration into a single step at the start of the cmake build action.

Reviewed By: mnorris11

Differential Revision: D58560454

fbshipit-source-id: ee2c6b36865809f31eb335cfb3c2fffdccaa318d
---
 .github/actions/build_cmake/action.yml | 44 ++++++++++++++------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
index 6e21f785ea..2bc476add5 100644
--- a/.github/actions/build_cmake/action.yml
+++ b/.github/actions/build_cmake/action.yml
@@ -20,27 +20,35 @@ runs:
       with:
         python-version: '3.11'
         miniconda-version: latest
-    - name: Initialize Conda environment
+    - name: Configure build environment
       shell: bash
       run: |
+        # initialize Conda
         conda config --set solver libmamba
         conda update -y -q conda
-    - name: Configure Conda environment
-      shell: bash
-      run: |
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        # install base packages
         conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
         conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
-        echo "$CONDA/bin" >> $GITHUB_PATH
-    - name: Install CUDA
-      if: inputs.gpu == 'ON' && inputs.raft == 'OFF'
-      shell: bash
-      run: |
-        conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
-    - name: Install RAFT
-      if: inputs.raft == 'ON'
-      shell: bash
-      run: |
-        conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+
+        # install CUDA packages
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+        fi
+
+        # install RAFT packages
+        if [ "${{ inputs.raft }}" = "ON" ]; then
+          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        fi
+
+        # install test packages
+        conda install -y pytest
+        if [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        else
+          conda install -y -q pytorch -c pytorch
+        fi
     - name: Build all targets
       shell: bash
       run: |
@@ -69,22 +77,16 @@ runs:
       working-directory: build/faiss/python
       run: |
         $CONDA/bin/python setup.py install
-    - name: Install pytest
-      shell: bash
-      run: |
-        conda install -y pytest
     - name: Python tests (CPU only)
       if: inputs.gpu == 'OFF'
       shell: bash
       run: |
-        conda install -y -q pytorch -c pytorch
         pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
         pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
     - name: Python tests (CPU + GPU)
       if: inputs.gpu == 'ON'
       shell: bash
       run: |
-        conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
         pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
         pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
         cp tests/common_faiss_tests.py faiss/gpu/test

From e65a910eb47e93e796a4fb8f1dd70a6a0a11136c Mon Sep 17 00:00:00 2001
From: Gergely Szilvasy <gsz@meta.com>
Date: Fri, 14 Jun 2024 13:00:05 -0700
Subject: [PATCH 113/116] fix Windows build - signed int OMP for MSVC (#3517)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3517

MSVC doesn't support unsigned int for OMP

Reviewed By: kuarora, junjieqi, ramilbakhshyiev

Differential Revision: D58591594

fbshipit-source-id: ac7d6b37a82f9543be3e0fe418f0f6b439751475
---
 faiss/IndexHNSW.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index fd80b87df7..8e5c654f04 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -468,7 +468,7 @@ void IndexHNSW::search_level_0(
     if (is_similarity_metric(this->metric_type)) {
 // we need to revert the negated distances
 #pragma omp parallel for
-        for (size_t i = 0; i < k * n; i++) {
+        for (int64_t i = 0; i < k * n; i++) {
             distances[i] = -distances[i];
         }
     }

From 849557a38a4acd87f18e62fa9e538b151964d582 Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 17 Jun 2024 01:40:32 -0700
Subject: [PATCH 114/116] Unbreak RAFT conda builds (#3519)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3519

Fixing the conda conflicts because of `_openmp_mutex` build versions. This change pins that version for RAFT conda package builds.

Reviewed By: algoriddle

Differential Revision: D58646659

fbshipit-source-id: 4c1eaa9f08bd354da016b9399a36698007a497d8
---
 conda/faiss-gpu-raft/meta.yaml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 23e4835032..9a5fd542f1 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -50,14 +50,18 @@ outputs:
         - llvm-openmp  # [osx]
         - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - libraft =24.04
         - cuda-version {{ cuda_constraints }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
@@ -87,12 +91,16 @@ outputs:
         - swig
         - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - packaging

From e188eb381026a25b8817e3846c9ce53710f8947a Mon Sep 17 00:00:00 2001
From: Ramil Bakhshyiev <ramil@meta.com>
Date: Mon, 17 Jun 2024 17:59:13 -0700
Subject: [PATCH 115/116] Bump libraft to 24.06 to unblock nightly RAFT builds
 (#3522)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3522

Quick fix to unblock nightly

Reviewed By: mlomeli1

Differential Revision: D58694193

fbshipit-source-id: ea323991cc2e2b958fc11ab614dcd6e09d4c072c
---
 conda/faiss-gpu-raft/meta.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index 9a5fd542f1..1dde8e9868 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
       run:
         - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
@@ -66,7 +66,7 @@ outputs:
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.04
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
     test:
       requires:

From e758973fa08164728eb9e136631fe6c57d7edf6c Mon Sep 17 00:00:00 2001
From: Matthijs Douze <matthijs@meta.com>
Date: Tue, 18 Jun 2024 03:13:51 -0700
Subject: [PATCH 116/116] Add ABS_INNER_PRODUCT metric (#3524)

Summary:
Pull Request resolved: https://github.com/facebookresearch/faiss/pull/3524

Searches with the metric abs(dot(query, database))
This makes it possible to search vectors that are closest to a hyperplane

* adds support for alternative metrics in faiss.knn in python

* checks that it works with HNSW

* simplifies the extra distances interface by removing the template on

Reviewed By: asadoughi

Differential Revision: D58695971

fbshipit-source-id: 2a0ff49c7f7ac2c005d85f141cc5de148081c9c4
---
 faiss/IndexFlat.cpp               | 18 ++++++----
 faiss/MetricType.h                |  8 +++--
 faiss/python/extra_wrappers.py    | 12 +++++--
 faiss/utils/extra_distances-inl.h | 12 +++++++
 faiss/utils/extra_distances.cpp   | 55 +++++++++++--------------------
 faiss/utils/extra_distances.h     |  5 +--
 tests/test_extra_distances.py     |  7 ++++
 tests/test_graph_based.py         | 15 +++++++++
 8 files changed, 82 insertions(+), 50 deletions(-)

diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index f606f8e621..7d29ca5387 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -41,15 +41,19 @@ void IndexFlat::search(
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
-    } else if (is_similarity_metric(metric_type)) {
-        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
-        FAISS_THROW_IF_NOT(!sel);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        FAISS_THROW_IF_NOT(!sel); // TODO implement with selector
         knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+                x,
+                get_xb(),
+                d,
+                n,
+                ntotal,
+                metric_type,
+                metric_arg,
+                k,
+                distances,
+                labels);
     }
 }
 
diff --git a/faiss/MetricType.h b/faiss/MetricType.h
index 4689d4d018..8e889b1a03 100644
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
@@ -31,9 +31,13 @@ enum MetricType {
     METRIC_Canberra = 20,
     METRIC_BrayCurtis,
     METRIC_JensenShannon,
-    METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
-                    ///< where a_i, b_i > 0
+
+    /// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
+    METRIC_Jaccard,
+    /// Squared Eucliden distance, ignoring NaNs
     METRIC_NaNEuclidean,
+    /// abs(x | y): the distance to a hyperplane
+    METRIC_ABS_INNER_PRODUCT,
 };
 
 /// all vector indices are this type
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
index d7fd05bc9f..a037b0280f 100644
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@@ -330,7 +330,7 @@ def lookup(self, keys):
 # KNN function
 ######################################################
 
-def knn(xq, xb, k, metric=METRIC_L2):
+def knn(xq, xb, k, metric=METRIC_L2, metric_arg=0.0):
     """
     Compute the k nearest neighbors of a vector without constructing an index
 
@@ -374,10 +374,16 @@ def knn(xq, xb, k, metric=METRIC_L2):
             swig_ptr(xq), swig_ptr(xb),
             d, nq, nb, k, swig_ptr(D), swig_ptr(I)
         )
-    else:
-        raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
+    else: 
+        knn_extra_metrics(
+            swig_ptr(xq), swig_ptr(xb),
+            d, nq, nb, metric, metric_arg, k, 
+            swig_ptr(D), swig_ptr(I)
+        )
+
     return D, I
 
+
 def knn_hamming(xq, xb, k, variant="hc"):
     """
     Compute the k nearest neighbors of a set of vectors without constructing an index.
diff --git a/faiss/utils/extra_distances-inl.h b/faiss/utils/extra_distances-inl.h
index 5b21482d18..3171580f8c 100644
--- a/faiss/utils/extra_distances-inl.h
+++ b/faiss/utils/extra_distances-inl.h
@@ -150,4 +150,16 @@ inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
     }
     return float(d) / float(present) * accu;
 }
+
+template <>
+inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu += fabs(x[i] * y[i]);
+    }
+    return accu;
+}
+
 } // namespace faiss
diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp
index fb225e7c9e..407057e58e 100644
--- a/faiss/utils/extra_distances.cpp
+++ b/faiss/utils/extra_distances.cpp
@@ -50,16 +50,18 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD, class C>
+template <class VD>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        HeapArray<C>* res) {
-    size_t k = res->k;
+        size_t k,
+        float* distances,
+        int64_t* labels) {
     size_t d = vd.d;
+    using C = typename VD::C;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
     check_period *= omp_get_max_threads();
 
@@ -71,18 +73,15 @@ void knn_extra_metrics_template(
             const float* x_i = x + i * d;
             const float* y_j = y;
             size_t j;
-            float* simi = res->get_val(i);
-            int64_t* idxi = res->get_ids(i);
+            float* simi = distances + k * i;
+            int64_t* idxi = labels + k * i;
 
             // maxheap_heapify(k, simi, idxi);
             heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
                 float disij = vd(x_i, y_j);
 
-                // if (disij < simi[0]) {
-                if ((!vd.is_similarity && (disij < simi[0])) ||
-                    (vd.is_similarity && (disij > simi[0]))) {
-                    // maxheap_replace_top(k, simi, idxi, disij, j);
+                if (C::cmp(simi[0], disij)) {
                     heap_replace_top<C>(k, simi, idxi, disij, j);
                 }
                 y_j += d;
@@ -165,13 +164,13 @@ void pairwise_extra_distances(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -180,13 +179,15 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res) {
+        size_t k,
+        float* distances,
+        int64_t* indexes) {
     switch (mt) {
-#define HANDLE_VAR(kw)                                            \
-    case METRIC_##kw: {                                           \
-        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res);        \
-        break;                                                    \
+#define HANDLE_VAR(kw)                                                       \
+    case METRIC_##kw: {                                                      \
+        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg};            \
+        knn_extra_metrics_template(vd, x, y, nx, ny, k, distances, indexes); \
+        break;                                                               \
     }
         HANDLE_VAR(L2);
         HANDLE_VAR(L1);
@@ -197,32 +198,13 @@ void knn_extra_metrics(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template void knn_extra_metrics<CMax<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMax<float, int64_t>>* res);
-
-template void knn_extra_metrics<CMin<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMin<float, int64_t>>* res);
-
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -245,6 +227,7 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
         HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/faiss/utils/extra_distances.h b/faiss/utils/extra_distances.h
index 79b65bc1e9..f8b47cfba5 100644
--- a/faiss/utils/extra_distances.h
+++ b/faiss/utils/extra_distances.h
@@ -33,7 +33,6 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -42,7 +41,9 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res);
+        size_t k,
+        float* distances,
+        int64_t* indexes);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
diff --git a/tests/test_extra_distances.py b/tests/test_extra_distances.py
index 66318f76c5..fcaf4d383d 100644
--- a/tests/test_extra_distances.py
+++ b/tests/test_extra_distances.py
@@ -114,6 +114,13 @@ def test_nan_euclidean(self):
         new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
         self.assertTrue(np.isnan(new_dis[0]))
 
+    def test_abs_inner_product(self):
+        xq, yb = self.make_example()
+        dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_ABS_INNER_PRODUCT)
+
+        gt_dis = np.abs(xq @ yb.T)
+        np.testing.assert_allclose(dis, gt_dis, atol=1e-5)
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index c769e03ade..d5797186da 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -169,6 +169,21 @@ def test_io_no_storage(self):
             )
             self.assertEquals(index3.storage, None)
 
+    def test_abs_inner_product(self):
+        """Test HNSW with abs inner product (not a real distance, so dubious that triangular inequality works)"""
+        d = self.xq.shape[1]
+        xb = self.xb - self.xb.mean(axis=0)  # need to be centered to give interesting directions
+        xq = self.xq - self.xq.mean(axis=0)
+        Dref, Iref = faiss.knn(xq, xb, 10, faiss.METRIC_ABS_INNER_PRODUCT)
+        
+        index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_ABS_INNER_PRODUCT)
+        index.add(xb)
+        Dnew, Inew = index.search(xq, 10)
+
+        inter = faiss.eval_intersection(Iref, Inew)
+        # 4769 vs. 500*10
+        self.assertGreater(inter, Iref.size * 0.9)
+ 
 
 class TestNSG(unittest.TestCase):