diff --git a/.circleci/config.yml b/.circleci/config.yml
index 94aad3b11e..0330939153 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -5,185 +5,8 @@ executors:
     docker:
       - image: continuumio/miniconda3
     resource_class: large
-  linux-x86_64-gpu:
-    environment:
-      CONDA_ARCH: Linux-x86_64
-    machine:
-      image: linux-cuda-12:default
-    resource_class: gpu.nvidia.medium
-  linux-arm64-cpu:
-    environment:
-      CONDA_ARCH: Linux-aarch64
-    machine:
-      image: ubuntu-2204:current
-    resource_class: arm.medium
-  macosx-arm64-cpu:
-    environment:
-      CONDA_ARCH: MacOSX-arm64
-    macos:
-      xcode: 14.2.0 # minimum supported for M1
-    resource_class: macos.m1.large.gen1
-  windows-x86_64-cpu:
-    machine:
-      image: windows-server-2019-vs2019:2023.04.1
-      shell: bash.exe
-    resource_class: windows.medium
 
 jobs:
-  format:
-    docker:
-      - image: ubuntu:22.04
-    steps:
-      - checkout
-      - run:
-          name: Install clang-format
-          command: |
-            apt-get update
-            apt-get install -y git-core clang-format-11
-      - run:
-          name: Verify clang-format
-          command: |
-             git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-11 -i
-             if git diff --quiet; then
-               echo "Formatting OK!"
-             else
-               echo "Formatting not OK!"
-               echo "------------------"
-               git --no-pager diff --color
-               exit 1
-             fi
-
-  build_conda:
-    parameters:
-      label:
-        type: string
-        default: ""
-      cuda:
-        type: string
-        default: ""
-      raft:
-        type: string
-        default: ""
-      cuda_archs:
-        type: string
-        default: ""
-      compiler_version:
-        type: string
-        default: ""
-      exec:
-        type: executor
-    executor: << parameters.exec >>
-    environment:
-      OMP_NUM_THREADS: 10
-      PACKAGE_TYPE: <<parameters.label>>
-      CUDA_ARCHS: <<parameters.cuda_archs>>
-    steps:
-      - checkout
-      - run:
-          name: Install conda
-          command: |
-            if [ -n "${CONDA_ARCH}" ]
-            then
-              curl https://repo.anaconda.com/miniconda/Miniconda3-latest-${CONDA_ARCH}.sh --output miniconda.sh
-              bash miniconda.sh -b -p $HOME/miniconda
-              ~/miniconda/bin/conda init
-            fi
-      - run:
-          name: Install conda build tools
-          command: |
-            # conda config --set solver libmamba
-            # conda config --set verbosity 3
-            conda update -y -q conda
-            conda install -y -q conda-build
-      - when:
-          condition: << parameters.label >>
-          steps:
-            - run:
-                name: Enable anaconda uploads
-                command: |
-                  conda install -y -q anaconda-client
-                  conda config --set anaconda_upload yes
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU)
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --python 3.11 -c pytorch
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - not: << parameters.cuda >>
-          steps:
-            - run:
-                name: Conda build (CPU) w/ anaconda upload
-                no_output_timeout: 30m
-                command: |
-                  cd conda
-                  conda build faiss --user pytorch --label <<parameters.label>> -c pytorch
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - not: << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia
-      - when:
-          condition:
-            and:
-              - not: << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT)
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c conda-forge
-      - when:
-          condition:
-            and:
-              - << parameters.label >>
-              - << parameters.cuda >>
-              - << parameters.raft >>
-          steps:
-            - run:
-                name: Conda build (GPU w/ RAFT) w/ anaconda upload
-                no_output_timeout: 60m
-                command: |
-                  cd conda
-                  conda build faiss-gpu-raft --variants '{ "cudatoolkit": "<<parameters.cuda>>", "c_compiler_version": "<<parameters.compiler_version>>", "cxx_compiler_version": "<<parameters.compiler_version>>" }' \
-                      --user pytorch --label <<parameters.label>> -c pytorch -c nvidia/label/cuda-<<parameters.cuda>> -c nvidia -c rapidsai -c conda-forge
-
   build_cmake:
     parameters:
       exec:
@@ -191,12 +14,6 @@ jobs:
       opt_level:
         type: string
         default: generic
-      gpu:
-        type: string
-        default: "OFF"
-      raft:
-        type: string
-        default: "OFF"
     executor: << parameters.exec >>
     environment:
       OMP_NUM_THREADS: 10
@@ -217,32 +34,10 @@ jobs:
           command: |
             conda config --set solver libmamba
             conda update -y -q conda
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using main channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64
-      - when:
-          condition:
-            equal: [ "ON", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install env using conda-forge channel
-                command: |
-                  conda install -y -q python=3.11 cmake make swig=4.0.2 mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64 sysroot_linux-64=2.28 libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
-      - when:
-          condition:
-            and:
-              - equal: [ "ON", << parameters.gpu >> ]
-              - equal: [ "OFF", << parameters.raft >> ]
-          steps:
-            - run:
-                name: Install CUDA
-                command: |
-                  conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+      - run:
+          name: Install env using main channel
+          command: |
+            conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest gxx_linux-64=11.2 sysroot_linux-64
       - run:
           name: Build all targets
           no_output_timeout: 30m
@@ -252,8 +47,8 @@ jobs:
             cmake -B build \
                   -DBUILD_TESTING=ON \
                   -DBUILD_SHARED_LIBS=ON \
-                  -DFAISS_ENABLE_GPU=<< parameters.gpu >> \
-                  -DFAISS_ENABLE_RAFT=<< parameters.raft >> \
+                  -DFAISS_ENABLE_GPU=OFF \
+                  -DFAISS_ENABLE_RAFT=OFF \
                   -DFAISS_OPT_LEVEL=<< parameters.opt_level >> \
                   -DFAISS_ENABLE_C_API=ON \
                   -DPYTHON_EXECUTABLE=$(which python) \
@@ -272,38 +67,12 @@ jobs:
           command: |
             cd build/faiss/python
             python setup.py install
-      - when:
-          condition:
-            equal: [ "OFF", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU only)
-                command: |
-                  conda install -y -q pytorch -c pytorch
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-      - when:
-          condition:
-            equal: [ "ON", << parameters.gpu >> ]
-          steps:
-            - run:
-                name: Python tests (CPU + GPU)
-                command: |
-                  conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
-                  pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
-                  pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
-                  cp tests/common_faiss_tests.py faiss/gpu/test
-                  pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
-                  pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
-      - when:
-          condition:
-            equal: [ "avx2", << parameters.opt_level >> ]
-          steps:
-            - run:
-                name: Test avx2 loading
-                command: |
-                  FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss.so
-                  LD_DEBUG=libs python -c "import faiss" 2>&1 | grep faiss_avx2.so
+      - run:
+          name: Python tests (CPU only)
+          command: |
+            conda install -y -q pytorch -c pytorch
+            pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+            pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
       - store_test_results:
           path: test-results
 
@@ -311,180 +80,7 @@ workflows:
   version: 2
   build:
     jobs:
-      - format:
-          name: Format
-      - build_cmake:
-          name: Linux x86_64 (cmake)
-          exec: linux-x86_64-cpu
-      - build_cmake:
-          name: Linux x86_64 AVX2 (cmake)
-          exec: linux-x86_64-cpu
-          opt_level: "avx2"
       - build_cmake:
           name: Linux x86_64 AVX512 (cmake)
           exec: linux-x86_64-cpu
           opt_level: "avx512"
-      - build_cmake:
-          name: Linux x86_64 GPU (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          requires:
-            - Linux x86_64 AVX2 (cmake)
-      - build_cmake:
-          name: Linux x86_64 GPU w/ RAFT (cmake)
-          exec: linux-x86_64-gpu
-          gpu: "ON"
-          raft: "ON"
-          requires:
-            - Linux x86_64 GPU (cmake)
-      - build_conda:
-          name: Linux x86_64 (conda)
-          exec: linux-x86_64-cpu
-      - build_conda:
-          name: Windows x86_64 (conda)
-          exec: windows-x86_64-cpu
-      - build_conda:
-          name: Linux arm64 (conda)
-          exec: linux-arm64-cpu
-      - build_conda:
-          name: Linux x86_64 packages
-          exec: linux-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: main
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Windows x86_64 packages
-          exec: windows-x86_64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: OSX arm64 packages
-          exec: macosx-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-      - build_conda:
-          name: Linux arm64 packages
-          exec: linux-arm64-cpu
-          label: main
-          filters:
-            tags:
-              only: /^v.*/
-            branches:
-              ignore: /.*/
-
-  nightly:
-    triggers:
-      - schedule:
-          cron: "0 0 * * *"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - build_conda:
-          name: Linux x86_64 nightlies
-          exec: linux-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "11.4.4"
-          cuda_archs: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "11.8.0"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
-          exec: linux-x86_64-gpu
-          label: nightly
-          raft: "ON"
-          cuda: "12.1.1"
-          cuda_archs: "70-real;72-real;75-real;80;86-real"
-          compiler_version: "11.2"
-      - build_conda:
-          name: Windows x86_64 nightlies
-          exec: windows-x86_64-cpu
-          label: nightly
-      - build_conda:
-          name: OSX arm64 nightlies
-          exec: macosx-arm64-cpu
-          label: nightly
-      - build_conda:
-          name: Linux arm64 nightlies
-          exec: linux-arm64-cpu
-          label: nightly
diff --git a/.github/actions/build_cmake/action.yml b/.github/actions/build_cmake/action.yml
new file mode 100644
index 0000000000..2bc476add5
--- /dev/null
+++ b/.github/actions/build_cmake/action.yml
@@ -0,0 +1,105 @@
+name: Build cmake
+inputs:
+  opt_level:
+    description: 'Compile options / optimization level.'
+    required: false
+    default: generic
+  gpu:
+    description: 'Enable GPU support.'
+    required: false
+    default: OFF
+  raft:
+    description: 'Enable RAFT support.'
+    required: false
+    default: OFF
+runs:
+  using: composite
+  steps:
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version: latest
+    - name: Configure build environment
+      shell: bash
+      run: |
+        # initialize Conda
+        conda config --set solver libmamba
+        conda update -y -q conda
+        echo "$CONDA/bin" >> $GITHUB_PATH
+
+        # install base packages
+        conda install -y -q -c conda-forge gxx_linux-64=11.2 sysroot_linux-64=2.28
+        conda install -y -q python=3.11 cmake make swig mkl=2023 mkl-devel=2023 numpy scipy pytest
+
+        # install CUDA packages
+        if [ "${{ inputs.gpu }}" = "ON" ] && [ "${{ inputs.raft }}" = "OFF" ]; then
+          conda install -y -q cuda-toolkit -c "nvidia/label/cuda-11.8.0"
+        fi
+
+        # install RAFT packages
+        if [ "${{ inputs.raft }}" = "ON" ]; then
+          conda install -y -q libraft cuda-version=11.8 cuda-toolkit -c rapidsai-nightly -c "nvidia/label/cuda-11.8.0" -c conda-forge
+        fi
+
+        # install test packages
+        conda install -y pytest
+        if [ "${{ inputs.gpu }}" = "ON" ]; then
+          conda install -y -q pytorch pytorch-cuda=11.8 -c pytorch -c nvidia/label/cuda-11.8.0
+        else
+          conda install -y -q pytorch -c pytorch
+        fi
+    - name: Build all targets
+      shell: bash
+      run: |
+        eval "$(conda shell.bash hook)"
+        conda activate
+        cmake -B build \
+              -DBUILD_TESTING=ON \
+              -DBUILD_SHARED_LIBS=ON \
+              -DFAISS_ENABLE_GPU=${{ inputs.gpu }} \
+              -DFAISS_ENABLE_RAFT=${{ inputs.raft }} \
+              -DFAISS_OPT_LEVEL=${{ inputs.opt_level }} \
+              -DFAISS_ENABLE_C_API=ON \
+              -DPYTHON_EXECUTABLE=$CONDA/bin/python \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DBLA_VENDOR=Intel10_64_dyn \
+              -DCMAKE_CUDA_FLAGS="-gencode arch=compute_75,code=sm_75" \
+              .
+        make -k -C build -j$(nproc)
+    - name: C++ tests
+      shell: bash
+      run: |
+        export GTEST_OUTPUT="xml:$(realpath .)/test-results/googletest/"
+        make -C build test
+    - name: Install Python extension
+      shell: bash
+      working-directory: build/faiss/python
+      run: |
+        $CONDA/bin/python setup.py install
+    - name: Python tests (CPU only)
+      if: inputs.gpu == 'OFF'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+    - name: Python tests (CPU + GPU)
+      if: inputs.gpu == 'ON'
+      shell: bash
+      run: |
+        pytest --junitxml=test-results/pytest/results.xml tests/test_*.py
+        pytest --junitxml=test-results/pytest/results-torch.xml tests/torch_*.py
+        cp tests/common_faiss_tests.py faiss/gpu/test
+        pytest --junitxml=test-results/pytest/results-gpu.xml faiss/gpu/test/test_*.py
+        pytest --junitxml=test-results/pytest/results-gpu-torch.xml faiss/gpu/test/torch_*.py
+    - name: Test avx2 loading
+      if: inputs.opt_level == 'avx2'
+      shell: bash
+      run: |
+        FAISS_DISABLE_CPU_FEATURES=AVX2 LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss.so
+        LD_DEBUG=libs $CONDA/bin/python -c "import faiss" 2>&1 | grep faiss_avx2.so
+    - name: Upload test results
+      uses: actions/upload-artifact@v4
+      with:
+        name: test-results-${{ inputs.opt_level }}-${{ inputs.gpu }}-${{ inputs.raft }}
+        path: test-results
diff --git a/.github/actions/build_conda/action.yml b/.github/actions/build_conda/action.yml
new file mode 100644
index 0000000000..982430c351
--- /dev/null
+++ b/.github/actions/build_conda/action.yml
@@ -0,0 +1,96 @@
+name: Conda build
+description: Builds FAISS inside a Conda environment and uploads to repository when label is provided.
+inputs:
+  label:
+    description: "The label to be used for uploads to Conda."
+    default: ""
+    required: false
+  cuda:
+    description: "CUDA toolkit version to use."
+    default: ""
+    required: false
+  raft:
+    description: "Enable RAFT support."
+    default: ""
+    required: false
+  compiler_version:
+    description: "compiler_version"
+    default: "Compiler version for C/C++/CUDA."
+    required: false
+runs:
+  using: composite
+  steps:
+    - name: Choose shell
+      shell: bash
+      id: choose_shell
+      run: |
+        # Use pwsh on Windows; bash everywhere else
+        if [ "${{ runner.os }}" != "Windows" ]; then
+          echo "shell=bash" >> "$GITHUB_OUTPUT"
+        else
+          echo "shell=pwsh" >> "$GITHUB_OUTPUT"
+        fi
+    - name: Setup miniconda
+      uses: conda-incubator/setup-miniconda@v3
+      with:
+        python-version: '3.11'
+        miniconda-version:  latest
+    - name: Install conda build tools
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      run: |
+        conda update -y -q conda
+        conda install -y -q conda-build
+    - name: Enable anaconda uploads
+      if: inputs.label != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda install -y -q anaconda-client
+        conda config --set anaconda_upload yes
+    - name: Conda build (CPU)
+      if: inputs.label == '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss --python 3.11 -c pytorch
+    - name: Conda build (CPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss --user pytorch --label ${{ inputs.label }} -c pytorch
+    - name: Conda build (GPU)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft == ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia
+    - name: Conda build (GPU w/ RAFT)
+      if: inputs.label == '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
+    - name: Conda build (GPU w/ RAFT) w/ anaconda upload
+      if: inputs.label != '' && inputs.cuda != '' && inputs.raft != ''
+      shell: ${{ steps.choose_shell.outputs.shell }}
+      working-directory: conda
+      env:
+        PACKAGE_TYPE: ${{ inputs.label }}
+      run: |
+        conda build faiss-gpu-raft --variants '{ "cudatoolkit": "${{ inputs.cuda }}", "c_compiler_version": "${{ inputs.compiler_version }}", "cxx_compiler_version": "${{ inputs.compiler_version }}" }' \
+            --user pytorch --label ${{ inputs.label }} -c pytorch -c nvidia/label/cuda-${{ inputs.cuda }} -c nvidia -c rapidsai -c rapidsai-nightly -c conda-forge
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000000..bd415dfce8
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,244 @@
+name: Build
+on:
+  workflow_dispatch:
+  pull_request:
+    branches:
+      - main
+  push:
+    tags:
+      - 'v*'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  format:
+    name: Format
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install clang-format
+        run: |
+            sudo apt-get update -y
+            sudo apt-get install -y wget
+            sudo apt install -y lsb-release wget software-properties-common gnupg
+            wget https://apt.llvm.org/llvm.sh
+            chmod u+x llvm.sh
+            sudo ./llvm.sh 18
+            sudo apt-get install -y git-core clang-format-18
+      - name: Verify clang-format
+        run: |
+            git ls-files | grep -E  '\.(cpp|h|cu|cuh)$' | xargs clang-format-18 -i
+            if git diff --quiet; then
+              echo "Formatting OK!"
+            else
+              echo "Formatting not OK!"
+              echo "------------------"
+              git --no-pager diff --color
+              exit 1
+            fi
+  linux-x86_64-cmake:
+    name: Linux x86_64 (cmake)
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+  linux-x86_64-AVX2-cmake:
+    name: Linux x86_64 AVX2 (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx2
+  linux-x86_64-AVX512-cmake:
+    name: Linux x86_64 AVX512 (cmake)
+    if: false # TODO: enable when GitHub Actions adds AVX-512 hosts
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          opt_level: avx512
+  linux-x86_64-GPU-cmake:
+    name: Linux x86_64 GPU (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+  linux-x86_64-GPU-w-RAFT-cmake:
+    name: Linux x86_64 GPU w/ RAFT (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: 4-core-ubuntu-gpu-t4
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - uses: ./.github/actions/build_cmake
+        with:
+          gpu: ON
+          raft: ON
+  linux-x86_64-conda:
+    name: Linux x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  windows-x86_64-conda:
+    name: Windows x86_64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-arm64-conda:
+    name: Linux arm64 (conda)
+    needs: linux-x86_64-cmake
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+  linux-x86_64-packages:
+    name: Linux x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-x86_64-GPU-packages-CUDA-11-4-4:
+    name: Linux x86_64 GPU packages (CUDA 11.4.4)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA11-8-0:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 11.8.0)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-packages-CUDA-12-1-1:
+    name: Linux x86_64 GPU packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-packages-CUDA12-1-1:
+    name: Linux x86_64 GPU w/ RAFT packages (CUDA 12.1.1)
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-packages:
+    name: Windows x86_64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  osx-arm64-packages:
+    name: OSX arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
+  linux-arm64-packages:
+    name: Linux arm64 packages
+    if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v')
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        with:
+          label: main
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
new file mode 100644
index 0000000000..eabee07744
--- /dev/null
+++ b/.github/workflows/nightly.yml
@@ -0,0 +1,139 @@
+name: Nightly
+on:
+  schedule:
+    - cron:  '10 1 * * *'
+env:
+  OMP_NUM_THREADS: '10'
+  MKL_THREADING_LAYER: GNU
+jobs:
+  linux-x86_64-nightly:
+    name: Linux x86_64 nightlies
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-x86_64-GPU-CUDA-11-4-4-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 11.4.4)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "60-real;61-real;62-real;70-real;72-real;75-real;80;86-real"
+      FAISS_FLATTEN_CONDA_INCLUDES: "1"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "11.4.4"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA11-8-0-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 11.8.0)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "11.8.0"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-CUDA-12-1-1-nightly:
+    name: Linux x86_64 GPU nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  linux-x86_64-GPU-RAFT-CUDA12-1-1-nightly:
+    name: Linux x86_64 GPU w/ RAFT nightlies (CUDA 12.1.1)
+    runs-on: 4-core-ubuntu-gpu-t4
+    env:
+      CUDA_ARCHS: "70-real;72-real;75-real;80;86-real"
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+          raft: "ON"
+          cuda: "12.1.1"
+          compiler_version: "11.2"
+  windows-x86_64-nightly:
+    name: Windows x86_64 nightlies
+    runs-on: windows-2019
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  osx-arm64-nightly:
+    name: OSX arm64 nightlies
+    runs-on: macos-14
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
+  linux-arm64-nightly:
+    name: Linux arm64 nightlies
+    runs-on: 2-core-ubuntu-arm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+          fetch-tags: true
+      - uses: ./.github/actions/build_conda
+        env:
+          ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+        with:
+          label: nightly
diff --git a/.gitignore b/.gitignore
index caab1304c8..d6df432fa5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@
 *.dylib
 *.pyc
 *~
+/build/
 /config.*
 /aclocal.m4
 /autom4te.cache/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae418b09b4..8d289ec2f6 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,7 +9,36 @@ We try to indicate most contributions here with the contributor names who are no
 the Facebook Faiss team.  Feel free to add entries here if you submit a PR.
 
 ## [Unreleased]
-- Support for range search in HNSW and Fast scan IVF.
+### Changed
+- Previously, when moving indices to GPU with coarse quantizers that were not implemented on GPU, the cloner would silently fallback to CPU. This version will now throw an exception instead and the calling code would need to explicitly allow fallback to CPU by setting a flag in cloner config.
+
+## [1.8.0] - 2024-02-27
+### Added
+- Added a new conda package faiss-gpu-raft alongside faiss-cpu and faiss-gpu
+- Integrated IVF-Flat and IVF-PQ implementations in faiss-gpu-raft from RAFT by Nvidia [thanks Corey Nolet and Tarang Jain]
+- Added a context parameter to InvertedLists and InvertedListsIterator
+- Added Faiss on Rocksdb demo to showing how inverted lists can be persisted in a key-value store
+- Introduced Offline IVF framework powered by Faiss big batch search
+- Added SIMD NEON Optimization for QT_FP16 in Scalar Quantizer. [thanks Naveen Tatikonda]
+- Generalized ResultHandler and supported range search for HNSW and FastScan
+- Introduced avx512 optimization mode and FAISS_OPT_LEVEL env variable [thanks Alexandr Ghuzva]
+- Added search parameters for IndexRefine::search() and IndexRefineFlat::search()
+- Supported large two-level clustering
+- Added support for Python 3.11 and 3.12
+- Added support for CUDA 12
+
+### Changed
+- Used the benchmark to find Pareto optimal indices. Intentionally limited to IVF(Flat|HNSW),PQ|SQ indices
+- Splitted off RQ encoding steps to another file
+- Supported better NaN handling
+- HNSW speedup + Distance 4 points [thanks Alexandr Ghuzva]
+
+### Fixed
+- Fixed DeviceVector reallocations in Faiss GPU
+- Used efSearch from params if provided in HNSW search
+- Fixed warp synchronous behavior in Faiss GPU CUDA 12
+
+
 ## [1.7.4] - 2023-04-12
 ### Added
 - Added big batch IVF search for conducting efficient search with big batches of queries
@@ -259,7 +288,8 @@ by conda install -c pytorch faiss-gpu cudatoolkit=10.0.
 - C bindings.
 - Extended tutorial to GPU indices.
 
-[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.7.4...HEAD
+[Unreleased]: https://github.com/facebookresearch/faiss/compare/v1.8.0...HEAD
+[1.8.0]: https://github.com/facebookresearch/faiss/compare/v1.7.4...v1.8.0
 [1.7.4]: https://github.com/facebookresearch/faiss/compare/v1.7.3...v1.7.4
 [1.7.3]: https://github.com/facebookresearch/faiss/compare/v1.7.2...v1.7.3
 [1.7.2]: https://github.com/facebookresearch/faiss/compare/v1.7.1...v1.7.2
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7d02bad1a3..eea6aec814 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -18,7 +18,7 @@
 # the License.
 # =============================================================================
 
-cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
 
 set(FAISS_LANGUAGES CXX)
 
@@ -40,7 +40,7 @@ rapids_cuda_init_architectures(faiss_c_library)
 endif()
 
 project(faiss
-  VERSION 1.7.4
+  VERSION 1.8.0
   DESCRIPTION "A library for efficient similarity search and clustering of dense vectors."
   HOMEPAGE_URL "https://github.com/facebookresearch/faiss"
   LANGUAGES ${FAISS_LANGUAGES})
diff --git a/INSTALL.md b/INSTALL.md
index bb10b4b026..39a711c43c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -4,44 +4,47 @@ The supported way to install Faiss is through [conda](https://docs.conda.io).
 Stable releases are pushed regularly to the pytorch conda channel, as well as
 pre-release nightly builds.
 
-The CPU-only `faiss-cpu` conda package is currently available on Linux, OSX, and
-Windows. The `faiss-gpu`, containing both CPU and GPU indices, is available on
-Linux systems, for CUDA 11.4. Packages are built for Python versions 3.8-3.10.
+- The CPU-only faiss-cpu conda package is currently available on Linux (x86_64 and arm64), OSX (arm64 only), and Windows (x86_64)
+- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86_64 only) for CUDA 11.4 and 12.1
+- NEW: faiss-gpu-raft containing both CPU and GPU indices provided by NVIDIA RAFT, is available on Linux (x86_64 only) for CUDA 11.8 and 12.1.
 
 To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch faiss-cpu=1.8.0
 
 # GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.7.4 mkl=2021 blas=1.0=mkl
+$ conda install -c pytorch -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+$ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0
 ```
 
-For faiss-gpu, the nvidia channel is required for cudatoolkit=11.4, which is not
+For faiss-gpu, the nvidia channel is required for CUDA, which is not
 published in the main anaconda channel.
 
-NOTE: due to a bug in the latest 1.7.4 release, Intel MKL 2021 needs to be installed
-separately where applicable. Remove the MKL reference when installing on
-non-Intel platforms.
+For faiss-gpu-raft, the nvidia, rapidsai and conda-forge channels are required.
 
-Nightly pre-release packages can be installed as follows. There is no need to
-install MKL separately, the correct package is automatically installed as a
-dependency where necessary:
+Nightly pre-release packages can be installed as follows:
 
 ``` shell
 # CPU-only version
 $ conda install -c pytorch/label/nightly faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.7.4
+$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.8.0
+
+# GPU(+CPU) version with NVIDIA RAFT
+conda install -c pytorch -c nvidia -c rapidsai -c conda-forge faiss-gpu-raft=1.8.0 pytorch pytorch-cuda numpy
 ```
+In the above commands, pytorch-cuda=11 or pytorch-cuda=12 would select a specific CUDA version, if it’s required.
 
-A combination of versions that installs GPU Faiss with CUDA 11.4 and Pytorch (as of 2023-06-19):
+A combination of versions that installs GPU Faiss with CUDA and Pytorch (as of 2024-05-15):
 ```
-conda create --name faiss_1.7.4 python=3.10
-conda activate faiss_1.7.4
-conda install faiss-gpu=1.7.4 mkl=2021 pytorch pytorch-cuda numpy -c pytorch -c nvidia
+conda create --name faiss_1.8.0
+conda activate faiss_1.8.0
+conda install -c pytorch -c nvidia faiss-gpu=1.8.0 pytorch=*=*cuda* pytorch-cuda=11 numpy
 ```
 
 ## Installing from conda-forge
diff --git a/benchs/bench_cppcontrib_sa_decode.cpp b/benchs/bench_cppcontrib_sa_decode.cpp
index c5c6b0bf18..b960fb7c6a 100644
--- a/benchs/bench_cppcontrib_sa_decode.cpp
+++ b/benchs/bench_cppcontrib_sa_decode.cpp
@@ -213,10 +213,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -262,11 +261,9 @@ static void verifyIndex2LevelDecoder(
 
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
-
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -326,10 +323,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -356,10 +352,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -388,10 +383,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -423,10 +417,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -462,10 +455,9 @@ static void verifyIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -531,10 +523,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -581,10 +572,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -650,10 +640,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -685,10 +674,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -722,10 +710,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -762,10 +749,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -806,10 +792,9 @@ static void verifyMinMaxIndex2LevelDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -865,10 +850,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -914,10 +898,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -977,10 +960,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1006,10 +988,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1036,10 +1017,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1070,10 +1050,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1106,10 +1085,9 @@ static void verifyIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1170,10 +1148,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_seq"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_seq\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1219,10 +1196,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "store_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel << "\t" << error << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\tstore_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel
+                  << "\t" << error << std::endl;
     }
 
     //////////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -1287,10 +1263,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error1 = getError(n, d, outputFaiss, outputKernel1);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel1 << "\t" << error1 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel1
+                  << "\t" << error1 << std::endl;
 
         // kernels: accum 2 points, shared centroids
         StopWatch swKernel2;
@@ -1321,10 +1296,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2 = getError(n, d, outputFaiss, outputKernel2);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2 << "\t" << error2 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2
+                  << "\t" << error2 << std::endl;
 
         // kernels: accum 2 points, unique centroids
         StopWatch swKernel2u;
@@ -1356,10 +1330,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error2u = getError(n, d, outputFaiss, outputKernel2u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum2u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel2u << "\t" << error2u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum2u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel2u
+                  << "\t" << error2u << std::endl;
 
         // kernels: accum 3 points, shared centroids
         StopWatch swKernel3;
@@ -1395,10 +1368,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3 = getError(n, d, outputFaiss, outputKernel3);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3 << "\t" << error3 << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3
+                  << "\t" << error3 << std::endl;
 
         // kernels: accum 3 points, unique centroids
         StopWatch swKernel3u;
@@ -1436,10 +1408,9 @@ static void verifyMinMaxIndexPQDecoder(
         // evaluate the error
         const double error3u = getError(n, d, outputFaiss, outputKernel3u);
 
-        std::cout << description << "\t" << n << "\t" << d << "\t"
-                  << "accum3u_rnd"
-                  << "\t" << nIterations << "\t" << timeFaiss << "\t"
-                  << timeKernel3u << "\t" << error3u << std::endl;
+        std::cout << description << "\t" << n << "\t" << d << "\taccum3u_rnd\t"
+                  << nIterations << "\t" << timeFaiss << "\t" << timeKernel3u
+                  << "\t" << error3u << std::endl;
     }
 }
 
@@ -1512,14 +1483,11 @@ int main(int argc, char** argv) {
             (N_ITERATIONS % 6) == 0, "Number of iterations should be 6*x");
 
     // print the header
-    std::cout << "Codec\t"
-              << "n\t"
-              << "d\t"
-              << "Experiment\t"
-              << "Iterations\t"
-              << "Faiss time\t"
-              << "SADecodeKernel time\t"
-              << "Error" << std::endl;
+    auto delim = "\t";
+    std::cout << "Codec" << delim << "n" << delim << "d" << delim
+              << "Experiment" << delim << "Iterations" << delim << "Faiss time"
+              << delim << "SADecodeKernel time" << delim << "Error"
+              << std::endl;
 
     // The following experiment types are available:
     // * store_seq - decode a contiguous block of codes into vectors, one by one
diff --git a/benchs/bench_fw/benchmark.py b/benchs/bench_fw/benchmark.py
index 1053f99388..8ca68c4cd8 100644
--- a/benchs/bench_fw/benchmark.py
+++ b/benchs/bench_fw/benchmark.py
@@ -208,9 +208,11 @@ def set_io(self, benchmark_io):
         self.io.distance_metric = self.distance_metric
         self.io.distance_metric_type = self.distance_metric_type
 
-    def get_index_desc(self, factory: str) -> Optional[IndexDescriptor]:
+    def get_index_desc(self, factory_or_codec: str) -> Optional[IndexDescriptor]:
         for desc in self.index_descs:
-            if desc.factory == factory:
+            if desc.factory == factory_or_codec:
+                return desc
+            if desc.codec_alias == factory_or_codec:
                 return desc
         return None
 
@@ -232,7 +234,7 @@ def range_search_reference(self, index, parameters, range_metric):
             parameters,
             radius=m_radius,
         )
-        flat = index.factory == "Flat"
+        flat = index.is_flat_index()
         (
             gt_radius,
             range_search_metric_function,
@@ -650,6 +652,7 @@ def benchmark(
                     f"Range index {index_desc.factory} has no radius_score"
                 )
             results["metrics"] = {}
+            self.build_index_wrapper(index_desc)
             for metric_key, range_metric in index_desc.range_metrics.items():
                 (
                     gt_radius,
diff --git a/benchs/bench_fw/descriptors.py b/benchs/bench_fw/descriptors.py
index f1dd7354c2..173b07ce16 100644
--- a/benchs/bench_fw/descriptors.py
+++ b/benchs/bench_fw/descriptors.py
@@ -20,6 +20,7 @@ class IndexDescriptor:
     # but not both at the same time.
     path: Optional[str] = None
     factory: Optional[str] = None
+    codec_alias: Optional[str] = None
     construction_params: Optional[List[Dict[str, int]]] = None
     search_params: Optional[Dict[str, int]] = None
     # range metric definitions
diff --git a/benchs/bench_fw/index.py b/benchs/bench_fw/index.py
index 14f2158e64..3deaa4afcf 100644
--- a/benchs/bench_fw/index.py
+++ b/benchs/bench_fw/index.py
@@ -495,7 +495,7 @@ def range_search(
         radius: Optional[float] = None,
     ):
         logger.info("range_search: begin")
-        if search_parameters is not None and search_parameters["snap"] == 1:
+        if search_parameters is not None and search_parameters.get("snap") == 1:
             query_vectors = self.snap(query_vectors)
         filename = (
             self.get_range_search_name(
@@ -776,6 +776,9 @@ def add_range_or_val(name, range):
             )
         return op
 
+    def is_flat_index(self):
+        return self.get_index_name().startswith("Flat")
+
 
 # IndexFromCodec, IndexFromQuantizer and IndexFromPreTransform
 # are used to wrap pre-trained Faiss indices (codecs)
@@ -807,6 +810,9 @@ def get_codec_name(self):
         name += Index.param_dict_list_to_name(self.construction_params)
         return name
 
+    def fetch_meta(self, dry_run=False):
+        return None, None
+
     def fetch_codec(self):
         codec = self.io.read_index(
             os.path.basename(self.path),
@@ -911,7 +917,7 @@ def fetch_codec(self, dry_run=False):
             assert codec_size is not None
             meta = {
                 "training_time": training_time,
-                "training_size": self.training_vectors.num_vectors,
+                "training_size": self.training_vectors.num_vectors if self.training_vectors else 0,
                 "codec_size": codec_size,
                 "sa_code_size": self.get_sa_code_size(codec),
                 "code_size": self.get_code_size(codec),
diff --git a/benchs/bench_fw/optimize.py b/benchs/bench_fw/optimize.py
index 473436ea68..a2653b7144 100644
--- a/benchs/bench_fw/optimize.py
+++ b/benchs/bench_fw/optimize.py
@@ -226,6 +226,7 @@ def optimize_codec(
             [
                 (None, "Flat"),
                 (None, "SQfp16"),
+                (None, "SQbf16"),
                 (None, "SQ8"),
             ] + [
                 (f"OPQ{M}_{M * dim}", f"PQ{M}x{b}")
diff --git a/benchs/bench_fw_ivf.py b/benchs/bench_fw_ivf.py
index 8c84743e27..e9e144c569 100644
--- a/benchs/bench_fw_ivf.py
+++ b/benchs/bench_fw_ivf.py
@@ -3,16 +3,20 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-import logging
 import argparse
+import logging
 import os
 
-from bench_fw.benchmark import Benchmark
-from bench_fw.benchmark_io import BenchmarkIO
-from bench_fw.descriptors import DatasetDescriptor, IndexDescriptor
+from faiss.benchs.bench_fw.benchmark import Benchmark
+from faiss.benchs.bench_fw.benchmark_io import BenchmarkIO
+from faiss.benchs.bench_fw.descriptors import (
+    DatasetDescriptor,
+    IndexDescriptor,
+)
 
 logging.basicConfig(level=logging.INFO)
 
+
 def sift1M(bio):
     benchmark = Benchmark(
         num_threads=32,
@@ -37,6 +41,7 @@ def sift1M(bio):
     benchmark.set_io(bio)
     benchmark.benchmark(result_file="result.json", local=False, train=True, reconstruct=False, knn=True, range=False)
 
+
 def bigann(bio):
     for scale in [1, 2, 5, 10, 20, 50]:
         benchmark = Benchmark(
diff --git a/benchs/link_and_code/README.md b/benchs/link_and_code/README.md
index 697c7bdfc6..0c04cadac5 100644
--- a/benchs/link_and_code/README.md
+++ b/benchs/link_and_code/README.md
@@ -21,138 +21,5 @@ graph to improve the reconstruction. It is described in
 
 ArXiV [here](https://arxiv.org/abs/1804.09996)
 
-Code structure
---------------
-
-The test runs with 3 files:
-
-- `bench_link_and_code.py`: driver script
-
-- `datasets.py`: code to load the datasets. The example code runs on the
-  deep1b and bigann datasets. See the [toplevel README](../README.md)
-  on how to download them. They should be put in a directory, edit
-  datasets.py to set the path.
-
-- `neighbor_codec.py`: this is where the representation is trained.
-
-The code runs on top of Faiss. The HNSW index can be extended with a
-`ReconstructFromNeighbors` C++ object that refines the distances. The
-training is implemented in Python.
-
-Update: 2023-12-28: the current Faiss dropped support for reconstruction with
-this method.
-
-Reproducing Table 2 in the paper
---------------------------------
-
-The results of table 2 (accuracy on deep100M) in the paper can be
-obtained with:
-
-```bash
-python bench_link_and_code.py \
-   --db deep100M \
-   --M0 6 \
-   --indexkey OPQ36_144,HNSW32_PQ36 \
-   --indexfile $bdir/deep100M_PQ36_L6.index \
-   --beta_nsq 4  \
-   --beta_centroids $bdir/deep100M_PQ36_L6_nsq4.npy \
-   --neigh_recons_codes $bdir/deep100M_PQ36_L6_nsq4_codes.npy \
-   --k_reorder 0,5 --efSearch 1,1024
-```
-
-Set `bdir` to a scratch directory.
-
-Explanation of the flags:
-
-- `--db deep1M`: dataset to process
-
-- `--M0 6`: number of links on the base level (L6)
-
-- `--indexkey OPQ36_144,HNSW32_PQ36`: Faiss index key to construct the
-  HNSW structure. It means that vectors are transformed by OPQ and
-  encoded with PQ 36x8 (with an intermediate size of 144D). The HNSW
-  level>0 nodes have 32 links (theses ones are "cheap" to store
-  because there are fewer nodes in the upper levels.
-
-- `--indexfile $bdir/deep1M_PQ36_M6.index`: name of the index file
-  (without information for the L&C extension)
-
-- `--beta_nsq 4`: number of bytes to allocate for the codes (M in the
-  paper)
-
-- `--beta_centroids $bdir/deep1M_PQ36_M6_nsq4.npy`: filename to store
-  the trained beta centroids
-
-- `--neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq4_codes.npy`: filename
-  for the encoded weights (beta) of the combination
-
-- `--k_reorder 0,5`: number of results to reorder. 0 = baseline
-  without reordering, 5 = value used throughout the paper
-
-- `--efSearch 1,1024`: number of nodes to visit (T in the paper)
-
-The script will proceed with the following steps:
-
-0. load dataset (and possibly compute the ground-truth if the
-ground-truth file is not provided)
-
-1. train the OPQ encoder
-
-2. build the index and store it
-
-3. compute the residuals and train the beta vocabulary to do the reconstruction
-
-4. encode the vertices
-
-5. search and evaluate the search results.
-
-With option `--exhaustive` the results of the exhaustive column can be
-obtained.
-
-The run above should output:
-```bash
-...
-setting k_reorder=5
-...
-efSearch=1024      0.3132 ms per query,  R@1: 0.4283 R@10: 0.6337 R@100: 0.6520 ndis 40941919 nreorder 50000
-
-```
-which matches the paper's table 2.
-
-Note that in multi-threaded mode, the building of the HNSW structure
-is not deterministic. Therefore, the results across runs may not be exactly the same.
-
-Reproducing Figure 5 in the paper
----------------------------------
-
-Figure 5 just evaluates the combination of HNSW and PQ. For example,
-the operating point L6&OPQ40 can be obtained with
-
-```bash
-python bench_link_and_code.py \
-   --db deep1M \
-   --M0 6 \
-   --indexkey OPQ40_160,HNSW32_PQ40 \
-   --indexfile $bdir/deep1M_PQ40_M6.index \
-   --beta_nsq 1 --beta_k 1  \
-   --beta_centroids $bdir/deep1M_PQ40_M6_nsq0.npy \
-   --neigh_recons_codes $bdir/deep1M_PQ36_M6_nsq0_codes.npy \
-   --k_reorder 0 --efSearch 16,64,256,1024
-```
-
-The arguments are similar to the previous table. Note that nsq = 0 is
-simulated by setting beta_nsq = 1 and beta_k = 1 (ie a code with a single
-reproduction value).
-
-The output should look like:
-
-```bash
-setting k_reorder=0
-efSearch=16        0.0147 ms per query,  R@1: 0.3409 R@10: 0.4388 R@100: 0.4394 ndis 2629735 nreorder 0
-efSearch=64        0.0122 ms per query,  R@1: 0.4836 R@10: 0.6490 R@100: 0.6509 ndis 4623221 nreorder 0
-efSearch=256       0.0344 ms per query,  R@1: 0.5730 R@10: 0.7915 R@100: 0.7951 ndis 11090176 nreorder 0
-efSearch=1024      0.2656 ms per query,  R@1: 0.6212 R@10: 0.8722 R@100: 0.8765 ndis 33501951 nreorder 0
-```
-
-The results with k_reorder=5 are not reported in the paper, they
-represent the performance of a "free coding" version of the algorithm.
+The necessary code for this paper was removed from Faiss in version 1.8.0.
+For a functioning verinsion, use Faiss 1.7.4.
diff --git a/benchs/link_and_code/bench_link_and_code.py b/benchs/link_and_code/bench_link_and_code.py
deleted file mode 100755
index ed8f86d631..0000000000
--- a/benchs/link_and_code/bench_link_and_code.py
+++ /dev/null
@@ -1,300 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-from __future__ import print_function
-import os
-import sys
-import time
-import numpy as np
-import faiss
-import argparse
-import datasets
-from datasets import sanitize
-import neighbor_codec
-
-######################################################
-# Command-line parsing
-######################################################
-
-
-parser = argparse.ArgumentParser()
-
-def aa(*args, **kwargs):
-    group.add_argument(*args, **kwargs)
-
-group = parser.add_argument_group('dataset options')
-
-aa('--db', default='deep1M', help='dataset')
-aa( '--compute_gt', default=False, action='store_true',
-    help='compute and store the groundtruth')
-
-group = parser.add_argument_group('index consturction')
-
-aa('--indexkey', default='HNSW32', help='index_factory type')
-aa('--efConstruction', default=200, type=int,
-   help='HNSW construction factor')
-aa('--M0', default=-1, type=int, help='size of base level')
-aa('--maxtrain', default=256 * 256, type=int,
-   help='maximum number of training points')
-aa('--indexfile', default='', help='file to read or write index from')
-aa('--add_bs', default=-1, type=int,
-   help='add elements index by batches of this size')
-aa('--link_singletons', default=False, action='store_true',
-   help='do a pass to link in the singletons')
-
-group = parser.add_argument_group(
-    'searching (reconstruct_from_neighbors options)')
-
-aa('--beta_centroids', default='',
-   help='file with codebook')
-aa('--neigh_recons_codes', default='',
-   help='file with codes for reconstruction')
-aa('--beta_ntrain', default=250000, type=int, help='')
-aa('--beta_k', default=256, type=int, help='beta codebook size')
-aa('--beta_nsq', default=1, type=int, help='number of beta sub-vectors')
-aa('--beta_niter', default=10, type=int, help='')
-aa('--k_reorder', default='-1', help='')
-
-group = parser.add_argument_group('searching')
-
-aa('--k', default=100, type=int, help='nb of nearest neighbors')
-aa('--exhaustive', default=False, action='store_true',
-    help='report the exhaustive search topline')
-aa('--searchthreads', default=-1, type=int,
-   help='nb of threads to use at search time')
-aa('--efSearch', default='', type=str,
-   help='comma-separated values of efSearch to try')
-
-args = parser.parse_args()
-
-print("args:", args)
-
-
-######################################################
-# Load dataset
-######################################################
-
-xt, xb, xq, gt = datasets.load_data(
-    dataset=args.db, compute_gt=args.compute_gt)
-
-nq, d = xq.shape
-nb, d = xb.shape
-
-
-######################################################
-# Make index
-######################################################
-
-if os.path.exists(args.indexfile):
-
-    print("reading", args.indexfile)
-    index = faiss.read_index(args.indexfile)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw_stats = faiss.cvar.hnsw_stats
-
-else:
-
-    print("build index, key=", args.indexkey)
-
-    index = faiss.index_factory(d, args.indexkey)
-
-    if isinstance(index, faiss.IndexPreTransform):
-        index_hnsw = faiss.downcast_index(index.index)
-        vec_transform = index.chain.at(0).apply_py
-    else:
-        index_hnsw = index
-        vec_transform = lambda x:x
-
-    hnsw = index_hnsw.hnsw
-    hnsw.efConstruction = args.efConstruction
-    hnsw_stats = faiss.cvar.hnsw_stats
-    index.verbose = True
-    index_hnsw.verbose = True
-    index_hnsw.storage.verbose = True
-
-    if args.M0 != -1:
-        print("set level 0 nb of neighbors to", args.M0)
-        hnsw.set_nb_neighbors(0, args.M0)
-
-    xt2 = sanitize(xt[:args.maxtrain])
-    assert np.all(np.isfinite(xt2))
-
-    print("train, size", xt.shape)
-    t0 = time.time()
-    index.train(xt2)
-    print("  train in %.3f s" % (time.time() - t0))
-
-    print("adding")
-    t0 = time.time()
-    if args.add_bs == -1:
-        index.add(sanitize(xb))
-    else:
-        for i0 in range(0, nb, args.add_bs):
-            i1 = min(nb, i0 + args.add_bs)
-            print("  adding %d:%d / %d" % (i0, i1, nb))
-            index.add(sanitize(xb[i0:i1]))
-
-    print("  add in %.3f s" % (time.time() - t0))
-    print("storing", args.indexfile)
-    faiss.write_index(index, args.indexfile)
-
-
-######################################################
-# Train beta centroids and encode dataset
-######################################################
-
-if args.beta_centroids:
-    print("reordering links")
-    index_hnsw.reorder_links()
-
-    if os.path.exists(args.beta_centroids):
-        print("load", args.beta_centroids)
-        beta_centroids = np.load(args.beta_centroids)
-        nsq, k, M1 = beta_centroids.shape
-        assert M1 == hnsw.nb_neighbors(0) + 1
-
-        rfn = faiss.ReconstructFromNeighbors(index_hnsw, k, nsq)
-    else:
-        print("train beta centroids")
-        rfn = faiss.ReconstructFromNeighbors(
-            index_hnsw, args.beta_k, args.beta_nsq)
-
-        xb_full = vec_transform(sanitize(xb[:args.beta_ntrain]))
-
-        beta_centroids = neighbor_codec.train_beta_codebook(
-            rfn, xb_full, niter=args.beta_niter)
-
-        print("  storing", args.beta_centroids)
-        np.save(args.beta_centroids, beta_centroids)
-
-
-    faiss.copy_array_to_vector(beta_centroids.ravel(),
-                               rfn.codebook)
-    index_hnsw.reconstruct_from_neighbors = rfn
-
-    if rfn.k == 1:
-        pass     # no codes to take care of
-    elif os.path.exists(args.neigh_recons_codes):
-        print("loading neigh codes", args.neigh_recons_codes)
-        codes = np.load(args.neigh_recons_codes)
-        assert codes.size == rfn.code_size * index.ntotal
-        faiss.copy_array_to_vector(codes.astype('uint8'),
-                                   rfn.codes)
-        rfn.ntotal = index.ntotal
-    else:
-        print("encoding neigh codes")
-        t0 = time.time()
-
-        bs = 1000000 if args.add_bs == -1 else args.add_bs
-
-        for i0 in range(0, nb, bs):
-            i1 = min(i0 + bs, nb)
-            print("   encode %d:%d / %d [%.3f s]\r" % (
-                i0, i1, nb, time.time() - t0), end=' ')
-            sys.stdout.flush()
-            xbatch = vec_transform(sanitize(xb[i0:i1]))
-            rfn.add_codes(i1 - i0, faiss.swig_ptr(xbatch))
-        print()
-
-        print("storing %s" % args.neigh_recons_codes)
-        codes = faiss.vector_to_array(rfn.codes)
-        np.save(args.neigh_recons_codes, codes)
-
-######################################################
-# Exhaustive evaluation
-######################################################
-
-if args.exhaustive:
-    print("exhaustive evaluation")
-    xq_tr = vec_transform(sanitize(xq))
-    index2 = faiss.IndexFlatL2(index_hnsw.d)
-    accu_recons_error = 0.0
-
-    if faiss.get_num_gpus() > 0:
-        print("do eval on GPU")
-        co = faiss.GpuMultipleClonerOptions()
-        co.shard = False
-        index2 = faiss.index_cpu_to_all_gpus(index2, co)
-
-    # process in batches in case the dataset does not fit in RAM
-    rh = datasets.ResultHeap(xq_tr.shape[0], 100)
-    t0 = time.time()
-    bs = 500000
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        print('  handling batch %d:%d' % (i0, i1))
-
-        xb_recons = np.empty(
-            (i1 - i0, index_hnsw.d), dtype='float32')
-        rfn.reconstruct_n(i0, i1 - i0, faiss.swig_ptr(xb_recons))
-
-        accu_recons_error += (
-            (vec_transform(sanitize(xb[i0:i1])) -
-             xb_recons)**2).sum()
-
-        index2.reset()
-        index2.add(xb_recons)
-        D, I = index2.search(xq_tr, 100)
-        rh.add_batch_result(D, I, i0)
-
-    rh.finalize()
-    del index2
-    t1 = time.time()
-    print("done in %.3f s" % (t1 - t0))
-    print("total reconstruction error: ", accu_recons_error)
-    print("eval retrieval:")
-    datasets.evaluate_DI(rh.D, rh.I, gt)
-
-
-def get_neighbors(hnsw, i, level):
-    " list the neighbors for node i at level "
-    assert i < hnsw.levels.size()
-    assert level < hnsw.levels.at(i)
-    be = np.empty(2, 'uint64')
-    hnsw.neighbor_range(i, level, faiss.swig_ptr(be), faiss.swig_ptr(be[1:]))
-    return [hnsw.neighbors.at(j) for j in range(be[0], be[1])]
-
-
-#############################################################
-# Index is ready
-#############################################################
-
-xq = sanitize(xq)
-
-if args.searchthreads != -1:
-    print("Setting nb of threads to", args.searchthreads)
-    faiss.omp_set_num_threads(args.searchthreads)
-
-
-if gt is None:
-    print("no valid groundtruth -- exit")
-    sys.exit()
-
-
-k_reorders = [int(x) for x in args.k_reorder.split(',')]
-efSearchs = [int(x) for x in args.efSearch.split(',')]
-
-
-for k_reorder in k_reorders:
-
-    if index_hnsw.reconstruct_from_neighbors:
-        print("setting k_reorder=%d" % k_reorder)
-        index_hnsw.reconstruct_from_neighbors.k_reorder = k_reorder
-
-    for efSearch in efSearchs:
-        print("efSearch=%-4d" % efSearch, end=' ')
-        hnsw.efSearch = efSearch
-        hnsw_stats.reset()
-        datasets.evaluate(xq, gt, index, k=args.k, endl=False)
-
-        print("ndis %d nreorder %d" % (hnsw_stats.ndis, hnsw_stats.nreorder))
diff --git a/benchs/link_and_code/datasets.py b/benchs/link_and_code/datasets.py
deleted file mode 100755
index a043eb8883..0000000000
--- a/benchs/link_and_code/datasets.py
+++ /dev/null
@@ -1,236 +0,0 @@
-#! /usr/bin/env python2
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-Common functions to load datasets and compute their ground-truth
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-import pdb
-import sys
-
-# set this to the directory that contains the datafiles.
-# deep1b data should be at simdir + 'deep1b'
-# bigann data should be at simdir + 'bigann'
-simdir = '/mnt/vol/gfsai-east/ai-group/datasets/simsearch/'
-
-#################################################################
-# Small I/O functions
-#################################################################
-
-
-def ivecs_read(fname):
-    a = np.fromfile(fname, dtype='int32')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-def fvecs_read(fname):
-    return ivecs_read(fname).view('float32')
-
-
-def ivecs_mmap(fname):
-    a = np.memmap(fname, dtype='int32', mode='r')
-    d = a[0]
-    return a.reshape(-1, d + 1)[:, 1:]
-
-
-def fvecs_mmap(fname):
-    return ivecs_mmap(fname).view('float32')
-
-
-def bvecs_mmap(fname):
-    x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
-    return x.reshape(-1, d + 4)[:, 4:]
-
-
-def ivecs_write(fname, m):
-    n, d = m.shape
-    m1 = np.empty((n, d + 1), dtype='int32')
-    m1[:, 0] = d
-    m1[:, 1:] = m
-    m1.tofile(fname)
-
-
-def fvecs_write(fname, m):
-    m = m.astype('float32')
-    ivecs_write(fname, m.view('int32'))
-
-
-#################################################################
-# Dataset
-#################################################################
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-class ResultHeap:
-    """ Combine query results from a sliced dataset """
-
-    def __init__(self, nq, k):
-        " nq: number of query vectors, k: number of results per query "
-        self.I = np.zeros((nq, k), dtype='int64')
-        self.D = np.zeros((nq, k), dtype='float32')
-        self.nq, self.k = nq, k
-        heaps = faiss.float_maxheap_array_t()
-        heaps.k = k
-        heaps.nh = nq
-        heaps.val = faiss.swig_ptr(self.D)
-        heaps.ids = faiss.swig_ptr(self.I)
-        heaps.heapify()
-        self.heaps = heaps
-
-    def add_batch_result(self, D, I, i0):
-        assert D.shape == (self.nq, self.k)
-        assert I.shape == (self.nq, self.k)
-        I += i0
-        self.heaps.addn_with_ids(
-            self.k, faiss.swig_ptr(D),
-            faiss.swig_ptr(I), self.k)
-
-    def finalize(self):
-        self.heaps.reorder()
-
-
-
-def compute_GT_sliced(xb, xq, k):
-    print("compute GT")
-    t0 = time.time()
-    nb, d = xb.shape
-    nq, d = xq.shape
-    rh = ResultHeap(nq, k)
-    bs = 10 ** 5
-
-    xqs = sanitize(xq)
-
-    db_gt = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-
-    # compute ground-truth by blocks of bs, and add to heaps
-    for i0 in range(0, nb, bs):
-        i1 = min(nb, i0 + bs)
-        xsl = sanitize(xb[i0:i1])
-        db_gt.add(xsl)
-        D, I = db_gt.search(xqs, k)
-        rh.add_batch_result(D, I, i0)
-        db_gt.reset()
-        print("\r   %d/%d, %.3f s" % (i0, nb, time.time() - t0), end=' ')
-        sys.stdout.flush()
-    print()
-    rh.finalize()
-    gt_I = rh.I
-
-    print("GT time: %.3f s" % (time.time() - t0))
-    return gt_I
-
-
-def do_compute_gt(xb, xq, k):
-    print("computing GT")
-    nb, d = xb.shape
-    index = faiss.index_cpu_to_all_gpus(faiss.IndexFlatL2(d))
-    if nb < 100 * 1000:
-        print("   add")
-        index.add(np.ascontiguousarray(xb, dtype='float32'))
-        print("   search")
-        D, I = index.search(np.ascontiguousarray(xq, dtype='float32'), k)
-    else:
-        I = compute_GT_sliced(xb, xq, k)
-
-    return I.astype('int32')
-
-
-def load_data(dataset='deep1M', compute_gt=False):
-
-    print("load data", dataset)
-
-    if dataset == 'sift1M':
-        basedir = simdir + 'sift1M/'
-
-        xt = fvecs_read(basedir + "sift_learn.fvecs")
-        xb = fvecs_read(basedir + "sift_base.fvecs")
-        xq = fvecs_read(basedir + "sift_query.fvecs")
-        gt = ivecs_read(basedir + "sift_groundtruth.ivecs")
-
-    elif dataset.startswith('bigann'):
-        basedir = simdir + 'bigann/'
-
-        dbsize = 1000 if dataset == "bigann1B" else int(dataset[6:-1])
-        xb = bvecs_mmap(basedir + 'bigann_base.bvecs')
-        xq = bvecs_mmap(basedir + 'bigann_query.bvecs')
-        xt = bvecs_mmap(basedir + 'bigann_learn.bvecs')
-        # trim xb to correct size
-        xb = xb[:dbsize * 1000 * 1000]
-        gt = ivecs_read(basedir + 'gnd/idx_%dM.ivecs' % dbsize)
-
-    elif dataset.startswith("deep"):
-        basedir = simdir + 'deep1b/'
-        szsuf = dataset[4:]
-        if szsuf[-1] == 'M':
-            dbsize = 10 ** 6 * int(szsuf[:-1])
-        elif szsuf == '1B':
-            dbsize = 10 ** 9
-        elif szsuf[-1] == 'k':
-            dbsize = 1000 * int(szsuf[:-1])
-        else:
-            assert False, "did not recognize suffix " + szsuf
-
-        xt = fvecs_mmap(basedir + "learn.fvecs")
-        xb = fvecs_mmap(basedir + "base.fvecs")
-        xq = fvecs_read(basedir + "deep1B_queries.fvecs")
-
-        xb = xb[:dbsize]
-
-        gt_fname = basedir + "%s_groundtruth.ivecs" % dataset
-        if compute_gt:
-            gt = do_compute_gt(xb, xq, 100)
-            print("store", gt_fname)
-            ivecs_write(gt_fname, gt)
-
-        gt = ivecs_read(gt_fname)
-
-    else:
-        assert False
-
-    print("dataset %s sizes: B %s Q %s T %s" % (
-        dataset, xb.shape, xq.shape, xt.shape))
-
-    return xt, xb, xq, gt
-
-#################################################################
-# Evaluation
-#################################################################
-
-
-def evaluate_DI(D, I, gt):
-    nq = gt.shape[0]
-    k = I.shape[1]
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-
-
-def evaluate(xq, gt, index, k=100, endl=True):
-    t0 = time.time()
-    D, I = index.search(xq, k)
-    t1 = time.time()
-    nq = xq.shape[0]
-    print("\t %8.4f ms per query, " % (
-        (t1 - t0) * 1000.0 / nq), end=' ')
-    rank = 1
-    while rank <= k:
-        recall = (I[:, :rank] == gt[:, :1]).sum() / float(nq)
-        print("R@%d: %.4f" % (rank, recall), end=' ')
-        rank *= 10
-    if endl:
-        print()
-    return D, I
diff --git a/benchs/link_and_code/neighbor_codec.py b/benchs/link_and_code/neighbor_codec.py
deleted file mode 100755
index 54cad8168a..0000000000
--- a/benchs/link_and_code/neighbor_codec.py
+++ /dev/null
@@ -1,241 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-"""
-This is the training code for the link and code. Especially the
-neighbors_kmeans function implements the EM-algorithm to find the
-appropriate weightings and cluster them.
-"""
-from __future__ import print_function
-
-import time
-import numpy as np
-import faiss
-
-#----------------------------------------------------------
-# Utils
-#----------------------------------------------------------
-
-def sanitize(x):
-    return np.ascontiguousarray(x, dtype='float32')
-
-
-def train_kmeans(x, k, ngpu, max_points_per_centroid=256):
-    "Runs kmeans on one or several GPUs"
-    d = x.shape[1]
-    clus = faiss.Clustering(d, k)
-    clus.verbose = True
-    clus.niter = 20
-    clus.max_points_per_centroid = max_points_per_centroid
-
-    if ngpu == 0:
-        index = faiss.IndexFlatL2(d)
-    else:
-        res = [faiss.StandardGpuResources() for i in range(ngpu)]
-
-        flat_config = []
-        for i in range(ngpu):
-            cfg = faiss.GpuIndexFlatConfig()
-            cfg.useFloat16 = False
-            cfg.device = i
-            flat_config.append(cfg)
-
-        if ngpu == 1:
-            index = faiss.GpuIndexFlatL2(res[0], d, flat_config[0])
-        else:
-            indexes = [faiss.GpuIndexFlatL2(res[i], d, flat_config[i])
-                       for i in range(ngpu)]
-            index = faiss.IndexReplicas()
-            for sub_index in indexes:
-                index.addIndex(sub_index)
-
-    # perform the training
-    clus.train(x, index)
-    centroids = faiss.vector_float_to_array(clus.centroids)
-
-    stats = clus.iteration_stats
-    stats = [stats.at(i) for i in range(stats.size())]
-    obj = np.array([st.obj for st in stats])
-    print("final objective: %.4g" % obj[-1])
-
-    return centroids.reshape(k, d)
-
-
-#----------------------------------------------------------
-# Learning the codebook from neighbors
-#----------------------------------------------------------
-
-
-# works with both a full Inn table and dynamically generated neighbors
-
-def get_Inn_shape(Inn):
-    if type(Inn) != tuple:
-        return Inn.shape
-    return Inn[:2]
-
-def get_neighbor_table(x_coded, Inn, i):
-    if type(Inn) != tuple:
-        return x_coded[Inn[i,:],:]
-    rfn = x_coded
-    M, d = rfn.M, rfn.index.d
-    out = np.zeros((M + 1, d), dtype='float32')
-    int_i = int(i)
-    rfn.get_neighbor_table(int_i, faiss.swig_ptr(out))
-    _, _, sq = Inn
-    return out[:, sq * rfn.dsub : (sq + 1) * rfn.dsub]
-
-
-# Function that produces the best regression values from the vector
-# and its neighbors
-def regress_from_neighbors (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    betas = np.zeros((N,knn))
-    t0 = time.time()
-    for i in range (N):
-        xi = x[i,:]
-        NNi = get_neighbor_table(x_coded, Inn, i)
-        betas[i,:] = np.linalg.lstsq(NNi.transpose(), xi, rcond=0.01)[0]
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return betas
-
-
-
-# find the best beta minimizing ||x-x_coded[Inn,:]*beta||^2
-def regress_opt_beta (x, x_coded, Inn):
-    (N, knn) = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    # construct the linear system to be solved
-    X = np.zeros ((d*N))
-    Y = np.zeros ((d*N, knn))
-    for i in range (N):
-        X[i*d:(i+1)*d] = x[i,:]
-        neighbor_table = get_neighbor_table(x_coded, Inn, i)
-        Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-    beta_opt = np.linalg.lstsq(Y, X, rcond=0.01)[0]
-    return beta_opt
-
-
-# Find the best encoding by minimizing the reconstruction error using
-# a set of pre-computed beta values
-def assign_beta (beta_centroids, x, x_coded, Inn, verbose=True):
-    if type(Inn) == tuple:
-        return assign_beta_2(beta_centroids, x, x_coded, Inn)
-    (N, knn) = Inn.shape
-    x_ibeta = np.zeros ((N), dtype='int32')
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        # Consider all possible betas for the encoding and compute the
-        # encoding error
-        x_reg_all = np.dot (beta_centroids, NNi)
-        err = ((x_reg_all - x[i,:]) ** 2).sum(axis=1)
-        x_ibeta[i] = err.argmin()
-        if verbose:
-            if i % (N / 10) == 0:
-                print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_ibeta
-
-
-# Reconstruct a set of vectors using the beta_centroids, the
-# assignment, the encoded neighbors identified by the list Inn (which
-# includes the vector itself)
-def recons_from_neighbors (beta_centroids, x_ibeta, x_coded, Inn):
-    (N, knn) = Inn.shape
-    x_rec = np.zeros(x_coded.shape)
-    t0= time.time()
-    for i in range (N):
-        NNi = x_coded[Inn[i,:]]
-        x_rec[i, :] = np.dot (beta_centroids[x_ibeta[i]], NNi)
-        if i % (N / 10) == 0:
-            print ("[%d:%d]  %6.3fs" % (i, i + N / 10, time.time() - t0))
-    return x_rec
-
-
-# Compute a EM-like algorithm trying at optimizing the beta such as they
-# minimize the reconstruction error from the neighbors
-def neighbors_kmeans (x, x_coded, Inn, K, ngpus=1, niter=5):
-    # First compute centroids using a regular k-means algorithm
-    betas = regress_from_neighbors (x, x_coded, Inn)
-    beta_centroids = train_kmeans(
-        sanitize(betas), K, ngpus, max_points_per_centroid=1000000)
-    _, knn = get_Inn_shape(Inn)
-    d = x.shape[1]
-
-    rs = np.random.RandomState()
-    for iter in range(niter):
-        print('iter', iter)
-        idx = assign_beta (beta_centroids, x, x_coded, Inn, verbose=False)
-
-        hist = np.bincount(idx)
-        for cl0 in np.where(hist == 0)[0]:
-            print("  cluster %d empty, split" % cl0, end=' ')
-            cl1 = idx[np.random.randint(idx.size)]
-            pos = np.nonzero (idx == cl1)[0]
-            pos = rs.choice(pos, pos.size / 2)
-            print("   cl %d -> %d + %d" % (cl1, len(pos), hist[cl1] - len(pos)))
-            idx[pos] = cl0
-            hist = np.bincount(idx)
-
-        tot_err = 0
-        for k in range (K):
-            pos = np.nonzero (idx == k)[0]
-            npos = pos.shape[0]
-
-            X = np.zeros (d*npos)
-            Y = np.zeros ((d*npos, knn))
-
-            for i in range(npos):
-                X[i*d:(i+1)*d] = x[pos[i],:]
-                neighbor_table = get_neighbor_table(x_coded, Inn, pos[i])
-                Y[i*d:(i+1)*d, :] = neighbor_table.transpose()
-            sol, residuals, _, _ = np.linalg.lstsq(Y, X, rcond=0.01)
-            if residuals.size > 0:
-                tot_err += residuals.sum()
-            beta_centroids[k, :] = sol
-        print('  err=%g' % tot_err)
-    return beta_centroids
-
-
-# assign the betas in C++
-def assign_beta_2(beta_centroids, x, rfn, Inn):
-    _, _, sq = Inn
-    if rfn.k == 1:
-        return np.zeros(x.shape[0], dtype=int)
-    # add dummy dimensions to beta_centroids and x
-    all_beta_centroids = np.zeros(
-        (rfn.nsq, rfn.k, rfn.M + 1), dtype='float32')
-    all_beta_centroids[sq] = beta_centroids
-    all_x = np.zeros((len(x), rfn.d), dtype='float32')
-    all_x[:, sq * rfn.dsub : (sq + 1) * rfn.dsub] = x
-    rfn.codes.clear()
-    rfn.ntotal = 0
-    faiss.copy_array_to_vector(
-        all_beta_centroids.ravel(), rfn.codebook)
-    rfn.add_codes(len(x), faiss.swig_ptr(all_x))
-    codes = faiss.vector_to_array(rfn.codes)
-    codes = codes.reshape(-1, rfn.nsq)
-    return codes[:, sq]
-
-
-#######################################################
-# For usage from bench_storages.py
-
-def train_beta_codebook(rfn, xb_full, niter=10):
-    beta_centroids = []
-    for sq in range(rfn.nsq):
-        d0, d1 = sq * rfn.dsub, (sq + 1) * rfn.dsub
-        print("training subquantizer %d/%d on dimensions %d:%d" % (
-            sq, rfn.nsq, d0, d1))
-        beta_centroids_i = neighbors_kmeans(
-            xb_full[:, d0:d1], rfn, (xb_full.shape[0], rfn.M + 1, sq),
-            rfn.k,
-            ngpus=0, niter=niter)
-        beta_centroids.append(beta_centroids_i)
-        rfn.ntotal = 0
-        rfn.codes.clear()
-        rfn.codebook.clear()
-    return np.stack(beta_centroids)
diff --git a/c_api/IndexScalarQuantizer_c.h b/c_api/IndexScalarQuantizer_c.h
index 2c5e3f2942..87fe6d3415 100644
--- a/c_api/IndexScalarQuantizer_c.h
+++ b/c_api/IndexScalarQuantizer_c.h
@@ -26,6 +26,7 @@ typedef enum FaissQuantizerType {
     QT_fp16,
     QT_8bit_direct, ///< fast indexing of uint8s
     QT_6bit,        ///< 6 bits per component
+    QT_bf16,
 } FaissQuantizerType;
 
 // forward declaration
diff --git a/c_api/clone_index_c.cpp b/c_api/clone_index_c.cpp
index 8211156aaa..606e5f9b0a 100644
--- a/c_api/clone_index_c.cpp
+++ b/c_api/clone_index_c.cpp
@@ -14,6 +14,7 @@
 #include "macros_impl.h"
 
 using faiss::Index;
+using faiss::IndexBinary;
 
 int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     try {
@@ -22,3 +23,14 @@ int faiss_clone_index(const FaissIndex* idx, FaissIndex** p_out) {
     }
     CATCH_AND_HANDLE
 }
+
+int faiss_clone_index_binary(
+        const FaissIndexBinary* idx,
+        FaissIndexBinary** p_out) {
+    try {
+        auto out = faiss::clone_binary_index(
+                reinterpret_cast<const IndexBinary*>(idx));
+        *p_out = reinterpret_cast<FaissIndexBinary*>(out);
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/clone_index_c.h b/c_api/clone_index_c.h
index 3d0bd6745f..d2da35b82f 100644
--- a/c_api/clone_index_c.h
+++ b/c_api/clone_index_c.h
@@ -13,6 +13,7 @@
 #define FAISS_CLONE_INDEX_C_H
 
 #include <stdio.h>
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -25,6 +26,9 @@ extern "C" {
 /** Clone an index. This is equivalent to `faiss::clone_index` */
 int faiss_clone_index(const FaissIndex*, FaissIndex** p_out);
 
+/** Clone a binary index. This is equivalent to `faiss::clone_index_binary` */
+int faiss_clone_index_binary(const FaissIndexBinary*, FaissIndexBinary** p_out);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/c_api/index_factory_c.cpp b/c_api/index_factory_c.cpp
index e9abf141f8..3a1ab9bab9 100644
--- a/c_api/index_factory_c.cpp
+++ b/c_api/index_factory_c.cpp
@@ -15,7 +15,7 @@
 
 using faiss::Index;
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -29,3 +29,17 @@ int faiss_index_factory(
     }
     CATCH_AND_HANDLE
 }
+
+/** Build an index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description) {
+    try {
+        *p_index = reinterpret_cast<FaissIndexBinary*>(
+                faiss::index_binary_factory(d, description));
+    }
+    CATCH_AND_HANDLE
+}
\ No newline at end of file
diff --git a/c_api/index_factory_c.h b/c_api/index_factory_c.h
index 11fb0faa16..ccd58ac778 100644
--- a/c_api/index_factory_c.h
+++ b/c_api/index_factory_c.h
@@ -11,6 +11,7 @@
 #ifndef FAISS_INDEX_FACTORY_C_H
 #define FAISS_INDEX_FACTORY_C_H
 
+#include "IndexBinary_c.h"
 #include "Index_c.h"
 #include "faiss_c.h"
 
@@ -18,7 +19,7 @@
 extern "C" {
 #endif
 
-/** Build and index with the sequence of processing steps described in
+/** Build an index with the sequence of processing steps described in
  *  the string.
  */
 int faiss_index_factory(
@@ -27,6 +28,14 @@ int faiss_index_factory(
         const char* description,
         FaissMetricType metric);
 
+/** Build a binary index with the sequence of processing steps described in
+ *  the string.
+ */
+int faiss_index_binary_factory(
+        FaissIndexBinary** p_index,
+        int d,
+        const char* description);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/conda/faiss-gpu-raft/meta.yaml b/conda/faiss-gpu-raft/meta.yaml
index b365571777..1dde8e9868 100644
--- a/conda/faiss-gpu-raft/meta.yaml
+++ b/conda/faiss-gpu-raft/meta.yaml
@@ -48,21 +48,25 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
-        - libraft =24.02
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - mkl =2023  # [x86_64]
         - openblas  # [not x86_64]
         - cuda-cudart {{ cuda_constraints }}
         - libcublas {{ libcublas_constraints }}
-        - libraft =24.02
+        - libraft =24.06
         - cuda-version {{ cuda_constraints }}
     test:
       requires:
@@ -84,14 +88,19 @@ outputs:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
-        - swig =4.0.2
-        - cmake >=3.23.1
+        - swig
+        - cmake >=3.24.0
         - make  # [not win]
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
+        - mkl =2023  # [x86_64]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - {{ pin_subpackage('libfaiss', exact=True) }}
       run:
+        - _openmp_mutex =4.5=2_kmp_llvm  # [x86_64]
         - python {{ python }}
         - numpy >=1.19,<2
         - packaging
diff --git a/conda/faiss-gpu/build-lib.sh b/conda/faiss-gpu/build-lib.sh
index 2d25e9c5e6..9957be96ea 100755
--- a/conda/faiss-gpu/build-lib.sh
+++ b/conda/faiss-gpu/build-lib.sh
@@ -6,6 +6,12 @@
 
 set -e
 
+# Workaround for CUDA 11.4.4 builds. Moves all necessary headers to include root.
+if [ -n "$FAISS_FLATTEN_CONDA_INCLUDES" ] && [ "$FAISS_FLATTEN_CONDA_INCLUDES" = "1" ]; then
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/sysroot/usr/include/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/"* "$CONDA_PREFIX/include/"
+  cp -r -n "$CONDA_PREFIX/x86_64-conda-linux-gnu/include/c++/11.2.0/x86_64-conda-linux-gnu/"* "$CONDA_PREFIX/include/"
+fi
 
 # Build libfaiss.so/libfaiss_avx2.so/libfaiss_avx512.so
 cmake -B _build \
diff --git a/conda/faiss-gpu/meta.yaml b/conda/faiss-gpu/meta.yaml
index b0df707181..05f7b59008 100644
--- a/conda/faiss-gpu/meta.yaml
+++ b/conda/faiss-gpu/meta.yaml
@@ -43,12 +43,13 @@ outputs:
         - {{ pin_compatible('libfaiss', exact=True) }}
       script_env:
         - CUDA_ARCHS
+        - FAISS_FLATTEN_CONDA_INCLUDES
     requirements:
       build:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
         - cuda-toolkit {{ cudatoolkit }}
@@ -81,8 +82,9 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
+        - cuda-toolkit {{ cudatoolkit }}
       host:
         - python {{ python }}
         - numpy >=1.19,<2
diff --git a/conda/faiss/meta.yaml b/conda/faiss/meta.yaml
index c4d66ca0d3..79e7be953e 100644
--- a/conda/faiss/meta.yaml
+++ b/conda/faiss/meta.yaml
@@ -39,7 +39,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64  # [linux64]
         - llvm-openmp  # [osx]
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
         - mkl-devel =2023  # [x86_64]
       host:
@@ -69,7 +69,7 @@ outputs:
         - {{ compiler('cxx') }}
         - sysroot_linux-64 =2.17 # [linux64]
         - swig
-        - cmake >=3.23.1
+        - cmake >=3.24.0
         - make  # [not win]
       host:
         - python {{ python }}
diff --git a/contrib/datasets.py b/contrib/datasets.py
index f37a2fb6e4..281f16e2fa 100644
--- a/contrib/datasets.py
+++ b/contrib/datasets.py
@@ -6,6 +6,8 @@
 import os
 import numpy as np
 import faiss
+import getpass
+
 
 from .vecs_io import fvecs_read, ivecs_read, bvecs_mmap, fvecs_mmap
 from .exhaustive_search import knn
@@ -115,10 +117,12 @@ def get_groundtruth(self, k=100):
 # that directory is
 ############################################################################
 
+username = getpass.getuser()
 
 for dataset_basedir in (
         '/datasets01/simsearch/041218/',
-        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/'):
+        '/mnt/vol/gfsai-flash3-east/ai-group/datasets/simsearch/',
+        f'/home/{username}/simsearch/data/'):
     if os.path.exists(dataset_basedir):
         break
 else:
diff --git a/contrib/factory_tools.py b/contrib/factory_tools.py
index 745dc7f7ff..cfad7c7b5c 100644
--- a/contrib/factory_tools.py
+++ b/contrib/factory_tools.py
@@ -56,6 +56,8 @@ def get_code_size(d, indexkey):
         return (d * 6 + 7) // 8
     elif indexkey == 'SQfp16':
         return d * 2
+    elif indexkey == 'SQbf16':
+        return d * 2
 
     mo = re.match('PCAR?(\\d+),(.*)$', indexkey)
     if mo:
@@ -140,6 +142,7 @@ def reverse_index_factory(index):
             faiss.ScalarQuantizer.QT_4bit: "4",
             faiss.ScalarQuantizer.QT_6bit: "6",
             faiss.ScalarQuantizer.QT_fp16: "fp16",
+            faiss.ScalarQuantizer.QT_bf16: "bf16",
         }
         return f"SQ{sqtypes[index.sq.qtype]}"
 
diff --git a/contrib/ondisk.py b/contrib/ondisk.py
index 26a95f44f5..81ec71941c 100644
--- a/contrib/ondisk.py
+++ b/contrib/ondisk.py
@@ -11,7 +11,7 @@
 
 
 def merge_ondisk(
-    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str
+    trained_index: faiss.Index, shard_fnames: List[str], ivfdata_fname: str, shift_ids=False
 ) -> None:
     """Add the contents of the indexes stored in shard_fnames into the index
     trained_index. The on-disk data is stored in ivfdata_fname"""
@@ -51,7 +51,7 @@ def merge_ondisk(
         ivf_vector.push_back(ivf)
 
     LOG.info("merge %d inverted lists " % ivf_vector.size())
-    ntotal = invlists.merge_from(ivf_vector.data(), ivf_vector.size())
+    ntotal = invlists.merge_from_multiple(ivf_vector.data(), ivf_vector.size(), shift_ids)
 
     # now replace the inverted lists in the output index
     index.ntotal = index_ivf.ntotal = ntotal
diff --git a/contrib/torch_utils.py b/contrib/torch_utils.py
index 790c295e48..18f136e914 100644
--- a/contrib/torch_utils.py
+++ b/contrib/torch_utils.py
@@ -33,7 +33,7 @@ def swig_ptr_from_UInt8Tensor(x):
     assert x.is_contiguous()
     assert x.dtype == torch.uint8
     return faiss.cast_integer_to_uint8_ptr(
-        x.storage().data_ptr() + x.storage_offset())
+        x.untyped_storage().data_ptr() + x.storage_offset())
 
 def swig_ptr_from_HalfTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
@@ -41,28 +41,28 @@ def swig_ptr_from_HalfTensor(x):
     assert x.dtype == torch.float16
     # no canonical half type in C/C++
     return faiss.cast_integer_to_void_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 2)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 2)
 
 def swig_ptr_from_FloatTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.float32
     return faiss.cast_integer_to_float_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IntTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int32, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_int_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 4)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 4)
 
 def swig_ptr_from_IndicesTensor(x):
     """ gets a Faiss SWIG pointer from a pytorch tensor (on CPU or GPU) """
     assert x.is_contiguous()
     assert x.dtype == torch.int64, 'dtype=%s' % x.dtype
     return faiss.cast_integer_to_idx_t_ptr(
-        x.storage().data_ptr() + x.storage_offset() * 8)
+        x.untyped_storage().data_ptr() + x.storage_offset() * 8)
 
 @contextlib.contextmanager
 def using_stream(res, pytorch_stream=None):
@@ -492,8 +492,9 @@ def torch_replacement_sa_decode(self, codes, x=None):
         if issubclass(the_class, faiss.Index):
             handle_torch_Index(the_class)
 
+
 # allows torch tensor usage with bfKnn
-def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1):
+def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRIC_L2, device=-1, use_raft=False):
     if type(xb) is np.ndarray:
         # Forward to faiss __init__.py base method
         return faiss.knn_gpu_numpy(res, xq, xb, k, D, I, metric, device)
@@ -574,6 +575,7 @@ def torch_replacement_knn_gpu(res, xq, xb, k, D=None, I=None, metric=faiss.METRI
     args.outIndices = I_ptr
     args.outIndicesType = I_type
     args.device = device
+    args.use_raft = use_raft
 
     with using_stream(res):
         faiss.bfKnn(res, args)
diff --git a/contrib/vecs_io.py b/contrib/vecs_io.py
index ea75d5f94d..9ef9e0ab64 100644
--- a/contrib/vecs_io.py
+++ b/contrib/vecs_io.py
@@ -3,6 +3,7 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import sys
 import numpy as np
 
 """
@@ -13,6 +14,8 @@
 
 def ivecs_read(fname):
     a = np.fromfile(fname, dtype='int32')
+    if sys.byteorder == 'big':
+        a.byteswap(inplace=True)
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:].copy()
 
@@ -22,6 +25,7 @@ def fvecs_read(fname):
 
 
 def ivecs_mmap(fname):
+    assert sys.byteorder != 'big'
     a = np.memmap(fname, dtype='int32', mode='r')
     d = a[0]
     return a.reshape(-1, d + 1)[:, 1:]
@@ -33,7 +37,11 @@ def fvecs_mmap(fname):
 
 def bvecs_mmap(fname):
     x = np.memmap(fname, dtype='uint8', mode='r')
-    d = x[:4].view('int32')[0]
+    if sys.byteorder == 'big':
+        da = x[:4][::-1].copy()
+        d = da.view('int32')[0]
+    else:
+        d = x[:4].view('int32')[0]
     return x.reshape(-1, d + 4)[:, 4:]
 
 
@@ -42,6 +50,8 @@ def ivecs_write(fname, m):
     m1 = np.empty((n, d + 1), dtype='int32')
     m1[:, 0] = d
     m1[:, 1:] = m
+    if sys.byteorder == 'big':
+        m1.byteswap(inplace=True)
     m1.tofile(fname)
 
 
diff --git a/demos/demo_imi_pq.cpp b/demos/demo_imi_pq.cpp
index a2af65e792..4fab0778d8 100644
--- a/demos/demo_imi_pq.cpp
+++ b/demos/demo_imi_pq.cpp
@@ -77,7 +77,6 @@ int main() {
     // the coarse quantizer should not be dealloced before the index
     // 4 = nb of bytes per code (d must be a multiple of this)
     // 8 = nb of bits per sub-code (almost always 8)
-    faiss::MetricType metric = faiss::METRIC_L2; // can be METRIC_INNER_PRODUCT
     faiss::IndexIVFPQ index(
             &coarse_quantizer, d, ncentroids, bytes_per_code, 8);
     index.quantizer_trains_alone = true;
diff --git a/demos/offline_ivf/config_ssnpp.yaml b/demos/offline_ivf/config_ssnpp.yaml
index 690f0de156..88e0394155 100644
--- a/demos/offline_ivf/config_ssnpp.yaml
+++ b/demos/offline_ivf/config_ssnpp.yaml
@@ -6,6 +6,7 @@ index:
   non-prod:
   - 'IVF16384,PQ128'
   - 'IVF32768,PQ128'
+  - 'OPQ64_128,IVF4096,PQ64'
 nprobe:
   prod:
     - 512
diff --git a/demos/offline_ivf/offline_ivf.py b/demos/offline_ivf/offline_ivf.py
index 5c316178cb..eccd2b95cb 100644
--- a/demos/offline_ivf/offline_ivf.py
+++ b/demos/offline_ivf/offline_ivf.py
@@ -178,7 +178,7 @@ def dedupe(self):
             idxs.append(np.empty((0,), dtype=np.uint32))
         bs = 1_000_000
         i = 0
-        for buffer in tqdm(self.xb_ds.iterate(0, bs, np.float32)):
+        for buffer in tqdm(self._iterate_transformed(self.xb_ds, 0, bs, np.float32)):
             for j in range(len(codecs)):
                 codec, codeset, idx = codecs[j], codesets[j], idxs[j]
                 uniq = codeset.insert(codec.sa_encode(buffer))
@@ -227,64 +227,6 @@ def _iterate_transformed(self, ds, start, batch_size, dt):
             for buffer in ds.iterate(start, batch_size, dt):
                 yield buffer
 
-    def index_shard_and_quantize(self):
-        assert os.path.exists(self.index_template_file)
-        index = faiss.read_index(self.index_template_file)
-        index_ivf = faiss.downcast_index(faiss.extract_index_ivf(index))
-        assert self.nprobe <= index_ivf.quantizer.ntotal, (
-            f"the number of vectors {index_ivf.quantizer.ntotal} is not enough"
-            f" to retrieve {self.nprobe} neighbours, check."
-        )
-
-        if is_pretransform_index(index):
-            d = index.chain.at(0).d_out
-        else:
-            d = self.input_d
-        for i in range(0, self.nshards):
-            sfn = f"{self.index_shard_prefix}{i}"
-            cqfn = f"{self.coarse_quantization_prefix}{i}"  # fixme
-            if os.path.exists(sfn) or os.path.exists(cqfn):
-                logging.info(f"skipping shard: {i}")
-                continue
-            try:
-                with open(cqfn, "xb") as cqf:
-                    index.reset()
-                    start = i * self.shard_size
-                    j = 0
-                    quantizer = faiss.index_cpu_to_all_gpus(
-                        index_ivf.quantizer
-                    )
-                    for xb_j in tqdm(
-                        self._iterate_transformed(
-                            self.xb_ds,
-                            start,
-                            EMBEDDINGS_BATCH_SIZE,
-                            np.float32,
-                        ),
-                        file=sys.stdout,
-                    ):
-                        assert xb_j.shape[1] == d
-                        _, I = quantizer.search(xb_j, self.nprobe)
-                        assert np.amin(I) >= 0, f"{I}"
-                        assert np.amax(I) < index_ivf.nlist
-                        cqf.write(I)
-                        self._index_add_core_wrapper(  # fixme
-                            index_ivf,
-                            xb_j,
-                            np.arange(start + j, start + j + xb_j.shape[0]),
-                            I[:, 0],
-                        )
-                        j += xb_j.shape[0]
-                        assert j <= self.shard_size
-                        if j == self.shard_size:
-                            break
-                logging.info(f"writing {sfn}...")
-                faiss.write_index(index, sfn)
-            except FileExistsError:
-                logging.info(f"skipping shard: {i}")
-                continue
-        logging.info("done")
-
     def index_shard(self):
         assert os.path.exists(self.index_template_file)
         index = faiss.read_index(self.index_template_file)
@@ -325,11 +267,18 @@ def index_shard(self):
                         ),
                         file=sys.stdout,
                     ):
-                        assert xb_j.shape[1] == index.d
-                        index.add_with_ids(
-                            xb_j,
-                            np.arange(start + jj, start + jj + xb_j.shape[0]),
-                        )
+                        if is_pretransform_index(index):
+                            assert xb_j.shape[1] == index.chain.at(0).d_out
+                            index_ivf.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
+                        else:
+                            assert xb_j.shape[1] == index.d
+                            index.add_with_ids(
+                                xb_j,
+                                np.arange(start + jj, start + jj + xb_j.shape[0]),
+                            )
                         jj += xb_j.shape[0]
                         logging.info(jj)
                         assert (
@@ -728,10 +677,14 @@ def search(self):
                             os.remove(Ifn)
                             os.remove(Dfn)
 
-            try:  # TODO: modify shape for pretransform case
+            try:
+                if is_pretransform_index(index):
+                    d = index.chain.at(0).d_out
+                else:
+                    d = self.input_d
                 with open(Ifn, "xb") as f, open(Dfn, "xb") as g:
                     xq_i = np.empty(
-                        shape=(self.xq_bs, self.input_d), dtype=np.float16
+                        shape=(self.xq_bs, d), dtype=np.float16
                     )
                     q_assign = np.empty(
                         (self.xq_bs, self.nprobe), dtype=np.int32
@@ -893,8 +846,7 @@ def consistency_check(self):
             for j in range(SMALL_DATA_SAMPLE):
                 assert np.where(I[j] == j + r)[0].size > 0, (
                     f"I[j]: {I[j]}, j: {j}, i: {i}, shard_size:"
-                    f" {self.shard_size}"
-                )
+                    f" {self.shard_size}")
 
         logging.info("search results...")
         index_ivf.nprobe = self.nprobe
diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.cpp b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
index 99c51c1456..8d692f0b54 100644
--- a/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.cpp
@@ -101,7 +101,8 @@ void RocksDBInvertedLists::resize(size_t /*list_no*/, size_t /*new_size*/) {
 }
 
 InvertedListsIterator* RocksDBInvertedLists::get_iterator(
-        size_t list_no) const {
+        size_t list_no,
+        void* inverted_list_context) const {
     return new RocksDBInvertedListsIterator(db_.get(), list_no, code_size);
 }
 
diff --git a/demos/rocksdb_ivf/RocksDBInvertedLists.h b/demos/rocksdb_ivf/RocksDBInvertedLists.h
index fdc83d1d27..f9d70a4f97 100644
--- a/demos/rocksdb_ivf/RocksDBInvertedLists.h
+++ b/demos/rocksdb_ivf/RocksDBInvertedLists.h
@@ -49,7 +49,9 @@ struct RocksDBInvertedLists : faiss::InvertedLists {
 
     void resize(size_t list_no, size_t new_size) override;
 
-    faiss::InvertedListsIterator* get_iterator(size_t list_no) const override;
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context) const override;
 
    private:
     std::unique_ptr<rocksdb::DB> db_;
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 0ef15f1002..7e2a55740c 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -183,6 +183,7 @@ set(FAISS_HEADERS
   invlists/InvertedLists.h
   invlists/InvertedListsIOHook.h
   utils/AlignedTable.h
+  utils/bf16.h
   utils/Heap.h
   utils/WorkerThread.h
   utils/distances.h
@@ -299,7 +300,10 @@ if(WIN32)
   target_compile_definitions(faiss_avx512 PRIVATE FAISS_MAIN_LIB)
 endif()
 
-target_compile_definitions(faiss PRIVATE FINTEGER=int)
+string(FIND "${CMAKE_CXX_FLAGS}" "FINTEGER" finteger_idx)
+if (${finteger_idx} EQUAL -1)
+  target_compile_definitions(faiss PRIVATE FINTEGER=int)
+endif()
 target_compile_definitions(faiss_avx2 PRIVATE FINTEGER=int)
 target_compile_definitions(faiss_avx512 PRIVATE FINTEGER=int)
 
diff --git a/faiss/IVFlib.cpp b/faiss/IVFlib.cpp
index 91aa7af7f3..f2c975f4de 100644
--- a/faiss/IVFlib.cpp
+++ b/faiss/IVFlib.cpp
@@ -352,7 +352,10 @@ void search_with_parameters(
     const IndexIVF* index_ivf = dynamic_cast<const IndexIVF*>(index);
     FAISS_THROW_IF_NOT(index_ivf);
 
-    index_ivf->quantizer->search(n, x, params->nprobe, Dq.data(), Iq.data());
+    SearchParameters* quantizer_params =
+            (params) ? params->quantizer_params : nullptr;
+    index_ivf->quantizer->search(
+            n, x, params->nprobe, Dq.data(), Iq.data(), quantizer_params);
 
     if (nb_dis_ptr) {
         *nb_dis_ptr = count_ndis(index_ivf, n * params->nprobe, Iq.data());
diff --git a/faiss/Index.h b/faiss/Index.h
index 4b4b302b47..3d1bdb996a 100644
--- a/faiss/Index.h
+++ b/faiss/Index.h
@@ -17,8 +17,8 @@
 #include <typeinfo>
 
 #define FAISS_VERSION_MAJOR 1
-#define FAISS_VERSION_MINOR 7
-#define FAISS_VERSION_PATCH 4
+#define FAISS_VERSION_MINOR 8
+#define FAISS_VERSION_PATCH 0
 
 /**
  * @namespace faiss
diff --git a/faiss/IndexBinaryIVF.cpp b/faiss/IndexBinaryIVF.cpp
index 686785a987..ab1b9fd89a 100644
--- a/faiss/IndexBinaryIVF.cpp
+++ b/faiss/IndexBinaryIVF.cpp
@@ -456,7 +456,7 @@ void search_knn_hamming_heap(
             }
 
         } // parallel for
-    }     // parallel
+    } // parallel
 
     indexIVF_stats.nq += n;
     indexIVF_stats.nlist += nlistv;
diff --git a/faiss/IndexFastScan.cpp b/faiss/IndexFastScan.cpp
index 2dfb2f55fd..529465da3e 100644
--- a/faiss/IndexFastScan.cpp
+++ b/faiss/IndexFastScan.cpp
@@ -189,6 +189,7 @@ void estimators_from_tables_generic(
                 dt += index.ksub;
             }
         }
+
         if (C::cmp(heap_dis[0], dis)) {
             heap_pop<C>(k, heap_dis, heap_ids);
             heap_push<C>(k, heap_dis, heap_ids, dis, j);
@@ -203,17 +204,18 @@ ResultHandlerCompare<C, false>* make_knn_handler(
         idx_t k,
         size_t ntotal,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel = nullptr) {
     using HeapHC = HeapHandler<C, false>;
     using ReservoirHC = ReservoirHandler<C, false>;
     using SingleResultHC = SingleResultHandler<C, false>;
 
     if (k == 1) {
-        return new SingleResultHC(n, ntotal, distances, labels);
+        return new SingleResultHC(n, ntotal, distances, labels, sel);
     } else if (impl % 2 == 0) {
-        return new HeapHC(n, ntotal, k, distances, labels);
+        return new HeapHC(n, ntotal, k, distances, labels, sel);
     } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels);
+        return new ReservoirHC(n, ntotal, k, 2 * k, distances, labels, sel);
     }
 }
 
diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index f606f8e621..7d29ca5387 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -41,15 +41,19 @@ void IndexFlat::search(
     } else if (metric_type == METRIC_L2) {
         float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
         knn_L2sqr(x, get_xb(), d, n, ntotal, &res, nullptr, sel);
-    } else if (is_similarity_metric(metric_type)) {
-        float_minheap_array_t res = {size_t(n), size_t(k), labels, distances};
-        knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
     } else {
-        FAISS_THROW_IF_NOT(!sel);
-        float_maxheap_array_t res = {size_t(n), size_t(k), labels, distances};
+        FAISS_THROW_IF_NOT(!sel); // TODO implement with selector
         knn_extra_metrics(
-                x, get_xb(), d, n, ntotal, metric_type, metric_arg, &res);
+                x,
+                get_xb(),
+                d,
+                n,
+                ntotal,
+                metric_type,
+                metric_arg,
+                k,
+                distances,
+                labels);
     }
 }
 
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index 9a67332d67..8e5c654f04 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/IndexHNSW.h>
 
 #include <omp.h>
@@ -17,7 +15,10 @@
 #include <cstdlib>
 #include <cstring>
 
+#include <limits>
+#include <memory>
 #include <queue>
+#include <random>
 #include <unordered_set>
 
 #include <sys/stat.h>
@@ -68,52 +69,6 @@ HNSWStats hnsw_stats;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    void distances_batch_4(
-            const idx_t idx0,
-            const idx_t idx1,
-            const idx_t idx2,
-            const idx_t idx3,
-            float& dis0,
-            float& dis1,
-            float& dis2,
-            float& dis3) override {
-        basedis->distances_batch_4(
-                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
-        dis0 = -dis0;
-        dis1 = -dis1;
-        dis2 = -dis2;
-        dis3 = -dis3;
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    virtual ~NegativeDistanceComputer() {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
@@ -192,7 +147,9 @@ void hnsw_add_vertices(
 
         int i1 = n;
 
-        for (int pt_level = hist.size() - 1; pt_level >= 0; pt_level--) {
+        for (int pt_level = hist.size() - 1;
+             pt_level >= !index_hnsw.init_level0;
+             pt_level--) {
             int i0 = i1 - hist[pt_level];
 
             if (verbose) {
@@ -228,7 +185,13 @@ void hnsw_add_vertices(
                         continue;
                     }
 
-                    hnsw.add_with_locks(*dis, pt_level, pt_id, locks, vt);
+                    hnsw.add_with_locks(
+                            *dis,
+                            pt_level,
+                            pt_id,
+                            locks,
+                            vt,
+                            index_hnsw.keep_max_size_level0 && (pt_level == 0));
 
                     if (prev_display >= 0 && i - i0 > prev_display + 10000) {
                         prev_display = i - i0;
@@ -248,7 +211,11 @@ void hnsw_add_vertices(
             }
             i1 = i0;
         }
-        FAISS_ASSERT(i1 == 0);
+        if (index_hnsw.init_level0) {
+            FAISS_ASSERT(i1 == 0);
+        } else {
+            FAISS_ASSERT((i1 - hist[0]) == 0);
+        }
     }
     if (verbose) {
         printf("Done in %.3f ms\n", getmillisecs() - t0);
@@ -297,7 +264,8 @@ void hnsw_search(
         const SearchParameters* params_in) {
     FAISS_THROW_IF_NOT_MSG(
             index->storage,
-            "Please use IndexHNSWFlat (or variants) instead of IndexHNSW directly");
+            "No storage index, please use IndexHNSWFlat (or variants) "
+            "instead of IndexHNSW directly");
     const SearchParametersHNSW* params = nullptr;
     const HNSW& hnsw = index->hnsw;
 
@@ -307,7 +275,7 @@ void hnsw_search(
         FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
         efSearch = params->efSearch;
     }
-    size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+    size_t n1 = 0, n2 = 0, ndis = 0;
 
     idx_t check_period = InterruptCallback::get_period_hint(
             hnsw.max_level * index->d * efSearch);
@@ -323,7 +291,7 @@ void hnsw_search(
             std::unique_ptr<DistanceComputer> dis(
                     storage_distance_computer(index->storage));
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder) schedule(guided)
+#pragma omp for reduction(+ : n1, n2, ndis) schedule(guided)
             for (idx_t i = i0; i < i1; i++) {
                 res.begin(i);
                 dis->set_query(x + i * index->d);
@@ -331,16 +299,14 @@ void hnsw_search(
                 HNSWStats stats = hnsw.search(*dis, res, vt, params);
                 n1 += stats.n1;
                 n2 += stats.n2;
-                n3 += stats.n3;
                 ndis += stats.ndis;
-                nreorder += stats.nreorder;
                 res.end();
             }
         }
         InterruptCallback::check();
     }
 
-    hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+    hnsw_stats.combine({n1, n2, ndis});
 }
 
 } // anonymous namespace
@@ -453,10 +419,18 @@ void IndexHNSW::search_level_0(
         float* distances,
         idx_t* labels,
         int nprobe,
-        int search_type) const {
+        int search_type,
+        const SearchParameters* params_in) const {
     FAISS_THROW_IF_NOT(k > 0);
     FAISS_THROW_IF_NOT(nprobe > 0);
 
+    const SearchParametersHNSW* params = nullptr;
+
+    if (params_in) {
+        params = dynamic_cast<const SearchParametersHNSW*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "params type invalid");
+    }
+
     storage_idx_t ntotal = hnsw.levels.size();
 
     using RH = HeapBlockResultHandler<HNSW::C>;
@@ -483,13 +457,21 @@ void IndexHNSW::search_level_0(
                     nearest_d + i * nprobe,
                     search_type,
                     search_stats,
-                    vt);
+                    vt,
+                    params);
             res.end();
             vt.advance();
         }
 #pragma omp critical
         { hnsw_stats.combine(search_stats); }
     }
+    if (is_similarity_metric(this->metric_type)) {
+// we need to revert the negated distances
+#pragma omp parallel for
+        for (int64_t i = 0; i < k * n; i++) {
+            distances[i] = -distances[i];
+        }
+    }
 }
 
 void IndexHNSW::init_level_0_from_knngraph(
@@ -800,7 +782,7 @@ void IndexHNSW2Level::search(
         IndexHNSW::search(n, x, k, distances, labels);
 
     } else { // "mixed" search
-        size_t n1 = 0, n2 = 0, n3 = 0, ndis = 0, nreorder = 0;
+        size_t n1 = 0, n2 = 0, ndis = 0;
 
         const IndexIVFPQ* index_ivfpq =
                 dynamic_cast<const IndexIVFPQ*>(storage);
@@ -832,7 +814,7 @@ void IndexHNSW2Level::search(
             int candidates_size = hnsw.upper_beam;
             MinimaxHeap candidates(candidates_size);
 
-#pragma omp for reduction(+ : n1, n2, n3, ndis, nreorder)
+#pragma omp for reduction(+ : n1, n2, ndis)
             for (idx_t i = 0; i < n; i++) {
                 idx_t* idxi = labels + i * k;
                 float* simi = distances + i * k;
@@ -877,9 +859,7 @@ void IndexHNSW2Level::search(
                         k);
                 n1 += search_stats.n1;
                 n2 += search_stats.n2;
-                n3 += search_stats.n3;
                 ndis += search_stats.ndis;
-                nreorder += search_stats.nreorder;
 
                 vt.advance();
                 vt.advance();
@@ -888,7 +868,7 @@ void IndexHNSW2Level::search(
             }
         }
 
-        hnsw_stats.combine({n1, n2, n3, ndis, nreorder});
+        hnsw_stats.combine({n1, n2, ndis});
     }
 }
 
@@ -914,4 +894,86 @@ void IndexHNSW2Level::flip_to_ivf() {
     delete storage2l;
 }
 
+/**************************************************************
+ * IndexHNSWCagra implementation
+ **************************************************************/
+
+IndexHNSWCagra::IndexHNSWCagra() {
+    is_trained = true;
+}
+
+IndexHNSWCagra::IndexHNSWCagra(int d, int M, MetricType metric)
+        : IndexHNSW(
+                  (metric == METRIC_L2)
+                          ? static_cast<IndexFlat*>(new IndexFlatL2(d))
+                          : static_cast<IndexFlat*>(new IndexFlatIP(d)),
+                  M) {
+    FAISS_THROW_IF_NOT_MSG(
+            ((metric == METRIC_L2) || (metric == METRIC_INNER_PRODUCT)),
+            "unsupported metric type for IndexHNSWCagra");
+    own_fields = true;
+    is_trained = true;
+    init_level0 = true;
+    keep_max_size_level0 = true;
+}
+
+void IndexHNSWCagra::add(idx_t n, const float* x) {
+    FAISS_THROW_IF_NOT_MSG(
+            !base_level_only,
+            "Cannot add vectors when base_level_only is set to True");
+
+    IndexHNSW::add(n, x);
+}
+
+void IndexHNSWCagra::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params) const {
+    if (!base_level_only) {
+        IndexHNSW::search(n, x, k, distances, labels, params);
+    } else {
+        std::vector<storage_idx_t> nearest(n);
+        std::vector<float> nearest_d(n);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            std::unique_ptr<DistanceComputer> dis(
+                    storage_distance_computer(this->storage));
+            dis->set_query(x + i * d);
+            nearest[i] = -1;
+            nearest_d[i] = std::numeric_limits<float>::max();
+
+            std::random_device rd;
+            std::mt19937 gen(rd());
+            std::uniform_int_distribution<idx_t> distrib(0, this->ntotal);
+
+            for (idx_t j = 0; j < num_base_level_search_entrypoints; j++) {
+                auto idx = distrib(gen);
+                auto distance = (*dis)(idx);
+                if (distance < nearest_d[i]) {
+                    nearest[i] = idx;
+                    nearest_d[i] = distance;
+                }
+            }
+            FAISS_THROW_IF_NOT_MSG(
+                    nearest[i] >= 0, "Could not find a valid entrypoint.");
+        }
+
+        search_level_0(
+                n,
+                x,
+                k,
+                nearest.data(),
+                nearest_d.data(),
+                distances,
+                labels,
+                1, // n_probes
+                1, // search_type
+                params);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index e0b65fca9d..71807c6537 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -34,6 +34,18 @@ struct IndexHNSW : Index {
     bool own_fields = false;
     Index* storage = nullptr;
 
+    // When set to false, level 0 in the knn graph is not initialized.
+    // This option is used by GpuIndexCagra::copyTo(IndexHNSWCagra*)
+    // as level 0 knn graph is copied over from the index built by
+    // GpuIndexCagra.
+    bool init_level0 = true;
+
+    // When set to true, all neighbors in level 0 are filled up
+    // to the maximum size allowed (2 * M). This option is used by
+    // IndexHHNSWCagra to create a full base layer graph that is
+    // used when GpuIndexCagra::copyFrom(IndexHNSWCagra*) is invoked.
+    bool keep_max_size_level0 = false;
+
     explicit IndexHNSW(int d = 0, int M = 32, MetricType metric = METRIC_L2);
     explicit IndexHNSW(Index* storage, int M = 32);
 
@@ -81,7 +93,8 @@ struct IndexHNSW : Index {
             float* distances,
             idx_t* labels,
             int nprobe = 1,
-            int search_type = 1) const;
+            int search_type = 1,
+            const SearchParameters* params = nullptr) const;
 
     /// alternative graph building
     void init_level_0_from_knngraph(int k, const float* D, const idx_t* I);
@@ -148,4 +161,33 @@ struct IndexHNSW2Level : IndexHNSW {
             const SearchParameters* params = nullptr) const override;
 };
 
+struct IndexHNSWCagra : IndexHNSW {
+    IndexHNSWCagra();
+    IndexHNSWCagra(int d, int M, MetricType metric = METRIC_L2);
+
+    /// When set to true, the index is immutable.
+    /// This option is used to copy the knn graph from GpuIndexCagra
+    /// to the base level of IndexHNSWCagra without adding upper levels.
+    /// Doing so enables to search the HNSW index, but removes the
+    /// ability to add vectors.
+    bool base_level_only = false;
+
+    /// When `base_level_only` is set to `True`, the search function
+    /// searches only the base level knn graph of the HNSW index.
+    /// This parameter selects the entry point by randomly selecting
+    /// some points and using the best one.
+    int num_base_level_search_entrypoints = 32;
+
+    void add(idx_t n, const float* x) override;
+
+    /// entry point for search
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params = nullptr) const override;
+};
+
 } // namespace faiss
diff --git a/faiss/IndexIVF.cpp b/faiss/IndexIVF.cpp
index 95d3bc9e68..548aaa4cc7 100644
--- a/faiss/IndexIVF.cpp
+++ b/faiss/IndexIVF.cpp
@@ -444,7 +444,7 @@ void IndexIVF::search_preassigned(
         max_codes = unlimited_list_size;
     }
 
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 0           ? false
                      : pmode == 3 ? n > 1
                      : pmode == 1 ? nprobe > 1
@@ -784,7 +784,7 @@ void IndexIVF::range_search_preassigned(
 
     int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
     // don't start parallel section if single query
-    bool do_parallel = omp_get_max_threads() >= 2 &&
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
             (pmode == 3           ? false
                      : pmode == 0 ? nx > 1
                      : pmode == 1 ? nprobe > 1
diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index 45c65ef839..185561d086 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -433,6 +433,14 @@ struct IndexIVF : Index, IndexIVFInterface {
 
     /* The standalone codec interface (except sa_decode that is specific) */
     size_t sa_code_size() const override;
+
+    /** encode a set of vectors
+     * sa_encode will call encode_vector with include_listno=true
+     * @param n      nb of vectors to encode
+     * @param x      the vectors to encode
+     * @param bytes  output array for the codes
+     * @return nb of bytes written to codes
+     */
     void sa_encode(idx_t n, const float* x, uint8_t* bytes) const override;
 
     IndexIVF();
diff --git a/faiss/IndexIVFFastScan.cpp b/faiss/IndexIVFFastScan.cpp
index 00bc6c823e..3e40f7a3da 100644
--- a/faiss/IndexIVFFastScan.cpp
+++ b/faiss/IndexIVFFastScan.cpp
@@ -211,7 +211,7 @@ void estimators_from_tables_generic(
         int64_t* heap_ids,
         const NormTableScaler* scaler) {
     using accu_t = typename C::T;
-    int nscale = scaler ? scaler->nscale : 0;
+    size_t nscale = scaler ? scaler->nscale : 0;
     for (size_t j = 0; j < ncodes; ++j) {
         BitstringReader bsr(codes + j * index.code_size, index.code_size);
         accu_t dis = bias;
@@ -270,6 +270,7 @@ void IndexIVFFastScan::compute_LUT_uint8(
         biases.resize(n * nprobe);
     }
 
+    // OMP for MSVC requires i to have signed integral type
 #pragma omp parallel for if (n > 100)
     for (int64_t i = 0; i < n; i++) {
         const float* t_in = dis_tables_float.get() + i * dim123;
@@ -306,11 +307,16 @@ void IndexIVFFastScan::search(
         idx_t k,
         float* distances,
         idx_t* labels,
-        const SearchParameters* params) const {
-    auto paramsi = dynamic_cast<const SearchParametersIVF*>(params);
-    FAISS_THROW_IF_NOT_MSG(!params || paramsi, "need IVFSearchParameters");
+        const SearchParameters* params_in) const {
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+    }
+
     search_preassigned(
-            n, x, k, nullptr, nullptr, distances, labels, false, paramsi);
+            n, x, k, nullptr, nullptr, distances, labels, false, params);
 }
 
 void IndexIVFFastScan::search_preassigned(
@@ -326,18 +332,17 @@ void IndexIVFFastScan::search_preassigned(
         IndexIVFStats* stats) const {
     size_t nprobe = this->nprobe;
     if (params) {
-        FAISS_THROW_IF_NOT_MSG(
-                !params->quantizer_params, "quantizer params not supported");
         FAISS_THROW_IF_NOT(params->max_codes == 0);
         nprobe = params->nprobe;
     }
+
     FAISS_THROW_IF_NOT_MSG(
             !store_pairs, "store_pairs not supported for this index");
     FAISS_THROW_IF_NOT_MSG(!stats, "stats not supported for this index");
     FAISS_THROW_IF_NOT(k > 0);
 
     const CoarseQuantized cq = {nprobe, centroid_dis, assign};
-    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr);
+    search_dispatch_implem(n, x, k, distances, labels, cq, nullptr, params);
 }
 
 void IndexIVFFastScan::range_search(
@@ -345,10 +350,18 @@ void IndexIVFFastScan::range_search(
         const float* x,
         float radius,
         RangeSearchResult* result,
-        const SearchParameters* params) const {
-    FAISS_THROW_IF_NOT(!params);
+        const SearchParameters* params_in) const {
+    size_t nprobe = this->nprobe;
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(
+                params, "IndexIVFFastScan params have incorrect type");
+        nprobe = params->nprobe;
+    }
+
     const CoarseQuantized cq = {nprobe, nullptr, nullptr};
-    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr);
+    range_search_dispatch_implem(n, x, radius, *result, cq, nullptr, params);
 }
 
 namespace {
@@ -359,17 +372,18 @@ ResultHandlerCompare<C, true>* make_knn_handler_fixC(
         idx_t n,
         idx_t k,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel) {
     using HeapHC = HeapHandler<C, true>;
     using ReservoirHC = ReservoirHandler<C, true>;
     using SingleResultHC = SingleResultHandler<C, true>;
 
     if (k == 1) {
-        return new SingleResultHC(n, 0, distances, labels);
+        return new SingleResultHC(n, 0, distances, labels, sel);
     } else if (impl % 2 == 0) {
-        return new HeapHC(n, 0, k, distances, labels);
+        return new HeapHC(n, 0, k, distances, labels, sel);
     } else /* if (impl % 2 == 1) */ {
-        return new ReservoirHC(n, 0, k, 2 * k, distances, labels);
+        return new ReservoirHC(n, 0, k, 2 * k, distances, labels, sel);
     }
 }
 
@@ -379,13 +393,14 @@ SIMDResultHandlerToFloat* make_knn_handler(
         idx_t n,
         idx_t k,
         float* distances,
-        idx_t* labels) {
+        idx_t* labels,
+        const IDSelector* sel) {
     if (is_max) {
         return make_knn_handler_fixC<CMax<uint16_t, int64_t>>(
-                impl, n, k, distances, labels);
+                impl, n, k, distances, labels, sel);
     } else {
         return make_knn_handler_fixC<CMin<uint16_t, int64_t>>(
-                impl, n, k, distances, labels);
+                impl, n, k, distances, labels, sel);
     }
 }
 
@@ -402,10 +417,20 @@ struct CoarseQuantizedWithBuffer : CoarseQuantized {
     std::vector<idx_t> ids_buffer;
     std::vector<float> dis_buffer;
 
-    void quantize(const Index* quantizer, idx_t n, const float* x) {
+    void quantize(
+            const Index* quantizer,
+            idx_t n,
+            const float* x,
+            const SearchParameters* quantizer_params) {
         dis_buffer.resize(nprobe * n);
         ids_buffer.resize(nprobe * n);
-        quantizer->search(n, x, nprobe, dis_buffer.data(), ids_buffer.data());
+        quantizer->search(
+                n,
+                x,
+                nprobe,
+                dis_buffer.data(),
+                ids_buffer.data(),
+                quantizer_params);
         dis = dis_buffer.data();
         ids = ids_buffer.data();
     }
@@ -421,8 +446,11 @@ struct CoarseQuantizedSlice : CoarseQuantizedWithBuffer {
         }
     }
 
-    void quantize_slice(const Index* quantizer, const float* x) {
-        quantize(quantizer, i1 - i0, x + quantizer->d * i0);
+    void quantize_slice(
+            const Index* quantizer,
+            const float* x,
+            const SearchParameters* quantizer_params) {
+        quantize(quantizer, i1 - i0, x + quantizer->d * i0, quantizer_params);
     }
 };
 
@@ -459,7 +487,13 @@ void IndexIVFFastScan::search_dispatch_implem(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
     bool is_max = !is_similarity_metric(metric_type);
     using RH = SIMDResultHandlerToFloat;
 
@@ -489,52 +523,70 @@ void IndexIVFFastScan::search_dispatch_implem(
     }
 
     CoarseQuantizedWithBuffer cq(cq_in);
+    cq.nprobe = nprobe;
 
     if (!cq.done() && !multiple_threads) {
         // we do the coarse quantization here execpt when search is
         // sliced over threads (then it is more efficient to have each thread do
         // its own coarse quantization)
-        cq.quantize(quantizer, n, x);
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
     }
 
     if (impl == 1) {
         if (is_max) {
             search_implem_1<CMax<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         } else {
             search_implem_1<CMin<float, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         }
     } else if (impl == 2) {
         if (is_max) {
             search_implem_2<CMax<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         } else {
             search_implem_2<CMin<uint16_t, int64_t>>(
-                    n, x, k, distances, labels, cq, scaler);
+                    n, x, k, distances, labels, cq, scaler, params);
         }
-
     } else if (impl >= 10 && impl <= 15) {
         size_t ndis = 0, nlist_visited = 0;
 
         if (!multiple_threads) {
             // clang-format off
             if (impl == 12 || impl == 13) {
-                std::unique_ptr<RH> handler(make_knn_handler(is_max, impl, n, k, distances, labels));
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
+                        labels, sel
+                    )
+                );
                 search_implem_12(
                         n, x, *handler.get(),
-                        cq, &ndis, &nlist_visited, scaler);
-
+                        cq, &ndis, &nlist_visited, scaler, params);
             } else if (impl == 14 || impl == 15) {
-
                 search_implem_14(
                         n, x, k, distances, labels,
-                        cq, impl, scaler);
+                        cq, impl, scaler, params);
             } else {
-                std::unique_ptr<RH> handler(make_knn_handler(is_max, impl, n, k, distances, labels));
+                std::unique_ptr<RH> handler(
+                    make_knn_handler(
+                        is_max, 
+                        impl, 
+                        n, 
+                        k, 
+                        distances, 
+                        labels,
+                        sel
+                    )
+                );
                 search_implem_10(
                         n, x, *handler.get(), cq,
-                        &ndis, &nlist_visited, scaler);
+                        &ndis, &nlist_visited, scaler, params);
             }
             // clang-format on
         } else {
@@ -543,7 +595,8 @@ void IndexIVFFastScan::search_dispatch_implem(
             if (impl == 14 || impl == 15) {
                 // this might require slicing if there are too
                 // many queries (for now we keep this simple)
-                search_implem_14(n, x, k, distances, labels, cq, impl, scaler);
+                search_implem_14(
+                        n, x, k, distances, labels, cq, impl, scaler, params);
             } else {
 #pragma omp parallel for reduction(+ : ndis, nlist_visited)
                 for (int slice = 0; slice < nslice; slice++) {
@@ -553,19 +606,19 @@ void IndexIVFFastScan::search_dispatch_implem(
                     idx_t* lab_i = labels + i0 * k;
                     CoarseQuantizedSlice cq_i(cq, i0, i1);
                     if (!cq_i.done()) {
-                        cq_i.quantize_slice(quantizer, x);
+                        cq_i.quantize_slice(quantizer, x, quantizer_params);
                     }
                     std::unique_ptr<RH> handler(make_knn_handler(
-                            is_max, impl, i1 - i0, k, dis_i, lab_i));
+                            is_max, impl, i1 - i0, k, dis_i, lab_i, sel));
                     // clang-format off
                     if (impl == 12 || impl == 13) {
                         search_implem_12(
                                 i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler);
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     } else {
                         search_implem_10(
                                 i1 - i0, x + i0 * d, *handler.get(),
-                                cq_i, &ndis, &nlist_visited, scaler);
+                                cq_i, &ndis, &nlist_visited, scaler, params);
                     }
                     // clang-format on
                 }
@@ -585,7 +638,13 @@ void IndexIVFFastScan::range_search_dispatch_implem(
         float radius,
         RangeSearchResult& rres,
         const CoarseQuantized& cq_in,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
+    // const idx_t nprobe = params ? params->nprobe : this->nprobe;
+    const IDSelector* sel = (params) ? params->sel : nullptr;
+    const SearchParameters* quantizer_params =
+            params ? params->quantizer_params : nullptr;
+
     bool is_max = !is_similarity_metric(metric_type);
 
     if (n == 0) {
@@ -613,7 +672,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
     }
 
     if (!multiple_threads && !cq.done()) {
-        cq.quantize(quantizer, n, x);
+        cq.quantize(quantizer, n, x, quantizer_params);
+        invlists->prefetch_lists(cq.ids, n * cq.nprobe);
     }
 
     size_t ndis = 0, nlist_visited = 0;
@@ -622,10 +682,10 @@ void IndexIVFFastScan::range_search_dispatch_implem(
         std::unique_ptr<SIMDResultHandlerToFloat> handler;
         if (is_max) {
             handler.reset(new RangeHandler<CMax<uint16_t, int64_t>, true>(
-                    rres, radius, 0));
+                    rres, radius, 0, sel));
         } else {
             handler.reset(new RangeHandler<CMin<uint16_t, int64_t>, true>(
-                    rres, radius, 0));
+                    rres, radius, 0, sel));
         }
         if (impl == 12) {
             search_implem_12(
@@ -649,17 +709,17 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                 idx_t i1 = n * (slice + 1) / nslice;
                 CoarseQuantizedSlice cq_i(cq, i0, i1);
                 if (!cq_i.done()) {
-                    cq_i.quantize_slice(quantizer, x);
+                    cq_i.quantize_slice(quantizer, x, quantizer_params);
                 }
                 std::unique_ptr<SIMDResultHandlerToFloat> handler;
                 if (is_max) {
                     handler.reset(new PartialRangeHandler<
                                   CMax<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1));
+                                  true>(pres, radius, 0, i0, i1, sel));
                 } else {
                     handler.reset(new PartialRangeHandler<
                                   CMin<uint16_t, int64_t>,
-                                  true>(pres, radius, 0, i0, i1));
+                                  true>(pres, radius, 0, i0, i1, sel));
                 }
 
                 if (impl == 12 || impl == 13) {
@@ -670,7 +730,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                             cq_i,
                             &ndis,
                             &nlist_visited,
-                            scaler);
+                            scaler,
+                            params);
                 } else {
                     search_implem_10(
                             i1 - i0,
@@ -679,7 +740,8 @@ void IndexIVFFastScan::range_search_dispatch_implem(
                             cq_i,
                             &ndis,
                             &nlist_visited,
-                            scaler);
+                            scaler,
+                            params);
                 }
             }
             pres.finalize();
@@ -699,7 +761,8 @@ void IndexIVFFastScan::search_implem_1(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
     size_t dim12 = ksub * M;
@@ -766,7 +829,8 @@ void IndexIVFFastScan::search_implem_2(
         float* distances,
         idx_t* labels,
         const CoarseQuantized& cq,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     FAISS_THROW_IF_NOT(orig_invlists);
 
     size_t dim12 = ksub * M2;
@@ -848,7 +912,8 @@ void IndexIVFFastScan::search_implem_10(
         const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
@@ -909,6 +974,7 @@ void IndexIVFFastScan::search_implem_10(
             ndis++;
         }
     }
+
     handler.end();
     *ndis_out = ndis;
     *nlist_out = nlist;
@@ -921,7 +987,8 @@ void IndexIVFFastScan::search_implem_12(
         const CoarseQuantized& cq,
         size_t* ndis_out,
         size_t* nlist_out,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
@@ -933,6 +1000,7 @@ void IndexIVFFastScan::search_implem_12(
     std::unique_ptr<float[]> normalizers(new float[2 * n]);
 
     compute_LUT_uint8(n, x, cq, dis_tables, biases, normalizers.get());
+
     handler.begin(skip & 16 ? nullptr : normalizers.get());
 
     struct QC {
@@ -958,6 +1026,7 @@ void IndexIVFFastScan::search_implem_12(
             return a.list_no < b.list_no;
         });
     }
+
     // prepare the result handlers
 
     int qbs2 = this->qbs2 ? this->qbs2 : 11;
@@ -1049,12 +1118,15 @@ void IndexIVFFastScan::search_implem_14(
         idx_t* labels,
         const CoarseQuantized& cq,
         int impl,
-        const NormTableScaler* scaler) const {
+        const NormTableScaler* scaler,
+        const IVFSearchParameters* params) const {
     if (n == 0) { // does not work well with reservoir
         return;
     }
     FAISS_THROW_IF_NOT(bbs == 32);
 
+    const IDSelector* sel = params ? params->sel : nullptr;
+
     size_t dim12 = ksub * M2;
     AlignedTable<uint8_t> dis_tables;
     AlignedTable<uint16_t> biases;
@@ -1157,7 +1229,7 @@ void IndexIVFFastScan::search_implem_14(
 
         // prepare the result handlers
         std::unique_ptr<SIMDResultHandlerToFloat> handler(make_knn_handler(
-                is_max, impl, n, k, local_dis.data(), local_idx.data()));
+                is_max, impl, n, k, local_dis.data(), local_idx.data(), sel));
         handler->begin(normalizers.get());
 
         int qbs2 = this->qbs2 ? this->qbs2 : 11;
@@ -1167,6 +1239,7 @@ void IndexIVFFastScan::search_implem_14(
             tmp_bias.resize(qbs2);
             handler->dbias = tmp_bias.data();
         }
+
         std::set<int> q_set;
         uint64_t t_copy_pack = 0, t_scan = 0;
 #pragma omp for schedule(dynamic)
diff --git a/faiss/IndexIVFFastScan.h b/faiss/IndexIVFFastScan.h
index 159a3a7098..9d4c4910d3 100644
--- a/faiss/IndexIVFFastScan.h
+++ b/faiss/IndexIVFFastScan.h
@@ -148,7 +148,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     void range_search_dispatch_implem(
             idx_t n,
@@ -156,7 +157,8 @@ struct IndexIVFFastScan : IndexIVF {
             float radius,
             RangeSearchResult& rres,
             const CoarseQuantized& cq_in,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // impl 1 and 2 are just for verification
     template <class C>
@@ -167,7 +169,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     template <class C>
     void search_implem_2(
@@ -177,7 +180,8 @@ struct IndexIVFFastScan : IndexIVF {
             float* distances,
             idx_t* labels,
             const CoarseQuantized& cq,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 10 and 12 are not multithreaded internally, so
     // export search stats
@@ -188,7 +192,8 @@ struct IndexIVFFastScan : IndexIVF {
             const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     void search_implem_12(
             idx_t n,
@@ -197,7 +202,8 @@ struct IndexIVFFastScan : IndexIVF {
             const CoarseQuantized& cq,
             size_t* ndis_out,
             size_t* nlist_out,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // implem 14 is multithreaded internally across nprobes and queries
     void search_implem_14(
@@ -208,7 +214,8 @@ struct IndexIVFFastScan : IndexIVF {
             idx_t* labels,
             const CoarseQuantized& cq,
             int impl,
-            const NormTableScaler* scaler) const;
+            const NormTableScaler* scaler,
+            const IVFSearchParameters* params = nullptr) const;
 
     // reconstruct vectors from packed invlists
     void reconstruct_from_offset(int64_t list_no, int64_t offset, float* recons)
diff --git a/faiss/IndexIVFPQFastScan.cpp b/faiss/IndexIVFPQFastScan.cpp
index d069db1354..2844ae4936 100644
--- a/faiss/IndexIVFPQFastScan.cpp
+++ b/faiss/IndexIVFPQFastScan.cpp
@@ -286,9 +286,28 @@ void IndexIVFPQFastScan::compute_LUT(
     }
 }
 
-void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* bytes, float* x)
+void IndexIVFPQFastScan::sa_decode(idx_t n, const uint8_t* codes, float* x)
         const {
-    pq.decode(bytes, x, n);
+    size_t coarse_size = coarse_code_size();
+
+#pragma omp parallel if (n > 1)
+    {
+        std::vector<float> residual(d);
+
+#pragma omp for
+        for (idx_t i = 0; i < n; i++) {
+            const uint8_t* code = codes + i * (code_size + coarse_size);
+            int64_t list_no = decode_listno(code);
+            float* xi = x + i * d;
+            pq.decode(code + coarse_size, xi);
+            if (by_residual) {
+                quantizer->reconstruct(list_no, residual.data());
+                for (size_t j = 0; j < d; j++) {
+                    xi[j] += residual[j];
+                }
+            }
+        }
+    }
 }
 
 } // namespace faiss
diff --git a/faiss/IndexNNDescent.cpp b/faiss/IndexNNDescent.cpp
index 27bd6e33ee..382e9c41c6 100644
--- a/faiss/IndexNNDescent.cpp
+++ b/faiss/IndexNNDescent.cpp
@@ -58,35 +58,6 @@ using storage_idx_t = NNDescent::storage_idx_t;
 
 namespace {
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 DistanceComputer* storage_distance_computer(const Index* storage) {
     if (is_similarity_metric(storage->metric_type)) {
         return new NegativeDistanceComputer(storage->get_distance_computer());
diff --git a/faiss/IndexScalarQuantizer.cpp b/faiss/IndexScalarQuantizer.cpp
index 9203a98932..7ce838db5e 100644
--- a/faiss/IndexScalarQuantizer.cpp
+++ b/faiss/IndexScalarQuantizer.cpp
@@ -32,7 +32,8 @@ IndexScalarQuantizer::IndexScalarQuantizer(
         MetricType metric)
         : IndexFlatCodes(0, d, metric), sq(d, qtype) {
     is_trained = qtype == ScalarQuantizer::QT_fp16 ||
-            qtype == ScalarQuantizer::QT_8bit_direct;
+            qtype == ScalarQuantizer::QT_8bit_direct ||
+            qtype == ScalarQuantizer::QT_bf16;
     code_size = sq.code_size;
 }
 
diff --git a/faiss/MetricType.h b/faiss/MetricType.h
index 538b0a8e72..8e889b1a03 100644
--- a/faiss/MetricType.h
+++ b/faiss/MetricType.h
@@ -31,8 +31,13 @@ enum MetricType {
     METRIC_Canberra = 20,
     METRIC_BrayCurtis,
     METRIC_JensenShannon,
-    METRIC_Jaccard, ///< defined as: sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i))
-                    ///< where a_i, b_i > 0
+
+    /// sum_i(min(a_i, b_i)) / sum_i(max(a_i, b_i)) where a_i, b_i > 0
+    METRIC_Jaccard,
+    /// Squared Eucliden distance, ignoring NaNs
+    METRIC_NaNEuclidean,
+    /// abs(x | y): the distance to a hyperplane
+    METRIC_ABS_INNER_PRODUCT,
 };
 
 /// all vector indices are this type
diff --git a/faiss/cppcontrib/detail/UintReader.h b/faiss/cppcontrib/detail/UintReader.h
index 81e600f410..4a64a1a254 100644
--- a/faiss/cppcontrib/detail/UintReader.h
+++ b/faiss/cppcontrib/detail/UintReader.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/impl/platform_macros.h>
 #include <cstdint>
 
 namespace faiss {
@@ -31,7 +32,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 3) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32) >> 24;
+#else
                     return (code32 & 0x000000FF);
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -40,7 +45,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 2) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x00FF0000) >> 16;
+#else
                     return (code32 & 0x0000FF00) >> 8;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -49,7 +58,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS + 1) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x0000FF00) >> 8;
+#else
                     return (code32 & 0x00FF0000) >> 16;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -58,7 +71,11 @@ struct Uint8Reader {
                 if (N_ELEMENTS > CPOS) {
                     const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    return (code32 & 0x000000FF);
+#else
                     return (code32) >> 24;
+#endif
                 } else {
                     return codes[CPOS];
                 }
@@ -87,40 +104,61 @@ struct Uint10Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000001111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000001111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111110000000000) >> 10;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111100) >> 2;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 5);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b00111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 5 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0011111111110000) >> 4;
                 }
             }
             case 3: {
-                const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                         codes + ELEMENT_TO_READ * 5 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                code16 = Swap2Bytes(code16);
+#endif
                 return (code16 & 0b1111111111000000) >> 6;
             }
         }
@@ -147,45 +185,69 @@ struct Uint12Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 2) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b0000111111111111);
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 0);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b111111111111000000000000) >> 12;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 1);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
             case 2: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b000011111111111100000000) >> 8;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 3);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b0000111111111111);
                 }
             }
             case 3: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 6 + 2);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0b11111111111100000000000000000000) >> 20;
                 } else {
-                    const uint16_t code16 = *reinterpret_cast<const uint16_t*>(
+                    uint16_t code16 = *reinterpret_cast<const uint16_t*>(
                             codes + ELEMENT_TO_READ * 6 + 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code16 = Swap2Bytes(code16);
+#endif
                     return (code16 & 0b1111111111110000) >> 4;
                 }
             }
@@ -208,23 +270,39 @@ struct Uint16Reader {
         switch (SUB_ELEMENT) {
             case 0: {
                 if (N_ELEMENTS > CPOS + 1) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return (code32 & 0x0000FFFF);
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
             case 1: {
                 if (N_ELEMENTS > CPOS) {
-                    const uint32_t code32 = *reinterpret_cast<const uint32_t*>(
+                    uint32_t code32 = *reinterpret_cast<const uint32_t*>(
                             codes + ELEMENT_TO_READ * 4);
+#ifdef FAISS_BIG_ENDIAN
+                    code32 = Swap4Bytes(code32);
+#endif
                     return code32 >> 16;
                 } else {
                     const uint16_t* const __restrict codesFp16 =
                             reinterpret_cast<const uint16_t*>(codes);
+#ifdef FAISS_BIG_ENDIAN
+                    uint16_t rt = codesFp16[CPOS];
+                    rt = Swap2Bytes(rt);
+                    return rt;
+#endif
                     return codesFp16[CPOS];
                 }
             }
diff --git a/faiss/cppcontrib/sa_decode/Level2-inl.h b/faiss/cppcontrib/sa_decode/Level2-inl.h
index 36355af001..1eb7767ba8 100644
--- a/faiss/cppcontrib/sa_decode/Level2-inl.h
+++ b/faiss/cppcontrib/sa_decode/Level2-inl.h
@@ -12,10 +12,19 @@
 #include <cstdint>
 
 #include <faiss/cppcontrib/detail/CoarseBitType.h>
+#include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 namespace cppcontrib {
 
+bool isBigEndian() {
+#ifdef FAISS_BIG_ENDIAN
+    return true;
+#else
+    return false;
+#endif
+}
+
 ////////////////////////////////////////////////////////////////////////////////////
 /// Index2LevelDecoder
 ////////////////////////////////////////////////////////////////////////////////////
@@ -72,9 +81,14 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
@@ -112,9 +126,14 @@ struct Index2LevelDecoder {
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
 
-            const intptr_t coarseCode = coarse[coarseCentroidIdx];
-            const intptr_t fineCode = fine[fineCentroidIdx];
-
+            intptr_t coarseCode, fineCode;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode = Swap2Bytes(coarse[coarseCentroidIdx]);
+                fineCode = Swap2Bytes(fine[fineCentroidIdx]);
+            } else {
+                coarseCode = coarse[coarseCentroidIdx];
+                fineCode = fine[fineCentroidIdx];
+            }
             const float* const __restrict coarsePtr = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode) *
                             COARSE_SIZE +
@@ -162,11 +181,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -222,11 +248,18 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -292,13 +325,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, coarseCode1, fineCode0, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids0 +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
@@ -369,13 +412,23 @@ struct Index2LevelDecoder {
             const intptr_t coarseCentroidOffset = i % COARSE_SIZE;
             const intptr_t fineCentroidIdx = i / FINE_SIZE;
             const intptr_t fineCentroidOffset = i % FINE_SIZE;
-
-            const intptr_t coarseCode0 = coarse0[coarseCentroidIdx];
-            const intptr_t fineCode0 = fine0[fineCentroidIdx];
-            const intptr_t coarseCode1 = coarse1[coarseCentroidIdx];
-            const intptr_t fineCode1 = fine1[fineCentroidIdx];
-            const intptr_t coarseCode2 = coarse2[coarseCentroidIdx];
-            const intptr_t fineCode2 = fine2[fineCentroidIdx];
+            intptr_t coarseCode0, fineCode0, coarseCode1, fineCode1;
+            intptr_t coarseCode2, fineCode2;
+            if (isBigEndian() && sizeof(coarse_storage_type) == 2) {
+                coarseCode0 = Swap2Bytes(coarse0[coarseCentroidIdx]);
+                fineCode0 = Swap2Bytes(fine0[fineCentroidIdx]);
+                coarseCode1 = Swap2Bytes(coarse1[coarseCentroidIdx]);
+                fineCode1 = Swap2Bytes(fine1[fineCentroidIdx]);
+                coarseCode2 = Swap2Bytes(coarse2[coarseCentroidIdx]);
+                fineCode2 = Swap2Bytes(fine2[fineCentroidIdx]);
+            } else {
+                coarseCode0 = coarse0[coarseCentroidIdx];
+                fineCode0 = fine0[fineCentroidIdx];
+                coarseCode1 = coarse1[coarseCentroidIdx];
+                fineCode1 = fine1[fineCentroidIdx];
+                coarseCode2 = coarse2[coarseCentroidIdx];
+                fineCode2 = fine2[fineCentroidIdx];
+            }
 
             const float* const __restrict coarsePtr0 = pqCoarseCentroids +
                     (coarseCentroidIdx * COARSE_TABLE_BYTES + coarseCode0) *
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
index 126cbe5044..d20f3b7f8e 100644
--- a/faiss/gpu/CMakeLists.txt
+++ b/faiss/gpu/CMakeLists.txt
@@ -238,11 +238,15 @@ generate_ivf_interleaved_code()
 
 if(FAISS_ENABLE_RAFT)
   list(APPEND FAISS_GPU_HEADERS
+          GpuIndexCagra.h
+          impl/RaftCagra.cuh
           impl/RaftFlatIndex.cuh
           impl/RaftIVFFlat.cuh
           impl/RaftIVFPQ.cuh
           utils/RaftUtils.h)
   list(APPEND FAISS_GPU_SRC
+          GpuIndexCagra.cu
+          impl/RaftCagra.cu
           impl/RaftFlatIndex.cu
           impl/RaftIVFFlat.cu
           impl/RaftIVFPQ.cu
@@ -316,5 +320,5 @@ __nv_relfatbin : { *(__nv_relfatbin) }
 target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
 find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr>)
+target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
+target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
diff --git a/faiss/gpu/GpuCloner.cpp b/faiss/gpu/GpuCloner.cpp
index 20583720f3..b6d55a47aa 100644
--- a/faiss/gpu/GpuCloner.cpp
+++ b/faiss/gpu/GpuCloner.cpp
@@ -14,6 +14,9 @@
 
 #include <faiss/IndexBinaryFlat.h>
 #include <faiss/IndexFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/IndexHNSW.h>
+#endif
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFPQ.h>
@@ -24,6 +27,9 @@
 #include <faiss/MetaIndexes.h>
 #include <faiss/gpu/GpuIndex.h>
 #include <faiss/gpu/GpuIndexBinaryFlat.h>
+#if defined USE_NVIDIA_RAFT
+#include <faiss/gpu/GpuIndexCagra.h>
+#endif
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVFFlat.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -85,7 +91,15 @@ Index* ToCPUCloner::clone_Index(const Index* index) {
         // objective is to make a single component out of them
         // (inverse op of ToGpuClonerMultiple)
 
-    } else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const GpuIndexCagra*>(index)) {
+        IndexHNSWCagra* res = new IndexHNSWCagra();
+        icg->copyTo(res);
+        return res;
+    }
+#endif
+    else if (auto ish = dynamic_cast<const IndexShards*>(index)) {
         int nshard = ish->count();
         FAISS_ASSERT(nshard > 0);
         Index* res = clone_Index(ish->at(0));
@@ -153,6 +167,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.indicesOptions = indicesOptions;
         config.flatConfig.useFloat16 = useFloat16CoarseQuantizer;
         config.use_raft = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFFlat* res = new GpuIndexIVFFlat(
                 provider, ifl->d, ifl->nlist, ifl->metric_type, config);
@@ -205,6 +220,7 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         config.usePrecomputedTables = usePrecomputed;
         config.use_raft = use_raft;
         config.interleavedLayout = use_raft;
+        config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer;
 
         GpuIndexIVFPQ* res = new GpuIndexIVFPQ(provider, ipq, config);
 
@@ -213,9 +229,25 @@ Index* ToGpuCloner::clone_Index(const Index* index) {
         }
 
         return res;
-    } else {
-        // default: use CPU cloner
-        return Cloner::clone_Index(index);
+    }
+#if defined USE_NVIDIA_RAFT
+    else if (auto icg = dynamic_cast<const faiss::IndexHNSWCagra*>(index)) {
+        GpuIndexCagraConfig config;
+        config.device = device;
+        GpuIndexCagra* res =
+                new GpuIndexCagra(provider, icg->d, icg->metric_type, config);
+        res->copyFrom(icg);
+        return res;
+    }
+#endif
+    else {
+        // use CPU cloner for IDMap and PreTransform
+        auto index_idmap = dynamic_cast<const IndexIDMap*>(index);
+        auto index_pt = dynamic_cast<const IndexPreTransform*>(index);
+        if (index_idmap || index_pt) {
+            return Cloner::clone_Index(index);
+        }
+        FAISS_THROW_MSG("This index type is not implemented on GPU.");
     }
 }
 
diff --git a/faiss/gpu/GpuClonerOptions.h b/faiss/gpu/GpuClonerOptions.h
index 197e09dc88..e643e848fb 100644
--- a/faiss/gpu/GpuClonerOptions.h
+++ b/faiss/gpu/GpuClonerOptions.h
@@ -43,6 +43,12 @@ struct GpuClonerOptions {
 #else
     bool use_raft = false;
 #endif
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 struct GpuMultipleClonerOptions : public GpuClonerOptions {
diff --git a/faiss/gpu/GpuDistance.cu b/faiss/gpu/GpuDistance.cu
index a235404b14..38a62f03bb 100644
--- a/faiss/gpu/GpuDistance.cu
+++ b/faiss/gpu/GpuDistance.cu
@@ -327,7 +327,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     int64_t,
                     raft::col_major>>
                     index_vec = {index.view()};
-            RAFT_LOG_INFO("Invoking flat bfknn");
+
             brute_force::knn(
                     handle,
                     index_vec,
@@ -354,10 +354,7 @@ void bfKnn(GpuResourcesProvider* prov, const GpuDistanceParams& args) {
                     [] __device__(const float& a) { return powf(a, 2); });
         }
 
-        RAFT_LOG_INFO("Done.");
-
         handle.sync_stream();
-        RAFT_LOG_INFO("All synced.");
     } else
 #else
     if (should_use_raft(args)) {
diff --git a/faiss/gpu/GpuIcmEncoder.cu b/faiss/gpu/GpuIcmEncoder.cu
index 434fae9e36..8bd60f91b8 100644
--- a/faiss/gpu/GpuIcmEncoder.cu
+++ b/faiss/gpu/GpuIcmEncoder.cu
@@ -82,7 +82,7 @@ void GpuIcmEncoder::encode(
         size_t n,
         size_t ils_iters) const {
     size_t nshards = shards->size();
-    size_t shard_size = (n + nshards - 1) / nshards;
+    size_t base_shard_size = n / nshards;
 
     auto codebooks = lsq->codebooks.data();
     auto M = lsq->M;
@@ -94,8 +94,14 @@ void GpuIcmEncoder::encode(
 
     // split input data
     auto fn = [=](int idx, IcmEncoderImpl* encoder) {
-        size_t i0 = idx * shard_size;
-        size_t ni = std::min(shard_size, n - i0);
+        size_t i0 = idx * base_shard_size + std::min(size_t(idx), n % nshards);
+        size_t ni = base_shard_size;
+        if (ni < n % nshards) {
+            ++ni;
+        }
+        if (ni <= 0) { // only if n < nshards
+            return;
+        }
         auto xi = x + i0 * d;
         auto ci = codes + i0 * M;
         std::mt19937 geni(idx + seed); // different seed for each shard
diff --git a/faiss/gpu/GpuIndex.h b/faiss/gpu/GpuIndex.h
index 36de98c098..cc10f21589 100644
--- a/faiss/gpu/GpuIndex.h
+++ b/faiss/gpu/GpuIndex.h
@@ -84,19 +84,14 @@ class GpuIndex : public faiss::Index {
 
     /// `x` and `labels` can be resident on the CPU or any GPU; copies are
     /// performed as needed
-    void assign(
-            idx_t n,
-            const float* x,
-            idx_t* labels,
-            // faiss::Index has idx_t for k
-            idx_t k = 1) const override;
+    void assign(idx_t n, const float* x, idx_t* labels, idx_t k = 1)
+            const override;
 
     /// `x`, `distances` and `labels` can be resident on the CPU or any
     /// GPU; copies are performed as needed
     void search(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
@@ -107,7 +102,6 @@ class GpuIndex : public faiss::Index {
     void search_and_reconstruct(
             idx_t n,
             const float* x,
-            // faiss::Index has idx_t for k
             idx_t k,
             float* distances,
             idx_t* labels,
diff --git a/faiss/gpu/GpuIndexCagra.cu b/faiss/gpu/GpuIndexCagra.cu
new file mode 100644
index 0000000000..4ae56df10d
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.cu
@@ -0,0 +1,274 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <cstddef>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+#include <optional>
+
+namespace faiss {
+namespace gpu {
+
+GpuIndexCagra::GpuIndexCagra(
+        GpuResourcesProvider* provider,
+        int dims,
+        faiss::MetricType metric,
+        GpuIndexCagraConfig config)
+        : GpuIndex(provider->getResources(), dims, metric, 0.0f, config),
+          cagraConfig_(config) {
+    this->is_trained = false;
+}
+
+void GpuIndexCagra::train(idx_t n, const float* x) {
+    if (this->is_trained) {
+        FAISS_ASSERT(index_);
+        return;
+    }
+
+    FAISS_ASSERT(!index_);
+
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+            std::nullopt;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params =
+            std::nullopt;
+    if (cagraConfig_.ivf_pq_params != nullptr) {
+        ivf_pq_params =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+        ivf_pq_params->n_lists = cagraConfig_.ivf_pq_params->n_lists;
+        ivf_pq_params->kmeans_n_iters =
+                cagraConfig_.ivf_pq_params->kmeans_n_iters;
+        ivf_pq_params->kmeans_trainset_fraction =
+                cagraConfig_.ivf_pq_params->kmeans_trainset_fraction;
+        ivf_pq_params->pq_bits = cagraConfig_.ivf_pq_params->pq_bits;
+        ivf_pq_params->pq_dim = cagraConfig_.ivf_pq_params->pq_dim;
+        ivf_pq_params->codebook_kind =
+                static_cast<raft::neighbors::ivf_pq::codebook_gen>(
+                        cagraConfig_.ivf_pq_params->codebook_kind);
+        ivf_pq_params->force_random_rotation =
+                cagraConfig_.ivf_pq_params->force_random_rotation;
+        ivf_pq_params->conservative_memory_allocation =
+                cagraConfig_.ivf_pq_params->conservative_memory_allocation;
+    }
+    if (cagraConfig_.ivf_pq_search_params != nullptr) {
+        ivf_pq_search_params =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+        ivf_pq_search_params->n_probes =
+                cagraConfig_.ivf_pq_search_params->n_probes;
+        ivf_pq_search_params->lut_dtype =
+                cagraConfig_.ivf_pq_search_params->lut_dtype;
+        ivf_pq_search_params->preferred_shmem_carveout =
+                cagraConfig_.ivf_pq_search_params->preferred_shmem_carveout;
+    }
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            cagraConfig_.intermediate_graph_degree,
+            cagraConfig_.graph_degree,
+            static_cast<faiss::cagra_build_algo>(cagraConfig_.build_algo),
+            cagraConfig_.nn_descent_niter,
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT,
+            ivf_pq_params,
+            ivf_pq_search_params);
+
+    index_->train(n, x);
+
+    this->is_trained = true;
+    this->ntotal = n;
+}
+
+bool GpuIndexCagra::addImplRequiresIDs_() const {
+    return false;
+};
+
+void GpuIndexCagra::addImpl_(idx_t n, const float* x, const idx_t* ids) {
+    FAISS_THROW_MSG("adding vectors is not supported by GpuIndexCagra.");
+};
+
+void GpuIndexCagra::searchImpl_(
+        idx_t n,
+        const float* x,
+        int k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* search_params) const {
+    FAISS_ASSERT(this->is_trained && index_);
+    FAISS_ASSERT(n > 0);
+
+    Tensor<float, 2, true> queries(const_cast<float*>(x), {n, this->d});
+    Tensor<float, 2, true> outDistances(distances, {n, k});
+    Tensor<idx_t, 2, true> outLabels(const_cast<idx_t*>(labels), {n, k});
+
+    SearchParametersCagra* params;
+    if (search_params) {
+        params = dynamic_cast<SearchParametersCagra*>(
+                const_cast<SearchParameters*>(search_params));
+    } else {
+        params = new SearchParametersCagra{};
+    }
+
+    index_->search(
+            queries,
+            k,
+            outDistances,
+            outLabels,
+            params->max_queries,
+            params->itopk_size,
+            params->max_iterations,
+            static_cast<faiss::cagra_search_algo>(params->algo),
+            params->team_size,
+            params->search_width,
+            params->min_iterations,
+            params->thread_block_size,
+            static_cast<faiss::cagra_hash_mode>(params->hashmap_mode),
+            params->hashmap_min_bitlen,
+            params->hashmap_max_fill_rate,
+            params->num_random_samplings,
+            params->seed);
+
+    if (not search_params) {
+        delete params;
+    }
+}
+
+void GpuIndexCagra::copyFrom(const faiss::IndexHNSWCagra* index) {
+    FAISS_ASSERT(index);
+
+    DeviceScope scope(config_.device);
+
+    GpuIndex::copyFrom(index);
+
+    auto base_index = dynamic_cast<IndexFlat*>(index->storage);
+    FAISS_ASSERT(base_index);
+    auto distances = base_index->get_xb();
+
+    auto hnsw = index->hnsw;
+    // copy level 0 to a dense knn graph matrix
+    std::vector<idx_t> knn_graph;
+    knn_graph.reserve(index->ntotal * hnsw.nb_neighbors(0));
+
+#pragma omp parallel for
+    for (size_t i = 0; i < index->ntotal; ++i) {
+        size_t begin, end;
+        hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            // knn_graph.push_back(hnsw.neighbors[j]);
+            knn_graph[i * hnsw.nb_neighbors(0) + (j - begin)] =
+                    hnsw.neighbors[j];
+        }
+    }
+
+    index_ = std::make_shared<RaftCagra>(
+            this->resources_.get(),
+            this->d,
+            index->ntotal,
+            hnsw.nb_neighbors(0),
+            distances,
+            knn_graph.data(),
+            this->metric_type,
+            this->metric_arg,
+            INDICES_64_BIT);
+
+    this->is_trained = true;
+}
+
+void GpuIndexCagra::copyTo(faiss::IndexHNSWCagra* index) const {
+    FAISS_ASSERT(index_ && this->is_trained && index);
+
+    DeviceScope scope(config_.device);
+
+    //
+    // Index information
+    //
+    GpuIndex::copyTo(index);
+    // This needs to be zeroed out as this implementation adds vectors to the
+    // cpuIndex instead of copying fields
+    index->ntotal = 0;
+
+    auto graph_degree = index_->get_knngraph_degree();
+    auto M = graph_degree / 2;
+    if (index->storage and index->own_fields) {
+        delete index->storage;
+    }
+
+    if (this->metric_type == METRIC_L2) {
+        index->storage = new IndexFlatL2(index->d);
+    } else if (this->metric_type == METRIC_INNER_PRODUCT) {
+        index->storage = new IndexFlatIP(index->d);
+    }
+    index->own_fields = true;
+    index->keep_max_size_level0 = true;
+    index->hnsw.reset();
+    index->hnsw.assign_probas.clear();
+    index->hnsw.cum_nneighbor_per_level.clear();
+    index->hnsw.set_default_probas(M, 1.0 / log(M));
+
+    auto n_train = this->ntotal;
+    auto train_dataset = index_->get_training_dataset();
+
+    // turn off as level 0 is copied from CAGRA graph
+    index->init_level0 = false;
+    if (!index->base_level_only) {
+        index->add(n_train, train_dataset.data());
+    } else {
+        index->hnsw.prepare_level_tab(n_train, false);
+        index->storage->add(n_train, train_dataset.data());
+        index->ntotal = n_train;
+    }
+
+    auto graph = get_knngraph();
+
+#pragma omp parallel for
+    for (idx_t i = 0; i < n_train; i++) {
+        size_t begin, end;
+        index->hnsw.neighbor_range(i, 0, &begin, &end);
+        for (size_t j = begin; j < end; j++) {
+            index->hnsw.neighbors[j] = graph[i * graph_degree + (j - begin)];
+        }
+    }
+
+    // turn back on to allow new vectors to be added to level 0
+    index->init_level0 = true;
+}
+
+void GpuIndexCagra::reset() {
+    DeviceScope scope(config_.device);
+
+    if (index_) {
+        index_->reset();
+        this->ntotal = 0;
+        this->is_trained = false;
+    } else {
+        FAISS_ASSERT(this->ntotal == 0);
+    }
+}
+
+std::vector<idx_t> GpuIndexCagra::get_knngraph() const {
+    FAISS_ASSERT(index_ && this->is_trained);
+
+    return index_->get_knngraph();
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexCagra.h b/faiss/gpu/GpuIndexCagra.h
new file mode 100644
index 0000000000..6ecee3ae03
--- /dev/null
+++ b/faiss/gpu/GpuIndexCagra.h
@@ -0,0 +1,282 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/IndexIVF.h>
+#include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexIVFPQ.h>
+
+namespace faiss {
+struct IndexHNSWCagra;
+}
+
+namespace faiss {
+namespace gpu {
+
+class RaftCagra;
+
+enum class graph_build_algo {
+    /// Use IVF-PQ to build all-neighbors knn graph
+    IVF_PQ,
+    /// Experimental, use NN-Descent to build all-neighbors knn graph
+    NN_DESCENT
+};
+
+/// A type for specifying how PQ codebooks are created.
+enum class codebook_gen { // NOLINT
+    PER_SUBSPACE = 0,     // NOLINT
+    PER_CLUSTER = 1,      // NOLINT
+};
+
+struct IVFPQBuildCagraConfig {
+    ///
+    /// The number of inverted lists (clusters)
+    ///
+    /// Hint: the number of vectors per cluster (`n_rows/n_lists`) should be
+    /// approximately 1,000 to 10,000.
+
+    uint32_t n_lists = 1024;
+    /// The number of iterations searching for kmeans centers (index building).
+    uint32_t kmeans_n_iters = 20;
+    /// The fraction of data to use during iterative kmeans building.
+    double kmeans_trainset_fraction = 0.5;
+    ///
+    /// The bit length of the vector element after compression by PQ.
+    ///
+    /// Possible values: [4, 5, 6, 7, 8].
+    ///
+    /// Hint: the smaller the 'pq_bits', the smaller the index size and the
+    /// better the search performance, but the lower the recall.
+
+    uint32_t pq_bits = 8;
+    ///
+    /// The dimensionality of the vector after compression by PQ. When zero, an
+    /// optimal value is selected using a heuristic.
+    ///
+    /// NB: `pq_dim  /// pq_bits` must be a multiple of 8.
+    ///
+    /// Hint: a smaller 'pq_dim' results in a smaller index size and better
+    /// search performance, but lower recall. If 'pq_bits' is 8, 'pq_dim' can be
+    /// set to any number, but multiple of 8 are desirable for good performance.
+    /// If 'pq_bits' is not 8, 'pq_dim' should be a multiple of 8. For good
+    /// performance, it is desirable that 'pq_dim' is a multiple of 32. Ideally,
+    /// 'pq_dim' should be also a divisor of the dataset dim.
+
+    uint32_t pq_dim = 0;
+    /// How PQ codebooks are created.
+    codebook_gen codebook_kind = codebook_gen::PER_SUBSPACE;
+    ///
+    /// Apply a random rotation matrix on the input data and queries even if
+    /// `dim % pq_dim == 0`.
+    ///
+    /// Note: if `dim` is not multiple of `pq_dim`, a random rotation is always
+    /// applied to the input data and queries to transform the working space
+    /// from `dim` to `rot_dim`, which may be slightly larger than the original
+    /// space and and is a multiple of `pq_dim` (`rot_dim % pq_dim == 0`).
+    /// However, this transform is not necessary when `dim` is multiple of
+    /// `pq_dim`
+    ///   (`dim == rot_dim`, hence no need in adding "extra" data columns /
+    ///   features).
+    ///
+    /// By default, if `dim == rot_dim`, the rotation transform is initialized
+    /// with the identity matrix. When `force_random_rotation == true`, a random
+    /// orthogonal transform matrix is generated regardless of the values of
+    /// `dim` and `pq_dim`.
+
+    bool force_random_rotation = false;
+    ///
+    /// By default, the algorithm allocates more space than necessary for
+    /// individual clusters
+    /// (`list_data`). This allows to amortize the cost of memory allocation and
+    /// reduce the number of data copies during repeated calls to `extend`
+    /// (extending the database).
+    ///
+    /// The alternative is the conservative allocation behavior; when enabled,
+    /// the algorithm always allocates the minimum amount of memory required to
+    /// store the given number of records. Set this flag to `true` if you prefer
+    /// to use as little GPU memory for the database as possible.
+
+    bool conservative_memory_allocation = false;
+};
+
+struct IVFPQSearchCagraConfig {
+    /// The number of clusters to search.
+    uint32_t n_probes = 20;
+    ///
+    /// Data type of look up table to be created dynamically at search time.
+    ///
+    /// Possible values: [CUDA_R_32F, CUDA_R_16F, CUDA_R_8U]
+    ///
+    /// The use of low-precision types reduces the amount of shared memory
+    /// required at search time, so fast shared memory kernels can be used even
+    /// for datasets with large dimansionality. Note that the recall is slightly
+    /// degraded when low-precision type is selected.
+
+    cudaDataType_t lut_dtype = CUDA_R_32F;
+    ///
+    /// Storage data type for distance/similarity computed at search time.
+    ///
+    /// Possible values: [CUDA_R_16F, CUDA_R_32F]
+    ///
+    /// If the performance limiter at search time is device memory access,
+    /// selecting FP16 will improve performance slightly.
+
+    cudaDataType_t internal_distance_dtype = CUDA_R_32F;
+    ///
+    /// Preferred fraction of SM's unified memory / L1 cache to be used as
+    /// shared memory.
+    ///
+    /// Possible values: [0.0 - 1.0] as a fraction of the
+    /// `sharedMemPerMultiprocessor`.
+    ///
+    /// One wants to increase the carveout to make sure a good GPU occupancy for
+    /// the main search kernel, but not to keep it too high to leave some memory
+    /// to be used as L1 cache. Note, this value is interpreted only as a hint.
+    /// Moreover, a GPU usually allows only a fixed set of cache configurations,
+    /// so the provided value is rounded up to the nearest configuration. Refer
+    /// to the NVIDIA tuning guide for the target GPU architecture.
+    ///
+    /// Note, this is a low-level tuning parameter that can have drastic
+    /// negative effects on the search performance if tweaked incorrectly.
+
+    double preferred_shmem_carveout = 1.0;
+};
+
+struct GpuIndexCagraConfig : public GpuIndexConfig {
+    /// Degree of input graph for pruning.
+    size_t intermediate_graph_degree = 128;
+    /// Degree of output graph.
+    size_t graph_degree = 64;
+    /// ANN algorithm to build knn graph.
+    graph_build_algo build_algo = graph_build_algo::IVF_PQ;
+    /// Number of Iterations to run if building with NN_DESCENT
+    size_t nn_descent_niter = 20;
+
+    IVFPQBuildCagraConfig* ivf_pq_params = nullptr;
+    IVFPQSearchCagraConfig* ivf_pq_search_params = nullptr;
+};
+
+enum class search_algo {
+    /// For large batch sizes.
+    SINGLE_CTA,
+    /// For small batch sizes.
+    MULTI_CTA,
+    MULTI_KERNEL,
+    AUTO
+};
+
+enum class hash_mode { HASH, SMALL, AUTO };
+
+struct SearchParametersCagra : SearchParameters {
+    /// Maximum number of queries to search at the same time (batch size). Auto
+    /// select when 0.
+    size_t max_queries = 0;
+
+    /// Number of intermediate search results retained during the search.
+    ///
+    ///  This is the main knob to adjust trade off between accuracy and search
+    /// speed. Higher values improve the search accuracy.
+
+    size_t itopk_size = 64;
+
+    /// Upper limit of search iterations. Auto select when 0.
+    size_t max_iterations = 0;
+
+    // In the following we list additional search parameters for fine tuning.
+    // Reasonable default values are automatically chosen.
+
+    /// Which search implementation to use.
+    search_algo algo = search_algo::AUTO;
+
+    /// Number of threads used to calculate a single distance. 4, 8, 16, or 32.
+
+    size_t team_size = 0;
+
+    /// Number of graph nodes to select as the starting point for the search in
+    /// each iteration. aka search width?
+    size_t search_width = 1;
+    /// Lower limit of search iterations.
+    size_t min_iterations = 0;
+
+    /// Thread block size. 0, 64, 128, 256, 512, 1024. Auto selection when 0.
+    size_t thread_block_size = 0;
+    /// Hashmap type. Auto selection when AUTO.
+    hash_mode hashmap_mode = hash_mode::AUTO;
+    /// Lower limit of hashmap bit length. More than 8.
+    size_t hashmap_min_bitlen = 0;
+    /// Upper limit of hashmap fill rate. More than 0.1, less than 0.9.
+    float hashmap_max_fill_rate = 0.5;
+
+    /// Number of iterations of initial random seed node selection. 1 or more.
+
+    uint32_t num_random_samplings = 1;
+    /// Bit mask used for initial random seed node selection.
+    uint64_t seed = 0x128394;
+};
+
+struct GpuIndexCagra : public GpuIndex {
+   public:
+    GpuIndexCagra(
+            GpuResourcesProvider* provider,
+            int dims,
+            faiss::MetricType metric = faiss::METRIC_L2,
+            GpuIndexCagraConfig config = GpuIndexCagraConfig());
+
+    /// Trains CAGRA based on the given vector data
+    void train(idx_t n, const float* x) override;
+
+    /// Initialize ourselves from the given CPU index; will overwrite
+    /// all data in ourselves
+    void copyFrom(const faiss::IndexHNSWCagra* index);
+
+    /// Copy ourselves to the given CPU index; will overwrite all data
+    /// in the index instance
+    void copyTo(faiss::IndexHNSWCagra* index) const;
+
+    void reset() override;
+
+    std::vector<idx_t> get_knngraph() const;
+
+   protected:
+    bool addImplRequiresIDs_() const override;
+
+    void addImpl_(idx_t n, const float* x, const idx_t* ids) override;
+
+    /// Called from GpuIndex for search
+    void searchImpl_(
+            idx_t n,
+            const float* x,
+            int k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* search_params) const override;
+
+    /// Our configuration options
+    const GpuIndexCagraConfig cagraConfig_;
+
+    /// Instance that we own; contains the inverted lists
+    std::shared_ptr<RaftCagra> index_;
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVF.cu b/faiss/gpu/GpuIndexIVF.cu
index 0c5b8db686..40129a54c5 100644
--- a/faiss/gpu/GpuIndexIVF.cu
+++ b/faiss/gpu/GpuIndexIVF.cu
@@ -7,6 +7,7 @@
 
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
 #include <faiss/gpu/GpuCloner.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
@@ -172,10 +173,29 @@ void GpuIndexIVF::copyFrom(const faiss::IndexIVF* index) {
         // over to the GPU, on the same device that we are on.
         GpuResourcesProviderFromInstance pfi(getResources());
 
-        GpuClonerOptions options;
-        auto cloner = ToGpuCloner(&pfi, getDevice(), options);
-
-        quantizer = cloner.clone_Index(index->quantizer);
+        // Attempt to clone the index to GPU. If it fails because the coarse
+        // quantizer is not implemented on GPU and the flag to allow CPU
+        // fallback is set, retry it with CPU cloner and re-throw errors.
+        try {
+            GpuClonerOptions options;
+            auto cloner = ToGpuCloner(&pfi, getDevice(), options);
+            quantizer = cloner.clone_Index(index->quantizer);
+        } catch (const std::exception& e) {
+            if (strstr(e.what(), "not implemented on GPU")) {
+                if (ivfConfig_.allowCpuCoarseQuantizer) {
+                    Cloner cpuCloner;
+                    quantizer = cpuCloner.clone_Index(index->quantizer);
+                } else {
+                    FAISS_THROW_MSG(
+                            "This index type is not implemented on "
+                            "GPU and allowCpuCoarseQuantizer is set to false. "
+                            "Please set the flag to true to allow the CPU "
+                            "fallback in cloning.");
+                }
+            } else {
+                throw;
+            }
+        }
         own_fields = true;
     } else {
         // Otherwise, this is a GPU coarse quantizer index instance found in a
diff --git a/faiss/gpu/GpuIndexIVF.h b/faiss/gpu/GpuIndexIVF.h
index a9f092d35b..65a27aa94e 100644
--- a/faiss/gpu/GpuIndexIVF.h
+++ b/faiss/gpu/GpuIndexIVF.h
@@ -26,6 +26,12 @@ struct GpuIndexIVFConfig : public GpuIndexConfig {
 
     /// Configuration for the coarse quantizer object
     GpuIndexFlatConfig flatConfig;
+
+    /// This flag controls the CPU fallback logic for coarse quantizer
+    /// component of the index. When set to false (default), the cloner will
+    /// throw an exception for indices not implemented on GPU. When set to
+    /// true, it will fallback to a CPU implementation.
+    bool allowCpuCoarseQuantizer = false;
 };
 
 /// Base class of all GPU IVF index types. This (for now) deliberately does not
diff --git a/faiss/gpu/GpuIndexIVFFlat.cu b/faiss/gpu/GpuIndexIVFFlat.cu
index 440b449a50..884b5b0fc0 100644
--- a/faiss/gpu/GpuIndexIVFFlat.cu
+++ b/faiss/gpu/GpuIndexIVFFlat.cu
@@ -356,5 +356,27 @@ void GpuIndexIVFFlat::setIndex_(
     }
 }
 
+void GpuIndexIVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) const {
+    FAISS_ASSERT(index_);
+
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    FAISS_THROW_IF_NOT_FMT(
+            i0 < this->ntotal,
+            "start index (%zu) out of bounds (ntotal %zu)",
+            i0,
+            this->ntotal);
+    FAISS_THROW_IF_NOT_FMT(
+            i0 + ni - 1 < this->ntotal,
+            "max index requested (%zu) out of bounds (ntotal %zu)",
+            i0 + ni - 1,
+            this->ntotal);
+
+    index_->reconstruct_n(i0, ni, out);
+}
+
 } // namespace gpu
 } // namespace faiss
diff --git a/faiss/gpu/GpuIndexIVFFlat.h b/faiss/gpu/GpuIndexIVFFlat.h
index 678bf8e7f4..1401e2b291 100644
--- a/faiss/gpu/GpuIndexIVFFlat.h
+++ b/faiss/gpu/GpuIndexIVFFlat.h
@@ -87,6 +87,8 @@ class GpuIndexIVFFlat : public GpuIndexIVF {
     /// Trains the coarse quantizer based on the given vector data
     void train(idx_t n, const float* x) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) const override;
+
    protected:
     /// Initialize appropriate index
     void setIndex_(
diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
index 004f80a27e..78336b4994 100644
--- a/faiss/gpu/StandardGpuResources.cpp
+++ b/faiss/gpu/StandardGpuResources.cpp
@@ -257,6 +257,14 @@ void StandardGpuResourcesImpl::setDefaultStream(
         if (prevStream != stream) {
             streamWait({stream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_[device] = stream;
@@ -275,6 +283,14 @@ void StandardGpuResourcesImpl::revertDefaultStream(int device) {
 
             streamWait({newStream}, {prevStream});
         }
+#if defined USE_NVIDIA_RAFT
+        // delete the raft handle for this device, which will be initialized
+        // with the updated stream during any subsequent calls to getRaftHandle
+        auto it2 = raftHandles_.find(device);
+        if (it2 != raftHandles_.end()) {
+            raftHandles_.erase(it2);
+        }
+#endif
     }
 
     userDefaultStreams_.erase(device);
diff --git a/faiss/gpu/impl/IVFBase.cu b/faiss/gpu/impl/IVFBase.cu
index 890d489440..3b373b8280 100644
--- a/faiss/gpu/impl/IVFBase.cu
+++ b/faiss/gpu/impl/IVFBase.cu
@@ -340,6 +340,10 @@ void IVFBase::copyInvertedListsTo(InvertedLists* ivf) {
     }
 }
 
+void IVFBase::reconstruct_n(idx_t i0, idx_t n, float* out) {
+    FAISS_THROW_MSG("not implemented");
+}
+
 void IVFBase::addEncodedVectorsToList_(
         idx_t listId,
         const void* codes,
diff --git a/faiss/gpu/impl/IVFBase.cuh b/faiss/gpu/impl/IVFBase.cuh
index 6b1f2ac394..04af9a906e 100644
--- a/faiss/gpu/impl/IVFBase.cuh
+++ b/faiss/gpu/impl/IVFBase.cuh
@@ -109,9 +109,18 @@ class IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) = 0;
 
+    /*  It is used to reconstruct a given number of vectors in an Inverted File
+     * (IVF) index
+     *  @param i0          index of the first vector to reconstruct
+     *  @param n           number of vectors to reconstruct
+     *  @param out         This is a pointer to a buffer where the reconstructed
+     * vectors will be stored.
+     */
+    virtual void reconstruct_n(idx_t i0, idx_t n, float* out);
+
    protected:
-    /// Adds a set of codes and indices to a list, with the representation
-    /// coming from the CPU equivalent
+    /// Adds a set of codes and indices to a list, with the
+    /// representation coming from the CPU equivalent
     virtual void addEncodedVectorsToList_(
             idx_t listId,
             // resident on the host
diff --git a/faiss/gpu/impl/IVFFlat.cu b/faiss/gpu/impl/IVFFlat.cu
index 4607e49870..e0ecfd82cf 100644
--- a/faiss/gpu/impl/IVFFlat.cu
+++ b/faiss/gpu/impl/IVFFlat.cu
@@ -283,6 +283,53 @@ void IVFFlat::searchPreassigned(
             storePairs);
 }
 
+void IVFFlat::reconstruct_n(idx_t i0, idx_t ni, float* out) {
+    if (ni == 0) {
+        // nothing to do
+        return;
+    }
+
+    auto stream = resources_->getDefaultStreamCurrentDevice();
+
+    for (idx_t list_no = 0; list_no < numLists_; list_no++) {
+        size_t list_size = deviceListData_[list_no]->numVecs;
+
+        auto idlist = getListIndices(list_no);
+
+        for (idx_t offset = 0; offset < list_size; offset++) {
+            idx_t id = idlist[offset];
+            if (!(id >= i0 && id < i0 + ni)) {
+                continue;
+            }
+
+            // vector data in the non-interleaved format is laid out like:
+            // v0d0 v0d1 ... v0d(dim-1) v1d0 v1d1 ... v1d(dim-1)
+
+            // vector data in the interleaved format is laid out like:
+            // (v0d0 v1d0 ... v31d0) (v0d1 v1d1 ... v31d1)
+            // (v0d(dim - 1) ... v31d(dim-1))
+            // (v32d0 v33d0 ... v63d0) (... v63d(dim-1)) (v64d0 ...)
+
+            // where vectors are chunked into groups of 32, and each dimension
+            // for each of the 32 vectors is contiguous
+
+            auto vectorChunk = offset / 32;
+            auto vectorWithinChunk = offset % 32;
+
+            auto listDataPtr = (float*)deviceListData_[list_no]->data.data();
+            listDataPtr += vectorChunk * 32 * dim_ + vectorWithinChunk;
+
+            for (int d = 0; d < dim_; ++d) {
+                fromDevice<float>(
+                        listDataPtr + 32 * d,
+                        out + (id - i0) * dim_ + d,
+                        1,
+                        stream);
+            }
+        }
+    }
+}
+
 void IVFFlat::searchImpl_(
         Tensor<float, 2, true>& queries,
         Tensor<float, 2, true>& coarseDistances,
diff --git a/faiss/gpu/impl/IVFFlat.cuh b/faiss/gpu/impl/IVFFlat.cuh
index 246fc18b16..889b510795 100644
--- a/faiss/gpu/impl/IVFFlat.cuh
+++ b/faiss/gpu/impl/IVFFlat.cuh
@@ -51,6 +51,8 @@ class IVFFlat : public IVFBase {
             Tensor<idx_t, 2, true>& outIndices,
             bool storePairs) override;
 
+    void reconstruct_n(idx_t i0, idx_t n, float* out) override;
+
    protected:
     /// Returns the number of bytes in which an IVF list containing numVecs
     /// vectors is encoded on the device. Note that due to padding this is not
diff --git a/faiss/gpu/impl/RaftCagra.cu b/faiss/gpu/impl/RaftCagra.cu
new file mode 100644
index 0000000000..292079321d
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cu
@@ -0,0 +1,371 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/gpu/utils/DeviceUtils.h>
+#include <cstddef>
+#include <cstdint>
+#include <faiss/gpu/impl/RaftCagra.cuh>
+
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft_runtime/neighbors/cagra.hpp>
+#include <optional>
+#include <raft/neighbors/cagra.cuh>
+
+namespace faiss {
+namespace gpu {
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t intermediate_graph_degree,
+        idx_t graph_degree,
+        faiss::cagra_build_algo graph_build_algo,
+        size_t nn_descent_niter,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions,
+        std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params,
+        std::optional<raft::neighbors::ivf_pq::search_params>
+                ivf_pq_search_params)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg),
+          index_params_(),
+          ivf_pq_params_(ivf_pq_params),
+          ivf_pq_search_params_(ivf_pq_search_params) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    index_params_.intermediate_graph_degree = intermediate_graph_degree;
+    index_params_.graph_degree = graph_degree;
+    index_params_.build_algo =
+            static_cast<raft::neighbors::cagra::graph_build_algo>(
+                    graph_build_algo);
+    index_params_.nn_descent_niter = nn_descent_niter;
+
+    if (!ivf_pq_params_) {
+        ivf_pq_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::index_params>();
+    }
+    if (!ivf_pq_search_params_) {
+        ivf_pq_search_params_ =
+                std::make_optional<raft::neighbors::ivf_pq::search_params>();
+    }
+    index_params_.metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+    ivf_pq_params_->metric = metric_ == faiss::METRIC_L2
+            ? raft::distance::DistanceType::L2Expanded
+            : raft::distance::DistanceType::InnerProduct;
+
+    reset();
+}
+
+RaftCagra::RaftCagra(
+        GpuResources* resources,
+        int dim,
+        idx_t n,
+        int graph_degree,
+        const float* distances,
+        const idx_t* knn_graph,
+        faiss::MetricType metric,
+        float metricArg,
+        IndicesOptions indicesOptions)
+        : resources_(resources),
+          dim_(dim),
+          metric_(metric),
+          metricArg_(metricArg) {
+    FAISS_THROW_IF_NOT_MSG(
+            metric == faiss::METRIC_L2 || metric == faiss::METRIC_INNER_PRODUCT,
+            "CAGRA currently only supports L2 or Inner Product metric.");
+    FAISS_THROW_IF_NOT_MSG(
+            indicesOptions == faiss::gpu::INDICES_64_BIT,
+            "only INDICES_64_BIT is supported for RAFT CAGRA index");
+
+    auto distances_on_gpu = getDeviceForAddress(distances) >= 0;
+    auto knn_graph_on_gpu = getDeviceForAddress(knn_graph) >= 0;
+
+    FAISS_ASSERT(distances_on_gpu == knn_graph_on_gpu);
+
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+
+    if (distances_on_gpu && knn_graph_on_gpu) {
+        raft_handle.sync_stream();
+        // Copying to host so that raft::neighbors::cagra::index
+        // creates an owning copy of the knn graph on device
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        thrust::copy(
+                thrust::device_ptr<const idx_t>(knn_graph),
+                thrust::device_ptr<const idx_t>(knn_graph + (n * graph_degree)),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds =
+                raft::make_device_matrix_view<const float, int64_t>(
+                        distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else if (!distances_on_gpu && !knn_graph_on_gpu) {
+        // copy idx_t (int64_t) host knn_graph to uint32_t host knn_graph
+        auto knn_graph_copy =
+                raft::make_host_matrix<uint32_t, int64_t>(n, graph_degree);
+        std::copy(
+                knn_graph,
+                knn_graph + (n * graph_degree),
+                knn_graph_copy.data_handle());
+
+        auto distances_mds = raft::make_host_matrix_view<const float, int64_t>(
+                distances, n, dim);
+
+        raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                raft_handle,
+                metric_ == faiss::METRIC_L2
+                        ? raft::distance::DistanceType::L2Expanded
+                        : raft::distance::DistanceType::InnerProduct,
+                distances_mds,
+                raft::make_const_mdspan(knn_graph_copy.view()));
+    } else {
+        FAISS_THROW_MSG(
+                "distances and knn_graph must both be in device or host memory");
+    }
+}
+
+void RaftCagra::train(idx_t n, const float* x) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    if (index_params_.build_algo ==
+        raft::neighbors::cagra::graph_build_algo::IVF_PQ) {
+        std::optional<raft::host_matrix<uint32_t, int64_t>> knn_graph(
+                raft::make_host_matrix<uint32_t, int64_t>(
+                        n, index_params_.intermediate_graph_degree));
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_d,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft::neighbors::cagra::build_knn_graph(
+                    raft_handle,
+                    dataset_h,
+                    knn_graph->view(),
+                    1.0f,
+                    ivf_pq_params_,
+                    ivf_pq_search_params_);
+        }
+        auto cagra_graph = raft::make_host_matrix<uint32_t, int64_t>(
+                n, index_params_.graph_degree);
+
+        raft::neighbors::cagra::optimize<uint32_t>(
+                raft_handle, knn_graph->view(), cagra_graph.view());
+
+        // free intermediate graph before trying to create the index
+        knn_graph.reset();
+
+        if (getDeviceForAddress(x) >= 0) {
+            auto dataset_d =
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_d,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        } else {
+            auto dataset_h = raft::make_host_matrix_view<const float, int64_t>(
+                    x, n, dim_);
+            raft_knn_index = raft::neighbors::cagra::index<float, uint32_t>(
+                    raft_handle,
+                    metric_ == faiss::METRIC_L2
+                            ? raft::distance::DistanceType::L2Expanded
+                            : raft::distance::DistanceType::InnerProduct,
+                    dataset_h,
+                    raft::make_const_mdspan(cagra_graph.view()));
+        }
+
+    } else {
+        if (getDeviceForAddress(x) >= 0) {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_device_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        } else {
+            raft_knn_index = raft::runtime::neighbors::cagra::build(
+                    raft_handle,
+                    index_params_,
+                    raft::make_host_matrix_view<const float, int64_t>(
+                            x, n, dim_));
+        }
+    }
+}
+
+void RaftCagra::search(
+        Tensor<float, 2, true>& queries,
+        int k,
+        Tensor<float, 2, true>& outDistances,
+        Tensor<idx_t, 2, true>& outIndices,
+        idx_t max_queries,
+        idx_t itopk_size,
+        idx_t max_iterations,
+        faiss::cagra_search_algo graph_search_algo,
+        idx_t team_size,
+        idx_t search_width,
+        idx_t min_iterations,
+        idx_t thread_block_size,
+        faiss::cagra_hash_mode hash_mode,
+        idx_t hashmap_min_bitlen,
+        float hashmap_max_fill_rate,
+        idx_t num_random_samplings,
+        idx_t rand_xor_mask) {
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    idx_t numQueries = queries.getSize(0);
+    idx_t cols = queries.getSize(1);
+    idx_t k_ = k;
+
+    FAISS_ASSERT(raft_knn_index.has_value());
+    FAISS_ASSERT(numQueries > 0);
+    FAISS_ASSERT(cols == dim_);
+
+    auto queries_view = raft::make_device_matrix_view<const float, int64_t>(
+            queries.data(), numQueries, cols);
+    auto distances_view = raft::make_device_matrix_view<float, int64_t>(
+            outDistances.data(), numQueries, k_);
+    auto indices_view = raft::make_device_matrix_view<idx_t, int64_t>(
+            outIndices.data(), numQueries, k_);
+
+    raft::neighbors::cagra::search_params search_pams;
+    search_pams.max_queries = max_queries;
+    search_pams.itopk_size = itopk_size;
+    search_pams.max_iterations = max_iterations;
+    search_pams.algo =
+            static_cast<raft::neighbors::cagra::search_algo>(graph_search_algo);
+    search_pams.team_size = team_size;
+    search_pams.search_width = search_width;
+    search_pams.min_iterations = min_iterations;
+    search_pams.thread_block_size = thread_block_size;
+    search_pams.hashmap_mode =
+            static_cast<raft::neighbors::cagra::hash_mode>(hash_mode);
+    search_pams.hashmap_min_bitlen = hashmap_min_bitlen;
+    search_pams.hashmap_max_fill_rate = hashmap_max_fill_rate;
+    search_pams.num_random_samplings = num_random_samplings;
+    search_pams.rand_xor_mask = rand_xor_mask;
+
+    auto indices_copy = raft::make_device_matrix<uint32_t, int64_t>(
+            raft_handle, numQueries, k_);
+
+    raft::runtime::neighbors::cagra::search(
+            raft_handle,
+            search_pams,
+            raft_knn_index.value(),
+            queries_view,
+            indices_copy.view(),
+            distances_view);
+    thrust::copy(
+            raft::resource::get_thrust_policy(raft_handle),
+            indices_copy.data_handle(),
+            indices_copy.data_handle() + indices_copy.size(),
+            indices_view.data_handle());
+}
+
+void RaftCagra::reset() {
+    raft_knn_index.reset();
+}
+
+idx_t RaftCagra::get_knngraph_degree() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    return static_cast<idx_t>(raft_knn_index.value().graph_degree());
+}
+
+std::vector<idx_t> RaftCagra::get_knngraph() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_graph = raft_knn_index.value().graph();
+
+    std::vector<idx_t> host_graph(
+            device_graph.extent(0) * device_graph.extent(1));
+
+    raft_handle.sync_stream();
+
+    thrust::copy(
+            thrust::device_ptr<const uint32_t>(device_graph.data_handle()),
+            thrust::device_ptr<const uint32_t>(
+                    device_graph.data_handle() + device_graph.size()),
+            host_graph.data());
+
+    return host_graph;
+}
+
+std::vector<float> RaftCagra::get_training_dataset() const {
+    FAISS_ASSERT(raft_knn_index.has_value());
+    const raft::device_resources& raft_handle =
+            resources_->getRaftHandleCurrentDevice();
+    auto stream = raft_handle.get_stream();
+
+    auto device_dataset = raft_knn_index.value().dataset();
+
+    std::vector<float> host_dataset(
+            device_dataset.extent(0) * device_dataset.extent(1));
+
+    RAFT_CUDA_TRY(cudaMemcpy2DAsync(
+            host_dataset.data(),
+            sizeof(float) * dim_,
+            device_dataset.data_handle(),
+            sizeof(float) * device_dataset.stride(0),
+            sizeof(float) * dim_,
+            device_dataset.extent(0),
+            cudaMemcpyDefault,
+            raft_handle.get_stream()));
+    raft_handle.sync_stream();
+
+    return host_dataset;
+}
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftCagra.cuh b/faiss/gpu/impl/RaftCagra.cuh
new file mode 100644
index 0000000000..95f6c03fca
--- /dev/null
+++ b/faiss/gpu/impl/RaftCagra.cuh
@@ -0,0 +1,132 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <faiss/gpu/GpuIndicesOptions.h>
+#include <faiss/gpu/GpuResources.h>
+#include <cstddef>
+#include <faiss/gpu/utils/Tensor.cuh>
+#include <optional>
+
+#include <faiss/MetricType.h>
+
+#include <raft/neighbors/cagra_types.hpp>
+#include <raft/neighbors/ivf_pq_types.hpp>
+
+namespace faiss {
+
+/// Algorithm used to build underlying CAGRA graph
+enum class cagra_build_algo { IVF_PQ, NN_DESCENT };
+
+enum class cagra_search_algo { SINGLE_CTA, MULTI_CTA };
+
+enum class cagra_hash_mode { HASH, SMALL, AUTO };
+
+namespace gpu {
+
+class RaftCagra {
+   public:
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t intermediate_graph_degree,
+            idx_t graph_degree,
+            faiss::cagra_build_algo graph_build_algo,
+            size_t nn_descent_niter,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions,
+            std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params =
+                    std::nullopt,
+            std::optional<raft::neighbors::ivf_pq::search_params>
+                    ivf_pq_search_params = std::nullopt);
+
+    RaftCagra(
+            GpuResources* resources,
+            int dim,
+            idx_t n,
+            int graph_degree,
+            const float* distances,
+            const idx_t* knn_graph,
+            faiss::MetricType metric,
+            float metricArg,
+            IndicesOptions indicesOptions);
+
+    ~RaftCagra() = default;
+
+    void train(idx_t n, const float* x);
+
+    void search(
+            Tensor<float, 2, true>& queries,
+            int k,
+            Tensor<float, 2, true>& outDistances,
+            Tensor<idx_t, 2, true>& outIndices,
+            idx_t max_queries,
+            idx_t itopk_size,
+            idx_t max_iterations,
+            faiss::cagra_search_algo graph_search_algo,
+            idx_t team_size,
+            idx_t search_width,
+            idx_t min_iterations,
+            idx_t thread_block_size,
+            faiss::cagra_hash_mode hash_mode,
+            idx_t hashmap_min_bitlen,
+            float hashmap_max_fill_rate,
+            idx_t num_random_samplings,
+            idx_t rand_xor_mask);
+
+    void reset();
+
+    idx_t get_knngraph_degree() const;
+
+    std::vector<idx_t> get_knngraph() const;
+
+    std::vector<float> get_training_dataset() const;
+
+   private:
+    /// Collection of GPU resources that we use
+    GpuResources* resources_;
+
+    /// Expected dimensionality of the vectors
+    const int dim_;
+
+    /// Metric type of the index
+    faiss::MetricType metric_;
+
+    /// Metric arg
+    float metricArg_;
+
+    /// Parameters to build RAFT CAGRA index
+    raft::neighbors::cagra::index_params index_params_;
+
+    /// Parameters to build CAGRA graph using IVF PQ
+    std::optional<raft::neighbors::ivf_pq::index_params> ivf_pq_params_;
+    std::optional<raft::neighbors::ivf_pq::search_params> ivf_pq_search_params_;
+
+    /// Instance of trained RAFT CAGRA index
+    std::optional<raft::neighbors::cagra::index<float, uint32_t>>
+            raft_knn_index{std::nullopt};
+};
+
+} // namespace gpu
+} // namespace faiss
diff --git a/faiss/gpu/impl/RaftIVFFlat.cu b/faiss/gpu/impl/RaftIVFFlat.cu
index 1e310723d0..0906a60f46 100644
--- a/faiss/gpu/impl/RaftIVFFlat.cu
+++ b/faiss/gpu/impl/RaftIVFFlat.cu
@@ -403,7 +403,8 @@ void RaftIVFFlat::copyInvertedListsFrom(const InvertedLists* ivf) {
     }
 
     // Update the pointers and the sizes
-    raft_knn_index.value().recompute_internal_state(raft_handle);
+    raft::neighbors::ivf_flat::helpers::recompute_internal_state(
+            raft_handle, &(raft_knn_index.value()));
 
     for (size_t i = 0; i < nlist; ++i) {
         size_t listSize = ivf->list_size(i);
diff --git a/faiss/gpu/perf/PerfClustering.cpp b/faiss/gpu/perf/PerfClustering.cpp
index 0322f0e490..532557fe20 100644
--- a/faiss/gpu/perf/PerfClustering.cpp
+++ b/faiss/gpu/perf/PerfClustering.cpp
@@ -17,6 +17,7 @@
 #include <vector>
 
 #include <cuda_profiler_api.h>
+#include <faiss/impl/AuxIndexStructures.h>
 
 DEFINE_int32(num, 10000, "# of vecs");
 DEFINE_int32(k, 100, "# of clusters");
@@ -34,6 +35,7 @@ DEFINE_int64(
         "minimum size to use CPU -> GPU paged copies");
 DEFINE_int64(pinned_mem, -1, "pinned memory allocation to use");
 DEFINE_int32(max_points, -1, "max points per centroid");
+DEFINE_double(timeout, 0, "timeout in seconds");
 
 using namespace faiss::gpu;
 
@@ -99,10 +101,14 @@ int main(int argc, char** argv) {
         cp.max_points_per_centroid = FLAGS_max_points;
     }
 
+    auto tc = new faiss::TimeoutCallback();
+    faiss::InterruptCallback::instance.reset(tc);
+
     faiss::Clustering kmeans(FLAGS_dim, FLAGS_k, cp);
 
     // Time k-means
     {
+        tc->set_timeout(FLAGS_timeout);
         CpuTimer timer;
 
         kmeans.train(FLAGS_num, vecs.data(), *(gpuIndex.getIndex()));
diff --git a/faiss/gpu/test/CMakeLists.txt b/faiss/gpu/test/CMakeLists.txt
index 9300deead9..60f78ef74f 100644
--- a/faiss/gpu/test/CMakeLists.txt
+++ b/faiss/gpu/test/CMakeLists.txt
@@ -21,7 +21,6 @@ find_package(CUDAToolkit REQUIRED)
 
 # Defines `gtest_discover_tests()`.
 include(GoogleTest)
-
 add_library(faiss_gpu_test_helper TestUtils.cpp)
 target_link_libraries(faiss_gpu_test_helper PUBLIC faiss gtest CUDA::cudart $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled>)
 
@@ -42,6 +41,9 @@ faiss_gpu_test(TestGpuIndexIVFPQ.cpp)
 faiss_gpu_test(TestGpuIndexIVFScalarQuantizer.cpp)
 faiss_gpu_test(TestGpuDistance.cu)
 faiss_gpu_test(TestGpuSelect.cu)
+if(FAISS_ENABLE_RAFT)
+  faiss_gpu_test(TestGpuIndexCagra.cu)
+endif()
 
 add_executable(demo_ivfpq_indexing_gpu EXCLUDE_FROM_ALL
   demo_ivfpq_indexing_gpu.cpp)
diff --git a/faiss/gpu/test/TestGpuIndexCagra.cu b/faiss/gpu/test/TestGpuIndexCagra.cu
new file mode 100644
index 0000000000..8d330a81cb
--- /dev/null
+++ b/faiss/gpu/test/TestGpuIndexCagra.cu
@@ -0,0 +1,474 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <faiss/IndexHNSW.h>
+#include <faiss/MetricType.h>
+#include <faiss/gpu/GpuIndexCagra.h>
+#include <faiss/gpu/GpuResources.h>
+#include <faiss/gpu/StandardGpuResources.h>
+#include <faiss/gpu/test/TestUtils.h>
+#include <faiss/utils/distances.h>
+#include <cstddef>
+#include <faiss/gpu/utils/CopyUtils.cuh>
+#include <faiss/gpu/utils/DeviceTensor.cuh>
+#include <optional>
+#include <vector>
+
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/stats/neighborhood_recall.cuh>
+
+struct Options {
+    Options() {
+        numTrain = 2 * faiss::gpu::randVal(2000, 5000);
+        dim = faiss::gpu::randVal(4, 10);
+        numAdd = faiss::gpu::randVal(1000, 3000);
+
+        graphDegree = faiss::gpu::randSelect({32, 64});
+        intermediateGraphDegree = faiss::gpu::randSelect({64, 98});
+        buildAlgo = faiss::gpu::randSelect(
+                {faiss::gpu::graph_build_algo::IVF_PQ,
+                 faiss::gpu::graph_build_algo::NN_DESCENT});
+
+        numQuery = faiss::gpu::randVal(32, 100);
+        k = faiss::gpu::randVal(10, 30);
+
+        device = faiss::gpu::randVal(0, faiss::gpu::getNumDevices() - 1);
+    }
+
+    std::string toString() const {
+        std::stringstream str;
+        str << "CAGRA device " << device << " numVecs " << numTrain << " dim "
+            << dim << " graphDegree " << graphDegree
+            << " intermediateGraphDegree " << intermediateGraphDegree
+            << "buildAlgo " << static_cast<int>(buildAlgo) << " numQuery "
+            << numQuery << " k " << k;
+
+        return str.str();
+    }
+
+    int numTrain;
+    int numAdd;
+    int dim;
+    size_t graphDegree;
+    size_t intermediateGraphDegree;
+    faiss::gpu::graph_build_algo buildAlgo;
+    int numQuery;
+    int k;
+    int device;
+};
+
+void queryTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // train gpu index
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, cpuIndex.d, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        // test quality of searches
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                test_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                test_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_L2) {
+    queryTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_Query_IP) {
+    queryTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+void copyToTest(
+        faiss::MetricType metric,
+        double expected_recall,
+        bool base_level_only) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+        std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numAdd, opt.dim, addVecs.data());
+        }
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // train gpu index and copy to cpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        faiss::IndexHNSWCagra copiedCpuIndex(
+                opt.dim, opt.graphDegree / 2, metric);
+        copiedCpuIndex.base_level_only = base_level_only;
+        gpuIndex.copyTo(&copiedCpuIndex);
+        copiedCpuIndex.hnsw.efConstruction = opt.k * 2;
+
+        // add more vecs to copied cpu index
+        if (!base_level_only) {
+            copiedCpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWFlat cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        // add more vecs to cpu index
+        if (!base_level_only) {
+            cpuIndex.add(opt.numAdd, addVecs.data());
+        }
+
+        // query indexes
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        std::vector<float> refDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> refIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParams;
+        cpuSearchParams.efSearch = opt.k * 2;
+        cpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                refDistance.data(),
+                refIndices.data(),
+                &cpuSearchParams);
+
+        std::vector<float> copyRefDistance(opt.numQuery * opt.k, 0);
+        std::vector<faiss::idx_t> copyRefIndices(opt.numQuery * opt.k, -1);
+        faiss::SearchParametersHNSW cpuSearchParamstwo;
+        cpuSearchParamstwo.efSearch = opt.k * 2;
+        copiedCpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyRefDistance.data(),
+                copyRefIndices.data(),
+                &cpuSearchParamstwo);
+
+        // test quality of search
+        auto gpuRes = res.getResources();
+
+        auto refDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto refIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                refIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto copyRefDistanceDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefDistance,
+                gpuRes->getDefaultStreamCurrentDevice());
+        auto copyRefIndicesDev = faiss::gpu::toDeviceTemporary(
+                gpuRes.get(),
+                copyRefIndices,
+                gpuRes->getDefaultStreamCurrentDevice());
+
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                refDistanceDev.data(), opt.numQuery, opt.k);
+        auto ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        ref_dis_mds);
+        auto ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        refIndicesDev.data(), opt.numQuery, opt.k);
+
+        auto copy_ref_dis_mds = raft::make_device_matrix_view<const float, int>(
+                copyRefDistanceDev.data(), opt.numQuery, opt.k);
+        auto copy_ref_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_ref_dis_mds);
+        auto copy_ref_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyRefIndicesDev.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_ref_ind_mds,
+                ref_ind_mds,
+                recall_score.view(),
+                copy_ref_dis_mds_opt,
+                ref_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2) {
+    copyToTest(faiss::METRIC_L2, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_L2_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_L2, 0.98, true);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, false);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyTo_IP_BaseLevelOnly) {
+    copyToTest(faiss::METRIC_INNER_PRODUCT, 0.98, true);
+}
+
+void copyFromTest(faiss::MetricType metric, double expected_recall) {
+    for (int tries = 0; tries < 5; ++tries) {
+        Options opt;
+        if (opt.buildAlgo == faiss::gpu::graph_build_algo::NN_DESCENT &&
+            metric == faiss::METRIC_INNER_PRODUCT) {
+            continue;
+        }
+
+        std::vector<float> trainVecs =
+                faiss::gpu::randVecs(opt.numTrain, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numTrain, opt.dim, trainVecs.data());
+        }
+
+        // train cpu index
+        faiss::IndexHNSWCagra cpuIndex(opt.dim, opt.graphDegree / 2, metric);
+        cpuIndex.hnsw.efConstruction = opt.k * 2;
+        cpuIndex.add(opt.numTrain, trainVecs.data());
+
+        faiss::gpu::StandardGpuResources res;
+        res.noTempMemory();
+
+        // convert to gpu index
+        faiss::gpu::GpuIndexCagra copiedGpuIndex(&res, cpuIndex.d, metric);
+        copiedGpuIndex.copyFrom(&cpuIndex);
+
+        // train gpu index
+        faiss::gpu::GpuIndexCagraConfig config;
+        config.device = opt.device;
+        config.graph_degree = opt.graphDegree;
+        config.intermediate_graph_degree = opt.intermediateGraphDegree;
+        config.build_algo = opt.buildAlgo;
+
+        faiss::gpu::GpuIndexCagra gpuIndex(&res, opt.dim, metric, config);
+        gpuIndex.train(opt.numTrain, trainVecs.data());
+
+        // query
+        auto queryVecs = faiss::gpu::randVecs(opt.numQuery, opt.dim);
+        if (metric == faiss::METRIC_INNER_PRODUCT) {
+            faiss::fvec_renorm_L2(opt.numQuery, opt.dim, queryVecs.data());
+        }
+
+        auto gpuRes = res.getResources();
+        auto devAlloc = faiss::gpu::makeDevAlloc(
+                faiss::gpu::AllocType::FlatData,
+                gpuRes->getDefaultStreamCurrentDevice());
+        faiss::gpu::DeviceTensor<float, 2, true> copyTestDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> copyTestIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        copiedGpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                copyTestDistance.data(),
+                copyTestIndices.data());
+
+        faiss::gpu::DeviceTensor<float, 2, true> testDistance(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        faiss::gpu::DeviceTensor<faiss::idx_t, 2, true> testIndices(
+                gpuRes.get(), devAlloc, {opt.numQuery, opt.k});
+        gpuIndex.search(
+                opt.numQuery,
+                queryVecs.data(),
+                opt.k,
+                testDistance.data(),
+                testIndices.data());
+
+        // test quality of searches
+        auto raft_handle = gpuRes->getRaftHandleCurrentDevice();
+
+        auto test_dis_mds = raft::make_device_matrix_view<const float, int>(
+                testDistance.data(), opt.numQuery, opt.k);
+        auto test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        test_dis_mds);
+
+        auto test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        testIndices.data(), opt.numQuery, opt.k);
+
+        auto copy_test_dis_mds =
+                raft::make_device_matrix_view<const float, int>(
+                        copyTestDistance.data(), opt.numQuery, opt.k);
+        auto copy_test_dis_mds_opt =
+                std::optional<raft::device_matrix_view<const float, int>>(
+                        copy_test_dis_mds);
+
+        auto copy_test_ind_mds =
+                raft::make_device_matrix_view<const faiss::idx_t, int>(
+                        copyTestIndices.data(), opt.numQuery, opt.k);
+
+        double scalar_init = 0;
+        auto recall_score = raft::make_host_scalar(scalar_init);
+
+        raft::stats::neighborhood_recall(
+                raft_handle,
+                copy_test_ind_mds,
+                test_ind_mds,
+                recall_score.view(),
+                copy_test_dis_mds_opt,
+                test_dis_mds_opt);
+        ASSERT_TRUE(*recall_score.data_handle() > expected_recall);
+    }
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_L2) {
+    copyFromTest(faiss::METRIC_L2, 0.98);
+}
+
+TEST(TestGpuIndexCagra, Float32_CopyFrom_IP) {
+    copyFromTest(faiss::METRIC_INNER_PRODUCT, 0.98);
+}
+
+int main(int argc, char** argv) {
+    testing::InitGoogleTest(&argc, argv);
+
+    // just run with a fixed test seed
+    faiss::gpu::setTestSeed(100);
+
+    return RUN_ALL_TESTS();
+}
diff --git a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
index 6e423e582e..28eefec308 100644
--- a/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
+++ b/faiss/gpu/test/TestGpuIndexIVFFlat.cpp
@@ -842,6 +842,71 @@ TEST(TestGpuIndexIVFFlat, LongIVFList) {
 #endif
 }
 
+TEST(TestGpuIndexIVFFlat, Reconstruct_n) {
+    Options opt;
+
+    std::vector<float> trainVecs = faiss::gpu::randVecs(opt.numTrain, opt.dim);
+    std::vector<float> addVecs = faiss::gpu::randVecs(opt.numAdd, opt.dim);
+
+    faiss::IndexFlatL2 cpuQuantizer(opt.dim);
+    faiss::IndexIVFFlat cpuIndex(
+            &cpuQuantizer, opt.dim, opt.numCentroids, faiss::METRIC_L2);
+    cpuIndex.nprobe = opt.nprobe;
+    cpuIndex.train(opt.numTrain, trainVecs.data());
+    cpuIndex.add(opt.numAdd, addVecs.data());
+
+    faiss::gpu::StandardGpuResources res;
+    res.noTempMemory();
+
+    faiss::gpu::GpuIndexIVFFlatConfig config;
+    config.device = opt.device;
+    config.indicesOptions = faiss::gpu::INDICES_64_BIT;
+    config.use_raft = false;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex.nprobe = opt.nprobe;
+
+    gpuIndex.train(opt.numTrain, trainVecs.data());
+    gpuIndex.add(opt.numAdd, addVecs.data());
+
+    std::vector<float> gpuVals(opt.numAdd * opt.dim);
+
+    gpuIndex.reconstruct_n(0, gpuIndex.ntotal, gpuVals.data());
+
+    std::vector<float> cpuVals(opt.numAdd * opt.dim);
+
+    cpuIndex.reconstruct_n(0, cpuIndex.ntotal, cpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_32_BIT;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex1(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex1.nprobe = opt.nprobe;
+
+    gpuIndex1.train(opt.numTrain, trainVecs.data());
+    gpuIndex1.add(opt.numAdd, addVecs.data());
+
+    gpuIndex1.reconstruct_n(0, gpuIndex1.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+
+    config.indicesOptions = faiss::gpu::INDICES_CPU;
+
+    faiss::gpu::GpuIndexIVFFlat gpuIndex2(
+            &res, opt.dim, opt.numCentroids, faiss::METRIC_L2, config);
+    gpuIndex2.nprobe = opt.nprobe;
+
+    gpuIndex2.train(opt.numTrain, trainVecs.data());
+    gpuIndex2.add(opt.numAdd, addVecs.data());
+
+    gpuIndex2.reconstruct_n(0, gpuIndex2.ntotal, gpuVals.data());
+
+    EXPECT_EQ(gpuVals, cpuVals);
+}
+
 int main(int argc, char** argv) {
     testing::InitGoogleTest(&argc, argv);
 
diff --git a/faiss/gpu/test/test_cagra.py b/faiss/gpu/test/test_cagra.py
new file mode 100644
index 0000000000..4c7e532c2b
--- /dev/null
+++ b/faiss/gpu/test/test_cagra.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+
+from faiss.contrib import datasets, evaluation
+
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestComputeGT(unittest.TestCase):
+
+    def do_compute_GT(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+        Dref, Iref = faiss.knn(ds.get_queries(), ds.get_database(), k, metric)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+    def test_compute_GT_L2(self):
+        self.do_compute_GT(faiss.METRIC_L2)
+
+    def test_compute_GT_IP(self):
+        self.do_compute_GT(faiss.METRIC_INNER_PRODUCT)
+
+@unittest.skipIf(
+    "RAFT" not in faiss.get_compile_options(),
+    "only if RAFT is compiled in")
+class TestInterop(unittest.TestCase):
+
+    def do_interop(self, metric):
+        d = 64
+        k = 12
+        ds = datasets.SyntheticDataset(d, 0, 10000, 100)
+
+        res = faiss.StandardGpuResources()
+
+        index = faiss.GpuIndexCagra(res, d, metric)
+        index.train(ds.get_database())
+        Dnew, Inew = index.search(ds.get_queries(), k)
+
+        cpu_index = faiss.index_gpu_to_cpu(index)
+        Dref, Iref = cpu_index.search(ds.get_queries(), k)
+        
+        evaluation.check_ref_knn_with_draws(Dref, Iref, Dnew, Inew, k)
+
+        deserialized_index = faiss.deserialize_index(
+            faiss.serialize_index(cpu_index))
+
+        gpu_index = faiss.index_cpu_to_gpu(res, 0, deserialized_index)
+        Dnew2, Inew2 = gpu_index.search(ds.get_queries(), k)
+
+        evaluation.check_ref_knn_with_draws(Dnew2, Inew2, Dnew, Inew, k)
+
+    def test_interop_L2(self):
+        self.do_interop(faiss.METRIC_L2)
+
+    def test_interop_IP(self):
+        self.do_interop(faiss.METRIC_INNER_PRODUCT)
diff --git a/faiss/gpu/test/test_gpu_basics.py b/faiss/gpu/test/test_gpu_basics.py
index f3f0a525d4..4b4024d236 100755
--- a/faiss/gpu/test/test_gpu_basics.py
+++ b/faiss/gpu/test/test_gpu_basics.py
@@ -11,6 +11,7 @@
 import random
 from common_faiss_tests import get_dataset_2
 
+
 class ReferencedObject(unittest.TestCase):
 
     d = 16
diff --git a/faiss/gpu/test/test_gpu_index.py b/faiss/gpu/test/test_gpu_index.py
index 620bfea198..28572ebcb4 100755
--- a/faiss/gpu/test/test_gpu_index.py
+++ b/faiss/gpu/test/test_gpu_index.py
@@ -589,7 +589,10 @@ class TestGpuAutoTune(unittest.TestCase):
 
     def test_params(self):
         index = faiss.index_factory(32, "IVF65536_HNSW,PQ16")
-        index = faiss.index_cpu_to_gpu(faiss.StandardGpuResources(), 0, index)
+        res = faiss.StandardGpuResources()
+        options = faiss.GpuClonerOptions()
+        options.allowCpuCoarseQuantizer = True
+        index = faiss.index_cpu_to_gpu(res, 0, index, options)
         ps = faiss.GpuParameterSpace()
         ps.initialize(index)
         for i in range(ps.parameter_ranges.size()):
diff --git a/faiss/gpu/test/test_gpu_index_ivfflat.py b/faiss/gpu/test/test_gpu_index_ivfflat.py
new file mode 100644
index 0000000000..099615aff5
--- /dev/null
+++ b/faiss/gpu/test/test_gpu_index_ivfflat.py
@@ -0,0 +1,25 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import faiss
+import numpy as np
+
+
+class TestGpuIndexIvfflat(unittest.TestCase):
+    def test_reconstruct_n(self):
+        index = faiss.index_factory(4, "IVF10,Flat")
+        x = np.random.RandomState(123).rand(10, 4).astype('float32')
+        index.train(x)
+        index.add(x)
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+        index2 = faiss.GpuIndexIVFFlat(res, index, config)
+        recons = index2.reconstruct_n(0, 10)
+
+        np.testing.assert_array_equal(recons, x)
diff --git a/faiss/gpu/test/test_index_cpu_to_gpu.py b/faiss/gpu/test/test_index_cpu_to_gpu.py
new file mode 100644
index 0000000000..088ea2bf74
--- /dev/null
+++ b/faiss/gpu/test/test_index_cpu_to_gpu.py
@@ -0,0 +1,89 @@
+import numpy as np
+import unittest
+import faiss
+
+
+class TestMoveToGpu(unittest.TestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        cls.res = faiss.StandardGpuResources()
+
+    def create_index(self, factory_string):
+        dimension = 128
+        n = 2500
+        db_vectors = np.random.random((n, dimension)).astype('float32')
+        index = faiss.index_factory(dimension, factory_string)
+        index.train(db_vectors)
+        if factory_string.startswith("IDMap"):
+            index.add_with_ids(db_vectors, np.arange(n))
+        else:
+            index.add(db_vectors)
+        return index
+
+    def create_and_clone(self, factory_string,
+                         allowCpuCoarseQuantizer=None,
+                         use_raft=None):
+        idx = self.create_index(factory_string)
+        config = faiss.GpuClonerOptions()
+        if allowCpuCoarseQuantizer is not None:
+            config.allowCpuCoarseQuantizer = allowCpuCoarseQuantizer
+        if use_raft is not None:
+            config.use_raft = use_raft
+        faiss.index_cpu_to_gpu(self.res, 0, idx, config)
+
+    def verify_throws_not_implemented_exception(self, factory_string):
+        try:
+            self.create_and_clone(factory_string)
+        except Exception as e:
+            if "not implemented" not in str(e):
+                self.fail("Expected an exception but no exception was "
+                          "thrown for factory_string: %s." % factory_string)
+
+    def verify_clones_successfully(self, factory_string,
+                                   allowCpuCoarseQuantizer=None,
+                                   use_raft=None):
+        try:
+            self.create_and_clone(
+                factory_string,
+                allowCpuCoarseQuantizer=allowCpuCoarseQuantizer,
+                use_raft=use_raft)
+        except Exception as e:
+            self.fail("Unexpected exception thrown factory_string: "
+                      "%s; error message: %s." % (factory_string, str(e)))
+
+    def test_not_implemented_indices(self):
+        self.verify_throws_not_implemented_exception("PQ16")
+        self.verify_throws_not_implemented_exception("LSHrt")
+        self.verify_throws_not_implemented_exception("HNSW")
+        self.verify_throws_not_implemented_exception("HNSW,PQ16")
+        self.verify_throws_not_implemented_exception("IDMap,PQ16")
+        self.verify_throws_not_implemented_exception("IVF256,ITQ64,SH1.2")
+
+    def test_implemented_indices(self):
+        self.verify_clones_successfully("Flat")
+        self.verify_clones_successfully("IVF1,Flat")
+        self.verify_clones_successfully("IVF32,PQ8")
+        self.verify_clones_successfully("IDMap,Flat")
+        self.verify_clones_successfully("PCA12,IVF32,Flat")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8")
+        self.verify_clones_successfully("PCA32,IVF32,PQ8np")
+
+        # set use_raft to false, these index types are not supported on RAFT
+        self.verify_clones_successfully("IVF32,SQ8", use_raft=False)
+        self.verify_clones_successfully(
+            "PCA32,IVF32,SQ8", use_raft=False)
+
+    def test_with_flag(self):
+        self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                        allowCpuCoarseQuantizer=True)
+        self.verify_clones_successfully("IVF256(PQ2x4fs),Flat",
+                                        allowCpuCoarseQuantizer=True)
+
+    def test_with_flag_set_to_false(self):
+        try:
+            self.verify_clones_successfully("IVF32_HNSW,Flat",
+                                            allowCpuCoarseQuantizer=False)
+        except Exception as e:
+            if "set the flag to true to allow the CPU fallback" not in str(e):
+                self.fail("Unexepected error message thrown: %s." % str(e))
diff --git a/faiss/gpu/test/torch_test_contrib_gpu.py b/faiss/gpu/test/torch_test_contrib_gpu.py
index 1510b10f1d..f7444337f1 100644
--- a/faiss/gpu/test/torch_test_contrib_gpu.py
+++ b/faiss/gpu/test/torch_test_contrib_gpu.py
@@ -108,7 +108,7 @@ def test_train_add_with_ids(self):
         self.assertTrue(np.array_equal(I.reshape(10), ids_np[10:20]))
 
     # tests reconstruct, reconstruct_n
-    def test_reconstruct(self):
+    def test_flat_reconstruct(self):
         d = 32
         res = faiss.StandardGpuResources()
         res.noTempMemory()
@@ -157,6 +157,40 @@ def test_reconstruct(self):
         index.reconstruct_n(50, 10, y)
         self.assertTrue(torch.equal(xb[50:60], y))
 
+    def test_ivfflat_reconstruct(self):
+        d = 32
+        nlist = 5
+        res = faiss.StandardGpuResources()
+        res.noTempMemory()
+        config = faiss.GpuIndexIVFFlatConfig()
+        config.use_raft = False
+
+        index = faiss.GpuIndexIVFFlat(res, d, nlist, faiss.METRIC_L2, config)
+
+        xb = torch.rand(100, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.train(xb)
+        index.add(xb)
+
+        # Test reconstruct_n with torch gpu (native return)
+        y = index.reconstruct_n(10, 10)
+        self.assertTrue(y.is_cuda)
+        self.assertTrue(torch.equal(xb[10:20], y))
+
+        # Test reconstruct with numpy output provided
+        y = np.empty((10, d), dtype='float32')
+        index.reconstruct_n(20, 10, y)
+        self.assertTrue(np.array_equal(xb.cpu().numpy()[20:30], y))
+
+        # Test reconstruct_n with torch cpu output provided
+        y = torch.empty(10, d, dtype=torch.float32)
+        index.reconstruct_n(40, 10, y)
+        self.assertTrue(torch.equal(xb[40:50].cpu(), y))
+
+        # Test reconstruct_n with torch gpu output provided
+        y = torch.empty(10, d, device=torch.device('cuda', 0), dtype=torch.float32)
+        index.reconstruct_n(50, 10, y)
+        self.assertTrue(torch.equal(xb[50:60], y))
+
     # tests assign
     def test_assign(self):
         d = 32
@@ -215,7 +249,7 @@ def test_sa_encode_decode(self):
         return
 
 class TestTorchUtilsKnnGpu(unittest.TestCase):
-    def test_knn_gpu(self):
+    def test_knn_gpu(self, use_raft=False):
         torch.manual_seed(10)
         d = 32
         nb = 1024
@@ -252,7 +286,7 @@ def test_knn_gpu(self):
                     else:
                         xb_c = xb_np
 
-                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                    D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                     self.assertTrue(torch.equal(torch.from_numpy(I), gt_I))
                     self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1e-4)
@@ -278,7 +312,7 @@ def test_knn_gpu(self):
                             xb_c = to_column_major_torch(xb)
                             assert not xb_c.is_contiguous()
 
-                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k)
+                        D, I = faiss.knn_gpu(res, xq_c, xb_c, k, use_raft=use_raft)
 
                         self.assertTrue(torch.equal(I.cpu(), gt_I))
                         self.assertLess((D.cpu() - gt_D).abs().max(), 1e-4)
@@ -286,7 +320,7 @@ def test_knn_gpu(self):
                         # test on subset
                         try:
                             # This internally uses the current pytorch stream
-                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k)
+                            D, I = faiss.knn_gpu(res, xq_c[6:8], xb_c, k, use_raft=use_raft)
                         except TypeError:
                             if not xq_row_major:
                                 # then it is expected
@@ -297,7 +331,13 @@ def test_knn_gpu(self):
                         self.assertTrue(torch.equal(I.cpu(), gt_I[6:8]))
                         self.assertLess((D.cpu() - gt_D[6:8]).abs().max(), 1e-4)
 
-    def test_knn_gpu_datatypes(self):
+    @unittest.skipUnless(
+        "RAFT" in faiss.get_compile_options(),
+        "only if RAFT is compiled in")
+    def test_knn_gpu_raft(self):
+        self.test_knn_gpu(use_raft=True)
+
+    def test_knn_gpu_datatypes(self, use_raft=False):
         torch.manual_seed(10)
         d = 10
         nb = 1024
@@ -320,7 +360,7 @@ def test_knn_gpu_datatypes(self):
         D = torch.zeros(nq, k, device=xb_c.device, dtype=torch.float32)
         I = torch.zeros(nq, k, device=xb_c.device, dtype=torch.int32)
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(I.long().cpu(), gt_I))
         self.assertLess((D.float().cpu() - gt_D).abs().max(), 1.5e-3)
@@ -332,7 +372,7 @@ def test_knn_gpu_datatypes(self):
         xb_c = xb.half().numpy()
         xq_c = xq.half().numpy()
 
-        faiss.knn_gpu(res, xq_c, xb_c, k, D, I)
+        faiss.knn_gpu(res, xq_c, xb_c, k, D, I, use_raft=use_raft)
 
         self.assertTrue(torch.equal(torch.from_numpy(I).long(), gt_I))
         self.assertLess((torch.from_numpy(D) - gt_D).abs().max(), 1.5e-3)
diff --git a/faiss/gpu/utils/DeviceVector.cuh b/faiss/gpu/utils/DeviceVector.cuh
index 0517d06c32..51cb7c8d37 100644
--- a/faiss/gpu/utils/DeviceVector.cuh
+++ b/faiss/gpu/utils/DeviceVector.cuh
@@ -169,6 +169,8 @@ class DeviceVector {
         T out;
         CUDA_VERIFY(cudaMemcpyAsync(
                 &out, data() + idx, sizeof(T), cudaMemcpyDeviceToHost, stream));
+
+        return out;
     }
 
     // Clean up after oversized allocations, while leaving some space to
diff --git a/faiss/gpu/utils/Tensor.cuh b/faiss/gpu/utils/Tensor.cuh
index b13d0e1496..0fbb2417b3 100644
--- a/faiss/gpu/utils/Tensor.cuh
+++ b/faiss/gpu/utils/Tensor.cuh
@@ -232,13 +232,12 @@ class Tensor {
     }
 
     /// Returns a read/write view of a portion of our tensor.
-    __host__ __device__ inline detail::SubTensor<TensorType, Dim - 1, PtrTraits>
-    operator[](IndexT);
+    __host__ __device__ inline detail::
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT);
 
     /// Returns a read/write view of a portion of our tensor (const).
     __host__ __device__ inline const detail::
-            SubTensor<TensorType, Dim - 1, PtrTraits>
-            operator[](IndexT) const;
+            SubTensor<TensorType, Dim - 1, PtrTraits> operator[](IndexT) const;
 
     /// Returns the size of a given dimension, `[0, Dim - 1]`. No bounds
     /// checking.
diff --git a/faiss/impl/AuxIndexStructures.cpp b/faiss/impl/AuxIndexStructures.cpp
index cebe8a1e23..e2b2791e55 100644
--- a/faiss/impl/AuxIndexStructures.cpp
+++ b/faiss/impl/AuxIndexStructures.cpp
@@ -236,4 +236,29 @@ size_t InterruptCallback::get_period_hint(size_t flops) {
     return std::max((size_t)10 * 10 * 1000 * 1000 / (flops + 1), (size_t)1);
 }
 
+void TimeoutCallback::set_timeout(double timeout_in_seconds) {
+    timeout = timeout_in_seconds;
+    start = std::chrono::steady_clock::now();
+}
+
+bool TimeoutCallback::want_interrupt() {
+    if (timeout == 0) {
+        return false;
+    }
+    auto end = std::chrono::steady_clock::now();
+    std::chrono::duration<float, std::milli> duration = end - start;
+    float elapsed_in_seconds = duration.count() / 1000.0;
+    if (elapsed_in_seconds > timeout) {
+        timeout = 0;
+        return true;
+    }
+    return false;
+}
+
+void TimeoutCallback::reset(double timeout_in_seconds) {
+    auto tc(new faiss::TimeoutCallback());
+    faiss::InterruptCallback::instance.reset(tc);
+    tc->set_timeout(timeout_in_seconds);
+}
+
 } // namespace faiss
diff --git a/faiss/impl/AuxIndexStructures.h b/faiss/impl/AuxIndexStructures.h
index f8b5cca842..7e12a1a3af 100644
--- a/faiss/impl/AuxIndexStructures.h
+++ b/faiss/impl/AuxIndexStructures.h
@@ -161,6 +161,14 @@ struct FAISS_API InterruptCallback {
     static size_t get_period_hint(size_t flops);
 };
 
+struct TimeoutCallback : InterruptCallback {
+    std::chrono::time_point<std::chrono::steady_clock> start;
+    double timeout;
+    bool want_interrupt() override;
+    void set_timeout(double timeout_in_seconds);
+    static void reset(double timeout_in_seconds);
+};
+
 /// set implementation optimized for fast access.
 struct VisitedTable {
     std::vector<uint8_t> visited;
diff --git a/faiss/impl/DistanceComputer.h b/faiss/impl/DistanceComputer.h
index dc46d113fb..5ac3a702c9 100644
--- a/faiss/impl/DistanceComputer.h
+++ b/faiss/impl/DistanceComputer.h
@@ -59,6 +59,52 @@ struct DistanceComputer {
     virtual ~DistanceComputer() {}
 };
 
+/* Wrap the distance computer into one that negates the
+   distances. This makes supporting INNER_PRODUCE search easier */
+
+struct NegativeDistanceComputer : DistanceComputer {
+    /// owned by this
+    DistanceComputer* basedis;
+
+    explicit NegativeDistanceComputer(DistanceComputer* basedis)
+            : basedis(basedis) {}
+
+    void set_query(const float* x) override {
+        basedis->set_query(x);
+    }
+
+    /// compute distance of vector i to current query
+    float operator()(idx_t i) override {
+        return -(*basedis)(i);
+    }
+
+    void distances_batch_4(
+            const idx_t idx0,
+            const idx_t idx1,
+            const idx_t idx2,
+            const idx_t idx3,
+            float& dis0,
+            float& dis1,
+            float& dis2,
+            float& dis3) override {
+        basedis->distances_batch_4(
+                idx0, idx1, idx2, idx3, dis0, dis1, dis2, dis3);
+        dis0 = -dis0;
+        dis1 = -dis1;
+        dis2 = -dis2;
+        dis3 = -dis3;
+    }
+
+    /// compute distance between two stored vectors
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return -basedis->symmetric_dis(i, j);
+    }
+
+    virtual ~NegativeDistanceComputer() {
+        delete basedis;
+    }
+};
+
 /*************************************************************
  * Specialized version of the DistanceComputer when we know that codes are
  * laid out in a flat index.
diff --git a/faiss/impl/FaissAssert.h b/faiss/impl/FaissAssert.h
index 6f666f684c..9d357823d0 100644
--- a/faiss/impl/FaissAssert.h
+++ b/faiss/impl/FaissAssert.h
@@ -94,13 +94,15 @@
         }                                              \
     } while (false)
 
-#define FAISS_THROW_IF_NOT_MSG(X, MSG)                       \
+#define FAISS_THROW_IF_MSG(X, MSG)                           \
     do {                                                     \
-        if (!(X)) {                                          \
+        if (X) {                                             \
             FAISS_THROW_FMT("Error: '%s' failed: " MSG, #X); \
         }                                                    \
     } while (false)
 
+#define FAISS_THROW_IF_NOT_MSG(X, MSG) FAISS_THROW_IF_MSG(!(X), MSG)
+
 #define FAISS_THROW_IF_NOT_FMT(X, FMT, ...)                               \
     do {                                                                  \
         if (!(X)) {                                                       \
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index a9fb9daf5b..3ba5f72f68 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -7,6 +7,7 @@
 
 #include <faiss/impl/HNSW.h>
 
+#include <cstddef>
 #include <string>
 
 #include <faiss/impl/AuxIndexStructures.h>
@@ -110,8 +111,8 @@ void HNSW::print_neighbor_stats(int level) const {
            level,
            nb_neighbors(level));
     size_t tot_neigh = 0, tot_common = 0, tot_reciprocal = 0, n_node = 0;
-#pragma omp parallel for reduction(+: tot_neigh) reduction(+: tot_common) \
-  reduction(+: tot_reciprocal) reduction(+: n_node)
+#pragma omp parallel for reduction(+ : tot_neigh) reduction(+ : tot_common) \
+        reduction(+ : tot_reciprocal) reduction(+ : n_node)
     for (int i = 0; i < levels.size(); i++) {
         if (levels[i] > level) {
             n_node++;
@@ -215,8 +216,8 @@ int HNSW::prepare_level_tab(size_t n, bool preset_levels) {
         if (pt_level > max_level)
             max_level = pt_level;
         offsets.push_back(offsets.back() + cum_nb_neighbors(pt_level + 1));
-        neighbors.resize(offsets.back(), -1);
     }
+    neighbors.resize(offsets.back(), -1);
 
     return max_level;
 }
@@ -229,7 +230,14 @@ void HNSW::shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistFarther>& input,
         std::vector<NodeDistFarther>& output,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0) {
+    // This prevents number of neighbors at
+    // level 0 from being shrunk to less than 2 * M.
+    // This is essential in making sure
+    // `faiss::gpu::GpuIndexCagra::copyFrom(IndexHNSWCagra*)` is functional
+    std::vector<NodeDistFarther> outsiders;
+
     while (input.size() > 0) {
         NodeDistFarther v1 = input.top();
         input.pop();
@@ -250,8 +258,15 @@ void HNSW::shrink_neighbor_list(
             if (output.size() >= max_size) {
                 return;
             }
+        } else if (keep_max_size_level0) {
+            outsiders.push_back(v1);
         }
     }
+    size_t idx = 0;
+    while (keep_max_size_level0 && (output.size() < max_size) &&
+           (idx < outsiders.size())) {
+        output.push_back(outsiders[idx++]);
+    }
 }
 
 namespace {
@@ -268,7 +283,8 @@ using NodeDistFarther = HNSW::NodeDistFarther;
 void shrink_neighbor_list(
         DistanceComputer& qdis,
         std::priority_queue<NodeDistCloser>& resultSet1,
-        int max_size) {
+        int max_size,
+        bool keep_max_size_level0 = false) {
     if (resultSet1.size() < max_size) {
         return;
     }
@@ -280,7 +296,8 @@ void shrink_neighbor_list(
         resultSet1.pop();
     }
 
-    HNSW::shrink_neighbor_list(qdis, resultSet, returnlist, max_size);
+    HNSW::shrink_neighbor_list(
+            qdis, resultSet, returnlist, max_size, keep_max_size_level0);
 
     for (NodeDistFarther curen2 : returnlist) {
         resultSet1.emplace(curen2.d, curen2.id);
@@ -294,7 +311,8 @@ void add_link(
         DistanceComputer& qdis,
         storage_idx_t src,
         storage_idx_t dest,
-        int level) {
+        int level,
+        bool keep_max_size_level0 = false) {
     size_t begin, end;
     hnsw.neighbor_range(src, level, &begin, &end);
     if (hnsw.neighbors[end - 1] == -1) {
@@ -319,7 +337,7 @@ void add_link(
         resultSet.emplace(qdis.symmetric_dis(src, neigh), neigh);
     }
 
-    shrink_neighbor_list(qdis, resultSet, end - begin);
+    shrink_neighbor_list(qdis, resultSet, end - begin, keep_max_size_level0);
 
     // ...and back
     size_t i = begin;
@@ -429,7 +447,8 @@ void HNSW::add_links_starting_from(
         float d_nearest,
         int level,
         omp_lock_t* locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     std::priority_queue<NodeDistCloser> link_targets;
 
     search_neighbors_to_add(
@@ -438,13 +457,13 @@ void HNSW::add_links_starting_from(
     // but we can afford only this many neighbors
     int M = nb_neighbors(level);
 
-    ::faiss::shrink_neighbor_list(ptdis, link_targets, M);
+    ::faiss::shrink_neighbor_list(ptdis, link_targets, M, keep_max_size_level0);
 
     std::vector<storage_idx_t> neighbors;
     neighbors.reserve(link_targets.size());
     while (!link_targets.empty()) {
         storage_idx_t other_id = link_targets.top().id;
-        add_link(*this, ptdis, pt_id, other_id, level);
+        add_link(*this, ptdis, pt_id, other_id, level, keep_max_size_level0);
         neighbors.push_back(other_id);
         link_targets.pop();
     }
@@ -452,7 +471,7 @@ void HNSW::add_links_starting_from(
     omp_unset_lock(&locks[pt_id]);
     for (storage_idx_t other_id : neighbors) {
         omp_set_lock(&locks[other_id]);
-        add_link(*this, ptdis, other_id, pt_id, level);
+        add_link(*this, ptdis, other_id, pt_id, level, keep_max_size_level0);
         omp_unset_lock(&locks[other_id]);
     }
     omp_set_lock(&locks[pt_id]);
@@ -467,7 +486,8 @@ void HNSW::add_with_locks(
         int pt_level,
         int pt_id,
         std::vector<omp_lock_t>& locks,
-        VisitedTable& vt) {
+        VisitedTable& vt,
+        bool keep_max_size_level0) {
     //  greedy search on upper levels
 
     storage_idx_t nearest;
@@ -496,7 +516,14 @@ void HNSW::add_with_locks(
 
     for (; level >= 0; level--) {
         add_links_starting_from(
-                ptdis, pt_id, nearest, d_nearest, level, locks.data(), vt);
+                ptdis,
+                pt_id,
+                nearest,
+                d_nearest,
+                level,
+                locks.data(),
+                vt,
+                keep_max_size_level0);
     }
 
     omp_unset_lock(&locks[pt_id]);
@@ -664,7 +691,7 @@ int search_from_candidates(
         if (candidates.size() == 0) {
             stats.n2++;
         }
-        stats.n3 += ndis;
+        stats.ndis += ndis;
     }
 
     return nres;
@@ -793,7 +820,7 @@ std::priority_queue<HNSW::Node> search_from_candidate_unbounded(
     if (candidates.size() == 0) {
         ++stats.n2;
     }
-    stats.n3 += ndis;
+    stats.ndis += ndis;
 
     return top_candidates;
 }
@@ -910,9 +937,12 @@ void HNSW::search_level_0(
         const float* nearest_d,
         int search_type,
         HNSWStats& search_stats,
-        VisitedTable& vt) const {
+        VisitedTable& vt,
+        const SearchParametersHNSW* params) const {
     const HNSW& hnsw = *this;
+    auto efSearch = params ? params->efSearch : hnsw.efSearch;
     int k = extract_k_from_ResultHandler(res);
+
     if (search_type == 1) {
         int nres = 0;
 
@@ -925,16 +955,24 @@ void HNSW::search_level_0(
             if (vt.get(cj))
                 continue;
 
-            int candidates_size = std::max(hnsw.efSearch, k);
+            int candidates_size = std::max(efSearch, k);
             MinimaxHeap candidates(candidates_size);
 
             candidates.push(cj, nearest_d[j]);
 
             nres = search_from_candidates(
-                    hnsw, qdis, res, candidates, vt, search_stats, 0, nres);
+                    hnsw,
+                    qdis,
+                    res,
+                    candidates,
+                    vt,
+                    search_stats,
+                    0,
+                    nres,
+                    params);
         }
     } else if (search_type == 2) {
-        int candidates_size = std::max(hnsw.efSearch, int(k));
+        int candidates_size = std::max(efSearch, int(k));
         candidates_size = std::max(candidates_size, int(nprobe));
 
         MinimaxHeap candidates(candidates_size);
@@ -947,7 +985,7 @@ void HNSW::search_level_0(
         }
 
         search_from_candidates(
-                hnsw, qdis, res, candidates, vt, search_stats, 0);
+                hnsw, qdis, res, candidates, vt, search_stats, 0, 0, params);
     }
 }
 
diff --git a/faiss/impl/HNSW.h b/faiss/impl/HNSW.h
index cb6b422c3d..f3aacf8a5b 100644
--- a/faiss/impl/HNSW.h
+++ b/faiss/impl/HNSW.h
@@ -184,7 +184,8 @@ struct HNSW {
             float d_nearest,
             int level,
             omp_lock_t* locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /** add point pt_id on all levels <= pt_level and build the link
      * structure for them. */
@@ -193,7 +194,8 @@ struct HNSW {
             int pt_level,
             int pt_id,
             std::vector<omp_lock_t>& locks,
-            VisitedTable& vt);
+            VisitedTable& vt,
+            bool keep_max_size_level0 = false);
 
     /// search interface for 1 point, single thread
     HNSWStats search(
@@ -211,7 +213,8 @@ struct HNSW {
             const float* nearest_d,
             int search_type,
             HNSWStats& search_stats,
-            VisitedTable& vt) const;
+            VisitedTable& vt,
+            const SearchParametersHNSW* params = nullptr) const;
 
     void reset();
 
@@ -224,36 +227,27 @@ struct HNSW {
             DistanceComputer& qdis,
             std::priority_queue<NodeDistFarther>& input,
             std::vector<NodeDistFarther>& output,
-            int max_size);
+            int max_size,
+            bool keep_max_size_level0 = false);
 
     void permute_entries(const idx_t* map);
 };
 
 struct HNSWStats {
-    size_t n1, n2, n3;
-    size_t ndis;
-    size_t nreorder;
-
-    HNSWStats(
-            size_t n1 = 0,
-            size_t n2 = 0,
-            size_t n3 = 0,
-            size_t ndis = 0,
-            size_t nreorder = 0)
-            : n1(n1), n2(n2), n3(n3), ndis(ndis), nreorder(nreorder) {}
+    size_t n1 = 0; /// numbner of vectors searched
+    size_t n2 =
+            0; /// number of queries for which the candidate list is exhasted
+    size_t ndis = 0; /// number of distances computed
 
     void reset() {
-        n1 = n2 = n3 = 0;
+        n1 = n2 = 0;
         ndis = 0;
-        nreorder = 0;
     }
 
     void combine(const HNSWStats& other) {
         n1 += other.n1;
         n2 += other.n2;
-        n3 += other.n3;
         ndis += other.ndis;
-        nreorder += other.nreorder;
     }
 };
 
diff --git a/faiss/impl/LocalSearchQuantizer.cpp b/faiss/impl/LocalSearchQuantizer.cpp
index 8da989a9a4..943fe32c9d 100644
--- a/faiss/impl/LocalSearchQuantizer.cpp
+++ b/faiss/impl/LocalSearchQuantizer.cpp
@@ -104,10 +104,10 @@ int dgemm_(
 
 namespace {
 
-void fmat_inverse(float* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void fmat_inverse(float* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<float> workspace(lwork);
 
     sgetrf_(&n, &n, a, &n, ipiv.data(), &info);
@@ -123,10 +123,10 @@ void dfvec_add(size_t d, const double* a, const float* b, double* c) {
     }
 }
 
-void dmat_inverse(double* a, int n) {
-    int info;
-    int lwork = n * n;
-    std::vector<int> ipiv(n);
+void dmat_inverse(double* a, FINTEGER n) {
+    FINTEGER info;
+    FINTEGER lwork = n * n;
+    std::vector<FINTEGER> ipiv(n);
     std::vector<double> workspace(lwork);
 
     dgetrf_(&n, &n, a, &n, ipiv.data(), &info);
diff --git a/faiss/impl/LookupTableScaler.h b/faiss/impl/LookupTableScaler.h
index c553a0f14d..b6438307fb 100644
--- a/faiss/impl/LookupTableScaler.h
+++ b/faiss/impl/LookupTableScaler.h
@@ -38,6 +38,23 @@ struct DummyScaler {
         return simd16uint16(0);
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8&, const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::lookup should not be called.");
+        return simd64uint8(0);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_lo should not be called.");
+        return simd32uint16(0);
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8&) const {
+        FAISS_THROW_MSG("DummyScaler::scale_hi should not be called.");
+        return simd32uint16(0);
+    }
+#endif
+
     template <class dist_t>
     inline dist_t scale_one(const dist_t&) const {
         FAISS_THROW_MSG("DummyScaler::scale_one should not be called.");
@@ -67,6 +84,23 @@ struct NormTableScaler {
         return (simd16uint16(res) >> 8) * scale_simd;
     }
 
+#ifdef __AVX512F__
+    inline simd64uint8 lookup(const simd64uint8& lut, const simd64uint8& c)
+            const {
+        return lut.lookup_4_lanes(c);
+    }
+
+    inline simd32uint16 scale_lo(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return simd32uint16(res) * scale_simd_wide;
+    }
+
+    inline simd32uint16 scale_hi(const simd64uint8& res) const {
+        auto scale_simd_wide = simd32uint16(scale_simd, scale_simd);
+        return (simd32uint16(res) >> 8) * scale_simd_wide;
+    }
+#endif
+
     // for non-SIMD implem 2, 3, 4
     template <class dist_t>
     inline dist_t scale_one(const dist_t& x) const {
diff --git a/faiss/impl/NNDescent.cpp b/faiss/impl/NNDescent.cpp
index b609aba390..5afcdaf5b7 100644
--- a/faiss/impl/NNDescent.cpp
+++ b/faiss/impl/NNDescent.cpp
@@ -154,15 +154,20 @@ NNDescent::NNDescent(const int d, const int K) : K(K), d(d) {
 NNDescent::~NNDescent() {}
 
 void NNDescent::join(DistanceComputer& qdis) {
+    idx_t check_period = InterruptCallback::get_period_hint(d * search_L);
+    for (idx_t i0 = 0; i0 < (idx_t)ntotal; i0 += check_period) {
+        idx_t i1 = std::min(i0 + check_period, (idx_t)ntotal);
 #pragma omp parallel for default(shared) schedule(dynamic, 100)
-    for (int n = 0; n < ntotal; n++) {
-        graph[n].join([&](int i, int j) {
-            if (i != j) {
-                float dist = qdis.symmetric_dis(i, j);
-                graph[i].insert(j, dist);
-                graph[j].insert(i, dist);
-            }
-        });
+        for (idx_t n = i0; n < i1; n++) {
+            graph[n].join([&](int i, int j) {
+                if (i != j) {
+                    float dist = qdis.symmetric_dis(i, j);
+                    graph[i].insert(j, dist);
+                    graph[j].insert(i, dist);
+                }
+            });
+        }
+        InterruptCallback::check();
     }
 }
 
diff --git a/faiss/impl/NSG.cpp b/faiss/impl/NSG.cpp
index 1f30b576b9..c974943343 100644
--- a/faiss/impl/NSG.cpp
+++ b/faiss/impl/NSG.cpp
@@ -25,35 +25,6 @@ namespace {
 // It needs to be smaller than 0
 constexpr int EMPTY_ID = -1;
 
-/* Wrap the distance computer into one that negates the
-   distances. This makes supporting INNER_PRODUCE search easier */
-
-struct NegativeDistanceComputer : DistanceComputer {
-    /// owned by this
-    DistanceComputer* basedis;
-
-    explicit NegativeDistanceComputer(DistanceComputer* basedis)
-            : basedis(basedis) {}
-
-    void set_query(const float* x) override {
-        basedis->set_query(x);
-    }
-
-    /// compute distance of vector i to current query
-    float operator()(idx_t i) override {
-        return -(*basedis)(i);
-    }
-
-    /// compute distance between two stored vectors
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return -basedis->symmetric_dis(i, j);
-    }
-
-    ~NegativeDistanceComputer() override {
-        delete basedis;
-    }
-};
-
 } // namespace
 
 DistanceComputer* storage_distance_computer(const Index* storage) {
diff --git a/faiss/impl/ResultHandler.h b/faiss/impl/ResultHandler.h
index 270de8dcd6..713fe8e49f 100644
--- a/faiss/impl/ResultHandler.h
+++ b/faiss/impl/ResultHandler.h
@@ -12,8 +12,10 @@
 #pragma once
 
 #include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/partitioning.h>
+#include <iostream>
 
 namespace faiss {
 
@@ -504,7 +506,15 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C> {
         void end() {}
 
         ~SingleResultHandler() {
-            pres.finalize();
+            try {
+                // finalize the partial result
+                pres.finalize();
+            } catch (const faiss::FaissException& e) {
+                // Do nothing if allocation fails in finalizing partial results.
+#ifndef NDEBUG
+                std::cerr << e.what() << std::endl;
+#endif
+            }
         }
     };
 
@@ -559,8 +569,15 @@ struct RangeSearchBlockResultHandler : BlockResultHandler<C> {
     }
 
     ~RangeSearchBlockResultHandler() {
-        if (partial_results.size() > 0) {
-            RangeSearchPartialResult::merge(partial_results);
+        try {
+            if (partial_results.size() > 0) {
+                RangeSearchPartialResult::merge(partial_results);
+            }
+        } catch (const faiss::FaissException& e) {
+            // Do nothing if allocation fails in merge.
+#ifndef NDEBUG
+            std::cerr << e.what() << std::endl;
+#endif
         }
     }
 };
diff --git a/faiss/impl/ScalarQuantizer.cpp b/faiss/impl/ScalarQuantizer.cpp
index 07d77d5622..7ad50189e4 100644
--- a/faiss/impl/ScalarQuantizer.cpp
+++ b/faiss/impl/ScalarQuantizer.cpp
@@ -23,6 +23,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/utils/bf16.h>
 #include <faiss/utils/fp16.h>
 #include <faiss/utils/utils.h>
 
@@ -101,8 +102,7 @@ struct Codec8bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -153,8 +153,7 @@ struct Codec4bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -266,8 +265,7 @@ struct Codec6bit {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 #endif
 };
@@ -345,16 +343,14 @@ struct QuantizerTemplate<Codec, true, 8> : QuantizerTemplate<Codec, true, 1> {
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
         float32x4x2_t xi = Codec::decode_8_components(code, i);
-        float32x4x2_t res = vzipq_f32(
-                vfmaq_f32(
+        return {vfmaq_f32(
                         vdupq_n_f32(this->vmin),
                         xi.val[0],
                         vdupq_n_f32(this->vdiff)),
                 vfmaq_f32(
                         vdupq_n_f32(this->vmin),
                         xi.val[1],
-                        vdupq_n_f32(this->vdiff)));
-        return vuzpq_f32(res.val[0], res.val[1]);
+                        vdupq_n_f32(this->vdiff))};
     }
 };
 
@@ -431,10 +427,8 @@ struct QuantizerTemplate<Codec, false, 8> : QuantizerTemplate<Codec, false, 1> {
         float32x4x2_t vmin_8 = vld1q_f32_x2(this->vmin + i);
         float32x4x2_t vdiff_8 = vld1q_f32_x2(this->vdiff + i);
 
-        float32x4x2_t res = vzipq_f32(
-                vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
-                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1]));
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {vfmaq_f32(vmin_8.val[0], xi.val[0], vdiff_8.val[0]),
+                vfmaq_f32(vmin_8.val[1], xi.val[1], vdiff_8.val[1])};
     }
 };
 
@@ -496,10 +490,75 @@ struct QuantizerFP16<8> : QuantizerFP16<1> {
 
     FAISS_ALWAYS_INLINE float32x4x2_t
     reconstruct_8_components(const uint8_t* code, int i) const {
-        uint16x4x2_t codei = vld2_u16((const uint16_t*)(code + 2 * i));
-        return vzipq_f32(
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
-                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1])));
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vcvt_f32_f16(vreinterpret_f16_u16(codei.val[0])),
+                vcvt_f32_f16(vreinterpret_f16_u16(codei.val[1]))};
+    }
+};
+#endif
+
+/*******************************************************************
+ * BF16 quantizer
+ *******************************************************************/
+
+template <int SIMDWIDTH>
+struct QuantizerBF16 {};
+
+template <>
+struct QuantizerBF16<1> : ScalarQuantizer::SQuantizer {
+    const size_t d;
+
+    QuantizerBF16(size_t d, const std::vector<float>& /* unused */) : d(d) {}
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            ((uint16_t*)code)[i] = encode_bf16(x[i]);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = decode_bf16(((uint16_t*)code)[i]);
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(const uint8_t* code, int i)
+            const {
+        return decode_bf16(((uint16_t*)code)[i]);
+    }
+};
+
+#ifdef __AVX2__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE __m256
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        __m128i code_128i = _mm_loadu_si128((const __m128i*)(code + 2 * i));
+        __m256i code_256i = _mm256_cvtepu16_epi32(code_128i);
+        code_256i = _mm256_slli_epi32(code_256i, 16);
+        return _mm256_castsi256_ps(code_256i);
+    }
+};
+
+#endif
+
+#ifdef __aarch64__
+
+template <>
+struct QuantizerBF16<8> : QuantizerBF16<1> {
+    QuantizerBF16(size_t d, const std::vector<float>& trained)
+            : QuantizerBF16<1>(d, trained) {}
+
+    FAISS_ALWAYS_INLINE float32x4x2_t
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        uint16x4x2_t codei = vld1_u16_x2((const uint16_t*)(code + 2 * i));
+        return {vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(codei.val[0]), 16)),
+                vreinterpretq_f32_u32(
+                        vshlq_n_u32(vmovl_u16(codei.val[1]), 16))};
     }
 };
 #endif
@@ -568,8 +627,7 @@ struct Quantizer8bitDirect<8> : Quantizer8bitDirect<1> {
         }
         float32x4_t res1 = vld1q_f32(result);
         float32x4_t res2 = vld1q_f32(result + 4);
-        float32x4x2_t res = vzipq_f32(res1, res2);
-        return vuzpq_f32(res.val[0], res.val[1]);
+        return {res1, res2};
     }
 };
 
@@ -598,6 +656,8 @@ ScalarQuantizer::SQuantizer* select_quantizer_1(
                     d, trained);
         case ScalarQuantizer::QT_fp16:
             return new QuantizerFP16<SIMDWIDTH>(d, trained);
+        case ScalarQuantizer::QT_bf16:
+            return new QuantizerBF16<SIMDWIDTH>(d, trained);
         case ScalarQuantizer::QT_8bit_direct:
             return new Quantizer8bitDirect<SIMDWIDTH>(d, trained);
     }
@@ -868,7 +928,7 @@ struct SimilarityL2<8> {
     float32x4x2_t accu8;
 
     FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
         yi = y;
     }
 
@@ -882,8 +942,7 @@ struct SimilarityL2<8> {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
 
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE void add_8_components_2(
@@ -895,8 +954,7 @@ struct SimilarityL2<8> {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], sub0, sub0);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], sub1, sub1);
 
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE float result_8() {
@@ -996,7 +1054,7 @@ struct SimilarityIP<8> {
     float32x4x2_t accu8;
 
     FAISS_ALWAYS_INLINE void begin_8() {
-        accu8 = vzipq_f32(vdupq_n_f32(0.0f), vdupq_n_f32(0.0f));
+        accu8 = {vdupq_n_f32(0.0f), vdupq_n_f32(0.0f)};
         yi = y;
     }
 
@@ -1006,8 +1064,7 @@ struct SimilarityIP<8> {
 
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], yiv.val[0], x.val[0]);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], yiv.val[1], x.val[1]);
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE void add_8_components_2(
@@ -1015,19 +1072,17 @@ struct SimilarityIP<8> {
             float32x4x2_t x2) {
         float32x4_t accu8_0 = vfmaq_f32(accu8.val[0], x1.val[0], x2.val[0]);
         float32x4_t accu8_1 = vfmaq_f32(accu8.val[1], x1.val[1], x2.val[1]);
-        float32x4x2_t accu8_temp = vzipq_f32(accu8_0, accu8_1);
-        accu8 = vuzpq_f32(accu8_temp.val[0], accu8_temp.val[1]);
+        accu8 = {accu8_0, accu8_1};
     }
 
     FAISS_ALWAYS_INLINE float result_8() {
-        float32x4x2_t sum_tmp = vzipq_f32(
+        float32x4x2_t sum = {
                 vpaddq_f32(accu8.val[0], accu8.val[0]),
-                vpaddq_f32(accu8.val[1], accu8.val[1]));
-        float32x4x2_t sum = vuzpq_f32(sum_tmp.val[0], sum_tmp.val[1]);
-        float32x4x2_t sum2_tmp = vzipq_f32(
+                vpaddq_f32(accu8.val[1], accu8.val[1])};
+
+        float32x4x2_t sum2 = {
                 vpaddq_f32(sum.val[0], sum.val[0]),
-                vpaddq_f32(sum.val[1], sum.val[1]));
-        float32x4x2_t sum2 = vuzpq_f32(sum2_tmp.val[0], sum2_tmp.val[1]);
+                vpaddq_f32(sum.val[1], sum.val[1])};
         return vgetq_lane_f32(sum2.val[0], 0) + vgetq_lane_f32(sum2.val[1], 0);
     }
 };
@@ -1392,6 +1447,10 @@ SQDistanceComputer* select_distance_computer(
             return new DCTemplate<QuantizerFP16<SIMDWIDTH>, Sim, SIMDWIDTH>(
                     d, trained);
 
+        case ScalarQuantizer::QT_bf16:
+            return new DCTemplate<QuantizerBF16<SIMDWIDTH>, Sim, SIMDWIDTH>(
+                    d, trained);
+
         case ScalarQuantizer::QT_8bit_direct:
             if (d % 16 == 0) {
                 return new DistanceComputerByte<Sim, SIMDWIDTH>(d, trained);
@@ -1440,6 +1499,10 @@ void ScalarQuantizer::set_derived_sizes() {
             code_size = d * 2;
             bits = 16;
             break;
+        case QT_bf16:
+            code_size = d * 2;
+            bits = 16;
+            break;
     }
 }
 
@@ -1476,6 +1539,7 @@ void ScalarQuantizer::train(size_t n, const float* x) {
             break;
         case QT_fp16:
         case QT_8bit_direct:
+        case QT_bf16:
             // no training necessary
             break;
     }
@@ -1805,6 +1869,11 @@ InvertedListScanner* sel1_InvertedListScanner(
                     QuantizerFP16<SIMDWIDTH>,
                     Similarity,
                     SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
+        case ScalarQuantizer::QT_bf16:
+            return sel2_InvertedListScanner<DCTemplate<
+                    QuantizerBF16<SIMDWIDTH>,
+                    Similarity,
+                    SIMDWIDTH>>(sq, quantizer, store_pairs, sel, r);
         case ScalarQuantizer::QT_8bit_direct:
             if (sq->d % 16 == 0) {
                 return sel2_InvertedListScanner<
diff --git a/faiss/impl/ScalarQuantizer.h b/faiss/impl/ScalarQuantizer.h
index 550a979092..49fd42cc31 100644
--- a/faiss/impl/ScalarQuantizer.h
+++ b/faiss/impl/ScalarQuantizer.h
@@ -32,6 +32,7 @@ struct ScalarQuantizer : Quantizer {
         QT_fp16,
         QT_8bit_direct, ///< fast indexing of uint8s
         QT_6bit,        ///< 6 bits per component
+        QT_bf16,
     };
 
     QuantizerType qtype = QT_8bit;
diff --git a/faiss/impl/code_distance/code_distance-avx2.h b/faiss/impl/code_distance/code_distance-avx2.h
index 0aa1535b28..d37b022441 100644
--- a/faiss/impl/code_distance/code_distance-avx2.h
+++ b/faiss/impl/code_distance/code_distance-avx2.h
@@ -16,6 +16,11 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/code_distance/code_distance-generic.h>
 
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=78782
+#if defined(__GNUC__) && __GNUC__ < 9
+#define _mm_loadu_si64(x) (_mm_loadl_epi64((__m128i_u*)x))
+#endif
+
 namespace {
 
 inline float horizontal_sum(const __m128 v) {
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index ac62e0269e..aa041c0fac 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io_macros.h>
@@ -531,7 +529,11 @@ Index* read_index(IOReader* f, int io_flags) {
     Index* idx = nullptr;
     uint32_t h;
     READ1(h);
-    if (h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
+    if (h == fourcc("null")) {
+        // denotes a missing index, useful for some cases
+        return nullptr;
+    } else if (
+            h == fourcc("IxFI") || h == fourcc("IxF2") || h == fourcc("IxFl")) {
         IndexFlat* idxf;
         if (h == fourcc("IxFI")) {
             idxf = new IndexFlatIP();
@@ -948,7 +950,7 @@ Index* read_index(IOReader* f, int io_flags) {
         idx = idxp;
     } else if (
             h == fourcc("IHNf") || h == fourcc("IHNp") || h == fourcc("IHNs") ||
-            h == fourcc("IHN2")) {
+            h == fourcc("IHN2") || h == fourcc("IHNc")) {
         IndexHNSW* idxhnsw = nullptr;
         if (h == fourcc("IHNf"))
             idxhnsw = new IndexHNSWFlat();
@@ -958,11 +960,19 @@ Index* read_index(IOReader* f, int io_flags) {
             idxhnsw = new IndexHNSWSQ();
         if (h == fourcc("IHN2"))
             idxhnsw = new IndexHNSW2Level();
+        if (h == fourcc("IHNc"))
+            idxhnsw = new IndexHNSWCagra();
         read_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            READ1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<IndexHNSWCagra*>(idxhnsw);
+            READ1(idx_hnsw_cagra->base_level_only);
+            READ1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         read_HNSW(&idxhnsw->hnsw, f);
         idxhnsw->storage = read_index(f, io_flags);
-        idxhnsw->own_fields = true;
-        if (h == fourcc("IHNp")) {
+        idxhnsw->own_fields = idxhnsw->storage != nullptr;
+        if (h == fourcc("IHNp") && !(io_flags & IO_FLAG_PQ_SKIP_SDC_TABLE)) {
             dynamic_cast<IndexPQ*>(idxhnsw->storage)->pq.compute_sdc_table();
         }
         idx = idxhnsw;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index b2808d7170..0a924d0225 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/index_io.h>
 
 #include <faiss/impl/io.h>
@@ -390,8 +388,12 @@ static void write_ivf_header(const IndexIVF* ivf, IOWriter* f) {
     write_direct_map(&ivf->direct_map, f);
 }
 
-void write_index(const Index* idx, IOWriter* f) {
-    if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
+void write_index(const Index* idx, IOWriter* f, int io_flags) {
+    if (idx == nullptr) {
+        // eg. for a storage component of HNSW that is set to nullptr
+        uint32_t h = fourcc("null");
+        WRITE1(h);
+    } else if (const IndexFlat* idxf = dynamic_cast<const IndexFlat*>(idx)) {
         uint32_t h =
                 fourcc(idxf->metric_type == METRIC_INNER_PRODUCT ? "IxFI"
                                : idxf->metric_type == METRIC_L2  ? "IxF2"
@@ -760,12 +762,24 @@ void write_index(const Index* idx, IOWriter* f) {
                 : dynamic_cast<const IndexHNSWPQ*>(idx)      ? fourcc("IHNp")
                 : dynamic_cast<const IndexHNSWSQ*>(idx)      ? fourcc("IHNs")
                 : dynamic_cast<const IndexHNSW2Level*>(idx)  ? fourcc("IHN2")
+                : dynamic_cast<const IndexHNSWCagra*>(idx)   ? fourcc("IHNc")
                                                              : 0;
         FAISS_THROW_IF_NOT(h != 0);
         WRITE1(h);
         write_index_header(idxhnsw, f);
+        if (h == fourcc("IHNc")) {
+            WRITE1(idxhnsw->keep_max_size_level0);
+            auto idx_hnsw_cagra = dynamic_cast<const IndexHNSWCagra*>(idxhnsw);
+            WRITE1(idx_hnsw_cagra->base_level_only);
+            WRITE1(idx_hnsw_cagra->num_base_level_search_entrypoints);
+        }
         write_HNSW(&idxhnsw->hnsw, f);
-        write_index(idxhnsw->storage, f);
+        if (io_flags & IO_FLAG_SKIP_STORAGE) {
+            uint32_t n4 = fourcc("null");
+            WRITE1(n4);
+        } else {
+            write_index(idxhnsw->storage, f);
+        }
     } else if (const IndexNSG* idxnsg = dynamic_cast<const IndexNSG*>(idx)) {
         uint32_t h = dynamic_cast<const IndexNSGFlat*>(idx) ? fourcc("INSf")
                 : dynamic_cast<const IndexNSGPQ*>(idx)      ? fourcc("INSp")
@@ -841,14 +855,14 @@ void write_index(const Index* idx, IOWriter* f) {
     }
 }
 
-void write_index(const Index* idx, FILE* f) {
+void write_index(const Index* idx, FILE* f, int io_flags) {
     FileIOWriter writer(f);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
-void write_index(const Index* idx, const char* fname) {
+void write_index(const Index* idx, const char* fname, int io_flags) {
     FileIOWriter writer(fname);
-    write_index(idx, &writer);
+    write_index(idx, &writer, io_flags);
 }
 
 void write_VectorTransform(const VectorTransform* vt, const char* fname) {
diff --git a/faiss/impl/io.cpp b/faiss/impl/io.cpp
index 5d24e58591..5f5b2d5ebd 100644
--- a/faiss/impl/io.cpp
+++ b/faiss/impl/io.cpp
@@ -20,11 +20,11 @@ namespace faiss {
  * IO functions
  ***********************************************************************/
 
-int IOReader::fileno() {
+int IOReader::filedescriptor() {
     FAISS_THROW_MSG("IOReader does not support memory mapping");
 }
 
-int IOWriter::fileno() {
+int IOWriter::filedescriptor() {
     FAISS_THROW_MSG("IOWriter does not support memory mapping");
 }
 
@@ -85,8 +85,12 @@ size_t FileIOReader::operator()(void* ptr, size_t size, size_t nitems) {
     return fread(ptr, size, nitems, f);
 }
 
-int FileIOReader::fileno() {
+int FileIOReader::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 FileIOWriter::FileIOWriter(FILE* wf) : f(wf) {}
@@ -116,8 +120,12 @@ size_t FileIOWriter::operator()(const void* ptr, size_t size, size_t nitems) {
     return fwrite(ptr, size, nitems, f);
 }
 
-int FileIOWriter::fileno() {
+int FileIOWriter::filedescriptor() {
+#ifdef _AIX
+    return fileno(f);
+#else
     return ::fileno(f);
+#endif
 }
 
 /***********************************************************************
@@ -259,7 +267,7 @@ std::string fourcc_inv_printable(uint32_t x) {
             str += c;
         } else {
             char buf[10];
-            sprintf(buf, "\\x%02x", c);
+            snprintf(buf, sizeof(buf), "\\x%02x", c);
             str += buf;
         }
     }
diff --git a/faiss/impl/io.h b/faiss/impl/io.h
index 8d0605a5a6..59c2e31539 100644
--- a/faiss/impl/io.h
+++ b/faiss/impl/io.h
@@ -32,7 +32,7 @@ struct IOReader {
     virtual size_t operator()(void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOReader() {}
 };
@@ -45,7 +45,7 @@ struct IOWriter {
     virtual size_t operator()(const void* ptr, size_t size, size_t nitems) = 0;
 
     // return a file number that can be memory-mapped
-    virtual int fileno();
+    virtual int filedescriptor();
 
     virtual ~IOWriter() noexcept(false) {}
 };
@@ -73,7 +73,7 @@ struct FileIOReader : IOReader {
 
     size_t operator()(void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 struct FileIOWriter : IOWriter {
@@ -88,7 +88,7 @@ struct FileIOWriter : IOWriter {
 
     size_t operator()(const void* ptr, size_t size, size_t nitems) override;
 
-    int fileno() override;
+    int filedescriptor() override;
 };
 
 /*******************************************************
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index 2aecc51222..3fc328535b 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -127,6 +127,13 @@ inline int __builtin_clzll(uint64_t x) {
     __pragma(float_control(precise, off, push))
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END __pragma(float_control(pop))
 #elif defined(__clang__)
+#if defined(__PPC__)
+#define FAISS_PRAGMA_IMPRECISE_LOOP \
+    _Pragma("clang loop vectorize_width(4) interleave_count(8)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN \
+    _Pragma("float_control(precise, off, push)")
+#define FAISS_PRAGMA_IMPRECISE_FUNCTION_END _Pragma("float_control(pop)")
+#else
 #define FAISS_PRAGMA_IMPRECISE_LOOP \
     _Pragma("clang loop vectorize(enable) interleave(enable)")
 
@@ -144,6 +151,7 @@ inline int __builtin_clzll(uint64_t x) {
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
 #define FAISS_PRAGMA_IMPRECISE_FUNCTION_END
 #endif
+#endif
 #elif defined(__GNUC__)
 // Unfortunately, GCC does not provide a pragma for detecting it.
 // So, we have to stick to GNUC, which is defined by MANY compilers.
@@ -165,3 +173,17 @@ inline int __builtin_clzll(uint64_t x) {
 #endif
 
 // clang-format on
+
+/*******************************************************
+ * BIGENDIAN specific macros
+ *******************************************************/
+#if !defined(_MSC_VER) && \
+        (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__))
+#define FAISS_BIG_ENDIAN
+#endif
+
+#define Swap2Bytes(val) ((((val) >> 8) & 0x00FF) | (((val) << 8) & 0xFF00))
+
+#define Swap4Bytes(val)                                           \
+    ((((val) >> 24) & 0x000000FF) | (((val) >> 8) & 0x0000FF00) | \
+     (((val) << 8) & 0x00FF0000) | (((val) << 24) & 0xFF000000))
diff --git a/faiss/impl/pq4_fast_scan.cpp b/faiss/impl/pq4_fast_scan.cpp
index 6173ecef47..127646e0eb 100644
--- a/faiss/impl/pq4_fast_scan.cpp
+++ b/faiss/impl/pq4_fast_scan.cpp
@@ -6,6 +6,7 @@
  */
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/platform_macros.h>
 #include <faiss/impl/pq4_fast_scan.h>
 #include <faiss/impl/simd_result_handlers.h>
 
@@ -58,8 +59,13 @@ void pq4_pack_codes(
         return;
     }
     memset(blocks, 0, nb * nsq / 2);
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     uint8_t* codes2 = blocks;
     for (size_t i0 = 0; i0 < nb; i0 += bbs) {
@@ -93,8 +99,13 @@ void pq4_pack_codes_range(
         size_t bbs,
         size_t nsq,
         uint8_t* blocks) {
+#ifdef FAISS_BIG_ENDIAN
+    const uint8_t perm0[16] = {
+            8, 0, 9, 1, 10, 2, 11, 3, 12, 4, 13, 5, 14, 6, 15, 7};
+#else
     const uint8_t perm0[16] = {
             0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15};
+#endif
 
     // range of affected blocks
     size_t block0 = i0 / bbs;
diff --git a/faiss/impl/pq4_fast_scan_search_qbs.cpp b/faiss/impl/pq4_fast_scan_search_qbs.cpp
index d69542c309..bf2ccd1f76 100644
--- a/faiss/impl/pq4_fast_scan_search_qbs.cpp
+++ b/faiss/impl/pq4_fast_scan_search_qbs.cpp
@@ -31,6 +31,8 @@ namespace {
  * writes results in a ResultHandler
  */
 
+#ifndef __AVX512F__
+
 template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
         int nsq,
@@ -111,6 +113,451 @@ void kernel_accumulate_block(
     }
 }
 
+#else
+
+// a special version for NQ=1.
+// Despite the function being large in the text form, it compiles to a very
+//    compact assembler code.
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nq1(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // NQ is kept in order to match the similarity to baseline function
+    constexpr int NQ = 1;
+    // distance accumulators. We can accept more for NQ=1
+    // layout: accu[q][b]: distance accumulator for vectors 32*b..32*b+15
+    simd32uint16 accu[NQ][4];
+    // layout: accu[q][b]: distance accumulator for vectors 32*b+16..32*b+31
+    simd32uint16 accu1[NQ][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+            accu1[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_8 = (nsq_minus_nscale / 8) * 8;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(c1lo);
+                simd64uint8 res1 = lut.lookup_4_lanes(c1hi);
+
+                accu1[q][0] += simd32uint16(res0);
+                accu1[q][1] += simd32uint16(res0) >> 8;
+
+                accu1[q][2] += simd32uint16(res1);
+                accu1[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nsq_minus_nscale_8 != nsq_minus_nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = lut.lookup_4_lanes(clo);
+            simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+            accu[q][0] += simd32uint16(res0);
+            accu[q][1] += simd32uint16(res0) >> 8;
+
+            accu[q][2] += simd32uint16(res1);
+            accu[q][3] += simd32uint16(res1) >> 8;
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_8 = (nscale / 8) * 8;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nscale_8; sq += 8) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 c1(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        simd64uint8 c1hi = simd64uint8(simd32uint16(c1) >> 4) & mask;
+        simd64uint8 c1lo = c1 & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+            }
+        }
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, c1lo);
+                accu1[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu1[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, c1hi);
+                accu1[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu1[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+    }
+
+    // process leftovers: a single chunk of size 4
+    if (nscale_8 != nscale_4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd64uint8 lut(LUT);
+            LUT += 64;
+
+            simd64uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..15
+            accu[q][1] += scaler.scale_hi(res0); // handle vectors 16..31
+
+            simd64uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += scaler.scale_lo(res1); // handle vectors 32..47
+            accu[q][3] += scaler.scale_hi(res1); //  handle vectors 48..63
+        }
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b] += accu1[q][b];
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+// general-purpose case
+FAISS_PRAGMA_IMPRECISE_FUNCTION_BEGIN
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block_avx512_nqx(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    // dummy alloc to keep the windows compiler happy
+    constexpr int NQA = NQ > 0 ? NQ : 1;
+    // distance accumulators
+    // layout: accu[q][b]: distance accumulator for vectors 8*b..8*b+7
+    simd32uint16 accu[NQA][4];
+
+    for (int q = 0; q < NQ; q++) {
+        for (int b = 0; b < 4; b++) {
+            accu[q][b].clear();
+        }
+    }
+
+    // process "nsq - scaler.nscale" part
+    const int nsq_minus_nscale = nsq - scaler.nscale;
+    const int nsq_minus_nscale_4 = (nsq_minus_nscale / 4) * 4;
+
+    // process in chunks of 8
+    for (int sq = 0; sq < nsq_minus_nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = lut.lookup_4_lanes(clo);
+                simd64uint8 res1 = lut.lookup_4_lanes(chi);
+
+                accu[q][0] += simd32uint16(res0);
+                accu[q][1] += simd32uint16(res0) >> 8;
+
+                accu[q][2] += simd32uint16(res1);
+                accu[q][3] += simd32uint16(res1) >> 8;
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nsq_minus_nscale_4 != nsq_minus_nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = lut.lookup_2_lanes(clo);
+            simd32uint8 res1 = lut.lookup_2_lanes(chi);
+
+            accu[q][0] += simd32uint16(simd16uint16(res0));
+            accu[q][1] += simd32uint16(simd16uint16(res0) >> 8);
+
+            accu[q][2] += simd32uint16(simd16uint16(res1));
+            accu[q][3] += simd32uint16(simd16uint16(res1) >> 8);
+        }
+    }
+
+    // process "sq" part
+    const int nscale = scaler.nscale;
+    const int nscale_4 = (nscale / 4) * 4;
+
+    // process in chunks of 4
+    for (int sq = 0; sq < nscale_4; sq += 4) {
+        // prefetch
+        simd64uint8 c(codes);
+        codes += 64;
+
+        simd64uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd64uint8 chi = simd64uint8(simd32uint16(c) >> 4) & mask;
+        simd64uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 4 quantizers
+            simd32uint8 lut_a(LUT);
+            simd32uint8 lut_b(LUT + NQ * 32);
+
+            simd64uint8 lut(lut_a, lut_b);
+            LUT += 32;
+
+            {
+                simd64uint8 res0 = scaler.lookup(lut, clo);
+                accu[q][0] += scaler.scale_lo(res0); // handle vectors 0..7
+                accu[q][1] += scaler.scale_hi(res0); // handle vectors 8..15
+
+                simd64uint8 res1 = scaler.lookup(lut, chi);
+                accu[q][2] += scaler.scale_lo(res1); // handle vectors 16..23
+                accu[q][3] += scaler.scale_hi(res1); //  handle vectors 24..31
+            }
+        }
+
+        LUT += NQ * 32;
+    }
+
+    // process leftovers: a single chunk of size 2
+    if (nscale_4 != nscale) {
+        // prefetch
+        simd32uint8 c(codes);
+        codes += 32;
+
+        simd32uint8 mask(0xf);
+        // shift op does not exist for int8...
+        simd32uint8 chi = simd32uint8(simd16uint16(c) >> 4) & mask;
+        simd32uint8 clo = c & mask;
+
+        for (int q = 0; q < NQ; q++) {
+            // load LUTs for 2 quantizers
+            simd32uint8 lut(LUT);
+            LUT += 32;
+
+            simd32uint8 res0 = scaler.lookup(lut, clo);
+            accu[q][0] +=
+                    simd32uint16(scaler.scale_lo(res0)); // handle vectors 0..7
+            accu[q][1] +=
+                    simd32uint16(scaler.scale_hi(res0)); // handle vectors 8..15
+
+            simd32uint8 res1 = scaler.lookup(lut, chi);
+            accu[q][2] += simd32uint16(
+                    scaler.scale_lo(res1)); // handle vectors 16..23
+            accu[q][3] += simd32uint16(
+                    scaler.scale_hi(res1)); //  handle vectors 24..31
+        }
+    }
+
+    for (int q = 0; q < NQ; q++) {
+        accu[q][0] -= accu[q][1] << 8;
+        simd16uint16 dis0 = combine4x2(accu[q][0], accu[q][1]);
+        accu[q][2] -= accu[q][3] << 8;
+        simd16uint16 dis1 = combine4x2(accu[q][2], accu[q][3]);
+        res.handle(q, 0, dis0, dis1);
+    }
+}
+
+template <int NQ, class ResultHandler, class Scaler>
+void kernel_accumulate_block(
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler) {
+    if constexpr (NQ == 1) {
+        kernel_accumulate_block_avx512_nq1<ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    } else {
+        kernel_accumulate_block_avx512_nqx<NQ, ResultHandler, Scaler>(
+                nsq, codes, LUT, res, scaler);
+    }
+}
+
+#endif
+
 // handle at most 4 blocks of queries
 template <int QBS, class ResultHandler, class Scaler>
 void accumulate_q_4step(
diff --git a/faiss/impl/simd_result_handlers.h b/faiss/impl/simd_result_handlers.h
index 2d8e5388d9..2fa18fa340 100644
--- a/faiss/impl/simd_result_handlers.h
+++ b/faiss/impl/simd_result_handlers.h
@@ -15,6 +15,7 @@
 #include <faiss/utils/simdlib.h>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 #include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/AlignedTable.h>
@@ -137,6 +138,7 @@ struct FixedStorageHandler : SIMDResultHandler {
             }
         }
     }
+
     virtual ~FixedStorageHandler() {}
 };
 
@@ -150,8 +152,10 @@ struct ResultHandlerCompare : SIMDResultHandlerToFloat {
     int64_t i0 = 0; // query origin
     int64_t j0 = 0; // db origin
 
-    ResultHandlerCompare(size_t nq, size_t ntotal)
-            : SIMDResultHandlerToFloat(nq, ntotal) {
+    const IDSelector* sel;
+
+    ResultHandlerCompare(size_t nq, size_t ntotal, const IDSelector* sel_in)
+            : SIMDResultHandlerToFloat(nq, ntotal), sel{sel_in} {
         this->is_CMax = C::is_max;
         this->sizeof_ids = sizeof(typename C::TI);
         this->with_fields = with_id_map;
@@ -232,9 +236,14 @@ struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
     float* dis;
     int64_t* ids;
 
-    SingleResultHandler(size_t nq, size_t ntotal, float* dis, int64_t* ids)
-            : RHC(nq, ntotal), idis(nq), dis(dis), ids(ids) {
-        for (int i = 0; i < nq; i++) {
+    SingleResultHandler(
+            size_t nq,
+            size_t ntotal,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in), idis(nq), dis(dis), ids(ids) {
+        for (size_t i = 0; i < nq; i++) {
             ids[i] = -1;
             idis[i] = C::neutral();
         }
@@ -256,20 +265,36 @@ struct SingleResultHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T d = d32tab[j];
-            if (C::cmp(idis[q], d)) {
-                idis[q] = d;
-                ids[q] = this->adjust_id(b, j);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T d = d32tab[j];
+                    if (C::cmp(idis[q], d)) {
+                        idis[q] = d;
+                        ids[q] = real_idx;
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T d = d32tab[j];
+                if (C::cmp(idis[q], d)) {
+                    idis[q] = d;
+                    ids[q] = this->adjust_id(b, j);
+                }
             }
         }
     }
 
     void end() {
-        for (int q = 0; q < this->nq; q++) {
+        for (size_t q = 0; q < this->nq; q++) {
             if (!normalizers) {
                 dis[q] = idis[q];
             } else {
@@ -296,8 +321,14 @@ struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
 
     int64_t k; // number of results to keep
 
-    HeapHandler(size_t nq, size_t ntotal, int64_t k, float* dis, int64_t* ids)
-            : RHC(nq, ntotal),
+    HeapHandler(
+            size_t nq,
+            size_t ntotal,
+            int64_t k,
+            float* dis,
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
               idis(nq * k),
               iids(nq * k),
               dis(dis),
@@ -330,21 +361,36 @@ struct HeapHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            if (C::cmp(heap_dis[0], dis)) {
-                int64_t idx = this->adjust_id(b, j);
-                heap_pop<C>(k, heap_dis, heap_ids);
-                heap_push<C>(k, heap_dis, heap_ids, dis, idx);
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    if (C::cmp(heap_dis[0], dis)) {
+                        heap_replace_top<C>(
+                                k, heap_dis, heap_ids, dis, real_idx);
+                    }
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                if (C::cmp(heap_dis[0], dis)) {
+                    int64_t idx = this->adjust_id(b, j);
+                    heap_replace_top<C>(k, heap_dis, heap_ids, dis, idx);
+                }
             }
         }
     }
 
     void end() override {
-        for (int q = 0; q < this->nq; q++) {
+        for (size_t q = 0; q < this->nq; q++) {
             T* heap_dis_in = idis.data() + q * k;
             TI* heap_ids_in = iids.data() + q * k;
             heap_reorder<C>(k, heap_dis_in, heap_ids_in);
@@ -393,8 +439,12 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
             size_t k,
             size_t cap,
             float* dis,
-            int64_t* ids)
-            : RHC(nq, ntotal), capacity((cap + 15) & ~15), dis(dis), ids(ids) {
+            int64_t* ids,
+            const IDSelector* sel_in)
+            : RHC(nq, ntotal, sel_in),
+              capacity((cap + 15) & ~15),
+              dis(dis),
+              ids(ids) {
         assert(capacity % 16 == 0);
         all_ids.resize(nq * capacity);
         all_vals.resize(nq * capacity);
@@ -423,12 +473,25 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            res.add(dis, this->adjust_id(b, j));
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                auto real_idx = this->adjust_id(b, j);
+                lt_mask -= 1 << j;
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    res.add(dis, real_idx);
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                res.add(dis, this->adjust_id(b, j));
+            }
         }
     }
 
@@ -439,7 +502,7 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
                 CMin<float, int64_t>>::type;
 
         std::vector<int> perm(reservoirs[0].n);
-        for (int q = 0; q < reservoirs.size(); q++) {
+        for (size_t q = 0; q < reservoirs.size(); q++) {
             ReservoirTopN<C>& res = reservoirs[q];
             size_t n = res.n;
 
@@ -454,14 +517,14 @@ struct ReservoirHandler : ResultHandlerCompare<C, with_id_map> {
                 one_a = 1 / normalizers[2 * q];
                 b = normalizers[2 * q + 1];
             }
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 perm[i] = i;
             }
             // indirect sort of result arrays
             std::sort(perm.begin(), perm.begin() + res.i, [&res](int i, int j) {
                 return C::cmp(res.vals[j], res.vals[i]);
             });
-            for (int i = 0; i < res.i; i++) {
+            for (size_t i = 0; i < res.i; i++) {
                 heap_dis[i] = res.vals[perm[i]] * one_a + b;
                 heap_ids[i] = res.ids[perm[i]];
             }
@@ -499,13 +562,17 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
     };
     std::vector<Triplet> triplets;
 
-    RangeHandler(RangeSearchResult& rres, float radius, size_t ntotal)
-            : RHC(rres.nq, ntotal), rres(rres), radius(radius) {
+    RangeHandler(
+            RangeSearchResult& rres,
+            float radius,
+            size_t ntotal,
+            const IDSelector* sel_in)
+            : RHC(rres.nq, ntotal, sel_in), rres(rres), radius(radius) {
         thresholds.resize(nq);
         n_per_query.resize(nq + 1);
     }
 
-    virtual void begin(const float* norms) {
+    virtual void begin(const float* norms) override {
         normalizers = norms;
         for (int q = 0; q < nq; ++q) {
             thresholds[q] =
@@ -528,13 +595,28 @@ struct RangeHandler : ResultHandlerCompare<C, with_id_map> {
         d0.store(d32tab);
         d1.store(d32tab + 16);
 
-        while (lt_mask) {
-            // find first non-zero
-            int j = __builtin_ctz(lt_mask);
-            lt_mask -= 1 << j;
-            T dis = d32tab[j];
-            n_per_query[q]++;
-            triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+        if (this->sel != nullptr) {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+
+                auto real_idx = this->adjust_id(b, j);
+                if (this->sel->is_member(real_idx)) {
+                    T dis = d32tab[j];
+                    n_per_query[q]++;
+                    triplets.push_back({idx_t(q + q0), real_idx, dis});
+                }
+            }
+        } else {
+            while (lt_mask) {
+                // find first non-zero
+                int j = __builtin_ctz(lt_mask);
+                lt_mask -= 1 << j;
+                T dis = d32tab[j];
+                n_per_query[q]++;
+                triplets.push_back({idx_t(q + q0), this->adjust_id(b, j), dis});
+            }
         }
     }
 
@@ -578,8 +660,9 @@ struct PartialRangeHandler : RangeHandler<C, with_id_map> {
             float radius,
             size_t ntotal,
             size_t q0,
-            size_t q1)
-            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal),
+            size_t q1,
+            const IDSelector* sel_in)
+            : RangeHandler<C, with_id_map>(*pres.res, radius, ntotal, sel_in),
               pres(pres) {
         nq = q1 - q0;
         this->q0 = q0;
@@ -698,6 +781,7 @@ void dispatch_SIMDResultHanlder(
         FAISS_THROW_FMT("Unknown id size %d", res.sizeof_ids);
     }
 }
+
 } // namespace simd_result_handlers
 
 } // namespace faiss
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 0d61b73ecd..d88fe7b393 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -140,8 +140,9 @@ std::map<std::string, ScalarQuantizer::QuantizerType> sq_types = {
         {"SQ4", ScalarQuantizer::QT_4bit},
         {"SQ6", ScalarQuantizer::QT_6bit},
         {"SQfp16", ScalarQuantizer::QT_fp16},
+        {"SQbf16", ScalarQuantizer::QT_bf16},
 };
-const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16)";
+const std::string sq_pattern = "(SQ4|SQ8|SQ6|SQfp16|SQbf16)";
 
 std::map<std::string, AdditiveQuantizer::Search_type_t> aq_search_type = {
         {"_Nfloat", AdditiveQuantizer::ST_norm_float},
diff --git a/faiss/index_io.h b/faiss/index_io.h
index 8d52ee1afd..3e77d0227c 100644
--- a/faiss/index_io.h
+++ b/faiss/index_io.h
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 // I/O code for indexes
 
 #ifndef FAISS_INDEX_IO_H
@@ -35,9 +33,12 @@ struct IOReader;
 struct IOWriter;
 struct InvertedLists;
 
-void write_index(const Index* idx, const char* fname);
-void write_index(const Index* idx, FILE* f);
-void write_index(const Index* idx, IOWriter* writer);
+/// skip the storage for graph-based indexes
+const int IO_FLAG_SKIP_STORAGE = 1;
+
+void write_index(const Index* idx, const char* fname, int io_flags = 0);
+void write_index(const Index* idx, FILE* f, int io_flags = 0);
+void write_index(const Index* idx, IOWriter* writer, int io_flags = 0);
 
 void write_index_binary(const IndexBinary* idx, const char* fname);
 void write_index_binary(const IndexBinary* idx, FILE* f);
@@ -52,6 +53,12 @@ const int IO_FLAG_ONDISK_SAME_DIR = 4;
 const int IO_FLAG_SKIP_IVF_DATA = 8;
 // don't initialize precomputed table after loading
 const int IO_FLAG_SKIP_PRECOMPUTE_TABLE = 16;
+// don't compute the sdc table for PQ-based indices
+// this will prevent distances from being computed
+// between elements in the index. For indices like HNSWPQ,
+// this will prevent graph building because sdc
+// computations are required to construct the graph
+const int IO_FLAG_PQ_SKIP_SDC_TABLE = 32;
 // try to memmap data (useful to load an ArrayInvertedLists as an
 // OnDiskInvertedLists)
 const int IO_FLAG_MMAP = IO_FLAG_SKIP_IVF_DATA | 0x646f0000;
diff --git a/faiss/invlists/BlockInvertedLists.cpp b/faiss/invlists/BlockInvertedLists.cpp
index 6370d11871..dbdb0302dc 100644
--- a/faiss/invlists/BlockInvertedLists.cpp
+++ b/faiss/invlists/BlockInvertedLists.cpp
@@ -9,6 +9,7 @@
 
 #include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
 
 #include <faiss/impl/io.h>
 #include <faiss/impl/io_macros.h>
@@ -54,7 +55,9 @@ size_t BlockInvertedLists::add_entries(
     codes[list_no].resize(n_block * block_size);
     if (o % block_size == 0) {
         // copy whole blocks
-        memcpy(&codes[list_no][o * code_size], code, n_block * block_size);
+        memcpy(&codes[list_no][o * packer->code_size],
+               code,
+               n_block * block_size);
     } else {
         FAISS_THROW_IF_NOT_MSG(packer, "missing code packer");
         std::vector<uint8_t> buffer(packer->code_size);
@@ -76,6 +79,29 @@ const uint8_t* BlockInvertedLists::get_codes(size_t list_no) const {
     return codes[list_no].get();
 }
 
+size_t BlockInvertedLists::remove_ids(const IDSelector& sel) {
+    idx_t nremove = 0;
+#pragma omp parallel for
+    for (idx_t i = 0; i < nlist; i++) {
+        std::vector<uint8_t> buffer(packer->code_size);
+        idx_t l = ids[i].size(), j = 0;
+        while (j < l) {
+            if (sel.is_member(ids[i][j])) {
+                l--;
+                ids[i][j] = ids[i][l];
+                packer->unpack_1(codes[i].data(), l, buffer.data());
+                packer->pack_1(buffer.data(), j, codes[i].data());
+            } else {
+                j++;
+            }
+        }
+        resize(i, l);
+        nremove += ids[i].size() - l;
+    }
+
+    return nremove;
+}
+
 const idx_t* BlockInvertedLists::get_ids(size_t list_no) const {
     assert(list_no < nlist);
     return ids[list_no].data();
@@ -102,12 +128,6 @@ void BlockInvertedLists::update_entries(
         const idx_t*,
         const uint8_t*) {
     FAISS_THROW_MSG("not impemented");
-    /*
-    assert (list_no < nlist);
-    assert (n_entry + offset <= ids[list_no].size());
-    memcpy (&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
-    memcpy (&codes[list_no][offset * code_size], codes_in, code_size * n_entry);
-    */
 }
 
 BlockInvertedLists::~BlockInvertedLists() {
diff --git a/faiss/invlists/BlockInvertedLists.h b/faiss/invlists/BlockInvertedLists.h
index 8d8df720bf..2b9cbba455 100644
--- a/faiss/invlists/BlockInvertedLists.h
+++ b/faiss/invlists/BlockInvertedLists.h
@@ -15,6 +15,7 @@
 namespace faiss {
 
 struct CodePacker;
+struct IDSelector;
 
 /** Inverted Lists that are organized by blocks.
  *
@@ -47,6 +48,8 @@ struct BlockInvertedLists : InvertedLists {
     size_t list_size(size_t list_no) const override;
     const uint8_t* get_codes(size_t list_no) const override;
     const idx_t* get_ids(size_t list_no) const override;
+    /// remove ids from the InvertedLists
+    size_t remove_ids(const IDSelector& sel);
 
     // works only on empty BlockInvertedLists
     // the codes should be of size ceil(n_entry / n_per_block) * block_size
diff --git a/faiss/invlists/DirectMap.cpp b/faiss/invlists/DirectMap.cpp
index 2b272922d5..dc2b92aa1c 100644
--- a/faiss/invlists/DirectMap.cpp
+++ b/faiss/invlists/DirectMap.cpp
@@ -15,6 +15,7 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/IDSelector.h>
+#include <faiss/invlists/BlockInvertedLists.h>
 
 namespace faiss {
 
@@ -148,8 +149,12 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
     std::vector<idx_t> toremove(nlist);
 
     size_t nremove = 0;
-
+    BlockInvertedLists* block_invlists =
+            dynamic_cast<BlockInvertedLists*>(invlists);
     if (type == NoMap) {
+        if (block_invlists != nullptr) {
+            return block_invlists->remove_ids(sel);
+        }
         // exhaustive scan of IVF
 #pragma omp parallel for
         for (idx_t i = 0; i < nlist; i++) {
@@ -178,6 +183,9 @@ size_t DirectMap::remove_ids(const IDSelector& sel, InvertedLists* invlists) {
             }
         }
     } else if (type == Hashtable) {
+        FAISS_THROW_IF_MSG(
+                block_invlists,
+                "remove with hashtable is not supported with BlockInvertedLists");
         const IDSelectorArray* sela =
                 dynamic_cast<const IDSelectorArray*>(&sel);
         FAISS_THROW_IF_NOT_MSG(
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index cc337d004b..c2bfa2cabc 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -5,8 +5,6 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// -*- c++ -*-
-
 #include <faiss/invlists/InvertedLists.h>
 
 #include <cstdio>
@@ -24,18 +22,10 @@ InvertedListsIterator::~InvertedListsIterator() {}
  ******************************************/
 
 InvertedLists::InvertedLists(size_t nlist, size_t code_size)
-        : nlist(nlist), code_size(code_size), use_iterator(false) {}
+        : nlist(nlist), code_size(code_size) {}
 
 InvertedLists::~InvertedLists() {}
 
-bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
-        const {
-    return use_iterator ? !std::unique_ptr<InvertedListsIterator>(
-                                   get_iterator(list_no, inverted_list_context))
-                                   ->is_available()
-                        : list_size(list_no) == 0;
-}
-
 idx_t InvertedLists::get_single_id(size_t list_no, size_t offset) const {
     assert(offset < list_size(list_no));
     const idx_t* ids = get_ids(list_no);
@@ -78,12 +68,6 @@ void InvertedLists::reset() {
     }
 }
 
-InvertedListsIterator* InvertedLists::get_iterator(
-        size_t /*list_no*/,
-        void* /*inverted_list_context*/) const {
-    FAISS_THROW_MSG("get_iterator is not supported");
-}
-
 void InvertedLists::merge_from(InvertedLists* oivf, size_t add_id) {
 #pragma omp parallel for
     for (idx_t i = 0; i < nlist; i++) {
@@ -233,6 +217,54 @@ size_t InvertedLists::compute_ntotal() const {
     return tot;
 }
 
+bool InvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    if (use_iterator) {
+        return !std::unique_ptr<InvertedListsIterator>(
+                        get_iterator(list_no, inverted_list_context))
+                        ->is_available();
+    } else {
+        FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+        return list_size(list_no) == 0;
+    }
+}
+
+// implemnent iterator on top of get_codes / get_ids
+namespace {
+
+struct CodeArrayIterator : InvertedListsIterator {
+    size_t list_size;
+    size_t code_size;
+    InvertedLists::ScopedCodes codes;
+    InvertedLists::ScopedIds ids;
+    size_t idx = 0;
+
+    CodeArrayIterator(const InvertedLists* il, size_t list_no)
+            : list_size(il->list_size(list_no)),
+              code_size(il->code_size),
+              codes(il, list_no),
+              ids(il, list_no) {}
+
+    bool is_available() const override {
+        return idx < list_size;
+    }
+    void next() override {
+        idx++;
+    }
+    std::pair<idx_t, const uint8_t*> get_id_and_codes() override {
+        return {ids[idx], codes.get() + code_size * idx};
+    }
+};
+
+} // namespace
+
+InvertedListsIterator* InvertedLists::get_iterator(
+        size_t list_no,
+        void* inverted_list_context) const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return new CodeArrayIterator(this, list_no);
+}
+
 /*****************************************
  * ArrayInvertedLists implementation
  ******************************************/
@@ -264,6 +296,12 @@ size_t ArrayInvertedLists::list_size(size_t list_no) const {
     return ids[list_no].size();
 }
 
+bool ArrayInvertedLists::is_empty(size_t list_no, void* inverted_list_context)
+        const {
+    FAISS_THROW_IF_NOT(inverted_list_context == nullptr);
+    return ids[list_no].size() == 0;
+}
+
 const uint8_t* ArrayInvertedLists::get_codes(size_t list_no) const {
     assert(list_no < nlist);
     return codes[list_no].data();
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 90a9d65411..b24700fad1 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -37,7 +37,9 @@ struct InvertedListsIterator {
 struct InvertedLists {
     size_t nlist;     ///< number of possible key values
     size_t code_size; ///< code size per vector in bytes
-    bool use_iterator;
+
+    /// request to use iterator rather than get_codes / get_ids
+    bool use_iterator = false;
 
     InvertedLists(size_t nlist, size_t code_size);
 
@@ -50,17 +52,9 @@ struct InvertedLists {
     /*************************
      *  Read only functions */
 
-    // check if the list is empty
-    bool is_empty(size_t list_no, void* inverted_list_context) const;
-
     /// get the size of a list
     virtual size_t list_size(size_t list_no) const = 0;
 
-    /// get iterable for lists that use_iterator
-    virtual InvertedListsIterator* get_iterator(
-            size_t list_no,
-            void* inverted_list_context) const;
-
     /** get the codes for an inverted list
      * must be released by release_codes
      *
@@ -92,6 +86,18 @@ struct InvertedLists {
     /// a list can be -1 hence the signed long
     virtual void prefetch_lists(const idx_t* list_nos, int nlist) const;
 
+    /*****************************************
+     * Iterator interface (with context)     */
+
+    /// check if the list is empty
+    virtual bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const;
+
+    /// get iterable for lists that use_iterator
+    virtual InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const;
+
     /*************************
      * writing functions     */
 
@@ -262,6 +268,9 @@ struct ArrayInvertedLists : InvertedLists {
     /// permute the inverted lists, map maps new_id to old_id
     void permute_invlists(const idx_t* map);
 
+    bool is_empty(size_t list_no, void* inverted_list_context = nullptr)
+            const override;
+
     ~ArrayInvertedLists() override;
 };
 
diff --git a/faiss/invlists/OnDiskInvertedLists.cpp b/faiss/invlists/OnDiskInvertedLists.cpp
index 3017d164c6..8565572a9b 100644
--- a/faiss/invlists/OnDiskInvertedLists.cpp
+++ b/faiss/invlists/OnDiskInvertedLists.cpp
@@ -394,8 +394,8 @@ const idx_t* OnDiskInvertedLists::get_ids(size_t list_no) const {
         return nullptr;
     }
 
-    return (
-        const idx_t*)(ptr + lists[list_no].offset + code_size * lists[list_no].capacity);
+    return (const idx_t*)(ptr + lists[list_no].offset +
+                          code_size * lists[list_no].capacity);
 }
 
 void OnDiskInvertedLists::update_entries(
@@ -565,15 +565,16 @@ void OnDiskInvertedLists::free_slot(size_t offset, size_t capacity) {
 /*****************************************
  * Compact form
  *****************************************/
-
-size_t OnDiskInvertedLists::merge_from(
+size_t OnDiskInvertedLists::merge_from_multiple(
         const InvertedLists** ils,
         int n_il,
+        bool shift_ids,
         bool verbose) {
     FAISS_THROW_IF_NOT_MSG(
             totsize == 0, "works only on an empty InvertedLists");
 
     std::vector<size_t> sizes(nlist);
+    std::vector<size_t> shift_id_offsets(n_il);
     for (int i = 0; i < n_il; i++) {
         const InvertedLists* il = ils[i];
         FAISS_THROW_IF_NOT(il->nlist == nlist && il->code_size == code_size);
@@ -581,6 +582,10 @@ size_t OnDiskInvertedLists::merge_from(
         for (size_t j = 0; j < nlist; j++) {
             sizes[j] += il->list_size(j);
         }
+
+        size_t il_totsize = il->compute_ntotal();
+        shift_id_offsets[i] =
+                (shift_ids && i > 0) ? shift_id_offsets[i - 1] + il_totsize : 0;
     }
 
     size_t cums = 0;
@@ -605,11 +610,21 @@ size_t OnDiskInvertedLists::merge_from(
             const InvertedLists* il = ils[i];
             size_t n_entry = il->list_size(j);
             l.size += n_entry;
+            ScopedIds scope_ids(il, j);
+            const idx_t* scope_ids_data = scope_ids.get();
+            std::vector<idx_t> new_ids;
+            if (shift_ids) {
+                new_ids.resize(n_entry);
+                for (size_t k = 0; k < n_entry; k++) {
+                    new_ids[k] = scope_ids[k] + shift_id_offsets[i];
+                }
+                scope_ids_data = new_ids.data();
+            }
             update_entries(
                     j,
                     l.size - n_entry,
                     n_entry,
-                    ScopedIds(il, j).get(),
+                    scope_ids_data,
                     ScopedCodes(il, j).get());
         }
         assert(l.size == l.capacity);
@@ -638,7 +653,7 @@ size_t OnDiskInvertedLists::merge_from(
 size_t OnDiskInvertedLists::merge_from_1(
         const InvertedLists* ils,
         bool verbose) {
-    return merge_from(&ils, 1, verbose);
+    return merge_from_multiple(&ils, 1, verbose);
 }
 
 void OnDiskInvertedLists::crop_invlists(size_t l0, size_t l1) {
diff --git a/faiss/invlists/OnDiskInvertedLists.h b/faiss/invlists/OnDiskInvertedLists.h
index 98cb653a7a..01c7f3481e 100644
--- a/faiss/invlists/OnDiskInvertedLists.h
+++ b/faiss/invlists/OnDiskInvertedLists.h
@@ -101,9 +101,10 @@ struct OnDiskInvertedLists : InvertedLists {
 
     // copy all inverted lists into *this, in compact form (without
     // allocating slots)
-    size_t merge_from(
+    size_t merge_from_multiple(
             const InvertedLists** ils,
             int n_il,
+            bool shift_ids = false,
             bool verbose = false);
 
     /// same as merge_from for a single invlist
diff --git a/faiss/python/CMakeLists.txt b/faiss/python/CMakeLists.txt
index 8bca710f5f..0073c20e04 100644
--- a/faiss/python/CMakeLists.txt
+++ b/faiss/python/CMakeLists.txt
@@ -38,6 +38,11 @@ macro(configure_swigfaiss source)
     set_source_files_properties(${source} PROPERTIES
       COMPILE_DEFINITIONS GPU_WRAPPER
     )
+    if (FAISS_ENABLE_RAFT)
+      set_property(SOURCE ${source} APPEND PROPERTY
+        COMPILE_DEFINITIONS FAISS_ENABLE_RAFT
+      )
+    endif()
   endif()
 endmacro()
 
@@ -67,11 +72,20 @@ else()
   find_package(faiss REQUIRED)
 endif()
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+swig_add_library(swigfaiss
+  TYPE MODULE
+  LANGUAGE python
+  SOURCES swigfaiss.swig
+)
+else ()
 swig_add_library(swigfaiss
   TYPE SHARED
   LANGUAGE python
   SOURCES swigfaiss.swig
 )
+endif()
+
 set_property(TARGET swigfaiss PROPERTY SWIG_COMPILE_OPTIONS -doxygen)
 
 set_property(SOURCE swigfaiss_avx2.swig
@@ -160,6 +174,10 @@ set_property(TARGET faiss_python_callbacks
   PROPERTY POSITION_INDEPENDENT_CODE ON
 )
 
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+target_link_libraries(faiss_python_callbacks PRIVATE faiss)
+endif()
+
 # Hack so that python_callbacks.h can be included as
 # `#include <faiss/python/python_callbacks.h>`.
 target_include_directories(faiss_python_callbacks PRIVATE ${PROJECT_SOURCE_DIR}/../..)
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 95be4254dc..ce4b42c618 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -292,10 +292,10 @@ def range_search_with_parameters(index, x, radius, params=None, output_stats=Fal
 ###########################################
 
 
-def serialize_index(index):
+def serialize_index(index, io_flags=0):
     """ convert an index to a numpy uint8 array  """
     writer = VectorIOWriter()
-    write_index(index, writer)
+    write_index(index, writer, io_flags)
     return vector_to_array(writer.data)
 
 
@@ -316,3 +316,14 @@ def deserialize_index_binary(data):
     reader = VectorIOReader()
     copy_array_to_vector(data, reader.data)
     return read_index_binary(reader)
+
+
+class TimeoutGuard:
+    def __init__(self, timeout_in_seconds: float):
+        self.timeout = timeout_in_seconds
+
+    def __enter__(self):
+        TimeoutCallback.reset(self.timeout)
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        PythonInterruptCallback.reset()
diff --git a/faiss/python/class_wrappers.py b/faiss/python/class_wrappers.py
index 4a6808d286..4af2345009 100644
--- a/faiss/python/class_wrappers.py
+++ b/faiss/python/class_wrappers.py
@@ -956,10 +956,44 @@ def replacement_remove_ids(self, x):
             sel = IDSelectorBatch(x.size, swig_ptr(x))
         return self.remove_ids_c(sel)
 
+    def replacement_assign(self, x, k, labels=None):
+        """Find the k nearest neighbors of the set of vectors x in the index.
+        This is the same as the `search` method, but discards the distances.
+
+        Parameters
+        ----------
+        x : array_like
+            Query vectors, shape (n, d) where d is appropriate for the index.
+            `dtype` must be uint8.
+        k : int
+            Number of nearest neighbors.
+        labels : array_like, optional
+            Labels array to store the results.
+
+        Returns
+        -------
+        labels: array_like
+            Labels of the nearest neighbors, shape (n, k).
+            When not enough results are found, the label is set to -1
+        """
+        n, d = x.shape
+        x = _check_dtype_uint8(x)
+        assert d == self.code_size
+        assert k > 0
+
+        if labels is None:
+            labels = np.empty((n, k), dtype=np.int64)
+        else:
+            assert labels.shape == (n, k)
+
+        self.assign_c(n, swig_ptr(x), swig_ptr(labels), k)
+        return labels
+
     replace_method(the_class, 'add', replacement_add)
     replace_method(the_class, 'add_with_ids', replacement_add_with_ids)
     replace_method(the_class, 'train', replacement_train)
     replace_method(the_class, 'search', replacement_search)
+    replace_method(the_class, 'assign', replacement_assign)
     replace_method(the_class, 'range_search', replacement_range_search)
     replace_method(the_class, 'reconstruct', replacement_reconstruct)
     replace_method(the_class, 'reconstruct_n', replacement_reconstruct_n)
diff --git a/faiss/python/extra_wrappers.py b/faiss/python/extra_wrappers.py
index d7fd05bc9f..a037b0280f 100644
--- a/faiss/python/extra_wrappers.py
+++ b/faiss/python/extra_wrappers.py
@@ -330,7 +330,7 @@ def lookup(self, keys):
 # KNN function
 ######################################################
 
-def knn(xq, xb, k, metric=METRIC_L2):
+def knn(xq, xb, k, metric=METRIC_L2, metric_arg=0.0):
     """
     Compute the k nearest neighbors of a vector without constructing an index
 
@@ -374,10 +374,16 @@ def knn(xq, xb, k, metric=METRIC_L2):
             swig_ptr(xq), swig_ptr(xb),
             d, nq, nb, k, swig_ptr(D), swig_ptr(I)
         )
-    else:
-        raise NotImplementedError("only L2 and INNER_PRODUCT are supported")
+    else: 
+        knn_extra_metrics(
+            swig_ptr(xq), swig_ptr(xb),
+            d, nq, nb, metric, metric_arg, k, 
+            swig_ptr(D), swig_ptr(I)
+        )
+
     return D, I
 
+
 def knn_hamming(xq, xb, k, variant="hc"):
     """
     Compute the k nearest neighbors of a set of vectors without constructing an index.
diff --git a/faiss/python/python_callbacks.cpp b/faiss/python/python_callbacks.cpp
index bfcf883aec..06b5c18cfc 100644
--- a/faiss/python/python_callbacks.cpp
+++ b/faiss/python/python_callbacks.cpp
@@ -46,7 +46,7 @@ size_t PyCallbackIOWriter::operator()(
         size_t wi = ws > bs ? bs : ws;
         PyObject* result = PyObject_CallFunction(
                 callback, "(N)", PyBytes_FromStringAndSize(ptr, wi));
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("py err");
         }
         // TODO check nb of bytes written
@@ -77,7 +77,7 @@ size_t PyCallbackIOReader::operator()(void* ptrv, size_t size, size_t nitems) {
     while (rs > 0) {
         size_t ri = rs > bs ? bs : rs;
         PyObject* result = PyObject_CallFunction(callback, "(n)", ri);
-        if (result == NULL) {
+        if (result == nullptr) {
             FAISS_THROW_MSG("propagate py error");
         }
         if (!PyBytes_Check(result)) {
@@ -122,7 +122,7 @@ bool PyCallbackIDSelector::is_member(faiss::idx_t id) const {
     FAISS_THROW_IF_NOT((id >> 32) == 0);
     PyThreadLock gil;
     PyObject* result = PyObject_CallFunction(callback, "(n)", int(id));
-    if (result == NULL) {
+    if (result == nullptr) {
         FAISS_THROW_MSG("propagate py error");
     }
     bool b = PyObject_IsTrue(result);
diff --git a/faiss/python/setup.py b/faiss/python/setup.py
index 3b4f2e9c83..939aeeffbe 100644
--- a/faiss/python/setup.py
+++ b/faiss/python/setup.py
@@ -60,7 +60,7 @@
 """
 setup(
     name='faiss',
-    version='1.7.4',
+    version='1.8.0',
     description='A library for efficient similarity search and clustering of dense vectors',
     long_description=long_description,
     url='https://github.com/facebookresearch/faiss',
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index fb7f50dd2e..74a371f6cd 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -304,6 +304,7 @@ void gpu_sync_all_devices();
 #include <faiss/gpu/GpuIndicesOptions.h>
 #include <faiss/gpu/GpuClonerOptions.h>
 #include <faiss/gpu/GpuIndex.h>
+#include <faiss/gpu/GpuIndexCagra.h>
 #include <faiss/gpu/GpuIndexFlat.h>
 #include <faiss/gpu/GpuIndexIVF.h>
 #include <faiss/gpu/GpuIndexIVFPQ.h>
@@ -557,6 +558,9 @@ struct faiss::simd16uint16 {};
 %include  <faiss/gpu/GpuIndicesOptions.h>
 %include  <faiss/gpu/GpuClonerOptions.h>
 %include  <faiss/gpu/GpuIndex.h>
+#ifdef FAISS_ENABLE_RAFT
+%include  <faiss/gpu/GpuIndexCagra.h>
+#endif
 %include  <faiss/gpu/GpuIndexFlat.h>
 %include  <faiss/gpu/GpuIndexIVF.h>
 %include  <faiss/gpu/GpuIndexIVFPQ.h>
@@ -673,6 +677,9 @@ struct faiss::simd16uint16 {};
     DOWNCAST ( IndexRowwiseMinMax )
     DOWNCAST ( IndexRowwiseMinMaxFP16 )
 #ifdef GPU_WRAPPER
+#ifdef FAISS_ENABLE_RAFT
+    DOWNCAST_GPU ( GpuIndexCagra )
+#endif
     DOWNCAST_GPU ( GpuIndexIVFPQ )
     DOWNCAST_GPU ( GpuIndexIVFFlat )
     DOWNCAST_GPU ( GpuIndexIVFScalarQuantizer )
@@ -1022,14 +1029,17 @@ PyObject *swig_ptr (PyObject *a)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_bool, 0);
     }
     if(PyArray_TYPE(ao) == NPY_UINT64) {
-#ifdef SWIGWORDSIZE64
+    // Convert npy64 either long or long long  and it depends on how compiler define int64_t.
+    // In the 64bit machine, typically the int64_t should be long but it is not hold for Apple osx.
+    // In this case, we want to convert npy64 to long_Long in osx
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_unsigned_long_long, 0);
 #endif
     }
     if(PyArray_TYPE(ao) == NPY_INT64) {
-#ifdef SWIGWORDSIZE64
+#if __SIZEOF_LONG__ == 8 && !defined(__APPLE__)
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long, 0);
 #else
         return SWIG_NewPointerObj(data, SWIGTYPE_p_long_long, 0);
@@ -1038,7 +1048,9 @@ PyObject *swig_ptr (PyObject *a)
     PyErr_SetString(PyExc_ValueError, "did not recognize array type");
     return NULL;
 }
+%}
 
+%inline %{
 
 struct PythonInterruptCallback: faiss::InterruptCallback {
 
@@ -1053,18 +1065,18 @@ struct PythonInterruptCallback: faiss::InterruptCallback {
         return err == -1;
     }
 
+    static void reset() {
+        faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
+    }
 };
 
-
 %}
 
-
 %init %{
     /* needed, else crash at runtime */
     import_array();
 
-    faiss::InterruptCallback::instance.reset(new PythonInterruptCallback());
-
+    PythonInterruptCallback::reset();
 %}
 
 // return a pointer usable as input for functions that expect pointers
@@ -1121,15 +1133,8 @@ int * cast_integer_to_int_ptr (int64_t x) {
 void * cast_integer_to_void_ptr (int64_t x) {
     return (void*)x;
 }
-
 %}
 
-
-
-
-
-
-
 %inline %{
     void wait() {
         // in gdb, use return to get out of this function
diff --git a/faiss/utils/bf16.h b/faiss/utils/bf16.h
new file mode 100644
index 0000000000..ff0fbe898b
--- /dev/null
+++ b/faiss/utils/bf16.h
@@ -0,0 +1,36 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace faiss {
+
+namespace {
+
+union fp32_bits {
+    uint32_t as_u32;
+    float as_f32;
+};
+
+} // namespace
+
+inline uint16_t encode_bf16(const float f) {
+    // Round off
+    fp32_bits fp;
+    fp.as_f32 = f;
+    return static_cast<uint16_t>((fp.as_u32 + 0x8000) >> 16);
+}
+
+inline float decode_bf16(const uint16_t v) {
+    fp32_bits fp;
+    fp.as_u32 = (uint32_t(v) << 16);
+    return fp.as_f32;
+}
+
+} // namespace faiss
diff --git a/faiss/utils/distances.cpp b/faiss/utils/distances.cpp
index 13ecc5d661..e00020e205 100644
--- a/faiss/utils/distances.cpp
+++ b/faiss/utils/distances.cpp
@@ -145,7 +145,7 @@ void exhaustive_inner_product_seq(
         const IDSelector* sel = nullptr) {
     using SingleResultHandler =
             typename BlockResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
@@ -216,7 +216,7 @@ void exhaustive_L2sqr_seq(
         const IDSelector* sel = nullptr) {
     using SingleResultHandler =
             typename BlockResultHandler::SingleResultHandler;
-    int nt = std::min(int(nx), omp_get_max_threads());
+    [[maybe_unused]] int nt = std::min(int(nx), omp_get_max_threads());
 
     FAISS_ASSERT(use_sel == (sel != nullptr));
 
diff --git a/faiss/utils/extra_distances-inl.h b/faiss/utils/extra_distances-inl.h
index d3768df668..3171580f8c 100644
--- a/faiss/utils/extra_distances-inl.h
+++ b/faiss/utils/extra_distances-inl.h
@@ -10,6 +10,7 @@
 
 #include <faiss/MetricType.h>
 #include <faiss/utils/distances.h>
+#include <cmath>
 #include <type_traits>
 
 namespace faiss {
@@ -130,4 +131,35 @@ inline float VectorDistance<METRIC_Jaccard>::operator()(
     return accu_num / accu_den;
 }
 
+template <>
+inline float VectorDistance<METRIC_NaNEuclidean>::operator()(
+        const float* x,
+        const float* y) const {
+    // https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise.nan_euclidean_distances.html
+    float accu = 0;
+    size_t present = 0;
+    for (size_t i = 0; i < d; i++) {
+        if (!std::isnan(x[i]) && !std::isnan(y[i])) {
+            float diff = x[i] - y[i];
+            accu += diff * diff;
+            present++;
+        }
+    }
+    if (present == 0) {
+        return NAN;
+    }
+    return float(d) / float(present) * accu;
+}
+
+template <>
+inline float VectorDistance<METRIC_ABS_INNER_PRODUCT>::operator()(
+        const float* x,
+        const float* y) const {
+    float accu = 0;
+    for (size_t i = 0; i < d; i++) {
+        accu += fabs(x[i] * y[i]);
+    }
+    return accu;
+}
+
 } // namespace faiss
diff --git a/faiss/utils/extra_distances.cpp b/faiss/utils/extra_distances.cpp
index 8c0699880d..407057e58e 100644
--- a/faiss/utils/extra_distances.cpp
+++ b/faiss/utils/extra_distances.cpp
@@ -50,16 +50,18 @@ void pairwise_extra_distances_template(
     }
 }
 
-template <class VD, class C>
+template <class VD>
 void knn_extra_metrics_template(
         VD vd,
         const float* x,
         const float* y,
         size_t nx,
         size_t ny,
-        HeapArray<C>* res) {
-    size_t k = res->k;
+        size_t k,
+        float* distances,
+        int64_t* labels) {
     size_t d = vd.d;
+    using C = typename VD::C;
     size_t check_period = InterruptCallback::get_period_hint(ny * d);
     check_period *= omp_get_max_threads();
 
@@ -71,18 +73,15 @@ void knn_extra_metrics_template(
             const float* x_i = x + i * d;
             const float* y_j = y;
             size_t j;
-            float* simi = res->get_val(i);
-            int64_t* idxi = res->get_ids(i);
+            float* simi = distances + k * i;
+            int64_t* idxi = labels + k * i;
 
             // maxheap_heapify(k, simi, idxi);
             heap_heapify<C>(k, simi, idxi);
             for (j = 0; j < ny; j++) {
                 float disij = vd(x_i, y_j);
 
-                // if (disij < simi[0]) {
-                if ((!vd.is_similarity && (disij < simi[0])) ||
-                    (vd.is_similarity && (disij > simi[0]))) {
-                    // maxheap_replace_top(k, simi, idxi, disij, j);
+                if (C::cmp(simi[0], disij)) {
                     heap_replace_top<C>(k, simi, idxi, disij, j);
                 }
                 y_j += d;
@@ -164,13 +163,14 @@ void pairwise_extra_distances(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -179,13 +179,15 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res) {
+        size_t k,
+        float* distances,
+        int64_t* indexes) {
     switch (mt) {
-#define HANDLE_VAR(kw)                                            \
-    case METRIC_##kw: {                                           \
-        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg}; \
-        knn_extra_metrics_template(vd, x, y, nx, ny, res);        \
-        break;                                                    \
+#define HANDLE_VAR(kw)                                                       \
+    case METRIC_##kw: {                                                      \
+        VectorDistance<METRIC_##kw> vd = {(size_t)d, metric_arg};            \
+        knn_extra_metrics_template(vd, x, y, nx, ny, k, distances, indexes); \
+        break;                                                               \
     }
         HANDLE_VAR(L2);
         HANDLE_VAR(L1);
@@ -195,32 +197,14 @@ void knn_extra_metrics(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
     }
 }
 
-template void knn_extra_metrics<CMax<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMax<float, int64_t>>* res);
-
-template void knn_extra_metrics<CMin<float, int64_t>>(
-        const float* x,
-        const float* y,
-        size_t d,
-        size_t nx,
-        size_t ny,
-        MetricType mt,
-        float metric_arg,
-        HeapArray<CMin<float, int64_t>>* res);
-
 FlatCodesDistanceComputer* get_extra_distance_computer(
         size_t d,
         MetricType mt,
@@ -242,6 +226,8 @@ FlatCodesDistanceComputer* get_extra_distance_computer(
         HANDLE_VAR(JensenShannon);
         HANDLE_VAR(Lp);
         HANDLE_VAR(Jaccard);
+        HANDLE_VAR(NaNEuclidean);
+        HANDLE_VAR(ABS_INNER_PRODUCT);
 #undef HANDLE_VAR
         default:
             FAISS_THROW_MSG("metric type not implemented");
diff --git a/faiss/utils/extra_distances.h b/faiss/utils/extra_distances.h
index 79b65bc1e9..f8b47cfba5 100644
--- a/faiss/utils/extra_distances.h
+++ b/faiss/utils/extra_distances.h
@@ -33,7 +33,6 @@ void pairwise_extra_distances(
         int64_t ldb = -1,
         int64_t ldd = -1);
 
-template <class C>
 void knn_extra_metrics(
         const float* x,
         const float* y,
@@ -42,7 +41,9 @@ void knn_extra_metrics(
         size_t ny,
         MetricType mt,
         float metric_arg,
-        HeapArray<C>* res);
+        size_t k,
+        float* distances,
+        int64_t* indexes);
 
 /** get a DistanceComputer that refers to this type of distance and
  *  indexes a flat array of size nb */
diff --git a/faiss/utils/simdlib.h b/faiss/utils/simdlib.h
index 27e9cc59f5..ea5020d719 100644
--- a/faiss/utils/simdlib.h
+++ b/faiss/utils/simdlib.h
@@ -14,7 +14,12 @@
  * functions.
  */
 
-#ifdef __AVX2__
+#if defined(__AVX512F__)
+
+#include <faiss/utils/simdlib_avx2.h>
+#include <faiss/utils/simdlib_avx512.h>
+
+#elif defined(__AVX2__)
 
 #include <faiss/utils/simdlib_avx2.h>
 
@@ -22,6 +27,10 @@
 
 #include <faiss/utils/simdlib_neon.h>
 
+#elif defined(__PPC64__)
+
+#include <faiss/utils/simdlib_ppc64.h>
+
 #else
 
 // emulated = all operations are implemented as scalars
diff --git a/faiss/utils/simdlib_avx512.h b/faiss/utils/simdlib_avx512.h
new file mode 100644
index 0000000000..9ce0965895
--- /dev/null
+++ b/faiss/utils/simdlib_avx512.h
@@ -0,0 +1,296 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+#include <immintrin.h>
+
+#include <faiss/impl/platform_macros.h>
+
+#include <faiss/utils/simdlib_avx2.h>
+
+namespace faiss {
+
+/** Simple wrapper around the AVX 512-bit registers
+ *
+ * The objective is to separate the different interpretations of the same
+ * registers (as a vector of uint8, uint16 or uint32), to provide printing
+ * functions, and to give more readable names to the AVX intrinsics. It does not
+ * pretend to be exhausitve, functions are added as needed.
+ */
+
+/// 512-bit representation without interpretation as a vector
+struct simd512bit {
+    union {
+        __m512i i;
+        __m512 f;
+    };
+
+    simd512bit() {}
+
+    explicit simd512bit(__m512i i) : i(i) {}
+
+    explicit simd512bit(__m512 f) : f(f) {}
+
+    explicit simd512bit(const void* x)
+            : i(_mm512_loadu_si512((__m512i const*)x)) {}
+
+    // sets up a lower half of the register while keeping upper one as zero
+    explicit simd512bit(simd256bit lo)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      _mm256_setzero_si256(),
+                      1)) {}
+
+    // constructs from lower and upper halves
+    explicit simd512bit(simd256bit lo, simd256bit hi)
+            : simd512bit(_mm512_inserti32x8(
+                      _mm512_castsi256_si512(lo.i),
+                      hi.i,
+                      1)) {}
+
+    void clear() {
+        i = _mm512_setzero_si512();
+    }
+
+    void storeu(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void loadu(const void* ptr) {
+        i = _mm512_loadu_si512((__m512i*)ptr);
+    }
+
+    void store(void* ptr) const {
+        _mm512_storeu_si512((__m512i*)ptr, i);
+    }
+
+    void bin(char bits[513]) const {
+        char bytes[64];
+        storeu((void*)bytes);
+        for (int i = 0; i < 512; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[512] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+};
+
+/// vector of 32 elements in uint16
+struct simd32uint16 : simd512bit {
+    simd32uint16() {}
+
+    explicit simd32uint16(__m512i i) : simd512bit(i) {}
+
+    explicit simd32uint16(int x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(uint16_t x) : simd512bit(_mm512_set1_epi16(x)) {}
+
+    explicit simd32uint16(simd512bit x) : simd512bit(x) {}
+
+    explicit simd32uint16(const uint16_t* x) : simd512bit((const void*)x) {}
+
+    // sets up a lower half of the register
+    explicit simd32uint16(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd32uint16(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint16_t bytes[32];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint16_t x) {
+        i = _mm512_set1_epi16((short)x);
+    }
+
+    simd32uint16 operator*(const simd32uint16& other) const {
+        return simd32uint16(_mm512_mullo_epi16(i, other.i));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator>>(const int shift) const {
+        return simd32uint16(_mm512_srli_epi16(i, shift));
+    }
+
+    // shift must be known at compile time
+    simd32uint16 operator<<(const int shift) const {
+        return simd32uint16(_mm512_slli_epi16(i, shift));
+    }
+
+    simd32uint16 operator+=(simd32uint16 other) {
+        i = _mm512_add_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator-=(simd32uint16 other) {
+        i = _mm512_sub_epi16(i, other.i);
+        return *this;
+    }
+
+    simd32uint16 operator+(simd32uint16 other) const {
+        return simd32uint16(_mm512_add_epi16(i, other.i));
+    }
+
+    simd32uint16 operator-(simd32uint16 other) const {
+        return simd32uint16(_mm512_sub_epi16(i, other.i));
+    }
+
+    simd32uint16 operator&(simd512bit other) const {
+        return simd32uint16(_mm512_and_si512(i, other.i));
+    }
+
+    simd32uint16 operator|(simd512bit other) const {
+        return simd32uint16(_mm512_or_si512(i, other.i));
+    }
+
+    simd32uint16 operator^(simd512bit other) const {
+        return simd32uint16(_mm512_xor_si512(i, other.i));
+    }
+
+    simd32uint16 operator~() const {
+        return simd32uint16(_mm512_xor_si512(i, _mm512_set1_epi32(-1)));
+    }
+
+    simd16uint16 low() const {
+        return simd16uint16(_mm512_castsi512_si256(i));
+    }
+
+    simd16uint16 high() const {
+        return simd16uint16(_mm512_extracti32x8_epi32(i, 1));
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        ALIGNED(64) uint16_t tab[32];
+        store(tab);
+        return tab[i];
+    }
+
+    void accu_min(simd32uint16 incoming) {
+        i = _mm512_min_epu16(i, incoming.i);
+    }
+
+    void accu_max(simd32uint16 incoming) {
+        i = _mm512_max_epu16(i, incoming.i);
+    }
+};
+
+// decompose in 128-lanes: a = (a0, a1, a2, a3), b = (b0, b1, b2, b3)
+// return (a0 + a1 + a2 + a3, b0 + b1 + b2 + b3)
+inline simd16uint16 combine4x2(simd32uint16 a, simd32uint16 b) {
+    return combine2x2(a.low(), b.low()) + combine2x2(a.high(), b.high());
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd64uint8 : simd512bit {
+    simd64uint8() {}
+
+    explicit simd64uint8(__m512i i) : simd512bit(i) {}
+
+    explicit simd64uint8(int x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    explicit simd64uint8(uint8_t x) : simd512bit(_mm512_set1_epi8(x)) {}
+
+    // sets up a lower half of the register
+    explicit simd64uint8(simd256bit lo) : simd512bit(lo) {}
+
+    // constructs from lower and upper halves
+    explicit simd64uint8(simd256bit lo, simd256bit hi) : simd512bit(lo, hi) {}
+
+    explicit simd64uint8(simd512bit x) : simd512bit(x) {}
+
+    explicit simd64uint8(const uint8_t* x) : simd512bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        uint8_t bytes[64];
+        storeu((void*)bytes);
+        char res[2000];
+        char* ptr = res;
+        for (int i = 0; i < 64; i++) {
+            ptr += sprintf(ptr, fmt, bytes[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        i = _mm512_set1_epi8((char)x);
+    }
+
+    simd64uint8 operator&(simd512bit other) const {
+        return simd64uint8(_mm512_and_si512(i, other.i));
+    }
+
+    simd64uint8 operator+(simd64uint8 other) const {
+        return simd64uint8(_mm512_add_epi8(i, other.i));
+    }
+
+    simd64uint8 lookup_4_lanes(simd64uint8 idx) const {
+        return simd64uint8(_mm512_shuffle_epi8(i, idx.i));
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+    simd32uint16 lane0_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 0);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd32uint16 lane1_as_uint16() const {
+        __m256i x = _mm512_extracti32x8_epi32(i, 1);
+        return simd32uint16(_mm512_cvtepu8_epi16(x));
+    }
+
+    simd64uint8 operator+=(simd64uint8 other) {
+        i = _mm512_add_epi8(i, other.i);
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        ALIGNED(64) uint8_t tab[64];
+        store(tab);
+        return tab[i];
+    }
+};
+
+} // namespace faiss
diff --git a/faiss/utils/simdlib_neon.h b/faiss/utils/simdlib_neon.h
index 656a561217..1bdf0ed01e 100644
--- a/faiss/utils/simdlib_neon.h
+++ b/faiss/utils/simdlib_neon.h
@@ -168,9 +168,12 @@ static inline std::string elements_to_string(const char* fmt, const S& simd) {
     simd.store(bytes);
     char res[1000], *ptr = res;
     for (size_t i = 0; i < N; ++i) {
-        ptr += sprintf(ptr, fmt, bytes[i]);
+        int bytesWritten =
+                snprintf(ptr, sizeof(res) - (ptr - res), fmt, bytes[i]);
+        ptr += bytesWritten;
     }
-    // strip last ,
+    // The format usually contains a ',' separator so this is to remove the last
+    // separator.
     ptr[-1] = 0;
     return std::string(res);
 }
diff --git a/faiss/utils/simdlib_ppc64.h b/faiss/utils/simdlib_ppc64.h
new file mode 100644
index 0000000000..94b3e42dc7
--- /dev/null
+++ b/faiss/utils/simdlib_ppc64.h
@@ -0,0 +1,1084 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <string>
+
+namespace faiss {
+
+struct simd256bit {
+    union {
+        uint8_t u8[32];
+        uint16_t u16[16];
+        uint32_t u32[8];
+        float f32[8];
+    };
+
+    simd256bit() {}
+
+    explicit simd256bit(const void* x) {
+        memcpy(u8, x, 32);
+    }
+
+    void clear() {
+        memset(u8, 0, 32);
+    }
+
+    void storeu(void* ptr) const {
+        memcpy(ptr, u8, 32);
+    }
+
+    void loadu(const void* ptr) {
+        memcpy(u8, ptr, 32);
+    }
+
+    void store(void* ptr) const {
+        storeu(ptr);
+    }
+
+    void bin(char bits[257]) const {
+        const char* bytes = (char*)this->u8;
+        for (int i = 0; i < 256; i++) {
+            bits[i] = '0' + ((bytes[i / 8] >> (i % 8)) & 1);
+        }
+        bits[256] = 0;
+    }
+
+    std::string bin() const {
+        char bits[257];
+        bin(bits);
+        return std::string(bits);
+    }
+
+    // Checks whether the other holds exactly the same bytes.
+    bool is_same_as(simd256bit other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+};
+
+/// vector of 16 elements in uint16
+struct simd16uint16 : simd256bit {
+    simd16uint16() {}
+
+    explicit simd16uint16(int x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(uint16_t x) {
+        set1(x);
+    }
+
+    explicit simd16uint16(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd16uint16(const uint16_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd16uint16(
+            uint16_t u0,
+            uint16_t u1,
+            uint16_t u2,
+            uint16_t u3,
+            uint16_t u4,
+            uint16_t u5,
+            uint16_t u6,
+            uint16_t u7,
+            uint16_t u8,
+            uint16_t u9,
+            uint16_t u10,
+            uint16_t u11,
+            uint16_t u12,
+            uint16_t u13,
+            uint16_t u14,
+            uint16_t u15) {
+        this->u16[0] = u0;
+        this->u16[1] = u1;
+        this->u16[2] = u2;
+        this->u16[3] = u3;
+        this->u16[4] = u4;
+        this->u16[5] = u5;
+        this->u16[6] = u6;
+        this->u16[7] = u7;
+        this->u16[8] = u8;
+        this->u16[9] = u9;
+        this->u16[10] = u10;
+        this->u16[11] = u11;
+        this->u16[12] = u12;
+        this->u16[13] = u13;
+        this->u16[14] = u14;
+        this->u16[15] = u15;
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 16; i++) {
+            ptr += sprintf(ptr, fmt, u16[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    template <typename F>
+    static simd16uint16 unary_func(const simd16uint16& a, F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j]);
+        }
+        return c;
+    }
+
+    template <typename F>
+    static simd16uint16 binary_func(
+            const simd16uint16& a,
+            const simd16uint16& b,
+            F&& f) {
+        simd16uint16 c;
+        for (int j = 0; j < 16; j++) {
+            c.u16[j] = f(a.u16[j], b.u16[j]);
+        }
+        return c;
+    }
+
+    void set1(uint16_t x) {
+        for (int i = 0; i < 16; i++) {
+            u16[i] = x;
+        }
+    }
+
+    simd16uint16 operator*(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a * b; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator>>(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a >> shift; });
+    }
+
+    // shift must be known at compile time
+    simd16uint16 operator<<(const int shift) const {
+        return unary_func(*this, [shift](uint16_t a) { return a << shift; });
+    }
+
+    simd16uint16 operator+=(const simd16uint16& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    simd16uint16 operator-=(const simd16uint16& other) {
+        *this = *this - other;
+        return *this;
+    }
+
+    simd16uint16 operator+(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a + b; });
+    }
+
+    simd16uint16 operator-(const simd16uint16& other) const {
+        return binary_func(
+                *this, other, [](uint16_t a, uint16_t b) { return a - b; });
+    }
+
+    simd16uint16 operator&(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a & b;
+                });
+    }
+
+    simd16uint16 operator|(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a | b;
+                });
+    }
+
+    simd16uint16 operator^(const simd256bit& other) const {
+        return binary_func(
+                *this, simd16uint16(other), [](uint16_t a, uint16_t b) {
+                    return a ^ b;
+                });
+    }
+
+    // returns binary masks
+    simd16uint16 operator==(const simd16uint16& other) const {
+        return binary_func(*this, other, [](uint16_t a, uint16_t b) {
+            return a == b ? 0xffff : 0;
+        });
+    }
+
+    simd16uint16 operator~() const {
+        return unary_func(*this, [](uint16_t a) { return ~a; });
+    }
+
+    // get scalar at index 0
+    uint16_t get_scalar_0() const {
+        return u16[0];
+    }
+
+    // mask of elements where this >= thresh
+    // 2 bit per component: 16 * 2 = 32 bit
+    uint32_t ge_mask(const simd16uint16& thresh) const {
+        uint32_t gem = 0;
+        for (int j = 0; j < 16; j++) {
+            if (u16[j] >= thresh.u16[j]) {
+                gem |= 3 << (j * 2);
+            }
+        }
+        return gem;
+    }
+
+    uint32_t le_mask(const simd16uint16& thresh) const {
+        return thresh.ge_mask(*this);
+    }
+
+    uint32_t gt_mask(const simd16uint16& thresh) const {
+        return ~le_mask(thresh);
+    }
+
+    bool all_gt(const simd16uint16& thresh) const {
+        return le_mask(thresh) == 0;
+    }
+
+    // for debugging only
+    uint16_t operator[](int i) const {
+        return u16[i];
+    }
+
+    void accu_min(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] < u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+
+    void accu_max(const simd16uint16& incoming) {
+        for (int j = 0; j < 16; j++) {
+            if (incoming.u16[j] > u16[j]) {
+                u16[j] = incoming.u16[j];
+            }
+        }
+    }
+};
+
+// not really a std::min because it returns an elementwise min
+inline simd16uint16 min(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::min(a, b); });
+}
+
+inline simd16uint16 max(const simd16uint16& av, const simd16uint16& bv) {
+    return simd16uint16::binary_func(
+            av, bv, [](uint16_t a, uint16_t b) { return std::max(a, b); });
+}
+
+// decompose in 128-lanes: a = (a0, a1), b = (b0, b1)
+// return (a0 + a1, b0 + b1)
+// TODO find a better name
+inline simd16uint16 combine2x2(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    for (int j = 0; j < 8; j++) {
+        c.u16[j] = a.u16[j] + a.u16[j + 8];
+        c.u16[j + 8] = b.u16[j] + b.u16[j + 8];
+    }
+    return c;
+}
+
+// compare d0 and d1 to thr, return 32 bits corresponding to the concatenation
+// of d0 and d1 with thr
+inline uint32_t cmp_ge32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] >= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] >= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+inline uint32_t cmp_le32(
+        const simd16uint16& d0,
+        const simd16uint16& d1,
+        const simd16uint16& thr) {
+    uint32_t gem = 0;
+    for (int j = 0; j < 16; j++) {
+        if (d0.u16[j] <= thr.u16[j]) {
+            gem |= 1 << j;
+        }
+        if (d1.u16[j] <= thr.u16[j]) {
+            gem |= 1 << (j + 16);
+        }
+    }
+    return gem;
+}
+
+// hadd does not cross lanes
+inline simd16uint16 hadd(const simd16uint16& a, const simd16uint16& b) {
+    simd16uint16 c;
+    c.u16[0] = a.u16[0] + a.u16[1];
+    c.u16[1] = a.u16[2] + a.u16[3];
+    c.u16[2] = a.u16[4] + a.u16[5];
+    c.u16[3] = a.u16[6] + a.u16[7];
+    c.u16[4] = b.u16[0] + b.u16[1];
+    c.u16[5] = b.u16[2] + b.u16[3];
+    c.u16[6] = b.u16[4] + b.u16[5];
+    c.u16[7] = b.u16[6] + b.u16[7];
+
+    c.u16[8] = a.u16[8] + a.u16[9];
+    c.u16[9] = a.u16[10] + a.u16[11];
+    c.u16[10] = a.u16[12] + a.u16[13];
+    c.u16[11] = a.u16[14] + a.u16[15];
+    c.u16[12] = b.u16[8] + b.u16[9];
+    c.u16[13] = b.u16[10] + b.u16[11];
+    c.u16[14] = b.u16[12] + b.u16[13];
+    c.u16[15] = b.u16[14] + b.u16[15];
+
+    return c;
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd16uint16 candidateValues,
+        const simd16uint16 candidateIndices,
+        const simd16uint16 currentValues,
+        const simd16uint16 currentIndices,
+        simd16uint16& minValues,
+        simd16uint16& minIndices,
+        simd16uint16& maxValues,
+        simd16uint16& maxIndices) {
+    for (size_t i = 0; i < 16; i++) {
+        bool flag = (candidateValues.u16[i] < currentValues.u16[i]);
+        minValues.u16[i] = flag ? candidateValues.u16[i] : currentValues.u16[i];
+        minIndices.u16[i] =
+                flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+        maxValues.u16[i] =
+                !flag ? candidateValues.u16[i] : currentValues.u16[i];
+        maxIndices.u16[i] =
+                !flag ? candidateIndices.u16[i] : currentIndices.u16[i];
+    }
+}
+
+// vector of 32 unsigned 8-bit integers
+struct simd32uint8 : simd256bit {
+    simd32uint8() {}
+
+    explicit simd32uint8(int x) {
+        set1(x);
+    }
+
+    explicit simd32uint8(uint8_t x) {
+        set1(x);
+    }
+    template <
+            uint8_t _0,
+            uint8_t _1,
+            uint8_t _2,
+            uint8_t _3,
+            uint8_t _4,
+            uint8_t _5,
+            uint8_t _6,
+            uint8_t _7,
+            uint8_t _8,
+            uint8_t _9,
+            uint8_t _10,
+            uint8_t _11,
+            uint8_t _12,
+            uint8_t _13,
+            uint8_t _14,
+            uint8_t _15,
+            uint8_t _16,
+            uint8_t _17,
+            uint8_t _18,
+            uint8_t _19,
+            uint8_t _20,
+            uint8_t _21,
+            uint8_t _22,
+            uint8_t _23,
+            uint8_t _24,
+            uint8_t _25,
+            uint8_t _26,
+            uint8_t _27,
+            uint8_t _28,
+            uint8_t _29,
+            uint8_t _30,
+            uint8_t _31>
+    static simd32uint8 create() {
+        simd32uint8 ret;
+        ret.u8[0] = _0;
+        ret.u8[1] = _1;
+        ret.u8[2] = _2;
+        ret.u8[3] = _3;
+        ret.u8[4] = _4;
+        ret.u8[5] = _5;
+        ret.u8[6] = _6;
+        ret.u8[7] = _7;
+        ret.u8[8] = _8;
+        ret.u8[9] = _9;
+        ret.u8[10] = _10;
+        ret.u8[11] = _11;
+        ret.u8[12] = _12;
+        ret.u8[13] = _13;
+        ret.u8[14] = _14;
+        ret.u8[15] = _15;
+        ret.u8[16] = _16;
+        ret.u8[17] = _17;
+        ret.u8[18] = _18;
+        ret.u8[19] = _19;
+        ret.u8[20] = _20;
+        ret.u8[21] = _21;
+        ret.u8[22] = _22;
+        ret.u8[23] = _23;
+        ret.u8[24] = _24;
+        ret.u8[25] = _25;
+        ret.u8[26] = _26;
+        ret.u8[27] = _27;
+        ret.u8[28] = _28;
+        ret.u8[29] = _29;
+        ret.u8[30] = _30;
+        ret.u8[31] = _31;
+        return ret;
+    }
+
+    explicit simd32uint8(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd32uint8(const uint8_t* x) : simd256bit((const void*)x) {}
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 32; i++) {
+            ptr += sprintf(ptr, fmt, u8[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%02x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%3d,");
+    }
+
+    void set1(uint8_t x) {
+        for (int j = 0; j < 32; j++) {
+            u8[j] = x;
+        }
+    }
+
+    template <typename F>
+    static simd32uint8 binary_func(
+            const simd32uint8& a,
+            const simd32uint8& b,
+            F&& f) {
+        simd32uint8 c;
+        for (int j = 0; j < 32; j++) {
+            c.u8[j] = f(a.u8[j], b.u8[j]);
+        }
+        return c;
+    }
+
+    simd32uint8 operator&(const simd256bit& other) const {
+        return binary_func(*this, simd32uint8(other), [](uint8_t a, uint8_t b) {
+            return a & b;
+        });
+    }
+
+    simd32uint8 operator+(const simd32uint8& other) const {
+        return binary_func(
+                *this, other, [](uint8_t a, uint8_t b) { return a + b; });
+    }
+
+    // The very important operation that everything relies on
+    simd32uint8 lookup_2_lanes(const simd32uint8& idx) const {
+        simd32uint8 c;
+        // The original for loop:
+        // for (int j = 0; j < 32; j++) {
+        //     if (idx.u8[j] & 0x80) {
+        //         c.u8[j] = 0;
+        //     } else {
+        //         uint8_t i = idx.u8[j] & 15;
+        //         if (j < 16) {
+        //             c.u8[j] = u8[i];
+        //         } else {
+        //             c.u8[j] = u8[16 + i];
+        //         }
+        //     }
+
+        // The following function was re-written for Power 10
+        // The loop was unrolled to remove the if (j < 16) statement by doing
+        // the j and j + 16 iterations in parallel.  The additional unrolling
+        // for j + 1 and j + 17, reduces the execution time on Power 10 by
+        // about 50% as the instruction scheduling allows on average 2X more
+        // instructions to be issued per cycle.
+
+        for (int j = 0; j < 16; j = j + 2) {
+            // j < 16, unrolled to depth of 2
+            if (idx.u8[j] & 0x80) {
+                c.u8[j] = 0;
+            } else {
+                uint8_t i = idx.u8[j] & 15;
+                c.u8[j] = u8[i];
+            }
+
+            if (idx.u8[j + 1] & 0x80) {
+                c.u8[j + 1] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 1] & 15;
+                c.u8[j + 1] = u8[i];
+            }
+
+            // j >= 16, unrolled to depth of 2
+            if (idx.u8[j + 16] & 0x80) {
+                c.u8[j + 16] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 16] & 15;
+                c.u8[j + 16] = u8[i + 16];
+            }
+
+            if (idx.u8[j + 17] & 0x80) {
+                c.u8[j + 17] = 0;
+            } else {
+                uint8_t i = idx.u8[j + 17] & 15;
+                c.u8[j + 17] = u8[i + 16];
+            }
+        }
+        return c;
+    }
+
+    // extract + 0-extend lane
+    // this operation is slow (3 cycles)
+
+    simd32uint8 operator+=(const simd32uint8& other) {
+        *this = *this + other;
+        return *this;
+    }
+
+    // for debugging only
+    uint8_t operator[](int i) const {
+        return u8[i];
+    }
+};
+
+// convert with saturation
+// careful: this does not cross lanes, so the order is weird
+inline simd32uint8 uint16_to_uint8_saturate(
+        const simd16uint16& a,
+        const simd16uint16& b) {
+    simd32uint8 c;
+
+    auto saturate_16_to_8 = [](uint16_t x) { return x >= 256 ? 0xff : x; };
+
+    for (int i = 0; i < 8; i++) {
+        c.u8[i] = saturate_16_to_8(a.u16[i]);
+        c.u8[8 + i] = saturate_16_to_8(b.u16[i]);
+        c.u8[16 + i] = saturate_16_to_8(a.u16[8 + i]);
+        c.u8[24 + i] = saturate_16_to_8(b.u16[8 + i]);
+    }
+    return c;
+}
+
+/// get most significant bit of each byte
+inline uint32_t get_MSBs(const simd32uint8& a) {
+    uint32_t res = 0;
+    for (int i = 0; i < 32; i++) {
+        if (a.u8[i] & 0x80) {
+            res |= 1 << i;
+        }
+    }
+    return res;
+}
+
+/// use MSB of each byte of mask to select a byte between a and b
+inline simd32uint8 blendv(
+        const simd32uint8& a,
+        const simd32uint8& b,
+        const simd32uint8& mask) {
+    simd32uint8 c;
+    for (int i = 0; i < 32; i++) {
+        if (mask.u8[i] & 0x80) {
+            c.u8[i] = b.u8[i];
+        } else {
+            c.u8[i] = a.u8[i];
+        }
+    }
+    return c;
+}
+
+/// vector of 8 unsigned 32-bit integers
+struct simd8uint32 : simd256bit {
+    simd8uint32() {}
+
+    explicit simd8uint32(uint32_t x) {
+        set1(x);
+    }
+
+    explicit simd8uint32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8uint32(const uint32_t* x) : simd256bit((const void*)x) {}
+
+    explicit simd8uint32(
+            uint32_t u0,
+            uint32_t u1,
+            uint32_t u2,
+            uint32_t u3,
+            uint32_t u4,
+            uint32_t u5,
+            uint32_t u6,
+            uint32_t u7) {
+        u32[0] = u0;
+        u32[1] = u1;
+        u32[2] = u2;
+        u32[3] = u3;
+        u32[4] = u4;
+        u32[5] = u5;
+        u32[6] = u6;
+        u32[7] = u7;
+    }
+
+    simd8uint32 operator+(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] + other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32 operator-(simd8uint32 other) const {
+        simd8uint32 result;
+        for (int i = 0; i < 8; i++) {
+            result.u32[i] = u32[i] - other.u32[i];
+        }
+        return result;
+    }
+
+    simd8uint32& operator+=(const simd8uint32& other) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] += other.u32[i];
+        }
+        return *this;
+    }
+
+    bool operator==(simd8uint32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (u32[i] != other.u32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8uint32 other) const {
+        return !(*this == other);
+    }
+
+    std::string elements_to_string(const char* fmt) const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, fmt, u32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+
+    std::string hex() const {
+        return elements_to_string("%08x,");
+    }
+
+    std::string dec() const {
+        return elements_to_string("%10d,");
+    }
+
+    void set1(uint32_t x) {
+        for (int i = 0; i < 8; i++) {
+            u32[i] = x;
+        }
+    }
+
+    simd8uint32 unzip() const {
+        const uint32_t ret[] = {
+                u32[0], u32[2], u32[4], u32[6], u32[1], u32[3], u32[5], u32[7]};
+        return simd8uint32{ret};
+    }
+};
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8uint32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8uint32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8uint32& minValues,
+        simd8uint32& minIndices,
+        simd8uint32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.u32[i] < currentValues.u32[i]);
+        minValues.u32[i] = flag ? candidateValues.u32[i] : currentValues.u32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.u32[i] =
+                !flag ? candidateValues.u32[i] : currentValues.u32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+struct simd8float32 : simd256bit {
+    simd8float32() {}
+
+    explicit simd8float32(const simd256bit& x) : simd256bit(x) {}
+
+    explicit simd8float32(float x) {
+        set1(x);
+    }
+
+    explicit simd8float32(const float* x) {
+        loadu((void*)x);
+    }
+
+    void set1(float x) {
+        for (int i = 0; i < 8; i++) {
+            f32[i] = x;
+        }
+    }
+
+    explicit simd8float32(
+            float f0,
+            float f1,
+            float f2,
+            float f3,
+            float f4,
+            float f5,
+            float f6,
+            float f7) {
+        f32[0] = f0;
+        f32[1] = f1;
+        f32[2] = f2;
+        f32[3] = f3;
+        f32[4] = f4;
+        f32[5] = f5;
+        f32[6] = f6;
+        f32[7] = f7;
+    }
+
+    template <typename F>
+    static simd8float32 binary_func(
+            const simd8float32& a,
+            const simd8float32& b,
+            F&& f) {
+        simd8float32 c;
+        for (int j = 0; j < 8; j++) {
+            c.f32[j] = f(a.f32[j], b.f32[j]);
+        }
+        return c;
+    }
+
+    simd8float32 operator*(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a * b; });
+    }
+
+    simd8float32 operator+(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a + b; });
+    }
+
+    simd8float32 operator-(const simd8float32& other) const {
+        return binary_func(
+                *this, other, [](float a, float b) { return a - b; });
+    }
+
+    simd8float32& operator+=(const simd8float32& other) {
+        for (size_t i = 0; i < 8; i++) {
+            f32[i] += other.f32[i];
+        }
+
+        return *this;
+    }
+
+    bool operator==(simd8float32 other) const {
+        for (size_t i = 0; i < 8; i++) {
+            if (f32[i] != other.f32[i]) {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    bool operator!=(simd8float32 other) const {
+        return !(*this == other);
+    }
+
+    std::string tostring() const {
+        char res[1000], *ptr = res;
+        for (int i = 0; i < 8; i++) {
+            ptr += sprintf(ptr, "%g,", f32[i]);
+        }
+        // strip last ,
+        ptr[-1] = 0;
+        return std::string(res);
+    }
+};
+
+// hadd does not cross lanes
+inline simd8float32 hadd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0] + a.f32[1];
+    c.f32[1] = a.f32[2] + a.f32[3];
+    c.f32[2] = b.f32[0] + b.f32[1];
+    c.f32[3] = b.f32[2] + b.f32[3];
+
+    c.f32[4] = a.f32[4] + a.f32[5];
+    c.f32[5] = a.f32[6] + a.f32[7];
+    c.f32[6] = b.f32[4] + b.f32[5];
+    c.f32[7] = b.f32[6] + b.f32[7];
+
+    return c;
+}
+
+inline simd8float32 unpacklo(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[0];
+    c.f32[1] = b.f32[0];
+    c.f32[2] = a.f32[1];
+    c.f32[3] = b.f32[1];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = b.f32[4];
+    c.f32[6] = a.f32[5];
+    c.f32[7] = b.f32[5];
+
+    return c;
+}
+
+inline simd8float32 unpackhi(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+    c.f32[0] = a.f32[2];
+    c.f32[1] = b.f32[2];
+    c.f32[2] = a.f32[3];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[6];
+    c.f32[5] = b.f32[6];
+    c.f32[6] = a.f32[7];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// compute a * b + c
+inline simd8float32 fmadd(
+        const simd8float32& a,
+        const simd8float32& b,
+        const simd8float32& c) {
+    simd8float32 res;
+    for (int i = 0; i < 8; i++) {
+        res.f32[i] = a.f32[i] * b.f32[i] + c.f32[i];
+    }
+    return res;
+}
+
+namespace {
+
+// get even float32's of a and b, interleaved
+simd8float32 geteven(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[2];
+    c.f32[2] = b.f32[0];
+    c.f32[3] = b.f32[2];
+
+    c.f32[4] = a.f32[4];
+    c.f32[5] = a.f32[6];
+    c.f32[6] = b.f32[4];
+    c.f32[7] = b.f32[6];
+
+    return c;
+}
+
+// get odd float32's of a and b, interleaved
+simd8float32 getodd(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[1];
+    c.f32[1] = a.f32[3];
+    c.f32[2] = b.f32[1];
+    c.f32[3] = b.f32[3];
+
+    c.f32[4] = a.f32[5];
+    c.f32[5] = a.f32[7];
+    c.f32[6] = b.f32[5];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// 3 cycles
+// if the lanes are a = [a0 a1] and b = [b0 b1], return [a0 b0]
+simd8float32 getlow128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[0];
+    c.f32[1] = a.f32[1];
+    c.f32[2] = a.f32[2];
+    c.f32[3] = a.f32[3];
+
+    c.f32[4] = b.f32[0];
+    c.f32[5] = b.f32[1];
+    c.f32[6] = b.f32[2];
+    c.f32[7] = b.f32[3];
+
+    return c;
+}
+
+simd8float32 gethigh128(const simd8float32& a, const simd8float32& b) {
+    simd8float32 c;
+
+    c.f32[0] = a.f32[4];
+    c.f32[1] = a.f32[5];
+    c.f32[2] = a.f32[6];
+    c.f32[3] = a.f32[7];
+
+    c.f32[4] = b.f32[4];
+    c.f32[5] = b.f32[5];
+    c.f32[6] = b.f32[6];
+    c.f32[7] = b.f32[7];
+
+    return c;
+}
+
+// The following primitive is a vectorized version of the following code
+// snippet:
+//   float lowestValue = HUGE_VAL;
+//   uint lowestIndex = 0;
+//   for (size_t i = 0; i < n; i++) {
+//     if (values[i] < lowestValue) {
+//       lowestValue = values[i];
+//       lowestIndex = i;
+//     }
+//   }
+// Vectorized version can be implemented via two operations: cmp and blend
+// with something like this:
+//   lowestValues = [HUGE_VAL; 8];
+//   lowestIndices = {0, 1, 2, 3, 4, 5, 6, 7};
+//   for (size_t i = 0; i < n; i += 8) {
+//     auto comparison = cmp(values + i, lowestValues);
+//     lowestValues = blend(
+//         comparison,
+//         values + i,
+//         lowestValues);
+//     lowestIndices = blend(
+//         comparison,
+//         i + {0, 1, 2, 3, 4, 5, 6, 7},
+//         lowestIndices);
+//     lowestIndices += {8, 8, 8, 8, 8, 8, 8, 8};
+//   }
+// The problem is that blend primitive needs very different instruction
+// order for AVX and ARM.
+// So, let's introduce a combination of these two in order to avoid
+// confusion for ppl who write in low-level SIMD instructions. Additionally,
+// these two ops (cmp and blend) are very often used together.
+inline void cmplt_and_blend_inplace(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        simd8float32& lowestValues,
+        simd8uint32& lowestIndices) {
+    for (size_t j = 0; j < 8; j++) {
+        bool comparison = (candidateValues.f32[j] < lowestValues.f32[j]);
+        if (comparison) {
+            lowestValues.f32[j] = candidateValues.f32[j];
+            lowestIndices.u32[j] = candidateIndices.u32[j];
+        }
+    }
+}
+
+// Vectorized version of the following code:
+//   for (size_t i = 0; i < n; i++) {
+//      bool flag = (candidateValues[i] < currentValues[i]);
+//      minValues[i] = flag ? candidateValues[i] : currentValues[i];
+//      minIndices[i] = flag ? candidateIndices[i] : currentIndices[i];
+//      maxValues[i] = !flag ? candidateValues[i] : currentValues[i];
+//      maxIndices[i] = !flag ? candidateIndices[i] : currentIndices[i];
+//   }
+// Max indices evaluation is inaccurate in case of equal values (the index of
+// the last equal value is saved instead of the first one), but this behavior
+// saves instructions.
+inline void cmplt_min_max_fast(
+        const simd8float32 candidateValues,
+        const simd8uint32 candidateIndices,
+        const simd8float32 currentValues,
+        const simd8uint32 currentIndices,
+        simd8float32& minValues,
+        simd8uint32& minIndices,
+        simd8float32& maxValues,
+        simd8uint32& maxIndices) {
+    for (size_t i = 0; i < 8; i++) {
+        bool flag = (candidateValues.f32[i] < currentValues.f32[i]);
+        minValues.f32[i] = flag ? candidateValues.f32[i] : currentValues.f32[i];
+        minIndices.u32[i] =
+                flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+        maxValues.f32[i] =
+                !flag ? candidateValues.f32[i] : currentValues.f32[i];
+        maxIndices.u32[i] =
+                !flag ? candidateIndices.u32[i] : currentIndices.u32[i];
+    }
+}
+
+} // namespace
+
+} // namespace faiss
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 10243b9a9c..3980d7dd7c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -32,6 +32,9 @@ set(FAISS_TEST_SRC
   test_hnsw.cpp
   test_partitioning.cpp
   test_fastscan_perf.cpp
+  test_disable_pq_sdc_tables.cpp
+  test_common_ivf_empty_index.cpp
+  test_callback.cpp
 )
 
 add_executable(faiss_test ${FAISS_TEST_SRC})
@@ -59,17 +62,39 @@ if(FAISS_OPT_LEVEL STREQUAL "avx512")
 endif()
 
 include(FetchContent)
-FetchContent_Declare(googletest
-  URL "https://github.com/google/googletest/archive/release-1.12.1.tar.gz")
+FetchContent_Declare(
+  googletest
+  GIT_REPOSITORY https://github.com/google/googletest.git
+  GIT_TAG 58d77fa8070e8cec2dc1ed015d66b454c8d78850 # release-1.12.1
+  OVERRIDE_FIND_PACKAGE)
 set(BUILD_GMOCK CACHE BOOL OFF)
 set(INSTALL_GTEST CACHE BOOL OFF)
 FetchContent_MakeAvailable(googletest)
 
+if(NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+   AND NOT EXISTS ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/GTestConfig.cmake)
+  file(
+    WRITE ${CMAKE_FIND_PACKAGE_REDIRECTS_DIR}/gtest-config.cmake
+    [=[
+include(CMakeFindDependencyMacro)
+find_dependency(googletest)
+if(NOT TARGET GTest::GTest)
+  add_library(GTest::GTest INTERFACE IMPORTED)
+  target_link_libraries(GTest::GTest INTERFACE GTest::gtest)
+endif()
+if(NOT TARGET GTest::Main)
+  add_library(GTest::Main INTERFACE IMPORTED)
+  target_link_libraries(GTest::Main INTERFACE GTest::gtest_main)
+endif()
+]=])
+endif()
+
 find_package(OpenMP REQUIRED)
+find_package(GTest CONFIG REQUIRED)
 
 target_link_libraries(faiss_test PRIVATE
   OpenMP::OpenMP_CXX
-  gtest_main
+  GTest::gtest_main
   $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft>
 )
 
diff --git a/tests/common_faiss_tests.py b/tests/common_faiss_tests.py
index 8dc25edec0..a8afe344e4 100644
--- a/tests/common_faiss_tests.py
+++ b/tests/common_faiss_tests.py
@@ -49,7 +49,6 @@ def evalres(self, DI):
         for rank in 1, 10, 100:
             e[rank] = ((I[:, :rank] == self.gt.reshape(-1, 1)).sum() /
                        float(self.nq))
-        # print("1-recalls: %s" % e)
         return e
 
 
diff --git a/tests/test_RCQ_cropping.cpp b/tests/test_RCQ_cropping.cpp
index 4dd3470885..4463c256ed 100644
--- a/tests/test_RCQ_cropping.cpp
+++ b/tests/test_RCQ_cropping.cpp
@@ -28,7 +28,6 @@ TEST(RCQCropping, test_cropping) {
     faiss::ResidualCoarseQuantizer rcq(d, nbits);
 
     rcq.train(nt, xt);
-    // fprintf(stderr, "nb centroids: %zd\n", rcq.ntotal);
 
     // the test below works only for beam size == nprobe
     rcq.set_beam_factor(1.0);
@@ -44,7 +43,6 @@ TEST(RCQCropping, test_cropping) {
     nbits.pop_back();
     faiss::ResidualCoarseQuantizer rcq_cropped(d, nbits);
     rcq_cropped.initialize_from(rcq);
-    // fprintf(stderr, "cropped nb centroids: %zd\n", rcq_cropped.ntotal);
 
     EXPECT_EQ(rcq_cropped.ntotal, rcq.ntotal >> last_nbits);
 
diff --git a/tests/test_binary_hashindex.py b/tests/test_binary_hashindex.py
index 2d33050571..e9a6eaca49 100644
--- a/tests/test_binary_hashindex.py
+++ b/tests/test_binary_hashindex.py
@@ -58,8 +58,6 @@ def test_hash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         index = faiss.IndexBinaryHash(d, 10)
         index.add(xb)
         # index.display()
@@ -80,8 +78,6 @@ def test_hash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -100,8 +96,6 @@ def test_multihash(self):
 
         Lref, Dref, Iref = index_ref.range_search(xq, radius)
 
-        print("nb res: ", Lref[-1])
-
         nfound = []
         ndis = []
 
@@ -123,8 +117,6 @@ def test_multihash(self):
                 self.assertTrue(snew <= set(ref))
             nfound.append(Lnew[-1])
             ndis.append(stats.ndis)
-        print('nfound=', nfound)
-        print('ndis=', ndis)
         nfound = np.array(nfound)
         # self.assertTrue(nfound[-1] == Lref[-1])
         self.assertTrue(np.all(nfound[1:] >= nfound[:-1]))
@@ -163,7 +155,6 @@ def test_hash_and_multihash(self):
                     # no duplicates
                     self.assertTrue(len(new) == len(snew))
                     nf += len(set(ref) & snew)
-                print('nfound', nh, nbit, nf)
                 nfound[(nh, nbit)] = nf
             self.assertGreater(nfound[(nh, 4)], nfound[(nh, 7)])
 
@@ -175,7 +166,6 @@ def test_hash_and_multihash(self):
             np.testing.assert_array_equal(Inew, I2)
             np.testing.assert_array_equal(Dnew, D2)
 
-        print('nfound=', nfound)
         self.assertGreater(3, abs(nfound[(0, 7)] - nfound[(1, 7)]))
         self.assertGreater(nfound[(3, 7)], nfound[(1, 7)])
         self.assertGreater(nfound[(5, 7)], nfound[(3, 7)])
diff --git a/tests/test_build_blocks.py b/tests/test_build_blocks.py
index 0a97e63185..fdf9ad8bd7 100644
--- a/tests/test_build_blocks.py
+++ b/tests/test_build_blocks.py
@@ -189,7 +189,6 @@ def test_l2(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = ((x - y) ** 2).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -204,7 +203,6 @@ def test_IP(self):
         for d in 1, 2, 4, 8, 12, 16:
             x = rs.rand(d).astype('float32')
             for ny in 128, 129, 130:
-                print("d=%d ny=%d" % (d, ny))
                 y = rs.rand(ny, d).astype('float32')
                 ref = (x * y).sum(1)
                 new = np.zeros(ny, dtype='float32')
@@ -220,7 +218,6 @@ def test_0s(self):
         m = rs.rand(40, 20).astype('float32')
         m[5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'has 5 copies' in comments
         assert '5 null vectors' in comments
 
@@ -229,7 +226,6 @@ def test_copies(self):
         m = rs.rand(40, 20).astype('float32')
         m[::2] = m[1::2]
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '20 vectors are distinct' in comments
 
     def test_dead_dims(self):
@@ -237,7 +233,6 @@ def test_dead_dims(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] = 0
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are constant' in comments
 
     def test_rogue_means(self):
@@ -245,7 +240,6 @@ def test_rogue_means(self):
         m = rs.rand(40, 20).astype('float32')
         m[:, 5:10] += 12345
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert '5 dimensions are too large wrt. their variance' in comments
 
     def test_normalized(self):
@@ -253,7 +247,6 @@ def test_normalized(self):
         m = rs.rand(40, 20).astype('float32')
         faiss.normalize_L2(m)
         comments = faiss.MatrixStats(m).comments
-        print(comments)
         assert 'vectors are normalized' in comments
 
     def test_hash(self):
@@ -300,7 +293,6 @@ def test_8bit_equiv(self):
                 D, I = index.search(x[3:], 1)
 
                 # assert D[0, 0] == Dref[0, 0]
-                # print(D[0, 0], ((x[3] - x[2]) ** 2).sum())
                 assert D[0, 0] == ((x[3] - x[2]) ** 2).sum()
 
     def test_6bit_equiv(self):
@@ -314,8 +306,6 @@ def test_6bit_equiv(self):
                 d, faiss.ScalarQuantizer.QT_6bit)
             index.train(trainset)
 
-            print('cs=', index.code_size)
-
             x = rs.randint(64, size=(100, d)).astype('float32')
 
             # verify encoder / decoder
@@ -330,7 +320,6 @@ def test_6bit_equiv(self):
             for i in range(20):
                 for j in range(10):
                     dis = ((y[i] - x2[I[i, j]]) ** 2).sum()
-                    # print(dis, D[i, j])
                     assert abs(D[i, j] - dis) / dis < 1e-5
 
     def test_reconstruct(self):
@@ -371,7 +360,6 @@ def test_randint(self):
         x = faiss.randint(20000, vmax=100)
         assert np.all(x >= 0) and np.all(x < 100)
         c = np.bincount(x, minlength=100)
-        print(c)
         assert c.max() - c.min() < 50 * 2
 
     def test_rand_vector(self):
@@ -473,7 +461,6 @@ def do_test_array_type(self, dtype):
         """ tests swig_ptr and rev_swig_ptr for this type of array """
         a = np.arange(12).astype(dtype)
         ptr = faiss.swig_ptr(a)
-        print(ptr)
         a2 = faiss.rev_swig_ptr(ptr, 12)
         np.testing.assert_array_equal(a, a2)
 
@@ -547,7 +534,6 @@ def subtest(self, d, K, metric):
                         recalls += 1
                         break
         recall = 1.0 * recalls / (nb * K)
-        print('Metric: {}, knng accuracy: {}'.format(metric_names[metric], recall))
         assert recall > 0.99
 
     def test_small_nndescent(self):
@@ -656,7 +642,6 @@ def do_test_bucket_sort_inplace(
             rows, _ = np.where(tab == b)
             rows.sort()
             tab2[lims[b]:lims[b + 1]].sort()
-            # print(rows, tab2[lims[b] : lims[b + 1]])
             rows = set(rows)
             self.assertEqual(rows, set(tab2[lims[b]:lims[b + 1]]))
 
diff --git a/tests/test_callback.cpp b/tests/test_callback.cpp
new file mode 100644
index 0000000000..cdfadf1d39
--- /dev/null
+++ b/tests/test_callback.cpp
@@ -0,0 +1,37 @@
+/**
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <faiss/Clustering.h>
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissException.h>
+#include <faiss/utils/random.h>
+
+TEST(TestCallback, timeout) {
+    int n = 1000;
+    int k = 100;
+    int d = 128;
+    int niter = 1000000000;
+    int seed = 42;
+
+    std::vector<float> vecs(n * d);
+    faiss::float_rand(vecs.data(), vecs.size(), seed);
+
+    auto index(new faiss::IndexFlat(d));
+
+    faiss::ClusteringParameters cp;
+    cp.niter = niter;
+    cp.verbose = false;
+
+    faiss::Clustering kmeans(d, k, cp);
+
+    faiss::TimeoutCallback::reset(0.010);
+    EXPECT_THROW(kmeans.train(n, vecs.data(), *index), faiss::FaissException);
+    delete index;
+}
diff --git a/tests/test_callback_py.py b/tests/test_callback_py.py
new file mode 100644
index 0000000000..0ec176dd86
--- /dev/null
+++ b/tests/test_callback_py.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+import numpy as np
+import faiss
+
+
+class TestCallbackPy(unittest.TestCase):
+    def setUp(self) -> None:
+        super().setUp()
+
+    def test_timeout(self) -> None:
+        n = 1000
+        k = 100
+        d = 128
+        niter = 1_000_000_000
+
+        x = np.random.rand(n, d).astype('float32')
+        index = faiss.IndexFlat(d)
+
+        cp = faiss.ClusteringParameters()
+        cp.niter = niter
+        cp.verbose = False
+
+        kmeans = faiss.Clustering(d, k, cp)
+
+        with self.assertRaises(RuntimeError):
+            with faiss.TimeoutGuard(0.010):
+                kmeans.train(x, index)
diff --git a/tests/test_clustering.py b/tests/test_clustering.py
index 2b81fc3e35..b1afc8523f 100644
--- a/tests/test_clustering.py
+++ b/tests/test_clustering.py
@@ -110,9 +110,6 @@ def test_weighted(self):
         cdis2_first = cdis2[:5].sum()
         cdis2_last = cdis2[5:].sum()
 
-        print(cdis1_first, cdis1_last)
-        print(cdis2_first, cdis2_last)
-
         # with the new clustering, the last should be much (*2) closer
         # to their centroids
         self.assertGreater(cdis1_last, cdis1_first * 2)
diff --git a/tests/test_common_ivf_empty_index.cpp b/tests/test_common_ivf_empty_index.cpp
new file mode 100644
index 0000000000..a3e33031bd
--- /dev/null
+++ b/tests/test_common_ivf_empty_index.cpp
@@ -0,0 +1,144 @@
+// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.
+
+#include <gtest/gtest.h>
+
+#include <omp.h>
+#include <cstddef>
+#include <memory>
+#include <vector>
+
+#include <faiss/IndexIVF.h>
+#include <faiss/clone_index.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/index_factory.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/random.h>
+
+/* This demonstrates how to query several independent IVF indexes with a trained
+ *index in common. This avoids to duplicate the coarse quantizer and metadata
+ *in memory.
+ **/
+
+namespace {
+
+int d = 64;
+
+} // namespace
+
+std::vector<float> get_random_vectors(size_t n, int seed) {
+    std::vector<float> x(n * d);
+    faiss::rand_smooth_vectors(n, d, x.data(), seed);
+    seed++;
+    return x;
+}
+
+/** InvetedLists implementation that dispatches the search to an InvertedList
+ * object that is passed in at query time */
+
+struct DispatchingInvertedLists : faiss::ReadOnlyInvertedLists {
+    DispatchingInvertedLists(size_t nlist, size_t code_size)
+            : faiss::ReadOnlyInvertedLists(nlist, code_size) {
+        use_iterator = true;
+    }
+
+    faiss::InvertedListsIterator* get_iterator(
+            size_t list_no,
+            void* inverted_list_context = nullptr) const override {
+        assert(inverted_list_context);
+        auto il =
+                static_cast<const faiss::InvertedLists*>(inverted_list_context);
+        return il->get_iterator(list_no);
+    }
+
+    using idx_t = faiss::idx_t;
+
+    size_t list_size(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const uint8_t* get_codes(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+    const idx_t* get_ids(size_t list_no) const override {
+        FAISS_THROW_MSG("use iterator interface");
+    }
+};
+
+TEST(COMMON, test_common_trained_index) {
+    int N = 3;    // number of independent indexes
+    int nt = 500; // training vectors
+    int nb = 200; // nb database vectors per index
+    int nq = 10;  // nb queries performed on each index
+    int k = 4;    // restults requested per query
+
+    // construct and build an "empty index": a trained index that does not
+    // itself hold any data
+    std::unique_ptr<faiss::IndexIVF> empty_index(dynamic_cast<faiss::IndexIVF*>(
+            faiss::index_factory(d, "IVF32,PQ8np")));
+    auto xt = get_random_vectors(nt, 123);
+    empty_index->train(nt, xt.data());
+    empty_index->nprobe = 4;
+
+    // reference run: build one index for each set of db / queries and record
+    // results
+    std::vector<std::vector<faiss::idx_t>> ref_I(N);
+
+    for (int i = 0; i < N; i++) {
+        // clone the empty index
+        std::unique_ptr<faiss::Index> index(
+                faiss::clone_index(empty_index.get()));
+        auto xb = get_random_vectors(nb, 1234 + i);
+        auto xq = get_random_vectors(nq, 12345 + i);
+        // add vectors and perform a search
+        index->add(nb, xb.data());
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+        index->search(nq, xq.data(), k, D.data(), I.data());
+        // record result as reference
+        ref_I[i] = I;
+    }
+
+    // build a set of inverted lists for each independent index
+    std::vector<faiss::ArrayInvertedLists> sub_invlists;
+
+    for (int i = 0; i < N; i++) {
+        // swap in other inverted lists
+        sub_invlists.emplace_back(empty_index->nlist, empty_index->code_size);
+        faiss::InvertedLists* invlists = &sub_invlists.back();
+
+        // replace_invlists swaps in a new InvertedLists for an existing index
+        empty_index->replace_invlists(invlists, false);
+        empty_index->reset(); // reset id counter to 0
+        // populate inverted lists
+        auto xb = get_random_vectors(nb, 1234 + i);
+        empty_index->add(nb, xb.data());
+    }
+
+    // perform search dispatching to the sub-invlists. At search time, we don't
+    // use replace_invlists because that would wreak havoc in a multithreaded
+    // context
+    DispatchingInvertedLists di(empty_index->nlist, empty_index->code_size);
+    empty_index->replace_invlists(&di, false);
+
+    std::vector<std::vector<faiss::idx_t>> new_I(N);
+
+    // run searches in the independent indexes but with a common empty_index
+#pragma omp parallel for
+    for (int i = 0; i < N; i++) {
+        auto xq = get_random_vectors(nq, 12345 + i);
+        std::vector<float> D(k * nq);
+        std::vector<faiss::idx_t> I(k * nq);
+
+        // here we set to what sub-index the queries should be directed
+        faiss::SearchParametersIVF params;
+        params.nprobe = empty_index->nprobe;
+        params.inverted_list_context = &sub_invlists[i];
+
+        empty_index->search(nq, xq.data(), k, D.data(), I.data(), &params);
+        new_I[i] = I;
+    }
+
+    // compare with reference reslt
+    for (int i = 0; i < N; i++) {
+        ASSERT_EQ(ref_I[i], new_I[i]);
+    }
+}
diff --git a/tests/test_contrib.py b/tests/test_contrib.py
index 84b90a4e5f..05a2c4ac8b 100644
--- a/tests/test_contrib.py
+++ b/tests/test_contrib.py
@@ -9,6 +9,7 @@
 import platform
 import os
 import random
+import shutil
 import tempfile
 
 from faiss.contrib import datasets
@@ -17,15 +18,13 @@
 from faiss.contrib import ivf_tools
 from faiss.contrib import clustering
 from faiss.contrib import big_batch_search
+from faiss.contrib.ondisk import merge_ondisk
 
 from common_faiss_tests import get_dataset_2
-try:
-    from faiss.contrib.exhaustive_search import \
-        knn_ground_truth, knn, range_ground_truth, \
-        range_search_max_results, exponential_query_iterator
-except:
-    pass  # Submodule import broken in python 2.
-
+from faiss.contrib.exhaustive_search import \
+    knn_ground_truth, knn, range_ground_truth, \
+    range_search_max_results, exponential_query_iterator
+from contextlib import contextmanager
 
 @unittest.skipIf(platform.python_version_tuple()[0] < '3',
                  'Submodule import broken in python 2.')
@@ -148,7 +147,6 @@ def test_query_iterator(self, metric=faiss.METRIC_L2):
         xb = ds.get_database()
         D, I = faiss.knn(xq, xb, 10, metric=metric)
         threshold = float(D[:, -1].mean())
-        print(threshold)
 
         index = faiss.IndexFlat(32, metric)
         index.add(xb)
@@ -252,7 +250,6 @@ def test_precision_recall(self):
         Inew = np.hstack(Inew)
 
         precision, recall = evaluation.range_PR(lims_ref, Iref, lims_new, Inew)
-        print(precision, recall)
 
         self.assertEqual(precision, 0.6)
         self.assertEqual(recall, 0.6)
@@ -674,3 +671,63 @@ def test_code_set(self):
         np.testing.assert_equal(
             np.sort(np.unique(codes, axis=0), axis=None),
             np.sort(codes[inserted], axis=None))
+
+
+@unittest.skipIf(platform.system() == 'Windows',
+                'OnDiskInvertedLists is unsupported on Windows.')
+class TestMerge(unittest.TestCase):
+    @contextmanager
+    def temp_directory(self):
+        temp_dir = tempfile.mkdtemp()
+        try:
+            yield temp_dir
+        finally:
+            shutil.rmtree(temp_dir)
+
+    def do_test_ondisk_merge(self, shift_ids=False):
+        with self.temp_directory() as tmpdir:
+            # only train and add index to disk without adding elements.
+            # this will create empty inverted lists.
+            ds = datasets.SyntheticDataset(32, 2000, 200, 20)
+            index = faiss.index_factory(ds.d, "IVF32,Flat")
+            index.train(ds.get_train())
+            faiss.write_index(index, tmpdir + "/trained.index")
+
+            # create 4 shards and add elements to them
+            ns = 4  # number of shards
+
+            for bno in range(ns):
+                index = faiss.read_index(tmpdir + "/trained.index")
+                i0, i1 = int(bno * ds.nb / ns), int((bno + 1) * ds.nb / ns)
+                if shift_ids:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(0, ds.nb / ns))
+                else:
+                    index.add_with_ids(ds.xb[i0:i1], np.arange(i0, i1))
+                faiss.write_index(index, tmpdir + "/block_%d.index" % bno)
+
+            # construct the output index and merge them on disk
+            index = faiss.read_index(tmpdir + "/trained.index")
+            block_fnames = [tmpdir + "/block_%d.index" % bno for bno in range(4)]
+
+            merge_ondisk(
+                index, block_fnames, tmpdir + "/merged_index.ivfdata", shift_ids
+            )
+            faiss.write_index(index, tmpdir + "/populated.index")
+
+            # perform a search from index on disk
+            index = faiss.read_index(tmpdir + "/populated.index")
+            index.nprobe = 5
+            D, I = index.search(ds.xq, 5)
+
+            # ground-truth
+            gtI = ds.get_groundtruth(5)
+
+            recall_at_1 = (I[:, :1] == gtI[:, :1]).sum() / float(ds.xq.shape[0])
+            self.assertGreaterEqual(recall_at_1, 0.5)
+
+    def test_ondisk_merge(self):
+        self.do_test_ondisk_merge()
+
+    def test_ondisk_merge_with_shift_ids(self):
+        # verified that recall is same for test_ondisk_merge and
+        self.do_test_ondisk_merge(True)
diff --git a/tests/test_contrib_with_scipy.py b/tests/test_contrib_with_scipy.py
index cb81bb623c..4f89e2fc1b 100644
--- a/tests/test_contrib_with_scipy.py
+++ b/tests/test_contrib_with_scipy.py
@@ -44,7 +44,6 @@ def test_sparse_routines(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         centroids = ds.get_queries()
@@ -72,7 +71,6 @@ def test_sparse_kmeans(self):
         faiss.normalize_L2(xt)
 
         mask = np.abs(xt) > 0.045
-        # print("fraction:", mask.sum() / mask.size) # around 10% non-zeros
         xt[np.logical_not(mask)] = 0
 
         km = faiss.Kmeans(ds.d, 50)
diff --git a/tests/test_disable_pq_sdc_tables.cpp b/tests/test_disable_pq_sdc_tables.cpp
new file mode 100644
index 0000000000..b211a5c451
--- /dev/null
+++ b/tests/test_disable_pq_sdc_tables.cpp
@@ -0,0 +1,61 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include "faiss/Index.h"
+#include "faiss/IndexHNSW.h"
+#include "faiss/index_factory.h"
+#include "faiss/index_io.h"
+#include "test_util.h"
+
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+TEST(IO, TestReadHNSWPQ_whenSDCDisabledFlagPassed_thenDisableSDCTable) {
+    Tempfilename index_filename(&temp_file_mutex, "/tmp/faiss_TestReadHNSWPQ");
+    int d = 32, n = 256;
+    std::default_random_engine rng(123);
+    std::uniform_real_distribution<float> u(0, 100);
+    std::vector<float> vectors(n * d);
+    for (size_t i = 0; i < n * d; i++) {
+        vectors[i] = u(rng);
+    }
+
+    // Build the index and write it to the temp file
+    {
+        std::unique_ptr<faiss::Index> index_writer(
+                faiss::index_factory(d, "HNSW8,PQ4np", faiss::METRIC_L2));
+        index_writer->train(n, vectors.data());
+        index_writer->add(n, vectors.data());
+
+        faiss::write_index(index_writer.get(), index_filename.c_str());
+    }
+
+    // Load index from disk. Confirm that the sdc table is equal to 0 when
+    // disable sdc is set
+    {
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_read_write(
+                dynamic_cast<faiss::IndexHNSWPQ*>(
+                        faiss::read_index(index_filename.c_str())));
+        std::unique_ptr<faiss::IndexHNSWPQ> index_reader_sdc_disabled(
+                dynamic_cast<faiss::IndexHNSWPQ*>(faiss::read_index(
+                        index_filename.c_str(),
+                        faiss::IO_FLAG_PQ_SKIP_SDC_TABLE)));
+
+        ASSERT_NE(
+                dynamic_cast<faiss::IndexPQ*>(index_reader_read_write->storage)
+                        ->pq.sdc_table.size(),
+                0);
+        ASSERT_EQ(
+                dynamic_cast<faiss::IndexPQ*>(
+                        index_reader_sdc_disabled->storage)
+                        ->pq.sdc_table.size(),
+                0);
+    }
+}
diff --git a/tests/test_extra_distances.py b/tests/test_extra_distances.py
index a474dd6ba7..fcaf4d383d 100644
--- a/tests/test_extra_distances.py
+++ b/tests/test_extra_distances.py
@@ -94,6 +94,33 @@ def test_jaccard(self):
         new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_Jaccard)
         self.assertTrue(np.allclose(ref_dis, new_dis))
 
+    def test_nan_euclidean(self):
+        xq, yb = self.make_example()
+        ref_dis = np.array([
+            [scipy.spatial.distance.sqeuclidean(x, y) for y in yb]
+            for x in xq
+        ])
+        new_dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(ref_dis, new_dis))
+
+        x = [[3, np.nan, np.nan, 6]]
+        q = [[1, np.nan, np.nan, 5]]
+        dis = [(4 / 2 * ((3 - 1)**2 + (6 - 5)**2))]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.allclose(new_dis, dis))
+
+        x = [[np.nan] * 4]
+        q = [[np.nan] * 4]
+        new_dis = faiss.pairwise_distances(x, q, faiss.METRIC_NaNEuclidean)
+        self.assertTrue(np.isnan(new_dis[0]))
+
+    def test_abs_inner_product(self):
+        xq, yb = self.make_example()
+        dis = faiss.pairwise_distances(xq, yb, faiss.METRIC_ABS_INNER_PRODUCT)
+
+        gt_dis = np.abs(xq @ yb.T)
+        np.testing.assert_allclose(dis, gt_dis, atol=1e-5)
+
 
 class TestKNN(unittest.TestCase):
     """ test that the knn search gives the same as distance matrix + argmin """
diff --git a/tests/test_fast_scan.py b/tests/test_fast_scan.py
index b061ee3af0..cfe9636fee 100644
--- a/tests/test_fast_scan.py
+++ b/tests/test_fast_scan.py
@@ -34,7 +34,6 @@ def test_PQ4_accuracy(self):
         nq = Iref.shape[0]
         recall_at_1 = (Iref[:, 0] == Ia[:, 0]).sum() / nq
         assert recall_at_1 > 0.6
-        # print(f'recall@1 = {recall_at_1:.3f}')
 
 
     # This is an experiment to see if we can catch performance
@@ -498,7 +497,6 @@ def subtest_accuracy(self, aq, st, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall = (Ia == gt).sum() / nq
 
-        print(aq, st, implem, metric_type, recall_ref, recall)
         assert abs(recall_ref - recall) < 0.05
 
     def xx_test_accuracy(self):
@@ -531,7 +529,6 @@ def subtest_from_idxaq(self, implem, metric):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_from_idxaq(self):
diff --git a/tests/test_fast_scan_ivf.py b/tests/test_fast_scan_ivf.py
index d6dad8fec3..f48dd2e47a 100644
--- a/tests/test_fast_scan_ivf.py
+++ b/tests/test_fast_scan_ivf.py
@@ -84,9 +84,7 @@ def sp(x):
         b = btab[0]
         dis_new = self.compute_dis_quant(codes, LUTq, biasq, a, b)
 
-        #    print(a, b, dis_ref.sum())
         avg_realtive_error = np.abs(dis_new - dis_ref).sum() / dis_ref.sum()
-        # print('a=', a, 'avg_relative_error=', avg_realtive_error)
         self.assertLess(avg_realtive_error, 0.0005)
 
     def test_no_residual_ip(self):
@@ -228,8 +226,6 @@ def eval_quant_loss(self, by_residual, metric=faiss.METRIC_L2):
 
         m3 = three_metrics(Da, Ia, Db, Ib)
 
-
-        # print(by_residual, metric, recall_at_1, recall_at_10, intersection_at_10)
         ref_results = {
             (True, 1): [0.985, 1.0, 9.872],
             (True, 0): [ 0.987, 1.0, 9.914],
@@ -261,6 +257,7 @@ class TestEquivPQ(unittest.TestCase):
 
     def test_equiv_pq(self):
         ds  = datasets.SyntheticDataset(32, 2000, 200, 4)
+        xq = ds.get_queries()
 
         index = faiss.index_factory(32, "IVF1,PQ16x4np")
         index.by_residual = False
@@ -268,7 +265,7 @@ def test_equiv_pq(self):
         index.quantizer.add(np.zeros((1, 32), dtype='float32'))
         index.train(ds.get_train())
         index.add(ds.get_database())
-        Dref, Iref = index.search(ds.get_queries(), 4)
+        Dref, Iref = index.search(xq, 4)
 
         index_pq = faiss.index_factory(32, "PQ16x4np")
         index_pq.pq = index.pq
@@ -276,21 +273,64 @@ def test_equiv_pq(self):
         index_pq.codes = faiss. downcast_InvertedLists(
             index.invlists).codes.at(0)
         index_pq.ntotal = index.ntotal
-        Dnew, Inew = index_pq.search(ds.get_queries(), 4)
+        Dnew, Inew = index_pq.search(xq, 4)
 
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
         index_pq2 = faiss.IndexPQFastScan(index_pq)
         index_pq2.implem = 12
-        Dref, Iref = index_pq2.search(ds.get_queries(), 4)
+        Dref, Iref = index_pq2.search(xq, 4)
 
         index2 = faiss.IndexIVFPQFastScan(index)
         index2.implem = 12
-        Dnew, Inew = index2.search(ds.get_queries(), 4)
+        Dnew, Inew = index2.search(xq, 4)
         np.testing.assert_array_equal(Iref, Inew)
         np.testing.assert_array_equal(Dref, Dnew)
 
+        # test encode and decode
+
+        np.testing.assert_array_equal(
+            index_pq.sa_encode(xq),
+            index2.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_pq.sa_decode(index_pq.sa_encode(xq)),
+            index2.sa_decode(index2.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_pq.sa_decode(index_pq.sa_encode(xq)) - xq) ** 2).sum(1),
+            ((index2.sa_decode(index2.sa_encode(xq)) - xq) ** 2).sum(1)
+        )
+
+    def test_equiv_pq_encode_decode(self):
+        ds = datasets.SyntheticDataset(32, 1000, 200, 10)
+        xq = ds.get_queries()
+
+        index_ivfpq = faiss.index_factory(ds.d, "IVF10,PQ8x4np")
+        index_ivfpq.train(ds.get_train())
+
+        index_ivfpqfs = faiss.IndexIVFPQFastScan(index_ivfpq)
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_encode(xq),
+            index_ivfpqfs.sa_encode(xq)
+        )
+
+        np.testing.assert_array_equal(
+            index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)),
+            index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq))
+        )
+
+        np.testing.assert_array_equal(
+            ((index_ivfpq.sa_decode(index_ivfpq.sa_encode(xq)) - xq) ** 2)
+            .sum(1),
+            ((index_ivfpqfs.sa_decode(index_ivfpqfs.sa_encode(xq)) - xq) ** 2)
+            .sum(1)
+        )
+
 
 class TestIVFImplem12(unittest.TestCase):
 
@@ -463,7 +503,6 @@ def do_test(self, by_residual=False, metric=faiss.METRIC_L2, d=32, bbs=32):
         Dnew, Inew = index2.search(ds.get_queries(), 10)
 
         m3 = three_metrics(Dref, Iref, Dnew, Inew)
-        #   print((by_residual, metric, d), ":", m3)
         ref_m3_tab = {
             (True, 1, 32): (0.995, 1.0, 9.91),
             (True, 0, 32): (0.99, 1.0, 9.91),
@@ -554,7 +593,6 @@ def subtest_accuracy(self, aq, st, by_residual, implem, metric_type='L2'):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, metric_type, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.051
 
     def xx_test_accuracy(self):
@@ -599,7 +637,6 @@ def subtest_rescale_accuracy(self, aq, st, by_residual, implem):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(aq, st, by_residual, implem, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def xx_test_rescale_accuracy(self):
@@ -624,7 +661,6 @@ def subtest_from_ivfaq(self, implem):
         nq = Iref.shape[0]
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
-        print(recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.02
 
     def test_from_ivfaq(self):
@@ -763,7 +799,6 @@ def subtest_accuracy(self, paq):
         recall_ref = (Iref == gt).sum() / nq
         recall1 = (I1 == gt).sum() / nq
 
-        print(paq, recall_ref, recall1)
         assert abs(recall_ref - recall1) < 0.05
 
     def test_accuracy_PLSQ(self):
@@ -847,7 +882,6 @@ def do_test(self, metric=faiss.METRIC_L2):
         # find a reasonable radius
         D, I = index.search(ds.get_queries(), 10)
         radius = np.median(D[:, -1])
-        #   print("radius=", radius)
         lims1, D1, I1 = index.range_search(ds.get_queries(), radius)
 
         index2 = faiss.IndexIVFPQFastScan(index)
@@ -860,7 +894,6 @@ def do_test(self, metric=faiss.METRIC_L2):
         for i in range(ds.nq):
             ref = set(I1[lims1[i]: lims1[i + 1]])
             new = set(I2[lims2[i]: lims2[i + 1]])
-            print(ref, new)
             nmiss += len(ref - new)
             nextra += len(new - ref)
 
diff --git a/tests/test_graph_based.py b/tests/test_graph_based.py
index 914fac3ff1..d5797186da 100644
--- a/tests/test_graph_based.py
+++ b/tests/test_graph_based.py
@@ -123,6 +123,67 @@ def test_hnsw_IP(self):
         mask = Iref[:, 0] == Ihnsw[:, 0]
         assert np.allclose(Dref[mask, 0], Dhnsw[mask, 0])
 
+    def test_ndis_stats(self):
+        d = self.xq.shape[1]
+
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+        stats = faiss.cvar.hnsw_stats
+        stats.reset()
+        Dhnsw, Ihnsw = index.search(self.xq, 1)
+        self.assertGreater(stats.ndis, len(self.xq) * index.hnsw.efSearch)
+
+    def test_io_no_storage(self):
+        d = self.xq.shape[1]
+        index = faiss.IndexHNSWFlat(d, 16)
+        index.add(self.xb)
+
+        Dref, Iref = index.search(self.xq, 5)
+
+        # test writing without storage
+        index2 = faiss.deserialize_index(
+            faiss.serialize_index(index, faiss.IO_FLAG_SKIP_STORAGE)
+        )
+        self.assertEqual(index2.storage, None)
+        self.assertRaises(
+            RuntimeError,
+            index2.search, self.xb, 1)
+
+        # make sure we can store an index with empty storage
+        index4 = faiss.deserialize_index(
+            faiss.serialize_index(index2))
+
+        # add storage afterwards
+        index.storage = faiss.clone_index(index.storage)
+        index.own_fields = True
+
+        Dnew, Inew = index.search(self.xq, 5)
+        np.testing.assert_array_equal(Dnew, Dref)
+        np.testing.assert_array_equal(Inew, Iref)
+
+        if False:
+            # test reading without storage
+            # not implemented because it is hard to skip over an index
+            index3 = faiss.deserialize_index(
+                faiss.serialize_index(index), faiss.IO_FLAG_SKIP_STORAGE
+            )
+            self.assertEquals(index3.storage, None)
+
+    def test_abs_inner_product(self):
+        """Test HNSW with abs inner product (not a real distance, so dubious that triangular inequality works)"""
+        d = self.xq.shape[1]
+        xb = self.xb - self.xb.mean(axis=0)  # need to be centered to give interesting directions
+        xq = self.xq - self.xq.mean(axis=0)
+        Dref, Iref = faiss.knn(xq, xb, 10, faiss.METRIC_ABS_INNER_PRODUCT)
+        
+        index = faiss.IndexHNSWFlat(d, 32, faiss.METRIC_ABS_INNER_PRODUCT)
+        index.add(xb)
+        Dnew, Inew = index.search(xq, 10)
+
+        inter = faiss.eval_intersection(Iref, Inew)
+        # 4769 vs. 500*10
+        self.assertGreater(inter, Iref.size * 0.9)
+ 
 
 class TestNSG(unittest.TestCase):
 
@@ -199,7 +260,6 @@ def subtest_add(self, build_type, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
         self.subtest_io_and_clone(index, Dnsg, Insg)
@@ -220,7 +280,6 @@ def subtest_build(self, knn_graph, thresh, metric=faiss.METRIC_L2):
         Dnsg, Insg = index.search(self.xq, 1)
 
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, thresh)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -276,7 +335,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -284,7 +342,6 @@ def test_reset(self):
         index.add(self.xb)
         Dnsg, Insg = index.search(self.xq, 1)
         recalls = (Iref == Insg).sum()
-        print('metric: {}, nb equal: {}'.format(metrics[metric], recalls))
         self.assertGreaterEqual(recalls, 475)
         self.subtest_connectivity(index, self.xb.shape[0])
 
@@ -325,7 +382,6 @@ def test_nsg_pq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGPQ", recalls)
         self.assertGreaterEqual(recalls, 190)  # 193
 
         # test I/O
@@ -351,7 +407,6 @@ def test_nsg_sq(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNSGSQ", recalls)
         self.assertGreaterEqual(recalls, 405)  # 411
 
         # test I/O
@@ -385,7 +440,6 @@ def test_nndescentflat(self):
 
         # test accuracy
         recalls = (Iref == I).sum()
-        print("IndexNNDescentFlat", recalls)
         self.assertGreaterEqual(recalls, 450)  # 462
 
         # do some IO tests
diff --git a/tests/test_index.py b/tests/test_index.py
index f46c6a94bf..43db906e47 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -327,7 +327,7 @@ def test_4variants_ivf(self):
         D, I = index.search(xq, 10)
         nok['flat'] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexIVFScalarQuantizer(quantizer, d, ncent,
                                                   qtype, faiss.METRIC_L2)
@@ -338,7 +338,6 @@ def test_4variants_ivf(self):
             D, I = index.search(xq, 10)
 
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
-        print(nok, nq)
 
         self.assertGreaterEqual(nok['flat'], nq * 0.6)
         # The tests below are a bit fragile, it happens that the
@@ -350,6 +349,7 @@ def test_4variants_ivf(self):
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nok['QT_8bit'])
 
     def test_4variants(self):
         d = 32
@@ -365,7 +365,7 @@ def test_4variants(self):
 
         nok = {}
 
-        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16".split():
+        for qname in "QT_4bit QT_4bit_uniform QT_8bit QT_8bit_uniform QT_fp16 QT_bf16".split():
             qtype = getattr(faiss.ScalarQuantizer, qname)
             index = faiss.IndexScalarQuantizer(d, qtype, faiss.METRIC_L2)
             index.train(xt)
@@ -373,13 +373,12 @@ def test_4variants(self):
             D, I = index.search(xq, 10)
             nok[qname] = (I[:, 0] == I_ref[:, 0]).sum()
 
-        print(nok, nq)
-
         self.assertGreaterEqual(nok['QT_8bit'], nq * 0.9)
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_4bit'])
         self.assertGreaterEqual(nok['QT_8bit'], nok['QT_8bit_uniform'])
         self.assertGreaterEqual(nok['QT_4bit'], nok['QT_4bit_uniform'])
         self.assertGreaterEqual(nok['QT_fp16'], nok['QT_8bit'])
+        self.assertGreaterEqual(nok['QT_bf16'], nq * 0.9)
 
 
 class TestRangeSearch(unittest.TestCase):
@@ -442,7 +441,6 @@ def norm1(x):
 
         recons_err = np.mean(norm1(R_flat - xb[I_flat]))
 
-        print('Reconstruction error = %.3f' % recons_err)
         if eps is not None:
             self.assertLessEqual(recons_err, eps)
 
@@ -638,7 +636,6 @@ def test_reconstuct_after_add(self):
 
         # should not raise an exception
         index.reconstruct(5)
-        print(index.ntotal)
         index.reconstruct(150)
 
 
diff --git a/tests/test_index_accuracy.py b/tests/test_index_accuracy.py
index 3f7bfbd303..8d8b4a28f6 100644
--- a/tests/test_index_accuracy.py
+++ b/tests/test_index_accuracy.py
@@ -56,7 +56,6 @@ def test_ivf_kmeans(self):
         Dref, Iref = ivfk.search(ev.xq, 100)
         ivfk.parallel_mode = 1
         Dnew, Inew = ivfk.search(ev.xq, 100)
-        print((Iref != Inew).sum(), Iref.size)
         assert (Iref != Inew).sum() < Iref.size / 5000.0
         assert np.all(Dref == Dnew)
 
@@ -136,8 +135,6 @@ def test_polysemous(self):
 
         res = ev.launch("Polysemous ht=%d" % index.polysemous_ht, index)
         e_polysemous = ev.evalres(res)
-        print(e_baseline, e_polysemous, index.polysemous_ht)
-        print(stats.n_hamming_pass, stats.ncode)
         # The randu dataset is difficult, so we are not too picky on
         # the results. Here we assert that we have < 10 % loss when
         # computing full PQ on fewer than 20% of the data.
@@ -248,7 +245,6 @@ def subtest(self, mt):
             index.nprobe = 4  # hopefully more robust than 1
             D, I = index.search(xq, 10)
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, repr(qname), ninter))
             assert abs(ninter - self.ref_results[(mt, qname)]) <= 10
 
             if qname == "6bit":
@@ -264,7 +260,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            # print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -278,14 +273,11 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            # print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.01
 
             for pm in 1, 2:
-                # print("parallel_mode=%d" % pm)
                 index.parallel_mode = pm
                 lims4, D4, I4 = index.range_search(xq, radius)
-                # print("sizes", lims4[1:] - lims4[:-1])
                 for qno in range(len(lims) - 1):
                     Iref = I3[lims[qno]: lims[qno + 1]]
                     Inew = I4[lims4[qno]: lims4[qno + 1]]
@@ -485,7 +477,6 @@ def subtest(self, mt):
             D, I = index.search(xq, 10)
 
             ninter = faiss.eval_intersection(I, gt_I)
-            print("(%d, %s): %d, " % (mt, by_residual, ninter))
 
             assert abs(ninter - self.ref_results[mt, by_residual]) <= 3
 
@@ -499,10 +490,6 @@ def subtest(self, mt):
                 index.polysemous_ht = 20
                 D, I = index.search(xq, 10)
                 ninter = faiss.eval_intersection(I, gt_I)
-                print(
-                    "(%d, %s, %d): %d, "
-                    % (mt, by_residual, index.polysemous_ht, ninter)
-                )
 
                 # polysemous behaves bizarrely on ARM
                 assert (
@@ -516,7 +503,6 @@ def subtest(self, mt):
                 radius = float(D[:, -1].max())
             else:
                 radius = float(D[:, -1].min())
-            print("radius", radius)
 
             lims, D3, I3 = index.range_search(xq, radius)
             ntot = ndiff = 0
@@ -530,7 +516,6 @@ def subtest(self, mt):
                 Iref = set(I2[i, mask])
                 ndiff += len(Inew ^ Iref)
                 ntot += len(Iref)
-            print("ndiff %d / %d" % (ndiff, ntot))
             assert ndiff < ntot * 0.02
 
     def test_IVFPQ_non8bit(self):
@@ -555,7 +540,6 @@ def test_IVFPQ_non8bit(self):
 
             D, I = index.search(xq, 10)
             ninter[v] = faiss.eval_intersection(I, gt_I)
-        print("ninter=", ninter)
         # this should be the case but we don't observe
         # that... Probavly too few test points
         #  assert ninter['2x8'] > ninter['8x2']
@@ -623,9 +607,6 @@ def test_OPQ(self):
         res = ev.launch("OPQ", index)
         e_opq = ev.evalres(res)
 
-        print("e_pq=%s" % e_pq)
-        print("e_opq=%s" % e_opq)
-
         # verify that OPQ better than PQ
         for r in 1, 10, 100:
             assert e_opq[r] > e_pq[r]
@@ -656,7 +637,6 @@ def test_OIVFPQ(self):
 
         # verify same on OIVFPQ
         for r in 1, 10, 100:
-            print(e_oivfpq[r], e_ivfpq[r])
             assert e_oivfpq[r] >= e_ivfpq[r]
 
 
@@ -758,9 +738,6 @@ def test_sh(self):
                     ninter = faiss.eval_intersection(I, gt_I)
                     key = (nbit, tt, period)
 
-                    print("(%d, %s, %g): %d, " % (nbit, repr(tt), period,
-                                                  ninter))
-                    print(abs(ninter - self.ref_results[key]))
                     assert abs(ninter - self.ref_results[key]) <= 14
 
 
@@ -799,7 +776,6 @@ def do_test(self, metric):
         # check that with refinement, the recall@10 is the same as
         # the original recall@100
         recall2 = (I2 == Iref[:, :1]).sum()
-        # print("recalls", recall1, recall2)
         self.assertEqual(recall1, recall2)
 
     def test_IP(self):
diff --git a/tests/test_index_binary.py b/tests/test_index_binary.py
index 312530ad46..7820cb6627 100644
--- a/tests/test_index_binary.py
+++ b/tests/test_index_binary.py
@@ -100,6 +100,9 @@ def test_flat(self):
         index.add(self.xb)
         D, I = index.search(self.xq, 3)
 
+        I2 = index.assign(x=self.xq, k=3, labels=None)
+        assert np.all(I == I2)
+
         for i in range(nq):
             for j, dj in zip(I[i], D[i]):
                 ref_dis = binary_dis(self.xq[i], self.xb[j])
@@ -139,10 +142,18 @@ def test_range_search(self):
                 self.assertTrue(set(range_res) <= set(I[i]))
                 nt2 += 1
             # in case of equality we have a problem with ties
-        print('nb tests', nt1, nt2)
         # nb tests is actually low...
         self.assertTrue(nt1 > 19 and nt2 > 19)
 
+    def test_reconstruct(self):
+        index = faiss.IndexBinaryFlat(64)
+        input_vector = np.random.randint(0, 255, size=(10, index.code_size)).astype("uint8")
+        index.add(input_vector)
+
+        reconstructed_vector = index.reconstruct_n(0, 4)
+        assert reconstructed_vector.shape == (4, index.code_size)
+        assert np.all(input_vector[:4] == reconstructed_vector)
+
 
 class TestBinaryIVF(unittest.TestCase):
 
@@ -275,8 +286,6 @@ def test_ivf_nprobe(self):
         ref_index.add(xb)
         ref_D, ref_I = ref_index.search(xq, k)
 
-        print(D[0], ref_D[0])
-        print(I[0], ref_I[0])
         assert np.all(D == ref_D)
         # assert np.all(I == ref_I)  # id may be different
 
diff --git a/tests/test_index_composite.py b/tests/test_index_composite.py
index a760c0cf09..8d9b441adc 100644
--- a/tests/test_index_composite.py
+++ b/tests/test_index_composite.py
@@ -168,8 +168,6 @@ def test_remove_id_map_2(self):
         index.remove_ids(remove_set)
         index.add_with_ids(X[5:, :], idx[5:])
 
-        print (index.search(X, 1))
-
         for i in range(10):
             _, searchres = index.search(X[i:i + 1, :], 1)
             if idx[i] in remove_set:
@@ -954,7 +952,6 @@ def do_test(self, factory_string):
         index.nprobe = 10
         Dref, Iref = index.search(ds.get_queries(), 10)
 
-        #print(index.search_and_return_codes)
         D, I, codes = index.search_and_return_codes(
             ds.get_queries(), 10, include_listnos=True)
 
diff --git a/tests/test_io.py b/tests/test_io.py
index dc8ac3dcfb..99dfe60847 100644
--- a/tests/test_io.py
+++ b/tests/test_io.py
@@ -102,7 +102,6 @@ def test_buf_read(self):
                 reader = faiss.BufferedIOReader(reader, bsz)
 
                 y = np.zeros_like(x)
-                print('nbytes=', y.nbytes)
                 reader(faiss.swig_ptr(y), y.nbytes, 1)
 
             np.testing.assert_array_equal(x, y)
diff --git a/tests/test_ivflib.py b/tests/test_ivflib.py
index f19c3da45b..0a3fb8c87e 100644
--- a/tests/test_ivflib.py
+++ b/tests/test_ivflib.py
@@ -125,7 +125,6 @@ def test_range_search_with_parameters(self):
 
         Dpre, _ = index.search(xq, 15)
         radius = float(np.median(Dpre[:, -1]))
-        print("Radius=", radius)
         stats = faiss.cvar.indexIVF_stats
         stats.reset()
         Lref, Dref, Iref = index.range_search(xq, radius)
diff --git a/tests/test_local_search_quantizer.py b/tests/test_local_search_quantizer.py
index 01fec70ccf..7975929811 100644
--- a/tests/test_local_search_quantizer.py
+++ b/tests/test_local_search_quantizer.py
@@ -196,7 +196,6 @@ def test_update_codebooks_with_double(self):
         err_float = eval_codec(lsq, xb)
 
         # 6533.377 vs 25457.99
-        print(err_double, err_float)
         self.assertLess(err_double, err_float)
 
     def test_compute_binary_terms(self):
@@ -348,7 +347,6 @@ def test_training(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_lsq, err_pq)
         self.assertLess(err_lsq, err_pq)
 
 
@@ -463,7 +461,6 @@ def eval_index_accuracy(self, factory_key):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         inters = np.array(inters)
@@ -528,7 +525,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        print(err_plsq, err_pq)
         self.assertLess(err_plsq, err_pq)
 
     def test_with_lsq(self):
@@ -549,7 +545,6 @@ def test_with_lsq(self):
         lsq.train(xt)
         err_lsq = eval_codec(lsq, xb)
 
-        print(err_plsq, err_lsq)
         self.assertEqual(err_plsq, err_lsq)
 
     def test_lut(self):
@@ -664,7 +659,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as LSQ."""
         inter1 = self.eval_index_accuracy("IVF32,PLSQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF32,LSQ4x5_Nqint8")
-        # print(inter1, inter2)  # 381 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_lowlevel_ivf.cpp b/tests/test_lowlevel_ivf.cpp
index e28e2a946f..7ce90a1d2d 100644
--- a/tests/test_lowlevel_ivf.cpp
+++ b/tests/test_lowlevel_ivf.cpp
@@ -364,22 +364,9 @@ void test_lowlevel_access_binary(const char* index_key) {
             }
         }
 
-        printf("new before reroder: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // re-order heap
         heap_reorder<CMax<int32_t, idx_t>>(k, D.data(), I.data());
 
-        printf("ref: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I_ref[j], D_ref[j]);
-        printf("]\nnew: [");
-        for (int j = 0; j < k; j++)
-            printf("%" PRId64 ",%d ", I[j], D[j]);
-        printf("]\n");
-
         // check that we have the same results as the reference search
         for (int j = 0; j < k; j++) {
             // here the order is not guaranteed to be the same
diff --git a/tests/test_merge.cpp b/tests/test_merge.cpp
index 7e23f15f72..edbe2a03a6 100644
--- a/tests/test_merge.cpp
+++ b/tests/test_merge.cpp
@@ -6,47 +6,22 @@
  */
 
 #include <cstdio>
-#include <cstdlib>
 #include <random>
 
-#include <unistd.h>
-
 #include <gtest/gtest.h>
 
 #include <faiss/IVFlib.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexIVFFlat.h>
-#include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexPreTransform.h>
 #include <faiss/MetaIndexes.h>
 #include <faiss/invlists/OnDiskInvertedLists.h>
 
-namespace {
-
-struct Tempfilename {
-    static pthread_mutex_t mutex;
-
-    std::string filename = "/tmp/faiss_tmp_XXXXXX";
+#include "test_util.h"
 
-    Tempfilename() {
-        pthread_mutex_lock(&mutex);
-        int fd = mkstemp(&filename[0]);
-        close(fd);
-        pthread_mutex_unlock(&mutex);
-    }
-
-    ~Tempfilename() {
-        if (access(filename.c_str(), F_OK)) {
-            unlink(filename.c_str());
-        }
-    }
-
-    const char* c_str() {
-        return filename.c_str();
-    }
-};
+namespace {
 
-pthread_mutex_t Tempfilename::mutex = PTHREAD_MUTEX_INITIALIZER;
+pthread_mutex_t temp_file_mutex = PTHREAD_MUTEX_INITIALIZER;
 
 typedef faiss::idx_t idx_t;
 
@@ -57,6 +32,7 @@ size_t nq = 100;
 int nindex = 4;
 int k = 10;
 int nlist = 40;
+int shard_size = nb / nindex;
 
 struct CommonData {
     std::vector<float> database;
@@ -95,7 +71,7 @@ int compare_merged(
     std::vector<float> refD(k * nq);
 
     index_shards->search(nq, cd.queries.data(), k, refD.data(), refI.data());
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     std::vector<idx_t> newI(k * nq);
     std::vector<float> newD(k * nq);
@@ -125,7 +101,7 @@ int compare_merged(
         auto il = new faiss::OnDiskInvertedLists(
                 index0->nlist, index0->code_size, filename.c_str());
 
-        il->merge_from(lists.data(), lists.size());
+        il->merge_from_multiple(lists.data(), lists.size(), shift_ids);
 
         index0->replace_invlists(il, true);
         index0->ntotal = ntotal;
@@ -135,11 +111,14 @@ int compare_merged(
             nq, cd.queries.data(), k, newD.data(), newI.data());
 
     size_t ndiff = 0;
+    bool adjust_ids = shift_ids && !standard_merge;
     for (size_t i = 0; i < k * nq; i++) {
-        if (refI[i] != newI[i]) {
+        idx_t new_id = adjust_ids ? refI[i] % shard_size : refI[i];
+        if (refI[i] != new_id) {
             ndiff++;
         }
     }
+
     return ndiff;
 }
 
@@ -212,7 +191,7 @@ TEST(MERGE, merge_flat_vt) {
 TEST(MERGE, merge_flat_ondisk) {
     faiss::IndexShards index_shards(d, false, false);
     index_shards.own_indices = true;
-    Tempfilename filename;
+    Tempfilename filename(&temp_file_mutex, "/tmp/faiss_tmp_XXXXXX");
 
     for (int i = 0; i < nindex; i++) {
         auto ivf = new faiss::IndexIVFFlat(&cd.quantizer, d, nlist);
@@ -245,3 +224,23 @@ TEST(MERGE, merge_flat_ondisk_2) {
     int ndiff = compare_merged(&index_shards, false, false);
     EXPECT_GE(0, ndiff);
 }
+
+// now use ondisk specific merge and use shift ids
+TEST(MERGE, merge_flat_ondisk_3) {
+    faiss::IndexShards index_shards(d, false, false);
+    index_shards.own_indices = true;
+
+    std::vector<idx_t> ids;
+    for (int i = 0; i < nb; ++i) {
+        int id = i % shard_size;
+        ids.push_back(id);
+    }
+    for (int i = 0; i < nindex; i++) {
+        index_shards.add_shard(
+                new faiss::IndexIVFFlat(&cd.quantizer, d, nlist));
+    }
+    EXPECT_TRUE(index_shards.is_trained);
+    index_shards.add_with_ids(nb, cd.database.data(), ids.data());
+    int ndiff = compare_merged(&index_shards, true, false);
+    EXPECT_GE(0, ndiff);
+}
diff --git a/tests/test_merge_index.py b/tests/test_merge_index.py
index 8c4c1f0912..bdcc813f1c 100644
--- a/tests/test_merge_index.py
+++ b/tests/test_merge_index.py
@@ -72,7 +72,6 @@ def do_test_merge(self, index_type):
             index.merge_from(indexes[i], index.ntotal)
 
         _D, I = index.search(xq, k)
-        print(I[:5, :6])
 
         ndiff = (I != Iref).sum()
         print('%d / %d differences' % (ndiff, nq * k))
@@ -246,19 +245,45 @@ def test_merge_IDMap2(self):
 
 class TestRemoveFastScan(unittest.TestCase):
 
-    def do_fast_scan_test(self, factory_key, size1):
+    def do_fast_scan_test(self,
+                          factory_key,
+                          with_ids=False,
+                          direct_map_type=faiss.DirectMap.NoMap):
         ds = SyntheticDataset(110, 1000, 1000, 100)
-        index1 = faiss.index_factory(ds.d, factory_key)
-        index1.train(ds.get_train())
-        index1.reset()
+        index = faiss.index_factory(ds.d, factory_key)
+        index.train(ds.get_train())
+
+        index.reset()
         tokeep = [i % 3 == 0 for i in range(ds.nb)]
-        index1.add(ds.get_database()[tokeep])
-        _, Iref = index1.search(ds.get_queries(), 5)
-        index1.reset()
-        index1.add(ds.get_database())
-        index1.remove_ids(np.where(np.logical_not(tokeep))[0])
-        _, Inew = index1.search(ds.get_queries(), 5)
+        if with_ids:
+            index.add_with_ids(ds.get_database()[tokeep], np.arange(ds.nb)[tokeep])
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database()[tokeep])
+        _, Iref = index.search(ds.get_queries(), 5)
+
+        index.reset()
+        if with_ids:
+            index.add_with_ids(ds.get_database(), np.arange(ds.nb))
+            index.set_direct_map_type(direct_map_type)
+            faiss.extract_index_ivf(index).nprobe = 5
+        else:
+            index.add(ds.get_database())
+        index.remove_ids(np.where(np.logical_not(tokeep))[0])
+        _, Inew = index.search(ds.get_queries(), 5)
         np.testing.assert_array_equal(Inew, Iref)
 
-    def test_remove(self):
-        self.do_fast_scan_test("PQ5x4fs", 320)
+    def test_remove_PQFastScan(self):
+        # with_ids is not support for this type of index
+        self.do_fast_scan_test("PQ5x4fs", False)
+
+    def test_remove_IVFPQFastScan(self):
+        self.do_fast_scan_test("IVF20,PQ5x4fs", True)
+
+    def test_remove_IVFPQFastScan_2(self):
+        self.assertRaisesRegex(Exception,
+                               ".*not supported.*",
+                               self.do_fast_scan_test,
+                               "IVF20,PQ5x4fs",
+                               True,
+                               faiss.DirectMap.Hashtable)
diff --git a/tests/test_meta_index.py b/tests/test_meta_index.py
index d53cad48f7..d0896e8ba2 100644
--- a/tests/test_meta_index.py
+++ b/tests/test_meta_index.py
@@ -82,10 +82,8 @@ def test_shards(self):
         k = 32
         ref_index = faiss.IndexFlatL2(d)
 
-        print('ref search')
         ref_index.add(xb)
         _Dref, Iref = ref_index.search(xq, k)
-        print(Iref[:5, :6])
 
         shard_index = faiss.IndexShards(d)
         shard_index_2 = faiss.IndexShards(d, True, False)
@@ -109,7 +107,6 @@ def test_shards(self):
         for test_no in range(3):
             with_threads = test_no == 1
 
-            print('shard search test_no = %d' % test_no)
             if with_threads:
                 remember_nt = faiss.omp_get_max_threads()
                 faiss.omp_set_num_threads(1)
@@ -122,14 +119,10 @@ def test_shards(self):
             else:
                 _D, I = shard_index_2.search(xq, k)
 
-            print(I[:5, :6])
-
             if with_threads:
                 faiss.omp_set_num_threads(remember_nt)
 
             ndiff = (I != Iref).sum()
-
-            print('%d / %d differences' % (ndiff, nq * k))
             assert (ndiff < nq * k / 1000.)
 
     def test_shards_ivf(self):
diff --git a/tests/test_partition.py b/tests/test_partition.py
index 02de7e8c2c..fd41eabe1f 100644
--- a/tests/test_partition.py
+++ b/tests/test_partition.py
@@ -49,7 +49,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -95,7 +94,6 @@ def do_partition(self, n, q, maxval=None, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         if maxval is None:
             vals = rs.rand(n).astype('float32')
@@ -148,7 +146,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
 
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -160,7 +157,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         tab_a = faiss.AlignedTableUint16()
         faiss.copy_array_to_AlignedTable(vals, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMax_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
@@ -196,7 +192,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         if seed is None:
             for i in range(50):
                 self.do_partition(n, q, maxval, i + 1234)
-        # print("seed=", seed)
         rs = np.random.RandomState(seed)
         vals = rs.randint(maxval, size=n).astype('uint16')
         ids = (rs.permutation(n) + 12345).astype('int64')
@@ -209,7 +204,6 @@ def do_partition(self, n, q, maxval=65536, seed=None):
         vals_inv = (65535 - vals).astype('uint16')
         faiss.copy_array_to_AlignedTable(vals_inv, tab_a)
 
-        # print("tab a type", tab_a.get())
         if type(q) == int:
             faiss.CMin_uint16_partition_fuzzy(
                 tab_a.get(), sp(ids), n, q, q, None)
diff --git a/tests/test_product_quantizer.py b/tests/test_product_quantizer.py
index 1cdee7f144..f531cab2a1 100644
--- a/tests/test_product_quantizer.py
+++ b/tests/test_product_quantizer.py
@@ -26,7 +26,6 @@ def test_pq(self):
         x2 = pq.decode(codes)
         diff = ((x - x2)**2).sum()
 
-        # print("diff=", diff)
         # diff= 4418.0562
         self.assertGreater(5000, diff)
 
@@ -71,7 +70,6 @@ def do_test_codec(self, nbit):
 
     def test_codec(self):
         for i in range(16):
-            print("Testing nbits=%d" % (i + 1))
             self.do_test_codec(i + 1)
 
 
diff --git a/tests/test_residual_quantizer.py b/tests/test_residual_quantizer.py
index e37ee3efe2..f4381607e1 100644
--- a/tests/test_residual_quantizer.py
+++ b/tests/test_residual_quantizer.py
@@ -211,7 +211,6 @@ def test_training(self):
 
         # in practice RQ is often better than PQ but it does not the case here, so just check
         # that we are within some factor.
-        # print(err_pq, err_rq)
         self.assertLess(err_rq, err_pq * 1.2)
 
     def test_beam_size(self):
@@ -321,10 +320,8 @@ def retrain_AQ_codebook(index, xt):
 
     x_decoded = index.sa_decode(codes_packed)
     MSE = ((xt - x_decoded) ** 2).sum() / n
-    # print(f"Initial MSE on training set: {MSE:g}")
 
     codes = unpack_codes(index.rq, codes_packed)
-    # print("ref codes", codes[0])
     codebook_offsets = faiss.vector_to_array(rq.codebook_offsets)
 
     # build sparse code matrix (represented as a dense matrix)
@@ -343,7 +340,6 @@ def retrain_AQ_codebook(index, xt):
         B, residuals, rank, singvals = scipy.linalg.lstsq(C, xt, )
 
     MSE = ((C @ B - xt) ** 2).sum() / n
-    # print(f"MSE after retrainining: {MSE:g}")
 
     # replace codebook
     # faiss.copy_array_to_vector(B.astype('float32').ravel(), index.rq.codebooks)
@@ -503,7 +499,6 @@ def test_reestimate_codebook_2(self):
         xt_decoded = ir.sa_decode(ir.sa_encode(xt))
         err_after_refined = ((xt - xt_decoded) ** 2).sum()
 
-        # print(err_before, err_after_refined)
         # ref run 7474.98 / 7006.1777
         self.assertGreater(err_before, err_after_refined * 1.06)
 
@@ -781,7 +776,6 @@ def test_search_L2(self):
             else:
                 inter_2 = faiss.eval_intersection(I2, gt)
                 self.assertGreaterEqual(inter_ref, inter_2)
-                # print(st, inter_ref, inter_2)
 
 
 ###########################################################
@@ -814,7 +808,6 @@ def do_test_accuracy(self, by_residual, st):
             index.nprobe = nprobe
             D, I = index.search(ds.get_queries(), 10)
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print(st, "nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
 
         # do a little I/O test
@@ -909,18 +902,13 @@ def do_test_accuracy_IP(self, by_residual):
             D, I = index.search(ds.get_queries(), 10)
             index.rq.search_type = faiss.AdditiveQuantizer.ST_LUT_nonorm
             D2, I2 = index.search(ds.get_queries(), 10)
-            # print(D[:5] - D2[:5])
-            # print(I[:5])
             np.testing.assert_array_almost_equal(D, D2, decimal=5)
             # there are many ties because the codes are so short
             self.assertLess((I != I2).sum(), I.size * 0.1)
 
             # D2, I2 = index2.search(ds.get_queries(), 10)
-            # print(D[:5])
-            # print(D2[:5])
 
             inter = faiss.eval_intersection(I, ds.get_groundtruth(10))
-            # print("nprobe=", nprobe, "inter=", inter)
             inters.append(inter)
         self.assertTrue(np.all(inters[1:4] >= inters[:3]))
 
@@ -979,8 +967,6 @@ def beam_search_encode_step_tab(codes, L, distances, codebook_cross_prods_i,
             for b in range(beam_size):
                 dotprods[i, b, :] += cb[codes[i, b, j]]
 
-    # print("dps", dotprods[:3, :2, :4])
-
     new_distances += 2 * dotprods
     cent_distances = new_distances
 
@@ -1166,7 +1152,6 @@ def test_codec(self):
         pq.train(xt)
         err_pq = eval_codec(pq, xb)
 
-        # print(err_prq, err_pq)
         self.assertLess(err_prq, err_pq)
 
     def test_with_rq(self):
@@ -1187,7 +1172,6 @@ def test_with_rq(self):
         rq.train(xt)
         err_rq = eval_codec(rq, xb)
 
-        # print(err_prq, err_rq)
         self.assertEqual(err_prq, err_rq)
 
 
@@ -1271,7 +1255,6 @@ def test_index_accuracy2(self):
         """check that the error is in the same ballpark as RQ."""
         inter1 = self.eval_index_accuracy("IVF100,PRQ2x2x5_Nqint8")
         inter2 = self.eval_index_accuracy("IVF100,RQ4x5_Nqint8")
-        # print(inter1, inter2)  # 392 vs 374
         self.assertGreaterEqual(inter1 * 1.1, inter2)
 
     def test_factory(self):
diff --git a/tests/test_rowwise_minmax.py b/tests/test_rowwise_minmax.py
index dbd14de388..53e6c00b15 100644
--- a/tests/test_rowwise_minmax.py
+++ b/tests/test_rowwise_minmax.py
@@ -45,7 +45,6 @@ def compare_train_vs_train_inplace(self, factory_key):
 
         # make sure that the reconstruction error is not crazy
         reconstruction_err = ((x - decoded) ** 2).sum()
-        print(reconstruction_err)
 
         self.assertLess(reconstruction_err, 0.6)
 
diff --git a/tests/test_search_params.py b/tests/test_search_params.py
index 954d39cd00..886ffc0c62 100644
--- a/tests/test_search_params.py
+++ b/tests/test_search_params.py
@@ -22,7 +22,7 @@ class TestSelector(unittest.TestCase):
     combinations as possible.
     """
 
-    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2):
+    def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METRIC_L2, k=10):
         """ Verify that the id selector returns the subset of results that are
         members according to the IDSelector.
         Supports id_selector_type="batch", "bitmap", "range", "range_sorted", "and", "or", "xor"
@@ -30,7 +30,6 @@ def do_test_id_selector(self, index_key, id_selector_type="batch", mt=faiss.METR
         ds = datasets.SyntheticDataset(32, 1000, 100, 20)
         index = faiss.index_factory(ds.d, index_key, mt)
         index.train(ds.get_train())
-        k = 10
 
         # reference result
         if "range" in id_selector_type:
@@ -145,6 +144,16 @@ def test_IVFFlat_range_sorted(self):
     def test_IVFPQ(self):
         self.do_test_id_selector("IVF32,PQ4x4np")
 
+    def test_IVFPQfs(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs")
+
+    def test_IVFPQfs_k1(self):
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=1)
+
+    def test_IVFPQfs_k40(self):
+        # test reservoir codepath
+        self.do_test_id_selector("IVF32,PQ4x4fs", k=40)
+
     def test_IVFSQ(self):
         self.do_test_id_selector("IVF32,SQ8")
 
@@ -456,7 +465,6 @@ def test_12_92(self):
         sp = faiss.swig_ptr
         selr.find_sorted_ids_bounds(
             len(ids), sp(ids), sp(j01[:1]), sp(j01[1:]))
-        print(j01)
         assert j01[0] >= j01[1]
 
 
diff --git a/tests/test_sliding_ivf.cpp b/tests/test_sliding_ivf.cpp
index ea9e53d6b5..0214dd72e8 100644
--- a/tests/test_sliding_ivf.cpp
+++ b/tests/test_sliding_ivf.cpp
@@ -74,8 +74,6 @@ void make_index_slices(
     for (int i = 0; i < total_size; i++) {
         sub_indexes.emplace_back(clone_index(trained_index));
 
-        printf("preparing sub-index # %d\n", i);
-
         Index* index = sub_indexes.back().get();
 
         auto xb = make_data(nb * d);
@@ -122,13 +120,10 @@ int test_sliding_window(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         window.step(
                 i < total_size ? sub_indexes[i].get() : nullptr,
                 i >= window_size);
-        printf("   current n_slice = %d\n", window.n_slice);
 
         auto new_res = search_index(index.get(), xq.data());
 
@@ -159,8 +154,6 @@ int test_sliding_invlists(const char* index_key) {
     auto xq = make_data(nq * d);
 
     for (int i = 0; i < total_size + window_size; i++) {
-        printf("doing step %d / %d\n", i, total_size + window_size);
-
         // update the index
         std::vector<const InvertedLists*> ils;
         for (int j = i - window_size + 1; j <= i; j++) {
@@ -178,8 +171,6 @@ int test_sliding_invlists(const char* index_key) {
         // will be deleted by the index
         index_ivf->replace_invlists(ci, true);
 
-        printf("   nb invlists = %zd\n", ils.size());
-
         auto new_res = search_index(index.get(), xq.data());
 
         std::unique_ptr<Index> merged_index(
@@ -188,13 +179,6 @@ int test_sliding_invlists(const char* index_key) {
         auto ref_res = search_index(merged_index.get(), xq.data());
 
         EXPECT_EQ(ref_res.size(), new_res.size());
-
-        size_t ndiff = 0;
-        for (size_t j = 0; j < ref_res.size(); j++) {
-            if (ref_res[j] != new_res[j])
-                ndiff++;
-        }
-        printf("  nb differences: %zd / %zd\n", ndiff, ref_res.size());
         EXPECT_EQ(ref_res, new_res);
     }
     return 0;
diff --git a/tests/test_standalone_codec.py b/tests/test_standalone_codec.py
index 7fdcf6849f..391b88b9dd 100644
--- a/tests/test_standalone_codec.py
+++ b/tests/test_standalone_codec.py
@@ -151,7 +151,6 @@ def compare_accuracy(self, lowac, highac, max_errs=(1e10, 1e10)):
             err = ((x - x2) ** 2).sum()
             errs.append(err)
 
-        print(errs)
         self.assertGreater(errs[0], errs[1])
 
         self.assertGreater(max_errs[0], errs[0])
@@ -174,6 +173,9 @@ def test_SQ2(self):
     def test_SQ3(self):
         self.compare_accuracy('SQ8', 'SQfp16')
 
+    def test_SQ4(self):
+        self.compare_accuracy('SQ8', 'SQbf16')
+
     def test_PQ(self):
         self.compare_accuracy('PQ6x8np', 'PQ8x8np')
 
@@ -214,7 +216,6 @@ def test_repeats(self):
             code = repeats.encode(swig_ptr(vec))
             vec2 = np.zeros(dim, dtype='float32')
             repeats.decode(code, swig_ptr(vec2))
-            # print(vec2)
             assert np.all(vec == vec2)
 
     def test_ZnSphereCodec_encode_centroid(self):
@@ -222,7 +223,6 @@ def test_ZnSphereCodec_encode_centroid(self):
         r2 = 5
         ref_codec = faiss.ZnSphereCodec(dim, r2)
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print(ref_codec.nv, codec.nv)
         assert ref_codec.nv == codec.nv
         s = set()
         for i in range(ref_codec.nv):
@@ -237,7 +237,6 @@ def test_ZnSphereCodecRec(self):
         dim = 16
         r2 = 6
         codec = faiss.ZnSphereCodecRec(dim, r2)
-        # print("nv=", codec.nv)
         for i in range(codec.nv):
             c = np.zeros(dim, dtype='float32')
             codec.decode(i, swig_ptr(c))
@@ -300,15 +299,10 @@ def test_rw(self):
         for i in range(nbyte):
             self.assertTrue(((bignum >> (i * 8)) & 255) == bs[i])
 
-        #for i in range(nbyte):
-        #    print(bin(bs[i] + 256)[3:], end=' ')
-        # print()
-
         br = faiss.BitstringReader(swig_ptr(bs), nbyte)
 
         for nbit, xref in ctrl:
             xnew = br.read(nbit)
-            # print('nbit %d xref %x xnew %x' % (nbit, xref, xnew))
             self.assertTrue(xnew == xref)
 
     def test_arrays(self):
diff --git a/tests/test_util.h b/tests/test_util.h
new file mode 100644
index 0000000000..3be0e35cff
--- /dev/null
+++ b/tests/test_util.h
@@ -0,0 +1,39 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_TEST_UTIL_H
+#define FAISS_TEST_UTIL_H
+
+#include <faiss/IndexIVFPQ.h>
+#include <unistd.h>
+#include <cstdlib>
+
+struct Tempfilename {
+    pthread_mutex_t* mutex;
+    std::string filename;
+
+    Tempfilename(pthread_mutex_t* mutex, std::string filename) {
+        this->mutex = mutex;
+        this->filename = filename;
+        pthread_mutex_lock(mutex);
+        int fd = mkstemp(&filename[0]);
+        close(fd);
+        pthread_mutex_unlock(mutex);
+    }
+
+    ~Tempfilename() {
+        if (access(filename.c_str(), F_OK)) {
+            unlink(filename.c_str());
+        }
+    }
+
+    const char* c_str() {
+        return filename.c_str();
+    }
+};
+
+#endif // FAISS_TEST_UTIL_H
diff --git a/tutorial/cpp/1-Flat.cpp b/tutorial/cpp/1-Flat.cpp
index 819e419573..147fa89bc0 100644
--- a/tutorial/cpp/1-Flat.cpp
+++ b/tutorial/cpp/1-Flat.cpp
@@ -83,10 +83,10 @@ int main() {
             printf("\n");
         }
 
-        printf("I (5 last results)=\n");
+        printf("D (5 last results)=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/2-IVFFlat.cpp b/tutorial/cpp/2-IVFFlat.cpp
index febd5be049..86530ae985 100644
--- a/tutorial/cpp/2-IVFFlat.cpp
+++ b/tutorial/cpp/2-IVFFlat.cpp
@@ -61,13 +61,10 @@ int main() {
             printf("\n");
         }
 
-        index.nprobe = 10;
-        index.search(nq, xq, k, D, I);
-
-        printf("I=\n");
+        printf("D=\n");
         for (int i = nq - 5; i < nq; i++) {
             for (int j = 0; j < k; j++)
-                printf("%5zd ", I[i * k + j]);
+                printf("%5f ", D[i * k + j]);
             printf("\n");
         }
 
diff --git a/tutorial/cpp/6-HNSW.cpp b/tutorial/cpp/6-HNSW.cpp
new file mode 100644
index 0000000000..9bd8cd3faa
--- /dev/null
+++ b/tutorial/cpp/6-HNSW.cpp
@@ -0,0 +1,73 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexHNSW.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[d * nb];
+    float* xq = new float[d * nq];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++)
+            xb[d * i + j] = distrib(rng);
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++)
+            xq[d * i + j] = distrib(rng);
+        xq[d * i] += i / 1000.;
+    }
+
+    int k = 4;
+
+    faiss::IndexHNSWFlat index(d, 32);
+    index.add(nb, xb);
+
+    { // search xq
+        idx_t* I = new idx_t[k * nq];
+        float* D = new float[k * nq];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5zd ", I[i * k + j]);
+            printf("\n");
+        }
+
+        printf("D=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++)
+                printf("%5f ", D[i * k + j]);
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/7-PQFastScan.cpp b/tutorial/cpp/7-PQFastScan.cpp
new file mode 100644
index 0000000000..4cdfea052e
--- /dev/null
+++ b/tutorial/cpp/7-PQFastScan.cpp
@@ -0,0 +1,75 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.train(nb, xb);
+    printf("Index is trained? %s\n", index.is_trained ? "true" : "false");
+    index.add(nb, xb);
+
+    int k = 4;
+
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+
+        index.search(nq, xq, k, D, I);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+} // namespace facebook::detail
diff --git a/tutorial/cpp/8-PQFastScanRefine.cpp b/tutorial/cpp/8-PQFastScanRefine.cpp
new file mode 100644
index 0000000000..2435d94d2c
--- /dev/null
+++ b/tutorial/cpp/8-PQFastScanRefine.cpp
@@ -0,0 +1,84 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    int m = 8;
+    int n_bit = 4;
+
+    faiss::IndexPQFastScan index(d, m, n_bit);
+    faiss::IndexRefineFlat index_refine(&index);
+    // refine index after PQFastScan
+
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.train(nb, xb);
+    printf("Index is trained? %s\n",
+           index_refine.is_trained ? "true" : "false");
+    index_refine.add(nb, xb);
+
+    int k = 4;
+    { // search xq
+        idx_t* I = new idx_t[(int)(k * nq)];
+        float* D = new float[(int)(k * nq)];
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+        index_refine.search(nq, xq, k, D, I, params);
+
+        printf("I=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I;
+        delete[] D;
+        delete params;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/9-RefineComparison.cpp b/tutorial/cpp/9-RefineComparison.cpp
new file mode 100644
index 0000000000..d7fbc90aec
--- /dev/null
+++ b/tutorial/cpp/9-RefineComparison.cpp
@@ -0,0 +1,104 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+#include <random>
+
+#include <faiss/IndexPQFastScan.h>
+#include <faiss/IndexRefine.h>
+#include <faiss/index_factory.h>
+using idx_t = faiss::idx_t;
+
+int main() {
+    int d = 64;      // dimension
+    int nb = 100000; // database size
+    int nq = 10000;  // nb of queries
+
+    std::mt19937 rng;
+    std::uniform_real_distribution<> distrib;
+
+    float* xb = new float[(int)(d * nb)];
+    float* xq = new float[(int)(d * nq)];
+
+    for (int i = 0; i < nb; i++) {
+        for (int j = 0; j < d; j++) {
+            xb[d * i + j] = distrib(rng);
+        }
+        xb[d * i] += i / 1000.;
+    }
+
+    for (int i = 0; i < nq; i++) {
+        for (int j = 0; j < d; j++) {
+            xq[d * i + j] = distrib(rng);
+        }
+        xq[d * i] += i / 1000.;
+    }
+
+    // Constructing the refine PQ index with SQfp16 with index factory
+    faiss::Index* index_fp16;
+    index_fp16 = faiss::index_factory(
+            d, "PQ32x4fs,Refine(SQfp16)", faiss::METRIC_L2);
+    index_fp16->train(nb, xb);
+    index_fp16->add(nb, xb);
+
+    // Constructing the refine PQ index with SQ8
+    faiss::Index* index_sq8;
+    index_sq8 =
+            faiss::index_factory(d, "PQ32x4fs,Refine(SQ8)", faiss::METRIC_L2);
+    index_sq8->train(nb, xb);
+    index_sq8->add(nb, xb);
+
+    int k = 10;
+    { // search xq
+        idx_t* I_fp16 = new idx_t[(int)(k * nq)];
+        float* D_fp16 = new float[(int)(k * nq)];
+        idx_t* I_sq8 = new idx_t[(int)(k * nq)];
+        float* D_sq8 = new float[(int)(k * nq)];
+
+        // Parameterization on k factor while doing search for index refinement
+        float k_factor = 3;
+        faiss::IndexRefineSearchParameters* params =
+                new faiss::IndexRefineSearchParameters();
+        params->k_factor = k_factor;
+
+        // Perform index search using different index refinement
+        index_fp16->search(nq, xq, k, D_fp16, I_fp16, params);
+        index_sq8->search(nq, xq, k, D_sq8, I_sq8, params);
+
+        printf("I_fp16=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_fp16[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        printf("I_sq8=\n");
+        for (int i = nq - 5; i < nq; i++) {
+            for (int j = 0; j < k; j++) {
+                printf("%5zd ", I_sq8[i * k + j]);
+            }
+            printf("\n");
+        }
+
+        delete[] I_fp16;
+        delete[] D_fp16;
+        delete[] I_sq8;
+        delete[] D_sq8;
+        delete params;
+
+        delete index_fp16;
+        delete index_sq8;
+    }
+
+    delete[] xb;
+    delete[] xq;
+
+    return 0;
+}
diff --git a/tutorial/cpp/CMakeLists.txt b/tutorial/cpp/CMakeLists.txt
index 7361b33a03..f964b3dda9 100644
--- a/tutorial/cpp/CMakeLists.txt
+++ b/tutorial/cpp/CMakeLists.txt
@@ -18,3 +18,15 @@ target_link_libraries(4-GPU PRIVATE faiss)
 
 add_executable(5-Multiple-GPUs EXCLUDE_FROM_ALL 5-Multiple-GPUs.cpp)
 target_link_libraries(5-Multiple-GPUs PRIVATE faiss)
+
+add_executable(6-HNSW EXCLUDE_FROM_ALL 6-HNSW.cpp)
+target_link_libraries(6-HNSW PRIVATE faiss)
+
+add_executable(7-PQFastScan EXCLUDE_FROM_ALL 7-PQFastScan.cpp)
+target_link_libraries(7-PQFastScan PRIVATE faiss)
+
+add_executable(8-PQFastScanRefine EXCLUDE_FROM_ALL 8-PQFastScanRefine.cpp)
+target_link_libraries(8-PQFastScanRefine PRIVATE faiss)
+
+add_executable(9-RefineComparison EXCLUDE_FROM_ALL 9-RefineComparison.cpp)
+target_link_libraries(9-RefineComparison PRIVATE faiss)
diff --git a/tutorial/python/7-PQFastScan.py b/tutorial/python/7-PQFastScan.py
new file mode 100644
index 0000000000..34d7a34ac1
--- /dev/null
+++ b/tutorial/python/7-PQFastScan.py
@@ -0,0 +1,35 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8   # 8 specifies that the number of sub-vector is 8
+k = 4   # number of dimension in etracted vector
+n_bit = 4   # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32    # build block size ( bbs % 32 == 0 ) for PQ
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2, bbs)
+# construct FastScan Index
+
+assert not index.is_trained
+index.train(xb)     # Train vectors data index within mockup database
+assert index.is_trained
+
+index.add(xb)
+D, I = index.search(xb[:5], k)  # sanity check
+print(I)
+print(D)
+index.nprobe = 10              # make comparable with experiment above
+D, I = index.search(xq, k)     # search
+print(I[-5:])               # neighbors of the 5 last queries
diff --git a/tutorial/python/8-PQFastScanRefine.py b/tutorial/python/8-PQFastScanRefine.py
new file mode 100644
index 0000000000..115a036fa7
--- /dev/null
+++ b/tutorial/python/8-PQFastScanRefine.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+import numpy as np
+
+d = 64                           # dimension
+nb = 100000                      # database size
+nq = 10000                       # nb of queries
+np.random.seed(1234)             # make reproducible
+xb = np.random.random((nb, d)).astype('float32')    # 64-dim *nb queries
+xb[:, 0] += np.arange(nb) / 1000.
+xq = np.random.random((nq, d)).astype('float32')
+xq[:, 0] += np.arange(nq) / 1000.
+
+m = 8  # 8 specifies that the number of sub-vector is 8
+k = 4  # number of dimension in etracted vector
+n_bit = 4  # 4 specifies that each sub-vector is encoded as 4 bits
+bbs = 32  # build block size ( bbs % 32 == 0 ) for PQ
+
+index = faiss.IndexPQFastScan(d, m, n_bit, faiss.METRIC_L2)
+index_refine = faiss.IndexRefineFlat(index)
+# construct FastScan and run index refinement
+
+assert not index_refine.is_trained
+index_refine.train(xb)  # Train vectors data index within mockup database
+assert index_refine.is_trained
+
+index_refine.add(xb)
+params = faiss.IndexRefineSearchParameters(k_factor=3)
+D, I = index_refine.search(xq[:5], 10, params=params)
+print(I)
+print(D)
+index.nprobe = 10  # make comparable with experiment above
+D, I = index.search(xq[:5], k)  # search
+print(I[-5:])
diff --git a/tutorial/python/9-RefineComparison.py b/tutorial/python/9-RefineComparison.py
new file mode 100644
index 0000000000..6fa69f33d9
--- /dev/null
+++ b/tutorial/python/9-RefineComparison.py
@@ -0,0 +1,42 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import faiss
+
+from faiss.contrib.evaluation import knn_intersection_measure
+from faiss.contrib import datasets
+
+# 64-dim vectors, 50000 vectors in the training, 100000 in database,
+# 10000 in queries, dtype ('float32')
+ds = datasets.SyntheticDataset(64, 50000, 100000, 10000)
+d = 64                           # dimension
+
+# Constructing the refine PQ index with SQfp16 with index factory
+index_fp16 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQfp16)')
+index_fp16.train(ds.get_train())
+index_fp16.add(ds.get_database())
+
+# Constructing the refine PQ index with SQ8
+index_sq8 = faiss.index_factory(d, 'PQ32x4fs,Refine(SQ8)')
+index_sq8.train(ds.get_train())
+index_sq8.add(ds.get_database())
+
+# Parameterization on k factor while doing search for index refinement
+k_factor = 3.0
+params = faiss.IndexRefineSearchParameters(k_factor=k_factor)
+
+# Perform index search using different index refinement
+D_fp16, I_fp16 = index_fp16.search(ds.get_queries(), 100, params=params)
+D_sq8, I_sq8 = index_sq8.search(ds.get_queries(), 100, params=params)
+
+# Calculating knn intersection measure for different index types on refinement
+KIM_fp16 = knn_intersection_measure(I_fp16, ds.get_groundtruth())
+KIM_sq8 = knn_intersection_measure(I_sq8, ds.get_groundtruth())
+
+# KNN intersection measure accuracy shows that choosing SQ8 impacts accuracy
+assert (KIM_fp16 > KIM_sq8)
+
+print(I_sq8[:5])
+print(I_fp16[:5])