Update for vamana (bulk and fresh) from kann-experiments.

Added support for cosine similarity CMake files for build on Windows Added gperftools as submodule and patches to gperftools Changed cmake to use libtcmalloc from gperftools
2022-04-12 00:19:41 -07:00 · 2022-04-12 00:19:41 -07:00 · bb5c124853
--- a/.gitmodules
+++ b/.gitmodules
@ -0,0 +1,3 @@
+[submodule "gperftools"]
+	path = gperftools
+	url = https://github.com/gperftools/gperftools.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -18,7 +18,7 @@ else()
 endif()

 project(diskann)
-include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/dll)
+include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/dll ${PROJECT_SOURCE_DIR}/gperftools/src)

 #OpenMP
 find_package(OpenMP)
--- a/1
+++ b/1
@ -0,0 +1 @@
+Subproject commit fe85bbdf4cb891a67a8e2109c1c22a33aa958c7e
--- a/include/aux_utils.h
+++ b/include/aux_utils.h
@ -27,12 +27,16 @@ typedef int FileHandle;

 #include "cached_io.h"
 #include "common_includes.h"
+#include "tsl/robin_set.h"
+
 #include "utils.h"
 #include "windows_customizations.h"
 #include "gperftools/malloc_extension.h"

 namespace diskann {
-  const size_t   TRAINING_SET_SIZE = 100000;
+  const size_t   MAX_PQ_TRAINING_SET_SIZE = 256000;
+  const size_t   MAX_SAMPLE_POINTS_FOR_WARMUP = 1000000;
+  const double   PQ_TRAINING_SET_FRACTION = 0.1;
  const double   SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
  const double   THRESHOLD_FOR_CACHING_IN_GB = 1.0;
  const uint32_t NUM_NODES_TO_CACHE = 250000;
@ -42,10 +46,24 @@ namespace diskann {
  template<typename T>
  class PQFlashIndex;

+  DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
+  DISKANN_DLLEXPORT double get_memory_budget(double search_ram_budget_in_gb);
+  DISKANN_DLLEXPORT void   add_new_file_to_single_index(std::string index_file,
+                                                        std::string new_file);
+
+  DISKANN_DLLEXPORT size_t calculate_num_pq_chunks(double final_index_ram_limit,
+                                                   size_t points_num,
+                                                   uint32_t dim);
+
  DISKANN_DLLEXPORT double calculate_recall(
      unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
      unsigned *our_results, unsigned dim_or, unsigned recall_at);

+  DISKANN_DLLEXPORT double calculate_recall(
+      unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
+      unsigned *our_results, unsigned dim_or, unsigned recall_at,
+      const tsl::robin_set<unsigned> &active_tags);
+
  DISKANN_DLLEXPORT double calculate_range_search_recall(
      unsigned num_queries, std::vector<std::vector<_u32>> &groundtruth,
      std::vector<std::vector<_u32>> &our_results);
@ -74,6 +92,11 @@ namespace diskann {
                                     const std::string &output_vamana,
                                     const std::string &medoids_file);

+  template<typename T>
+  DISKANN_DLLEXPORT std::string preprocess_base_file(
+      const std::string &infile, const std::string &indexPrefix,
+      diskann::Metric &distMetric);
+
  template<typename T>
  DISKANN_DLLEXPORT int build_merged_vamana_index(
      std::string base_file, diskann::Metric _compareMetric, unsigned L,
--- a/include/cosine_similarity.h
+++ b/include/cosine_similarity.h
@ -2,17 +2,28 @@
 // Licensed under the MIT license.

 #pragma once
+
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <vector>
+#include <limits>
+#include <algorithm>
+#include <stdexcept>
+
+#include "simd_utils.h"
+
+extern bool Avx2SupportedCPU;

 namespace diskann {
  template<typename T>
  inline float compute_l2_norm(const T* vector, uint64_t ndims) {
-    float norm = 0.0f;
+    float norm = std::numeric_limits<float>::epsilon();
    for (uint64_t i = 0; i < ndims; i++) {
-      norm += vector[i] * vector[i];
+      norm += (float) (vector[i] * vector[i]);
    }
    return std::sqrt(norm);
  }
@ -24,7 +35,7 @@ namespace diskann {
    float right_norm = compute_l2_norm<T>(right, ndims);
    float dot = 0.0f;
    for (uint64_t i = 0; i < ndims; i++) {
-      dot += left[i] * right[i];
+      dot += (float) (left[i] * right[i]);
    }
    float cos_sim = dot / (left_norm * right_norm);
    return cos_sim;
@ -37,10 +48,266 @@ namespace diskann {
    cos_dists.reserve(npts);

    for (size_t i = 0; i < npts; i++) {
-      const float* point = all_data + (size_t)(indices[i]) * (size_t)(ndims);
+      const float* point = all_data + (size_t) (indices[i]) * (size_t) (ndims);
      cos_dists.push_back(
          compute_cosine_similarity<float>(point, query, ndims));
    }
    return cos_dists;
  }
 }  // namespace diskann
+
+#ifdef _WINDOWS
+// SIMD implementation of Cosine similarity. Taken from hnsw library.
+
+/**
+ * Non-metric Space Library
+ *
+ * Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov
+ * (http://boytsov.info). With contributions from Lawrence Cayton
+ * (http://lcayton.com/) and others.
+ *
+ * For the complete list of contributors and further details see:
+ * https://github.com/searchivarius/NonMetricSpaceLib
+ *
+ * Copyright (c) 2014
+ *
+ * This code is released under the
+ * Apache License Version 2.0 http://www.apache.org/licenses/.
+ *
+ */
+
+namespace diskann {
+
+  using namespace std;
+
+#define PORTABLE_ALIGN16 __declspec(align(16))
+
+  static float NormScalarProductSIMD2(const int8_t* pVect1,
+                                      const int8_t* pVect2, uint32_t qty) {
+    if (Avx2SupportedCPU) {
+      __m256 cos, p1Len, p2Len;
+      cos = p1Len = p2Len = _mm256_setzero_ps();
+      while (qty >= 32) {
+        __m256i rx = _mm256_load_si256((__m256i*) pVect1),
+                ry = _mm256_load_si256((__m256i*) pVect2);
+        cos = _mm256_add_ps(cos, _mm256_mul_epi8(rx, ry));
+        p1Len = _mm256_add_ps(p1Len, _mm256_mul_epi8(rx, rx));
+        p2Len = _mm256_add_ps(p2Len, _mm256_mul_epi8(ry, ry));
+        pVect1 += 32;
+        pVect2 += 32;
+        qty -= 32;
+      }
+      while (qty > 0) {
+        __m128i rx = _mm_load_si128((__m128i*) pVect1),
+                ry = _mm_load_si128((__m128i*) pVect2);
+        cos = _mm256_add_ps(cos, _mm256_mul32_pi8(rx, ry));
+        p1Len = _mm256_add_ps(p1Len, _mm256_mul32_pi8(rx, rx));
+        p2Len = _mm256_add_ps(p2Len, _mm256_mul32_pi8(ry, ry));
+        pVect1 += 4;
+        pVect2 += 4;
+        qty -= 4;
+      }
+      cos = _mm256_hadd_ps(_mm256_hadd_ps(cos, cos), cos);
+      p1Len = _mm256_hadd_ps(_mm256_hadd_ps(p1Len, p1Len), p1Len);
+      p2Len = _mm256_hadd_ps(_mm256_hadd_ps(p2Len, p2Len), p2Len);
+      float denominator = max(numeric_limits<float>::min() * 2,
+                              sqrt(p1Len.m256_f32[0] + p1Len.m256_f32[4]) *
+                                  sqrt(p2Len.m256_f32[0] + p2Len.m256_f32[4]));
+      float cosine = (cos.m256_f32[0] + cos.m256_f32[4]) / denominator;
+
+      return max(float(-1), min(float(1), cosine));
+    }
+
+    __m128 cos, p1Len, p2Len;
+    cos = p1Len = p2Len = _mm_setzero_ps();
+    __m128i rx, ry;
+    while (qty >= 16) {
+      rx = _mm_load_si128((__m128i*) pVect1);
+      ry = _mm_load_si128((__m128i*) pVect2);
+      cos = _mm_add_ps(cos, _mm_mul_epi8(rx, ry));
+      p1Len = _mm_add_ps(p1Len, _mm_mul_epi8(rx, rx));
+      p2Len = _mm_add_ps(p2Len, _mm_mul_epi8(ry, ry));
+      pVect1 += 16;
+      pVect2 += 16;
+      qty -= 16;
+    }
+    while (qty > 0) {
+      rx = _mm_load_si128((__m128i*) pVect1);
+      ry = _mm_load_si128((__m128i*) pVect2);
+      cos = _mm_add_ps(cos, _mm_mul32_pi8(rx, ry));
+      p1Len = _mm_add_ps(p1Len, _mm_mul32_pi8(rx, rx));
+      p2Len = _mm_add_ps(p2Len, _mm_mul32_pi8(ry, ry));
+      pVect1 += 4;
+      pVect2 += 4;
+      qty -= 4;
+    }
+    cos = _mm_hadd_ps(_mm_hadd_ps(cos, cos), cos);
+    p1Len = _mm_hadd_ps(_mm_hadd_ps(p1Len, p1Len), p1Len);
+    p2Len = _mm_hadd_ps(_mm_hadd_ps(p2Len, p2Len), p2Len);
+    float norm1 = p1Len.m128_f32[0];
+    float norm2 = p2Len.m128_f32[0];
+
+    static const float eps = numeric_limits<float>::min() * 2;
+
+    if (norm1 < eps) { /*
+                        * This shouldn't normally happen for this space, but
+                        * if it does, we don't want to get NANs
+                        */
+      if (norm2 < eps) {
+        return 1;
+      }
+      return 0;
+    }
+    /*
+     * Sometimes due to rounding errors, we get values > 1 or < -1.
+     * This throws off other functions that use scalar product, e.g., acos
+     */
+    return max(float(-1),
+               min(float(1), cos.m128_f32[0] / sqrt(norm1) / sqrt(norm2)));
+  }
+
+  static float NormScalarProductSIMD(const float* pVect1, const float* pVect2,
+                                     uint32_t qty) {
+    // Didn't get significant performance gain compared with 128bit version.
+    static const float eps = numeric_limits<float>::min() * 2;
+
+    if (Avx2SupportedCPU) {
+      uint32_t qty8 = qty / 8;
+
+      const float* pEnd1 = pVect1 + 8 * qty8;
+      const float* pEnd2 = pVect1 + qty;
+
+      __m256 v1, v2;
+      __m256 sum_prod = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0);
+      __m256 sum_square1 = sum_prod;
+      __m256 sum_square2 = sum_prod;
+
+      while (pVect1 < pEnd1) {
+        v1 = _mm256_loadu_ps(pVect1);
+        pVect1 += 8;
+        v2 = _mm256_loadu_ps(pVect2);
+        pVect2 += 8;
+        sum_prod = _mm256_add_ps(sum_prod, _mm256_mul_ps(v1, v2));
+        sum_square1 = _mm256_add_ps(sum_square1, _mm256_mul_ps(v1, v1));
+        sum_square2 = _mm256_add_ps(sum_square2, _mm256_mul_ps(v2, v2));
+      }
+
+      float PORTABLE_ALIGN16 TmpResProd[8];
+      float PORTABLE_ALIGN16 TmpResSquare1[8];
+      float PORTABLE_ALIGN16 TmpResSquare2[8];
+
+      _mm256_store_ps(TmpResProd, sum_prod);
+      _mm256_store_ps(TmpResSquare1, sum_square1);
+      _mm256_store_ps(TmpResSquare2, sum_square2);
+
+      float sum = 0.0f;
+      float norm1 = 0.0f;
+      float norm2 = 0.0f;
+      for (uint32_t i = 0; i < 8; ++i) {
+        sum += TmpResProd[i];
+        norm1 += TmpResSquare1[i];
+        norm2 += TmpResSquare2[i];
+      }
+
+      while (pVect1 < pEnd2) {
+        sum += (*pVect1) * (*pVect2);
+        norm1 += (*pVect1) * (*pVect1);
+        norm2 += (*pVect2) * (*pVect2);
+
+        ++pVect1;
+        ++pVect2;
+      }
+
+      if (norm1 < eps) {
+        return norm2 < eps ? 1.0f : 0.0f;
+      }
+
+      return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
+    }
+
+    __m128 v1, v2;
+    __m128 sum_prod = _mm_set1_ps(0);
+    __m128 sum_square1 = sum_prod;
+    __m128 sum_square2 = sum_prod;
+
+    while (qty >= 4) {
+      v1 = _mm_loadu_ps(pVect1);
+      pVect1 += 4;
+      v2 = _mm_loadu_ps(pVect2);
+      pVect2 += 4;
+      sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
+      sum_square1 = _mm_add_ps(sum_square1, _mm_mul_ps(v1, v1));
+      sum_square2 = _mm_add_ps(sum_square2, _mm_mul_ps(v2, v2));
+
+      qty -= 4;
+    }
+
+    float sum = sum_prod.m128_f32[0] + sum_prod.m128_f32[1] +
+                sum_prod.m128_f32[2] + sum_prod.m128_f32[3];
+    float norm1 = sum_square1.m128_f32[0] + sum_square1.m128_f32[1] +
+                  sum_square1.m128_f32[2] + sum_square1.m128_f32[3];
+    float norm2 = sum_square2.m128_f32[0] + sum_square2.m128_f32[1] +
+                  sum_square2.m128_f32[2] + sum_square2.m128_f32[3];
+
+    if (norm1 < eps) {
+      return norm2 < eps ? 1.0f : 0.0f;
+    }
+
+    return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
+  }
+
+  static float NormScalarProductSIMD2(const float* pVect1, const float* pVect2,
+                                      uint32_t qty) {
+    return NormScalarProductSIMD(pVect1, pVect2, qty);
+  }
+
+  template<class T>
+  static float CosineSimilarity2(const T* p1, const T* p2, uint32_t qty) {
+    return std::max(0.0f, 1.0f - NormScalarProductSIMD2(p1, p2, qty));
+  }
+
+  // static template float CosineSimilarity2<__int8>(const __int8* pVect1,
+  //                                         const __int8* pVect2, size_t qty);
+
+  // static template float CosineSimilarity2<float>(const float* pVect1,
+  //                                        const float* pVect2, size_t qty);
+
+  template<class T>
+  static void CosineSimilarityNormalize(T* pVector, uint32_t qty) {
+    T sum = 0;
+    for (uint32_t i = 0; i < qty; ++i) {
+      sum += pVector[i] * pVector[i];
+    }
+    sum = 1 / sqrt(sum);
+    if (sum == 0) {
+      sum = numeric_limits<T>::min();
+    }
+    for (uint32_t i = 0; i < qty; ++i) {
+      pVector[i] *= sum;
+    }
+  }
+
+  // template static void CosineSimilarityNormalize<float>(float* pVector,
+  //                                                      size_t qty);
+  // template static void CosineSimilarityNormalize<double>(double* pVector,
+  //                                                       size_t  qty);
+
+  template<>
+  void CosineSimilarityNormalize(__int8* pVector, uint32_t qty) {
+    throw std::runtime_error(
+        "For int8 type vector, you can not use cosine distance!");
+  }
+
+  template<>
+  void CosineSimilarityNormalize(__int16* pVector, uint32_t qty) {
+    throw std::runtime_error(
+        "For int16 type vector, you can not use cosine distance!");
+  }
+
+  template<>
+  void CosineSimilarityNormalize(int* pVector, uint32_t qty) {
+    throw std::runtime_error(
+        "For int type vector, you can not use cosine distance!");
+  }
+}  // namespace diskann
+#endif
--- a/include/distance.h
+++ b/include/distance.h
@ -1,443 +1,65 @@
 #pragma once
-
-#include <utils.h>
-#ifdef _WINDOWS
-#include <immintrin.h>
-#include <smmintrin.h>
-#include <tmmintrin.h>
-#include <intrin.h>
-#else
-#include <immintrin.h>
-#endif
-
-#include <cosine_similarity.h>
-#include <iostream>
-
-namespace {
-  static inline __m128 _mm_mulhi_epi8(__m128i X) {
-    __m128i zero = _mm_setzero_si128();
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(
-        _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
-  }
-
-  static inline __m128 _mm_mulhi_epi8_shift32(__m128i X) {
-    __m128i zero = _mm_setzero_si128();
-    X = _mm_srli_epi64(X, 32);
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(
-        _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
-  }
-  static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y) {
-    __m128i zero = _mm_setzero_si128();
-
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i sign_y = _mm_cmplt_epi8(Y, zero);
-
-    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-    __m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
-    __m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
-
-    return _mm_cvtepi32_ps(
-        _mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
-  }
-  static inline __m128 _mm_mul_epi8(__m128i X) {
-    __m128i zero = _mm_setzero_si128();
-    __m128i sign_x = _mm_cmplt_epi8(X, zero);
-    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
-    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
-
-    return _mm_cvtepi32_ps(
-        _mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
-  }
-
-  static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y) {
-    __m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
-    return _mm_cvtepi32_ps(
-        _mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
-  }
-
-  static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y) {
-    __m256i zero = _mm256_setzero_si256();
-
-    __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
-    __m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
-
-    __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
-    __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
-    __m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
-    __m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
-
-    return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo),
-                                               _mm256_madd_epi16(xhi, yhi)));
-  }
-
-  static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y) {
-    __m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
-    return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)),
-                           _mm256_setzero_ps(), 252);
-  }
-
-  static inline float _mm256_reduce_add_ps(__m256 x) {
-    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
-    const __m128 x128 =
-        _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
-    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
-    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
-    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
-    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
-    /* Conversion to float is a no-op on x86-64 */
-    return _mm_cvtss_f32(x32);
-  }
-}  // namespace
+#include "windows_customizations.h"

 namespace diskann {
-  //  enum Metric { L2 = 0, INNER_PRODUCT = 1, FAST_L2 = 2, PQ = 3 };
+
  template<typename T>
  class Distance {
   public:
-    virtual float compare(const T *a, const T *b, unsigned length) const = 0;
+    virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
    virtual ~Distance() {
    }
  };

-  template<typename T>
-  class DistanceCosine : public Distance<T> {
-    float compare(const T *a, const T *b, unsigned length) const {
-      return diskann::compute_cosine_similarity<T>(a, b, length);
-    }
+  class DistanceCosineInt8 : public Distance<int8_t> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
+                                            uint32_t length) const;
+  };
+
+  class DistanceCosineFloat : public Distance<float> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t length) const;
+  };
+
+  class SlowDistanceCosineUInt8 : public Distance<uint8_t> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
+                                            uint32_t length) const;
  };

  class DistanceL2Int8 : public Distance<int8_t> {
   public:
-    float compare(const int8_t *a, const int8_t *b, unsigned size) const {
-      int32_t result = 0;
-
-#ifdef _WINDOWS
-#ifdef USE_AVX2
-      __m256 r = _mm256_setzero_ps();
-      char * pX = (char *) a, *pY = (char *) b;
-      while (size >= 32) {
-        __m256i r1 = _mm256_subs_epi8(_mm256_loadu_si256((__m256i *) pX),
-                                      _mm256_loadu_si256((__m256i *) pY));
-        r = _mm256_add_ps(r, _mm256_mul_epi8(r1, r1));
-        pX += 32;
-        pY += 32;
-        size -= 32;
-      }
-      while (size > 0) {
-        __m128i r2 = _mm_subs_epi8(_mm_loadu_si128((__m128i *) pX),
-                                   _mm_loadu_si128((__m128i *) pY));
-        r = _mm256_add_ps(r, _mm256_mul32_pi8(r2, r2));
-        pX += 4;
-        pY += 4;
-        size -= 4;
-      }
-      r = _mm256_hadd_ps(_mm256_hadd_ps(r, r), r);
-      return r.m256_f32[0] + r.m256_f32[4];
-#else
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-      for (_s32 i = 0; i < (_s32) size; i++) {
-        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
-                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
-      }
-      return (float) result;
-#endif
-#else
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-      for (_s32 i = 0; i < (_s32) size; i++) {
-        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
-                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
-      }
-      return (float) result;
-#endif
-    }
+    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
+                                            uint32_t size) const;
  };

  class DistanceL2UInt8 : public Distance<uint8_t> {
   public:
-    float compare(const uint8_t *a, const uint8_t *b, unsigned size) const {
-      uint32_t result = 0;
-#ifndef _WINDOWS
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-#endif
-      for (_s32 i = 0; i < (_s32) size; i++) {
-        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
-                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
-      }
-      return (float) result;
-    }
+    DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
+                                            uint32_t size) const;
  };

-  class DistanceL2 : public Distance<float> {
+  class DistanceL2Float : public Distance<float> {
   public:
-#ifndef _WINDOWS
-    float compare(const float *a, const float *b, unsigned size) const
-        __attribute__((hot)) {
-      a = (const float *) __builtin_assume_aligned(a, 32);
-      b = (const float *) __builtin_assume_aligned(b, 32);
+#ifdef _WINDOWS
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t size) const;
 #else
-    float compare(const float *a, const float *b, unsigned size) const {
-#endif
-
-      float result = 0;
-#ifdef USE_AVX2
-      // assume size is divisible by 8
-      _u16   niters = size / 8;
-      __m256 sum = _mm256_setzero_ps();
-      for (_u16 j = 0; j < niters; j++) {
-        // scope is a[8j:8j+7], b[8j:8j+7]
-        // load a_vec
-        if (j < (niters - 1)) {
-          _mm_prefetch((char *) (a + 8 * (j + 1)), _MM_HINT_T0);
-          _mm_prefetch((char *) (b + 8 * (j + 1)), _MM_HINT_T0);
-        }
-        __m256 a_vec = _mm256_load_ps(a + 8 * j);
-        // load b_vec
-        __m256 b_vec = _mm256_load_ps(b + 8 * j);
-        // a_vec - b_vec
-        __m256 tmp_vec = _mm256_sub_ps(a_vec, b_vec);
-        /*
-    // (a_vec - b_vec)**2
-        __m256 tmp_vec2 = _mm256_mul_ps(tmp_vec, tmp_vec);
-    // accumulate sum
-        sum = _mm256_add_ps(sum, tmp_vec2);
-    */
-        // sum = (tmp_vec**2) + sum
-        sum = _mm256_fmadd_ps(tmp_vec, tmp_vec, sum);
-      }
-
-      // horizontal add sum
-      result = _mm256_reduce_add_ps(sum);
-#else
-#ifndef _WINDOWS
-#pragma omp simd reduction(+ : result) aligned(a, b : 32)
-#endif
-      for (_s32 i = 0; i < (_s32) size; i++) {
-        result += (a[i] - b[i]) * (a[i] - b[i]);
-      }
-#endif
-      return result;
-    }
-  };
-
-  //  Slow implementations of the distance functions for machines without AVX2
-  template<typename T>
-  class SlowDistanceL2Int : public Distance<T> {
-    virtual float compare(const T *a, const T *b, unsigned length) const {
-      uint32_t result = 0;
-      for (_u32 i = 0; i < length; i++) {
-        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
-                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
-      }
-      return (float) result;
-    }
-  };
-
-  class SlowDistanceL2Float : public Distance<float> {
-    virtual float compare(const float *a, const float *b,
-                          unsigned length) const {
-      float result = 0.0f;
-      for (_u32 i = 0; i < length; i++) {
-        result += (a[i] - b[i]) * (a[i] - b[i]);
-      }
-      return result;
-    }
-  };
-
-  class AVXDistanceL2Int8 : public Distance<int8_t> {
-   public:
-    virtual float compare(const int8_t *a, const int8_t *b,
-                          unsigned int length) const {
-#ifndef _WINDOWS
-      int32_t result = 0;
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-      for (_s32 i = 0; i < (_s32) length; i++) {
-        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
-                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
-      }
-      return (float) result;
-    }
-#else
-      __m128 r = _mm_setzero_ps();
-      __m128i r1;
-      while (length >= 16) {
-        r1 = _mm_subs_epi8(_mm_load_si128((__m128i *) a),
-                           _mm_load_si128((__m128i *) b));
-        r = _mm_add_ps(r, _mm_mul_epi8(r1));
-        a += 16;
-        b += 16;
-        length -= 16;
-      }
-      r = _mm_hadd_ps(_mm_hadd_ps(r, r), r);
-      float res = r.m128_f32[0];
-
-      if (length >= 8) {
-        __m128 r2 = _mm_setzero_ps();
-        __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 8)),
-                                   _mm_load_si128((__m128i *) (b - 8)));
-        r2 = _mm_add_ps(r2, _mm_mulhi_epi8(r3));
-        a += 8;
-        b += 8;
-        length -= 8;
-        r2 = _mm_hadd_ps(_mm_hadd_ps(r2, r2), r2);
-        res += r2.m128_f32[0];
-      }
-
-      if (length >= 4) {
-        __m128 r2 = _mm_setzero_ps();
-        __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 12)),
-                                   _mm_load_si128((__m128i *) (b - 12)));
-        r2 = _mm_add_ps(r2, _mm_mulhi_epi8_shift32(r3));
-        res += r2.m128_f32[0] + r2.m128_f32[1];
-      }
-
-      return res;
-    }
-#endif
-  };
-
-  class AVXDistanceL2Float : public Distance<float> {
-   public:
-    virtual float compare(const float *a, const float *b,
-                          unsigned int length) const {
-#ifndef _WINDOWS
-      float result = 0;
-#pragma omp simd reduction(+ : result) aligned(a, b : 8)
-      for (_s32 i = 0; i < (_s32) length; i++) {
-        result += (a[i] - b[i]) * (a[i] - b[i]);
-      }
-      return result;
-    }
-#else
-      __m128 diff, v1, v2;
-      __m128 sum = _mm_set1_ps(0);
-
-      while (length >= 4) {
-        v1 = _mm_loadu_ps(a);
-        a += 4;
-        v2 = _mm_loadu_ps(b);
-        b += 4;
-        diff = _mm_sub_ps(v1, v2);
-        sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
-        length -= 4;
-      }
-
-      return sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] +
-             sum.m128_f32[3];
-    }
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t size) const
+        __attribute__((hot));
 #endif
  };

  template<typename T>
  class DistanceInnerProduct : public Distance<T> {
   public:
-    float inner_product(const T *a, const T *b, unsigned size) const {
-      float result = 0;
-#ifdef __GNUC__
-#ifdef __AVX__
-#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
-  tmp1 = _mm256_loadu_ps(addr1);                \
-  tmp2 = _mm256_loadu_ps(addr2);                \
-  tmp1 = _mm256_mul_ps(tmp1, tmp2);             \
-  dest = _mm256_add_ps(dest, tmp1);
-
-      __m256       sum;
-      __m256       l0, l1;
-      __m256       r0, r1;
-      unsigned     D = (size + 7) & ~7U;
-      unsigned     DR = D % 16;
-      unsigned     DD = D - DR;
-      const float *l = (float *) a;
-      const float *r = (float *) b;
-      const float *e_l = l + DD;
-      const float *e_r = r + DD;
-      float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
-
-      sum = _mm256_loadu_ps(unpack);
-      if (DR) {
-        AVX_DOT(e_l, e_r, sum, l0, r0);
-      }
-
-      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
-        AVX_DOT(l, r, sum, l0, r0);
-        AVX_DOT(l + 8, r + 8, sum, l1, r1);
-      }
-      _mm256_storeu_ps(unpack, sum);
-      result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
-               unpack[5] + unpack[6] + unpack[7];
-
-#else
-#ifdef __SSE2__
-#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
-  tmp1 = _mm128_loadu_ps(addr1);                \
-  tmp2 = _mm128_loadu_ps(addr2);                \
-  tmp1 = _mm128_mul_ps(tmp1, tmp2);             \
-  dest = _mm128_add_ps(dest, tmp1);
-      __m128       sum;
-      __m128       l0, l1, l2, l3;
-      __m128       r0, r1, r2, r3;
-      unsigned     D = (size + 3) & ~3U;
-      unsigned     DR = D % 16;
-      unsigned     DD = D - DR;
-      const float *l = a;
-      const float *r = b;
-      const float *e_l = l + DD;
-      const float *e_r = r + DD;
-      float        unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
-
-      sum = _mm_load_ps(unpack);
-      switch (DR) {
-        case 12:
-          SSE_DOT(e_l + 8, e_r + 8, sum, l2, r2);
-        case 8:
-          SSE_DOT(e_l + 4, e_r + 4, sum, l1, r1);
-        case 4:
-          SSE_DOT(e_l, e_r, sum, l0, r0);
-        default:
-          break;
-      }
-      for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
-        SSE_DOT(l, r, sum, l0, r0);
-        SSE_DOT(l + 4, r + 4, sum, l1, r1);
-        SSE_DOT(l + 8, r + 8, sum, l2, r2);
-        SSE_DOT(l + 12, r + 12, sum, l3, r3);
-      }
-      _mm_storeu_ps(unpack, sum);
-      result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
-#else
-
-      float        dot0, dot1, dot2, dot3;
-      const float *last = a + size;
-      const float *unroll_group = last - 3;
-
-      /* Process 4 items with each loop for efficiency. */
-      while (a < unroll_group) {
-        dot0 = a[0] * b[0];
-        dot1 = a[1] * b[1];
-        dot2 = a[2] * b[2];
-        dot3 = a[3] * b[3];
-        result += dot0 + dot1 + dot2 + dot3;
-        a += 4;
-        b += 4;
-      }
-      /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
-      while (a < last) {
-        result += *a++ * *b++;
-      }
-#endif
-#endif
-#endif
-      return result;
-    }
-    float compare(const T *a, const T *b, unsigned size)
-        const {  // since we use normally minimization objective for distance
-                 // comparisons, we are returning 1/x.
+    float inner_product(const T *a, const T *b, unsigned size) const;
+    float compare(const T *a, const T *b, unsigned size) const {
+      // since we use normally minimization objective for distance
+      // comparisons, we are returning 1/x.
      float result = inner_product(a, b, size);
      //      if (result < 0)
      //      return std::numeric_limits<float>::max();
@ -451,100 +73,65 @@ namespace diskann {
      : public DistanceInnerProduct<T> {  // currently defined only for float.
                                          // templated for future use.
   public:
-    float norm(const T *a, unsigned size) const {
-      float result = 0;
-#ifdef __GNUC__
-#ifdef __AVX__
-#define AVX_L2NORM(addr, dest, tmp) \
-  tmp = _mm256_loadu_ps(addr);      \
-  tmp = _mm256_mul_ps(tmp, tmp);    \
-  dest = _mm256_add_ps(dest, tmp);
-
-      __m256       sum;
-      __m256       l0, l1;
-      unsigned     D = (size + 7) & ~7U;
-      unsigned     DR = D % 16;
-      unsigned     DD = D - DR;
-      const float *l = (float *) a;
-      const float *e_l = l + DD;
-      float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
-
-      sum = _mm256_loadu_ps(unpack);
-      if (DR) {
-        AVX_L2NORM(e_l, sum, l0);
-      }
-      for (unsigned i = 0; i < DD; i += 16, l += 16) {
-        AVX_L2NORM(l, sum, l0);
-        AVX_L2NORM(l + 8, sum, l1);
-      }
-      _mm256_storeu_ps(unpack, sum);
-      result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
-               unpack[5] + unpack[6] + unpack[7];
-#else
-#ifdef __SSE2__
-#define SSE_L2NORM(addr, dest, tmp) \
-  tmp = _mm128_loadu_ps(addr);      \
-  tmp = _mm128_mul_ps(tmp, tmp);    \
-  dest = _mm128_add_ps(dest, tmp);
-
-      __m128       sum;
-      __m128       l0, l1, l2, l3;
-      unsigned     D = (size + 3) & ~3U;
-      unsigned     DR = D % 16;
-      unsigned     DD = D - DR;
-      const float *l = a;
-      const float *e_l = l + DD;
-      float        unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
-
-      sum = _mm_load_ps(unpack);
-      switch (DR) {
-        case 12:
-          SSE_L2NORM(e_l + 8, sum, l2);
-        case 8:
-          SSE_L2NORM(e_l + 4, sum, l1);
-        case 4:
-          SSE_L2NORM(e_l, sum, l0);
-        default:
-          break;
-      }
-      for (unsigned i = 0; i < DD; i += 16, l += 16) {
-        SSE_L2NORM(l, sum, l0);
-        SSE_L2NORM(l + 4, sum, l1);
-        SSE_L2NORM(l + 8, sum, l2);
-        SSE_L2NORM(l + 12, sum, l3);
-      }
-      _mm_storeu_ps(unpack, sum);
-      result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
-#else
-      float        dot0, dot1, dot2, dot3;
-      const float *last = a + size;
-      const float *unroll_group = last - 3;
-
-      /* Process 4 items with each loop for efficiency. */
-      while (a < unroll_group) {
-        dot0 = a[0] * a[0];
-        dot1 = a[1] * a[1];
-        dot2 = a[2] * a[2];
-        dot3 = a[3] * a[3];
-        result += dot0 + dot1 + dot2 + dot3;
-        a += 4;
-      }
-      /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
-      while (a < last) {
-        result += (*a) * (*a);
-        a++;
-      }
-#endif
-#endif
-#endif
-      return result;
-    }
-    using DistanceInnerProduct<T>::compare;
+    float norm(const T *a, unsigned size) const;
    float compare(const T *a, const T *b, float norm,
-                  unsigned size) const {  // not implement
-      float result = -2 * DistanceInnerProduct<T>::inner_product(a, b, size);
-      result += norm;
-      return result;
+                  unsigned size) const;
+  };
+
+  // Gopal. Slow implementations of the distance functions to get diskann to
+  // work in pre-AVX machines. Performance here is not a concern, so we are
+  // using the simplest possible implementation.
+  template<typename T>
+  class SlowDistanceL2Int : public Distance<T> {
+   public:
+    // Implementing here because this is a template function
+    DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b,
+                                            uint32_t length) const {
+      uint32_t result = 0;
+      for (uint32_t i = 0; i < length; i++) {
+        result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
+                  ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
+      }
+      return (float) result;
    }
  };
+
+  class SlowDistanceL2Float : public Distance<float> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t length) const;
+  };
+
+  // AVX implementations. Borrowed from HNSW code.
+  class AVXDistanceL2Int8 : public Distance<int8_t> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
+                                            uint32_t length) const;
+  };
+
+  class AVXDistanceL2Float : public Distance<float> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t length) const;
+  };
+
+  class AVXDistanceInnerProductFloat : public Distance<float> {
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t length) const;
+  };
+
+  class AVXNormalizedCosineDistanceFloat : public Distance<float> {
+   private:
+    AVXDistanceInnerProductFloat _innerProduct;
+
+   public:
+    DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
+                                            uint32_t length) const {
+      // Inner product returns negative values to indicate distance.
+      // This will ensure that cosine is between -1 and 1.
+      return 1.0f + _innerProduct.compare(a, b, length);
+    }
+  };
+
 }  // namespace diskann
--- a/include/index.h
+++ b/include/index.h
@ -3,81 +3,173 @@

 #pragma once

+#include <atomic>
 #include <cassert>
 #include <map>
+#include <shared_mutex>
 #include <sstream>
 #include <stack>
 #include <string>
 #include <unordered_map>
 #include "tsl/robin_set.h"
+#include "tsl/robin_map.h"

 #include "distance.h"
 #include "neighbor.h"
 #include "parameters.h"
 #include "utils.h"
+#include "concurrent_queue.h"
 #include "windows_customizations.h"

-#define SLACK_FACTOR 1.3
+#define GRAPH_SLACK_FACTOR 1.3
+#define OVERHEAD_FACTOR 1.1

-#define ESTIMATE_RAM_USAGE(size, dim, datasize, degree) \
-  (1.30 * (((double) size * dim) * datasize +           \
-           ((double) size * degree) * sizeof(unsigned) * SLACK_FACTOR))
+namespace boost {
+#ifndef BOOST_DYNAMIC_BITSET_FWD_HPP
+  template<typename Block = unsigned long,
+           typename Allocator = std::allocator<Block>>
+  class dynamic_bitset;
+#endif
+}  // namespace boost

 namespace diskann {
-  template<typename T, typename TagT = int>
+  inline double estimate_ram_usage(_u64 size, _u32 dim, _u32 datasize,
+                                   _u32 degree) {
+    double size_of_data = ((double) size) * ROUND_UP(dim, 8) * datasize;
+    double size_of_graph =
+        ((double) size) * degree * sizeof(unsigned) * GRAPH_SLACK_FACTOR;
+    double size_of_locks = ((double) size) * sizeof(std::mutex);
+    double size_of_outer_vector = ((double) size) * sizeof(ptrdiff_t);
+
+    return OVERHEAD_FACTOR * (size_of_data + size_of_graph + size_of_locks +
+                              size_of_outer_vector);
+  }
+
+  template<typename T>
+  struct InMemQueryScratch {
+    std::vector<Neighbor> *   _pool = nullptr;
+    tsl::robin_set<unsigned> *_visited = nullptr;
+    std::vector<unsigned> *   _des = nullptr;
+    std::vector<Neighbor> *   _best_l_nodes = nullptr;
+    tsl::robin_set<unsigned> *_inserted_into_pool_rs = nullptr;
+    boost::dynamic_bitset<> * _inserted_into_pool_bs = nullptr;
+
+    T *       aligned_query = nullptr;
+    uint32_t *indices = nullptr;
+    float *   interim_dists = nullptr;
+
+    uint32_t search_l;
+    uint32_t indexing_l;
+    uint32_t r;
+
+    InMemQueryScratch();
+    void setup(uint32_t search_l, uint32_t indexing_l, uint32_t r, size_t dim);
+    void clear();
+    void resize_for_query(uint32_t new_search_l);
+    void destroy();
+
+    std::vector<Neighbor> &pool() {
+      return *_pool;
+    }
+    std::vector<unsigned> &des() {
+      return *_des;
+    }
+    tsl::robin_set<unsigned> &visited() {
+      return *_visited;
+    }
+    std::vector<Neighbor> &best_l_nodes() {
+      return *_best_l_nodes;
+    }
+    tsl::robin_set<unsigned> &inserted_into_pool_rs() {
+      return *_inserted_into_pool_rs;
+    }
+    boost::dynamic_bitset<> &inserted_into_pool_bs() {
+      return *_inserted_into_pool_bs;
+    }
+  };
+
+  template<typename T, typename TagT = uint32_t>
  class Index {
   public:
-    DISKANN_DLLEXPORT Index(Metric m, const char *filename,
-                            const size_t max_points = 0, const size_t nd = 0,
-                            const size_t num_frozen_pts = 0,
-                            const bool   enable_tags = false,
-                            const bool   store_data = true,
-                            const bool   support_eager_delete = false);
+    // Constructor for Bulk operations and for creating the index object solely
+    // for loading a prexisting index.
+    DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
+                            const bool dynamic_index,
+                            const bool save_index_in_one_file,
+                            const bool enable_tags = false,
+                            const bool support_eager_delete = false);
+
+    // Constructor for incremental index
+    DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
+                            const bool        dynamic_index,
+                            const bool        save_index_in_one_file,
+                            const Parameters &indexParameters,
+                            const Parameters &searchParameters,
+                            const bool        enable_tags = false,
+                            const bool        support_eager_delete = false);
+
    DISKANN_DLLEXPORT ~Index();

+    // Public Functions for Static Support
+
    // checks if data is consolidated, saves graph, metadata and associated
    // tags.
    DISKANN_DLLEXPORT void save(const char *filename);
-    DISKANN_DLLEXPORT void load(const char *filename,
-                                const bool  load_tags = false,
-                                const char *tag_filename = NULL);
-    // generates one or more frozen points that will never get deleted from the
-    // graph
-    DISKANN_DLLEXPORT int generate_random_frozen_points(
-        const char *filename = NULL);
+    DISKANN_DLLEXPORT _u64 save_graph(std::string filename, size_t offset = 0);
+    DISKANN_DLLEXPORT _u64 save_data(std::string filename, size_t offset = 0);
+    DISKANN_DLLEXPORT _u64 save_tags(std::string filename, size_t offset = 0);
+    DISKANN_DLLEXPORT _u64 save_delete_list(const std::string &filename,
+                                            size_t             offset = 0);
+
+    DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads,
+                                uint32_t search_l);
+
+    DISKANN_DLLEXPORT size_t load_graph(const std::string filename,
+                                        size_t            expected_num_points,
+                                        size_t            offset = 0);
+
+    DISKANN_DLLEXPORT size_t load_data(std::string filename, size_t offset = 0);
+
+    DISKANN_DLLEXPORT size_t load_tags(const std::string tag_file_name,
+                                       size_t            offset = 0);
+    DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename,
+                                             size_t             offset = 0);
+
+    DISKANN_DLLEXPORT size_t get_num_points();
+
+    DISKANN_DLLEXPORT size_t return_max_points();

    DISKANN_DLLEXPORT void build(
+        const char *filename, const size_t num_points_to_load,
        Parameters &             parameters,
        const std::vector<TagT> &tags = std::vector<TagT>());

+    DISKANN_DLLEXPORT void build(const char * filename,
+                                 const size_t num_points_to_load,
+                                 Parameters & parameters,
+                                 const char * tag_filename);
+
    // Gopal. Added search overload that takes L as parameter, so that we
    // can customize L on a per-query basis without tampering with "Parameters"
-    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search(const T *      query,
-                                                           const size_t   K,
-                                                           const unsigned L,
-                                                           unsigned *indices);
-
+    template<typename IDType>
    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search(
-        const T *query, const uint64_t K, const unsigned L,
-        std::vector<unsigned> init_ids, uint64_t *indices, float *distances);
+        const T *query, const size_t K, const unsigned L, IDType *indices,
+        float *distances = nullptr);

-    DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_tags(
-        const T *query, const size_t K, const unsigned L, TagT *tags,
-        unsigned *indices_buffer = NULL);
+    DISKANN_DLLEXPORT size_t search_with_tags(const T *query, const uint64_t K,
+                                              const unsigned L, TagT *tags,
+                                              float *           distances,
+                                              std::vector<T *> &res_vectors);

-    // repositions frozen points to the end of _data - if they have been moved
-    // during deletion
-    DISKANN_DLLEXPORT void readjust_data(unsigned _num_frozen_pts);
+    DISKANN_DLLEXPORT void clear_index();

-    /* insertions possible only when id corresponding to tag does not already
-     * exist in the graph */
-    DISKANN_DLLEXPORT int insert_point(const T *                    point,
-                                       const Parameters &           parameter,
-                                       std::vector<Neighbor> &      pool,
-                                       std::vector<Neighbor> &      tmp,
-                                       tsl::robin_set<unsigned> &   visited,
-                                       std::vector<SimpleNeighbor> &cut_graph,
-                                       const TagT                   tag);
+    // Public Functions for Incremental Support
+
+    // insertions possible only when id corresponding to tag does not already
+    // exist in the graph 
+    DISKANN_DLLEXPORT int insert_point(
+        const T *point, const Parameters &parameter,
+        const TagT tag);  // only keep point, tag, parameters

    // call before triggering deleteions - sets important flags required for
    // deletion related operations
@ -91,12 +183,64 @@ namespace diskann {
    // Record deleted point now and restructure graph later. Return -1 if tag
    // not found, 0 if OK. Do not call if _eager_delete was called earlier and
    // data was not consolidated
-    DISKANN_DLLEXPORT int delete_point(const TagT tag);
+    DISKANN_DLLEXPORT int lazy_delete(const TagT &tag);
+
+    // Record deleted points now and restructure graph later. Add to failed_tags
+    // if tag not found. Do not call if _eager_delete was called earlier and
+    // data was not consolidated. Return -1 if
+    DISKANN_DLLEXPORT int lazy_delete(const tsl::robin_set<TagT> &tags,
+                                      std::vector<TagT> &         failed_tags);

    // Delete point from graph and restructure it immediately. Do not call if
    // _lazy_delete was called earlier and data was not consolidated
    DISKANN_DLLEXPORT int eager_delete(const TagT        tag,
-                                       const Parameters &parameters);
+                                       const Parameters &parameters,
+                                       int               delete_mode = 1);
+    // return _data and tag_to_location offset
+    DISKANN_DLLEXPORT int extract_data(
+        T *ret_data, std::unordered_map<TagT, unsigned> &tag_to_location);
+
+    DISKANN_DLLEXPORT void get_location_to_tag(
+        std::unordered_map<unsigned, TagT> &ret_loc_to_tag);
+
+    DISKANN_DLLEXPORT void prune_all_nbrs(const Parameters &parameters);
+
+    DISKANN_DLLEXPORT void compact_data_for_insert();
+
+    DISKANN_DLLEXPORT bool                    hasIndexBeenSaved();
+    const std::vector<std::vector<unsigned>> *get_graph() const {
+      return &this->_final_graph;
+    }
+    T *                                       get_data();
+    const std::unordered_map<unsigned, TagT> *get_tags() const {
+      return &this->_location_to_tag;
+    };
+    // repositions frozen points to the end of _data - if they have been moved
+    // during deletion
+    DISKANN_DLLEXPORT void reposition_frozen_point_to_end();
+    DISKANN_DLLEXPORT void reposition_point(unsigned old_location,
+                                            unsigned new_location);
+
+    DISKANN_DLLEXPORT void compact_frozen_point();
+    DISKANN_DLLEXPORT void compact_data_for_search();
+
+    DISKANN_DLLEXPORT void consolidate(Parameters &parameters);
+
+    // DISKANN_DLLEXPORT void save_index_as_one_file(bool flag);
+
+    DISKANN_DLLEXPORT void get_active_tags(tsl::robin_set<TagT> &active_tags);
+
+    DISKANN_DLLEXPORT int   get_vector_by_tag(TagT &tag, T *vec);
+    DISKANN_DLLEXPORT const T *get_vector_by_tag(const TagT &tag);
+
+    DISKANN_DLLEXPORT void print_status() const;
+
+    // This variable MUST be updated if the number of entries in the metadata
+    // change.
+    DISKANN_DLLEXPORT static const int METADATA_ROWS = 5;
+
+    DISKANN_DLLEXPORT static bool get_npts_and_dim_from_index(
+        const char *file_name, size_t &npts, size_t &dim);

    DISKANN_DLLEXPORT void optimize_graph();

@ -105,24 +249,49 @@ namespace diskann {

    /*  Internals of the library */
   protected:
-    typedef std::vector<SimpleNeighbor>        vecNgh;
-    typedef std::vector<std::vector<unsigned>> CompactGraph;
-    CompactGraph                               _final_graph;
-    CompactGraph                               _in_graph;
+    // No copy/assign.
+    Index(const Index<T, TagT> &) = delete;
+    Index<T, TagT> &operator=(const Index<T, TagT> &) = delete;
+
+    std::vector<std::vector<unsigned>> _final_graph;
+    std::vector<std::vector<unsigned>> _in_graph;
+
+    // generates one frozen point that will never get deleted from the
+    // graph
+    int generate_frozen_point();

    // determines navigating node of the graph by calculating medoid of data
    unsigned calculate_entry_point();
    // called only when _eager_delete is to be supported
    void update_in_graph();

+    template<typename IDType>
+    std::pair<uint32_t, uint32_t> search_impl(const T *query, const size_t K,
+                                              const unsigned L, IDType *indices,
+                                              float *               distances,
+                                              InMemQueryScratch<T> &scratch);
+
    std::pair<uint32_t, uint32_t> iterate_to_fixed_point(
        const T *node_coords, const unsigned Lindex,
        const std::vector<unsigned> &init_ids,
        std::vector<Neighbor> &      expanded_nodes_info,
        tsl::robin_set<unsigned> &   expanded_nodes_ids,
-        std::vector<Neighbor> &      best_L_nodes);
-
+        std::vector<Neighbor> &best_L_nodes, std::vector<unsigned> &des,
+        tsl::robin_set<unsigned> &inserted_into_pool_rs,
+        boost::dynamic_bitset<> &inserted_into_pool_bs, bool ret_frozen = true,
+        bool search_invocation = false);
    void get_expanded_nodes(const size_t node, const unsigned Lindex,
+                            std::vector<unsigned>     init_ids,
+                            std::vector<Neighbor> &   expanded_nodes_info,
+                            tsl::robin_set<unsigned> &expanded_nodes_ids,
+                            std::vector<unsigned> &   des,
+                            std::vector<Neighbor> &   best_L_nodes,
+                            tsl::robin_set<unsigned> &inserted_into_pool_rs,
+                            boost::dynamic_bitset<> & inserted_into_pool_bs);
+
+    // get_expanded_nodes for insertion. Must investigate to see if perf can
+    // be improved here as well using the same technique as above.
+    void get_expanded_nodes(const size_t node_id, const unsigned Lindex,
                            std::vector<unsigned>     init_ids,
                            std::vector<Neighbor> &   expanded_nodes_info,
                            tsl::robin_set<unsigned> &expanded_nodes_ids);
@ -151,60 +320,93 @@ namespace diskann {
    void link(Parameters &parameters);

    // WARNING: Do not call reserve_location() without acquiring change_lock_
-    unsigned reserve_location();
+    int  reserve_location();
+    void release_location();

-    // get new location corresponding to each undeleted tag after deletions
-    std::vector<unsigned> get_new_location(unsigned &active);
+    // Support for resizing the index
+    // This function must be called ONLY after taking the _change_lock and
+    // _update_lock. Anything else in a MT environment will lead to an
+    // inconsistent index.
+    void resize(size_t new_max_points);
+
+    /*    // get new location corresponding to each undeleted tag after
+       deletions std::vector<unsigned> get_new_location(unsigned &active);*/

    // renumber nodes, update tag and location maps and compact the graph, mode
    // = _consolidated_order in case of lazy deletion and _compacted_order in
    // case of eager deletion
-    void compact_data(std::vector<unsigned> new_location, unsigned active,
-                      bool &mode);
+    void compact_data();

    // WARNING: Do not call consolidate_deletes without acquiring change_lock_
    // Returns number of live points left after consolidation
    size_t consolidate_deletes(const Parameters &parameters);

+    void initialize_query_scratch(uint32_t num_threads, uint32_t search_l,
+                                  uint32_t indexing_l, uint32_t r, size_t dim);
+
   private:
-    Metric       _metric = diskann::L2;
-    size_t       _dim;
-    size_t       _aligned_dim;
-    T *          _data;
-    size_t       _nd;  // number of active points i.e. existing in the graph
-    size_t       _max_points;  // total number of points in given data set
-    size_t       _num_frozen_pts;
-    bool         _has_built;
-    Distance<T> *_distance;
-    unsigned     _width;
-    unsigned     _ep;
+    Metric       _dist_metric = diskann::L2;
+    size_t       _dim = 0;
+    size_t       _aligned_dim = 0;
+    T *          _data = nullptr;
+    size_t       _nd = 0;  // number of active points i.e. existing in the graph
+    size_t       _max_points = 0;  // total number of points in given data set
+    size_t       _num_frozen_pts = 0;
+    bool         _has_built = false;
+    Distance<T> *_distance = nullptr;
+    unsigned     _width = 0;
+    unsigned     _ep = 0;
+    size_t       _max_range_of_loaded_graph = 0;
    bool         _saturate_graph = false;
-    std::vector<std::mutex> _locks;  // Per node lock, cardinality=max_points_
+    bool         _save_as_one_file = false;
+    bool         _dynamic_index = false;
+    bool         _enable_tags = false;
+    // Using normalied L2 for cosine.
+    bool _normalize_vecs = false;

-    char * _opt_graph;
-    size_t _node_size;
-    size_t _data_len;
-    size_t _neighbor_len;
+    // Indexing parameters
+    uint32_t _indexingQueueSize, _indexingRange, _indexingMaxC;
+    float    _indexingAlpha;
+    uint32_t _search_queue_size;

-    bool _can_delete;
-    bool _eager_done;       // true if eager deletions have been made
-    bool _lazy_done;        // true if lazy deletions have been made
-    bool _compacted_order;  // true if after eager deletions, data has been
-                            // consolidated
-    bool _enable_tags;
-    bool _consolidated_order;    // true if after lazy deletions, data has been
-                                 // consolidated
-    bool _support_eager_delete;  //_support_eager_delete = activates extra data
-                                 // structures and functions required for eager
-    // deletion
-    bool _store_data;
+    // Query scratch data structures
+    ConcurrentQueue<InMemQueryScratch<T>> _query_scratch;

+    // flags for dynamic indexing
    std::unordered_map<TagT, unsigned> _tag_to_location;
    std::unordered_map<unsigned, TagT> _location_to_tag;

    tsl::robin_set<unsigned> _delete_set;
    tsl::robin_set<unsigned> _empty_slots;

-    std::mutex _change_lock;  // Allow only 1 thread to insert/delete
+    bool _support_eager_delete =
+        false;  //_support_eager_delete = activates extra data
+    // bool _can_delete = false;  // only true if deletes can be done (if
+    // enabled)
+    bool _eager_done = false;     // true if eager deletions have been made
+    bool _lazy_done = false;      // true if lazy deletions have been made
+    bool _data_compacted = true;  // true if data has been consolidated
+    bool _is_saved = false;  // Gopal. Checking if the index is already saved.
+
+    std::vector<std::mutex> _locks;  // Per node lock, cardinality=max_points_
+    std::shared_timed_mutex _tag_lock;  // reader-writer lock on
+                                        // _tag_to_location and
+    std::mutex _change_lock;  // Lock taken to synchronously modify _nd
+    std::vector<std::mutex> _locks_in;     // Per node lock
+    std::shared_timed_mutex _delete_lock;  // Lock on _delete_set and
+                                           // _empty_slots when reading and
+                                           // writing to them
+
+    // _location_to_tag, has a shared lock
+    // and exclusive lock associated with
+    // it.
+    std::shared_timed_mutex _update_lock;  // coordinate save() and any change
+                                           // being done to the graph.
+    static const float INDEX_GROWTH_FACTOR;
+
+    char * _opt_graph;
+    size_t _node_size;
+    size_t _data_len;
+    size_t _neighbor_len;
  };
 }  // namespace diskann
--- a/include/simd_utils.h
+++ b/include/simd_utils.h
@ -0,0 +1,105 @@
+#pragma once
+
+#ifdef _WINDOWS
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+namespace diskann {
+  static inline __m256 _mm256_mul_epi8(__m256i X) {
+    __m256i zero = _mm256_setzero_si256();
+
+    __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
+
+    __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
+    __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
+
+    return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, xlo),
+                                               _mm256_madd_epi16(xhi, xhi)));
+  }
+
+  static inline __m128 _mm_mulhi_epi8(__m128i X) {
+    __m128i zero = _mm_setzero_si128();
+    __m128i sign_x = _mm_cmplt_epi8(X, zero);
+    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
+
+    return _mm_cvtepi32_ps(
+        _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
+  }
+
+  static inline __m128 _mm_mulhi_epi8_shift32(__m128i X) {
+    __m128i zero = _mm_setzero_si128();
+    X = _mm_srli_epi64(X, 32);
+    __m128i sign_x = _mm_cmplt_epi8(X, zero);
+    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
+
+    return _mm_cvtepi32_ps(
+        _mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
+  }
+  static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y) {
+    __m128i zero = _mm_setzero_si128();
+
+    __m128i sign_x = _mm_cmplt_epi8(X, zero);
+    __m128i sign_y = _mm_cmplt_epi8(Y, zero);
+
+    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
+    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
+    __m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
+    __m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
+
+    return _mm_cvtepi32_ps(
+        _mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
+  }
+  static inline __m128 _mm_mul_epi8(__m128i X) {
+    __m128i zero = _mm_setzero_si128();
+    __m128i sign_x = _mm_cmplt_epi8(X, zero);
+    __m128i xlo = _mm_unpacklo_epi8(X, sign_x);
+    __m128i xhi = _mm_unpackhi_epi8(X, sign_x);
+
+    return _mm_cvtepi32_ps(
+        _mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
+  }
+
+  static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y) {
+    __m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
+    return _mm_cvtepi32_ps(
+        _mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
+  }
+
+  static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y) {
+    __m256i zero = _mm256_setzero_si256();
+
+    __m256i sign_x = _mm256_cmpgt_epi8(zero, X);
+    __m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
+
+    __m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
+    __m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
+    __m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
+    __m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
+
+    return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo),
+                                               _mm256_madd_epi16(xhi, yhi)));
+  }
+
+  static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y) {
+    __m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
+    return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)),
+                           _mm256_setzero_ps(), 252);
+  }
+
+  static inline float _mm256_reduce_add_ps(__m256 x) {
+    /* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
+    const __m128 x128 =
+        _mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
+    /* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
+    const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
+    /* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
+    const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
+    /* Conversion to float is a no-op on x86-64 */
+    return _mm_cvtss_f32(x32);
+  }
+}  // namespace
--- a/include/utils.h
+++ b/include/utils.h
@ -4,15 +4,21 @@
 #pragma once
 #include <fcntl.h>
 #include <algorithm>
+#include <errno.h>
+
 #include <cassert>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
+#include <limits.h>
+
 #include <string>
 #include <memory>
 #include <random>
 #include <set>
+#include <sstream>
+#include <string.h>
 #ifdef __APPLE__
 #else
 #include <malloc.h>
@ -26,8 +32,11 @@ typedef HANDLE FileHandle;
 typedef int FileHandle;
 #endif

+#include "distance.h"
+#include "utils.h"
 #include "logger.h"
 #include "cached_io.h"
+#include "ann_exception.h"
 #include "common_includes.h"
 #include "windows_customizations.h"

@ -51,6 +60,44 @@ typedef int FileHandle;
 #define IS_ALIGNED(X, Y) ((uint64_t)(X) % (uint64_t)(Y) == 0)
 #define IS_512_ALIGNED(X) IS_ALIGNED(X, 512)
 #define IS_4096_ALIGNED(X) IS_ALIGNED(X, 4096)
+#define METADATA_SIZE \
+  4096  // all metadata of individual sub-component files is written in first
+        // 4KB for unified files
+
+inline bool file_exists(const std::string& name, bool dirCheck = false) {
+  int val;
+#ifndef _WINDOWS
+  struct stat buffer;
+  val = stat(name.c_str(), &buffer);
+#else
+  // It is the 21st century but Windows API still thinks in 32-bit terms.
+  // Turns out calling stat() on a file > 4GB results in errno = 132 (OVERFLOW).
+  // How silly is this!? So calling _stat64()
+  struct _stat64 buffer;
+  val = _stat64(name.c_str(), &buffer);
+#endif
+
+  diskann::cout << " Stat(" << name.c_str() << ") returned: " << val
+                << std::endl;
+  if (val != 0) {
+    switch (errno) {
+      case EINVAL:
+        diskann::cout << "Invalid argument passed to stat()" << std::endl;
+        break;
+      case ENOENT:
+        diskann::cout << "File " << name.c_str() << " does not exist"
+                      << std::endl;
+        break;
+      default:
+        diskann::cout << "Unexpected error in stat():" << errno << std::endl;
+        break;
+    }
+    return false;
+  } else {
+    // the file entry exists. If reqd, check if this is a directory.
+    return dirCheck ? buffer.st_mode & S_IFDIR : true;
+  }
+}

 typedef uint64_t _u64;
 typedef int64_t  _s64;
@ -60,11 +107,62 @@ typedef uint16_t _u16;
 typedef int16_t  _s16;
 typedef uint8_t  _u8;
 typedef int8_t   _s8;
+inline void      open_file_to_write(std::ofstream&     writer,
+                                    const std::string& filename) {
+  writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
+  if (!file_exists(filename))
+    writer.open(filename, std::ios::binary | std::ios::out);
+  else
+    writer.open(filename, std::ios::binary | std::ios::in | std::ios::out);
+
+  if (writer.fail()) {
+    char buff[1024];
+#ifdef _WINDOWS
+    strerror_s(buff, 1024, errno);
+#else
+    strerror_r(errno, buff, 1024);
+#endif
+    diskann::cerr << std::string("Failed to open file") + filename +
+                         " for write because " + buff
+                  << std::endl;
+    throw diskann::ANNException(std::string("Failed to open file ") + filename +
+                                    " for write because: " + buff,
+                                -1);
+  }
+}
+
+inline _u64 get_file_size(const std::string& fname) {
+  std::ifstream reader(fname, std::ios::binary | std::ios::ate);
+  if (!reader.fail() && reader.is_open()) {
+    _u64 end_pos = reader.tellg();
+    reader.close();
+    return end_pos;
+  } else {
+    diskann::cerr << "Could not open file: " << fname << std::endl;
+    return 0;
+  }
+}
+
+inline int delete_file(const std::string& fileName) {
+  if (file_exists(fileName)) {
+    auto rc = ::remove(fileName.c_str());
+    if (rc != 0) {
+      diskann::cerr
+          << "Could not delete file: " << fileName
+          << " even though it exists. This might indicate a permissions issue. "
+             "If you see this message, please contact the diskann team."
+          << std::endl;
+    }
+    return rc;
+  } else {
+    return 0;
+  }
+}

 namespace diskann {
  static const size_t MAX_SIZE_OF_STREAMBUF = 2LL * 1024 * 1024 * 1024;

-  enum Metric { L2 = 0, INNER_PRODUCT = 1, FAST_L2 = 2, PQ = 3 };
+  enum Metric { L2 = 0, INNER_PRODUCT = 1, COSINE = 2, FAST_L2 = 3, PQ = 4 };

  inline void alloc_aligned(void** ptr, size_t size, size_t align) {
    *ptr = nullptr;
@ -77,6 +175,24 @@ namespace diskann {
    assert(*ptr != nullptr);
  }

+  inline void realloc_aligned(void** ptr, size_t size, size_t align) {
+    assert(IS_ALIGNED(size, align));
+#ifdef _WINDOWS
+    *ptr = ::_aligned_realloc(*ptr, size, align);
+#else
+    diskann::cerr << "No aligned realloc on GCC. Must malloc and mem_align, "
+                     "left it out for now."
+                  << std::endl;
+#endif
+    assert(*ptr != nullptr);
+  }
+
+  inline void check_stop(std::string arnd) {
+    int brnd;
+    diskann::cout << arnd << std::endl;
+    std::cin >> brnd;
+  }
+
  inline void aligned_free(void* ptr) {
    // Gopal. Must have a check here if the pointer was actually allocated by
    // _alloc_aligned
@ -384,7 +500,7 @@ namespace diskann {
    _u64 total_res = (_u64) total_u32;

    diskann::cout << "Metadata: #pts = " << gt_num
-                  << ", #total_results = " << total_res << "... " << std::flush;
+                  << ", #total_results = " << total_res << "..." << std::endl;

    size_t expected_file_size =
        2 * sizeof(_u32) + gt_num * sizeof(_u32) + total_res * sizeof(_u32);
@ -440,8 +556,8 @@ namespace diskann {
  }

  template<typename T>
-  inline void save_bin(const std::string& filename, T* data, size_t npts,
-                       size_t ndims) {
+  inline uint64_t save_bin(const std::string& filename, T* data, size_t npts,
+                           size_t ndims) {
    std::ofstream writer(filename, std::ios::binary | std::ios::out);
    diskann::cout << "Writing bin: " << filename.c_str() << std::endl;
    int npts_i32 = (int) npts, ndims_i32 = (int) ndims;
@ -454,7 +570,9 @@ namespace diskann {
    //    data = new T[npts_u64 * ndims_u64];
    writer.write((char*) data, npts * ndims * sizeof(T));
    writer.close();
+    size_t bytes_written = npts * ndims * sizeof(T) + 2 * sizeof(uint32_t);
    diskann::cout << "Finished writing bin." << std::endl;
+    return bytes_written;
  }

  // load_aligned_bin functions START
@ -652,6 +770,67 @@ namespace diskann {
      writer.write((char*) cur_pt, ndims * sizeof(T));
    }
  }
+  template<typename T>
+  inline uint64_t save_data_in_base_dimensions(const std::string& filename,
+                                               T* data, size_t npts,
+                                               size_t ndims, size_t aligned_dim,
+                                               size_t offset = 0) {
+    std::ofstream writer;  //(filename, std::ios::binary | std::ios::out);
+    open_file_to_write(writer, filename);
+    int  npts_i32 = (int) npts, ndims_i32 = (int) ndims;
+    _u64 bytes_written = 2 * sizeof(uint32_t) + npts * ndims * sizeof(T);
+    writer.seekp(offset, writer.beg);
+    writer.write((char*) &npts_i32, sizeof(int));
+    writer.write((char*) &ndims_i32, sizeof(int));
+    for (size_t i = 0; i < npts; i++) {
+      writer.write((char*) (data + i * aligned_dim), ndims * sizeof(T));
+    }
+    writer.close();
+    return bytes_written;
+  }
+
+  template<typename T>
+  inline void copy_aligned_data_from_file(const std::string bin_file, T*& data,
+                                          size_t& npts, size_t& dim,
+                                          const size_t& rounded_dim,
+                                          size_t        offset = 0) {
+    if (data == nullptr) {
+      diskann::cerr << "Memory was not allocated for " << data
+                    << " before calling the load function. Exiting..."
+                    << std::endl;
+      throw diskann::ANNException(
+          "Null pointer passed to copy_aligned_data_from_file function", -1,
+          __FUNCSIG__, __FILE__, __LINE__);
+    }
+    std::ifstream reader(bin_file, std::ios::binary);
+    reader.seekg(offset, reader.beg);
+
+    int npts_i32, dim_i32;
+    reader.read((char*) &npts_i32, sizeof(int));
+    reader.read((char*) &dim_i32, sizeof(int));
+    npts = (unsigned) npts_i32;
+    dim = (unsigned) dim_i32;
+
+    /*
+    size_t expected_actual_file_size =
+        npts * dim * sizeof(T) + 2 * sizeof(uint32_t);
+    if (actual_file_size != expected_actual_file_size) {
+      std::stringstream stream;
+      stream << "Error. File size mismatch. Actual size is " << actual_file_size
+             << " while expected size is  " << expected_actual_file_size
+             << " npts = " << npts << " dim = " << dim
+             << " size of <T>= " << sizeof(T) << std::endl;
+      diskann::cout << stream.str() << std::endl;
+      throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
+                                  __LINE__);
+    }
+    */
+
+    for (size_t i = 0; i < npts; i++) {
+      reader.read((char*) (data + i * rounded_dim), dim * sizeof(T));
+      memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
+    }
+  }

  // NOTE :: good efficiency when total_vec_size is integral multiple of 64
  inline void prefetch_vector(const char* vec, size_t vecsize) {
@ -666,6 +845,16 @@ namespace diskann {
    for (size_t d = 0; d < max_prefetch_size; d += 64)
      _mm_prefetch((const char*) vec + d, _MM_HINT_T1);
  }
+
+  // NOTE: Implementation in utils.cpp.
+  void block_convert(std::ofstream& writr, std::ifstream& readr,
+                     float* read_buf, _u64 npts, _u64 ndims);
+
+  DISKANN_DLLEXPORT void normalize_data_file(const std::string& inFileName,
+                                             const std::string& outFileName);
+
+  template<typename T>
+  Distance<T>* get_distance_function(Metric m);
 };  // namespace diskann

 struct PivotContainer {
@ -687,6 +876,7 @@ struct PivotContainer {
  float  piv_dist;
 };

+/*
 inline bool file_exists(const std::string& name) {
  struct stat buffer;
  auto        val = stat(name.c_str(), &buffer);
@ -694,20 +884,7 @@ inline bool file_exists(const std::string& name) {
                << std::endl;
  return (val == 0);
 }
-
-inline _u64 get_file_size(const std::string& fname) {
-  std::ifstream reader(fname, std::ios::binary | std::ios::ate);
-  if (!reader.fail() && reader.is_open()) {
-    _u64 end_pos = reader.tellg();
-    diskann::cout << " Tellg: " << reader.tellg() << " as u64: " << end_pos
-                  << std::endl;
-    reader.close();
-    return end_pos;
-  } else {
-    diskann::cout << "Could not open file: " << fname << std::endl;
-    return 0;
-  }
-}
+*/

 inline bool validate_index_file_size(std::ifstream& in) {
  if (!in.is_open())
@ -730,18 +907,60 @@ inline bool validate_index_file_size(std::ifstream& in) {
  return true;
 }

+// This function is valid only for float data type.
+template<typename T>
+inline void normalize(T* arr, size_t dim) {
+  float sum = 0.0f;
+  for (uint32_t i = 0; i < dim; i++) {
+    sum += arr[i] * arr[i];
+  }
+  sum = sqrt(sum);
+  for (uint32_t i = 0; i < dim; i++) {
+    arr[i] = (T)(arr[i] / sum);
+  }
+}
+
 #ifdef _WINDOWS
 #include <intrin.h>
 #include <Psapi.h>

+extern bool AvxSupportedCPU;
+extern bool Avx2SupportedCPU;
+
+inline size_t getMemoryUsage() {
+  PROCESS_MEMORY_COUNTERS_EX pmc;
+  GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*) &pmc,
+                       sizeof(pmc));
+  return pmc.PrivateUsage;
+}
+
+inline std::string getWindowsErrorMessage(DWORD lastError) {
+  char* errorText;
+  FormatMessageA(
+      // use system message tables to retrieve error text
+      FORMAT_MESSAGE_FROM_SYSTEM
+          // allocate buffer on local heap for error text
+          | FORMAT_MESSAGE_ALLOCATE_BUFFER
+          // Important! will fail otherwise, since we're not
+          // (and CANNOT) pass insertion parameters
+          | FORMAT_MESSAGE_IGNORE_INSERTS,
+      NULL,  // unused with FORMAT_MESSAGE_FROM_SYSTEM
+      lastError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
+      (LPSTR) &errorText,  // output
+      0,                   // minimum size for output buffer
+      NULL);               // arguments - see note
+
+  return errorText != nullptr ? std::string(errorText) : std::string();
+}
+
 inline void printProcessMemory(const char* message) {
  PROCESS_MEMORY_COUNTERS counters;
  HANDLE                  h = GetCurrentProcess();
  GetProcessMemoryInfo(h, &counters, sizeof(counters));
  diskann::cout << message << " [Peaking Working Set size: "
-                << counters.PeakWorkingSetSize * 1.0 / (1024 * 1024 * 1024)
+                << counters.PeakWorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
                << "GB Working set size: "
-                << counters.WorkingSetSize * 1.0 / (1024 * 1024 * 1024)
+                << counters.WorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
                << "GB Private bytes "
                << counters.PagefileUsage * 1.0 / (1024 * 1024 * 1024) << "GB]"
                << std::endl;
@ -752,10 +971,14 @@ inline void printProcessMemory(const char* message) {
 inline bool avx2Supported() {
  return true;
 }
-
-inline void printProcessMemory(const char* message) {
-  diskann::cout << message << std::endl;
+inline void printProcessMemory(const char*) {
 }
+
+inline size_t
+getMemoryUsage() {  // for non-windows, we have not implemented this function
+  return 0;
+}
+
 #endif

 extern bool AvxSupportedCPU;
--- a/patches/gperftools.diskann.patch
+++ b/patches/gperftools.diskann.patch
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -7,7 +7,7 @@ if(MSVC)
 	add_subdirectory(dll)
 else()
 	#file(GLOB CPP_SOURCES *.cpp)
-	set(CPP_SOURCES ann_exception.cpp aux_utils.cpp index.cpp
+	set(CPP_SOURCES ann_exception.cpp aux_utils.cpp distance.cpp index.cpp
        linux_aligned_file_reader.cpp math_utils.cpp memory_mapper.cpp
        partition_and_pq.cpp  pq_flash_index.cpp logger.cpp utils.cpp)
 	add_library(${PROJECT_NAME} ${CPP_SOURCES})
--- a/src/ann_exception.cpp
+++ b/src/ann_exception.cpp
@ -31,8 +31,8 @@ namespace diskann {
                               std::system_error& e, const std::string& funcSig,
                               const std::string& fileName,
                               unsigned int       lineNum)
-      : ANNException(std::string(" While opening file ") + filename +
-                         std::string(", error code: ") +
+      : ANNException(std::string(" While opening file \'") + filename +
+                         std::string("\', error code: ") +
                         std::to_string(e.code().value()) + "  " +
                         e.code().message(),
                     e.code().value(), funcSig, fileName, lineNum) {
--- a/src/aux_utils.cpp
+++ b/src/aux_utils.cpp
@ -19,21 +19,118 @@
 #include "partition_and_pq.h"
 #include "percentile_stats.h"
 #include "pq_flash_index.h"
+#include "tsl/robin_set.h"
+
 #include "utils.h"

 namespace diskann {

-  double get_memory_budget(const std::string &mem_budget_str) {
-    double mem_ram_budget = atof(mem_budget_str.c_str());
-    double final_index_ram_limit = mem_ram_budget;
-    if (mem_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB >
+  void add_new_file_to_single_index(std::string index_file,
+                                    std::string new_file) {
+    std::unique_ptr<_u64[]> metadata;
+    _u64                    nr, nc;
+    diskann::load_bin<_u64>(index_file, metadata, nr, nc);
+    if (nc != 1) {
+      std::stringstream stream;
+      stream << "Error, index file specified does not have correct metadata. "
+             << std::endl;
+      throw diskann::ANNException(stream.str(), -1);
+    }
+    size_t          index_ending_offset = metadata[nr - 1];
+    _u64            read_blk_size = 64 * 1024 * 1024;
+    cached_ofstream writer(index_file, read_blk_size);
+    _u64            check_file_size = get_file_size(index_file);
+    if (check_file_size != index_ending_offset) {
+      std::stringstream stream;
+      stream << "Error, index file specified does not have correct metadata "
+                "(last entry must match the filesize). "
+             << std::endl;
+      throw diskann::ANNException(stream.str(), -1);
+    }
+
+    cached_ifstream reader(new_file, read_blk_size);
+    size_t          fsize = reader.get_file_size();
+    if (fsize == 0) {
+      std::stringstream stream;
+      stream << "Error, new file specified is empty. Not appending. "
+             << std::endl;
+      throw diskann::ANNException(stream.str(), -1);
+    }
+
+    size_t num_blocks = DIV_ROUND_UP(fsize, read_blk_size);
+    char * dump = new char[read_blk_size];
+    for (_u64 i = 0; i < num_blocks; i++) {
+      size_t cur_block_size = read_blk_size > fsize - (i * read_blk_size)
+                                  ? fsize - (i * read_blk_size)
+                                  : read_blk_size;
+      reader.read(dump, cur_block_size);
+      writer.write(dump, cur_block_size);
+    }
+    //    reader.close();
+    //    writer.close();
+
+    delete[] dump;
+    std::vector<_u64> new_meta;
+    for (_u64 i = 0; i < nr; i++)
+      new_meta.push_back(metadata[i]);
+    new_meta.push_back(metadata[nr - 1] + fsize);
+
+    diskann::save_bin<_u64>(index_file, new_meta.data(), new_meta.size(), 1);
+  }
+
+  double get_memory_budget(double search_ram_budget) {
+    double final_index_ram_limit = search_ram_budget;
+    if (search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB >
        THRESHOLD_FOR_CACHING_IN_GB) {  // slack for space used by cached
                                        // nodes
-      final_index_ram_limit = mem_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB;
+      final_index_ram_limit = search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB;
    }
    return final_index_ram_limit * 1024 * 1024 * 1024;
  }

+  double get_memory_budget(const std::string &mem_budget_str) {
+    double search_ram_budget = atof(mem_budget_str.c_str());
+    return get_memory_budget(search_ram_budget);
+  }
+
+  size_t calculate_num_pq_chunks(double final_index_ram_limit,
+                                 size_t points_num, uint32_t dim,
+                                 const std::vector<std::string> &param_list) {
+    size_t num_pq_chunks =
+        (size_t)(std::floor)(_u64(final_index_ram_limit / (double) points_num));
+    diskann::cout << "Calculated num_pq_chunks :" << num_pq_chunks << std::endl;
+    if (param_list.size() >= 6) {
+      float compress_ratio = (float) atof(param_list[5].c_str());
+      if (compress_ratio > 0 && compress_ratio <= 1) {
+        size_t chunks_by_cr = (size_t)(std::floor)(compress_ratio * dim);
+
+        if (chunks_by_cr > 0 && chunks_by_cr < num_pq_chunks) {
+          diskann::cout << "Compress ratio:" << compress_ratio
+                        << " new #pq_chunks:" << chunks_by_cr << std::endl;
+          num_pq_chunks = chunks_by_cr;
+        } else {
+          diskann::cout << "Compress ratio: " << compress_ratio
+                        << " #new pq_chunks: " << chunks_by_cr
+                        << " is either zero or greater than num_pq_chunks: "
+                        << num_pq_chunks << ". num_pq_chunks is unchanged. "
+                        << std::endl;
+        }
+      } else {
+        diskann::cerr << "Compression ratio: " << compress_ratio
+                      << " should be in (0,1]" << std::endl;
+      }
+    }
+
+    num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks;
+    num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks;
+    num_pq_chunks =
+        num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks;
+
+    diskann::cout << "Compressing " << dim << "-dimensional data into "
+                  << num_pq_chunks << " bytes per vector." << std::endl;
+    return num_pq_chunks;
+  }
+
  double calculate_recall(unsigned num_queries, unsigned *gold_std,
                          float *gs_dist, unsigned dim_gs,
                          unsigned *our_results, unsigned dim_or,
@ -70,6 +167,63 @@ namespace diskann {
    return total_recall / (num_queries) * (100.0 / recall_at);
  }

+  double calculate_recall(unsigned num_queries, unsigned *gold_std,
+                          float *gs_dist, unsigned dim_gs,
+                          unsigned *our_results, unsigned dim_or,
+                          unsigned                        recall_at,
+                          const tsl::robin_set<unsigned> &active_tags) {
+    double             total_recall = 0;
+    std::set<unsigned> gt, res;
+    bool               printed = false;
+    for (size_t i = 0; i < num_queries; i++) {
+      gt.clear();
+      res.clear();
+      unsigned *gt_vec = gold_std + dim_gs * i;
+      unsigned *res_vec = our_results + dim_or * i;
+      size_t    tie_breaker = recall_at;
+      unsigned  active_points_count = 0;
+      unsigned  cur_counter = 0;
+      while (active_points_count < recall_at && cur_counter < dim_gs) {
+        if (active_tags.find(*(gt_vec + cur_counter)) != active_tags.end()) {
+          active_points_count++;
+        }
+        cur_counter++;
+      }
+      if (active_tags.empty())
+        cur_counter = recall_at;
+
+      if ((active_points_count < recall_at && !active_tags.empty()) &&
+          !printed) {
+        diskann::cout << "Warning: Couldn't find enough closest neighbors "
+                      << active_points_count << "/" << recall_at
+                      << " from "
+                         "truthset for query # "
+                      << i << ". Will result in under-reported value of recall."
+                      << std::endl;
+        printed = true;
+      }
+      if (gs_dist != nullptr) {
+        tie_breaker = cur_counter - 1;
+        float *gt_dist_vec = gs_dist + dim_gs * i;
+        while (tie_breaker < dim_gs &&
+               gt_dist_vec[tie_breaker] == gt_dist_vec[cur_counter - 1])
+          tie_breaker++;
+      }
+
+      gt.insert(gt_vec, gt_vec + tie_breaker);
+      res.insert(res_vec, res_vec + recall_at);
+      unsigned cur_recall = 0;
+      for (auto &v : res) {
+        if (gt.find(v) != gt.end()) {
+          cur_recall++;
+        }
+      }
+      total_recall += cur_recall;
+    }
+    return ((double) (total_recall / (num_queries))) *
+           ((double) (100.0 / recall_at));
+  }
+
  double calculate_range_search_recall(
      unsigned num_queries, std::vector<std::vector<_u32>> &groundtruth,
      std::vector<std::vector<_u32>> &our_results) {
@ -130,12 +284,20 @@ namespace diskann {
    if (files.fileExists(cache_warmup_file)) {
      diskann::load_aligned_bin<T>(files, cache_warmup_file, warmup, warmup_num,
                                   file_dim, file_aligned_dim);
+      diskann::cout << "In the warmup file: " << cache_warmup_file
+                    << " File dim: " << file_dim
+                    << " File aligned dim: " << file_aligned_dim
+                    << " Expected dim: " << warmup_dim
+                    << " Expected aligned dim: " << warmup_aligned_dim
+                    << std::endl;
+
      if (file_dim != warmup_dim || file_aligned_dim != warmup_aligned_dim) {
        std::stringstream stream;
        stream << "Mismatched dimensions in sample file. file_dim = "
               << file_dim << " file_aligned_dim: " << file_aligned_dim
               << " index_dim: " << warmup_dim
               << " index_aligned_dim: " << warmup_aligned_dim << std::endl;
+        diskann::cerr << stream.str();
        throw diskann::ANNException(stream.str(), -1);
      }
    } else {
@ -247,18 +409,20 @@ namespace diskann {
    std::vector<cached_ifstream> vamana_readers(nshards);
    for (_u64 i = 0; i < nshards; i++) {
      vamana_readers[i].open(vamana_names[i], 1024 * 1048576);
-      size_t actual_file_size = get_file_size(vamana_names[i]);
+      //      size_t actual_file_size = get_file_size(vamana_names[i]);
      size_t expected_file_size;
      vamana_readers[i].read((char *) &expected_file_size, sizeof(uint64_t));
-      if (actual_file_size != expected_file_size) {
-        std::stringstream stream;
-        stream << "Error in Vamana Index file " << vamana_names[i]
-               << " Actual file size: " << actual_file_size
-               << " does not match expected file size: " << expected_file_size
-               << std::endl;
-        throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
-                                    __LINE__);
-      }
+      /*      if (actual_file_size != expected_file_size) {
+              std::stringstream stream;
+              stream << "Error in Vamana Index file " << vamana_names[i]
+                     << " Actual file size: " << actual_file_size
+                     << " does not match expected file size: " <<
+         expected_file_size
+                     << std::endl;
+              throw diskann::ANNException(stream.str(), -1, __FUNCSIG__,
+         __FILE__,
+                                          __LINE__);
+            } */
    }

    size_t merged_index_size = 16;
@ -379,7 +543,7 @@ namespace diskann {
    diskann::get_bin_metadata(base_file, base_num, base_dim);

    double full_index_ram =
-        ESTIMATE_RAM_USAGE(base_num, base_dim, sizeof(T), R);
+        estimate_ram_usage(base_num, base_dim, sizeof(T), R);
    if (full_index_ram < ram_budget * 1024 * 1024 * 1024) {
      diskann::cout << "Full index fits in RAM budget, should consume at most "
                    << full_index_ram / (1024 * 1024 * 1024)
@ -394,9 +558,10 @@ namespace diskann {
      paras.Set<std::string>("save_path", mem_index_path);

      std::unique_ptr<diskann::Index<T>> _pvamanaIndex =
-          std::unique_ptr<diskann::Index<T>>(
-              new diskann::Index<T>(compareMetric, base_file.c_str()));
-      _pvamanaIndex->build(paras);
+          std::unique_ptr<diskann::Index<T>>(new diskann::Index<T>(
+              compareMetric, base_dim, base_num, false, false, false));
+      _pvamanaIndex->build(base_file.c_str(), base_num, paras);
+
      _pvamanaIndex->save(mem_index_path.c_str());
      std::remove(medoids_file.c_str());
      std::remove(centroids_file.c_str());
@ -432,10 +597,13 @@ namespace diskann {
      paras.Set<bool>("saturate_graph", 1);
      paras.Set<std::string>("save_path", shard_index_file);

+      _u64 shard_base_dim, shard_base_pts;
+      get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim);
      std::unique_ptr<diskann::Index<T>> _pvamanaIndex =
-          std::unique_ptr<diskann::Index<T>>(
-              new diskann::Index<T>(compareMetric, shard_base_file.c_str()));
-      _pvamanaIndex->build(paras);
+          std::unique_ptr<diskann::Index<T>>(new diskann::Index<T>(
+              compareMetric, shard_base_dim, shard_base_pts, false,
+              false));  // TODO: Single?
+      _pvamanaIndex->build(shard_base_file.c_str(), shard_base_pts, paras);
      _pvamanaIndex->save(shard_index_file.c_str());
      std::remove(shard_base_file.c_str());
      //      wait_for_keystroke();
@ -766,7 +934,7 @@ namespace diskann {
    size_t train_size, train_dim;
    float *train_data;

-    double p_val = ((double) TRAINING_SET_SIZE / (double) points_num);
+    double p_val = ((double) MAX_PQ_TRAINING_SET_SIZE / (double) points_num);
    // generates random sample and sets it to train_data and updates
    // train_size
    gen_random_slice<T>(data_file_to_use.c_str(), p_val, train_data, train_size,
--- a/src/distance.cpp
+++ b/src/distance.cpp
@ -0,0 +1,497 @@
+// TODO
+// CHECK COSINE ON LINUX
+
+#ifdef _WINDOWS
+#include <immintrin.h>
+#include <smmintrin.h>
+#include <tmmintrin.h>
+#include <intrin.h>
+#else
+#include <immintrin.h>
+#endif
+
+#include "simd_utils.h"
+#include <cosine_similarity.h>
+#include <iostream>
+
+#include "distance.h"
+#include "logger.h"
+#include "ann_exception.h"
+
+namespace diskann {
+
+  // Cosine similarity.
+  float DistanceCosineInt8::compare(const int8_t *a, const int8_t *b,
+                                    uint32_t length) const {
+#ifdef _WINDOWS
+    return diskann::CosineSimilarity2<int8_t>(a, b, length);
+#else
+    return diskann::compute_cosine_similarity(a, b, length);
+#endif
+  }
+
+  float DistanceCosineFloat::compare(const float *a, const float *b,
+                                     uint32_t length) const {
+#ifdef _WINDOWS
+    return diskann::CosineSimilarity2<float>(a, b, length);
+#else
+    return diskann::compute_cosine_similarity(a, b, length);
+#endif
+  }
+
+  float SlowDistanceCosineUInt8::compare(const uint8_t *a, const uint8_t *b,
+                                         uint32_t length) const {
+    int magA = 0, magB = 0, scalarProduct = 0;
+    for (uint32_t i = 0; i < length; i++) {
+      magA += ((uint32_t) a[i]) * ((uint32_t) a[i]);
+      magB += ((uint32_t) b[i]) * ((uint32_t) b[i]);
+      scalarProduct += ((uint32_t) a[i]) * ((uint32_t) b[i]);
+    }
+    // similarity == 1-cosine distance
+    return 1.0f - (float) (scalarProduct / (sqrt(magA) * sqrt(magB)));
+  }
+
+  // L2 distance functions.
+  float DistanceL2Int8::compare(const int8_t *a, const int8_t *b,
+                                uint32_t size) const {
+    int32_t result = 0;
+
+#ifdef _WINDOWS
+#ifdef USE_AVX2
+    __m256 r = _mm256_setzero_ps();
+    char * pX = (char *) a, *pY = (char *) b;
+    while (size >= 32) {
+      __m256i r1 = _mm256_subs_epi8(_mm256_loadu_si256((__m256i *) pX),
+                                    _mm256_loadu_si256((__m256i *) pY));
+      r = _mm256_add_ps(r, _mm256_mul_epi8(r1, r1));
+      pX += 32;
+      pY += 32;
+      size -= 32;
+    }
+    while (size > 0) {
+      __m128i r2 = _mm_subs_epi8(_mm_loadu_si128((__m128i *) pX),
+                                 _mm_loadu_si128((__m128i *) pY));
+      r = _mm256_add_ps(r, _mm256_mul32_pi8(r2, r2));
+      pX += 4;
+      pY += 4;
+      size -= 4;
+    }
+    r = _mm256_hadd_ps(_mm256_hadd_ps(r, r), r);
+    return r.m256_f32[0] + r.m256_f32[4];
+#else
+#pragma omp simd reduction(+ : result) aligned(a, b : 8)
+    for (_s32 i = 0; i < (_s32) size; i++) {
+      result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
+                ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
+    }
+    return (float) result;
+#endif
+#else
+#pragma omp simd reduction(+ : result) aligned(a, b : 8)
+    for (int32_t i = 0; i < (int32_t) size; i++) {
+      result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
+                ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
+    }
+    return (float) result;
+#endif
+  }
+
+  float DistanceL2UInt8::compare(const uint8_t *a, const uint8_t *b,
+                                 uint32_t size) const {
+    uint32_t result = 0;
+#ifndef _WINDOWS
+#pragma omp simd reduction(+ : result) aligned(a, b : 8)
+#endif
+    for (int32_t i = 0; i < (int32_t) size; i++) {
+      result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
+                ((int32_t)((int16_t) a[i] - (int16_t) b[i]));
+    }
+    return (float) result;
+  }
+
+#ifndef _WINDOWS
+  float DistanceL2Float::compare(const float *a, const float *b,
+                                 uint32_t size) const {
+    a = (const float *) __builtin_assume_aligned(a, 32);
+    b = (const float *) __builtin_assume_aligned(b, 32);
+#else
+  float DistanceL2Float::compare(const float *a, const float *b,
+                                 uint32_t size) const {
+#endif
+
+    float result = 0;
+#ifdef USE_AVX2
+    // assume size is divisible by 8
+    uint16_t niters = (uint16_t)(size / 8);
+    __m256   sum = _mm256_setzero_ps();
+    for (uint16_t j = 0; j < niters; j++) {
+      // scope is a[8j:8j+7], b[8j:8j+7]
+      // load a_vec
+      if (j < (niters - 1)) {
+        _mm_prefetch((char *) (a + 8 * (j + 1)), _MM_HINT_T0);
+        _mm_prefetch((char *) (b + 8 * (j + 1)), _MM_HINT_T0);
+      }
+      __m256 a_vec = _mm256_load_ps(a + 8 * j);
+      // load b_vec
+      __m256 b_vec = _mm256_load_ps(b + 8 * j);
+      // a_vec - b_vec
+      __m256 tmp_vec = _mm256_sub_ps(a_vec, b_vec);
+      
+      sum = _mm256_fmadd_ps(tmp_vec, tmp_vec, sum);
+    }
+
+    // horizontal add sum
+    result = _mm256_reduce_add_ps(sum);
+#else
+#ifndef _WINDOWS
+#pragma omp simd reduction(+ : result) aligned(a, b : 32)
+#endif
+    for (int32_t i = 0; i < (int32_t) size; i++) {
+      result += (a[i] - b[i]) * (a[i] - b[i]);
+    }
+#endif
+    return result;
+  }
+
+  float SlowDistanceL2Float::compare(const float *a, const float *b,
+                                     uint32_t length) const {
+    float result = 0.0f;
+    for (uint32_t i = 0; i < length; i++) {
+      result += (a[i] - b[i]) * (a[i] - b[i]);
+    }
+    return result;
+  }
+
+#ifdef _WINDOWS
+  float AVXDistanceL2Int8::compare(const int8_t *a, const int8_t *b,
+                                   uint32_t length) const {
+    __m128  r = _mm_setzero_ps();
+    __m128i r1;
+    while (length >= 16) {
+      r1 = _mm_subs_epi8(_mm_load_si128((__m128i *) a),
+                         _mm_load_si128((__m128i *) b));
+      r = _mm_add_ps(r, _mm_mul_epi8(r1));
+      a += 16;
+      b += 16;
+      length -= 16;
+    }
+    r = _mm_hadd_ps(_mm_hadd_ps(r, r), r);
+    float res = r.m128_f32[0];
+
+    if (length >= 8) {
+      __m128  r2 = _mm_setzero_ps();
+      __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 8)),
+                                 _mm_load_si128((__m128i *) (b - 8)));
+      r2 = _mm_add_ps(r2, _mm_mulhi_epi8(r3));
+      a += 8;
+      b += 8;
+      length -= 8;
+      r2 = _mm_hadd_ps(_mm_hadd_ps(r2, r2), r2);
+      res += r2.m128_f32[0];
+    }
+
+    if (length >= 4) {
+      __m128  r2 = _mm_setzero_ps();
+      __m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 12)),
+                                 _mm_load_si128((__m128i *) (b - 12)));
+      r2 = _mm_add_ps(r2, _mm_mulhi_epi8_shift32(r3));
+      res += r2.m128_f32[0] + r2.m128_f32[1];
+    }
+
+    return res;
+  }
+
+  float AVXDistanceL2Float::compare(const float *a, const float *b,
+                                    uint32_t length) const {
+    __m128 diff, v1, v2;
+    __m128 sum = _mm_set1_ps(0);
+
+    while (length >= 4) {
+      v1 = _mm_loadu_ps(a);
+      a += 4;
+      v2 = _mm_loadu_ps(b);
+      b += 4;
+      diff = _mm_sub_ps(v1, v2);
+      sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
+      length -= 4;
+    }
+
+    return sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] +
+           sum.m128_f32[3];
+  }
+#else
+  float AVXDistanceL2Int8::compare(const int8_t *, const int8_t *,
+                                   uint32_t) const {
+    return 0;
+  }
+  float AVXDistanceL2Float::compare(const float *, const float *,
+                                    uint32_t) const {
+    return 0;
+  }
+#endif
+
+  template<typename T>
+  float DistanceInnerProduct<T>::inner_product(const T *a, const T *b,
+                                               unsigned size) const {
+    if (!std::is_floating_point<T>::value) {
+      diskann::cerr << "ERROR: Inner Product only defined for float currently."
+                    << std::endl;
+      throw diskann::ANNException(
+          "ERROR: Inner Product only defined for float currently.", -1,
+          __FUNCSIG__, __FILE__, __LINE__);
+    }
+
+    float result = 0;
+
+#ifdef __GNUC__
+#ifdef USE_AVX2
+#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
+  tmp1 = _mm256_loadu_ps(addr1);                \
+  tmp2 = _mm256_loadu_ps(addr2);                \
+  tmp1 = _mm256_mul_ps(tmp1, tmp2);             \
+  dest = _mm256_add_ps(dest, tmp1);
+
+    __m256       sum;
+    __m256       l0, l1;
+    __m256       r0, r1;
+    unsigned     D = (size + 7) & ~7U;
+    unsigned     DR = D % 16;
+    unsigned     DD = D - DR;
+    const float *l = (float *) a;
+    const float *r = (float *) b;
+    const float *e_l = l + DD;
+    const float *e_r = r + DD;
+    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    sum = _mm256_loadu_ps(unpack);
+    if (DR) {
+      AVX_DOT(e_l, e_r, sum, l0, r0);
+    }
+
+    for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+      AVX_DOT(l, r, sum, l0, r0);
+      AVX_DOT(l + 8, r + 8, sum, l1, r1);
+    }
+    _mm256_storeu_ps(unpack, sum);
+    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
+             unpack[5] + unpack[6] + unpack[7];
+
+#else
+#ifdef __SSE2__
+#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
+  tmp1 = _mm128_loadu_ps(addr1);                \
+  tmp2 = _mm128_loadu_ps(addr2);                \
+  tmp1 = _mm128_mul_ps(tmp1, tmp2);             \
+  dest = _mm128_add_ps(dest, tmp1);
+    __m128       sum;
+    __m128       l0, l1, l2, l3;
+    __m128       r0, r1, r2, r3;
+    unsigned     D = (size + 3) & ~3U;
+    unsigned     DR = D % 16;
+    unsigned     DD = D - DR;
+    const float *l = a;
+    const float *r = b;
+    const float *e_l = l + DD;
+    const float *e_r = r + DD;
+    float        unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
+
+    sum = _mm_load_ps(unpack);
+    switch (DR) {
+      case 12:
+        SSE_DOT(e_l + 8, e_r + 8, sum, l2, r2);
+      case 8:
+        SSE_DOT(e_l + 4, e_r + 4, sum, l1, r1);
+      case 4:
+        SSE_DOT(e_l, e_r, sum, l0, r0);
+      default:
+        break;
+    }
+    for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+      SSE_DOT(l, r, sum, l0, r0);
+      SSE_DOT(l + 4, r + 4, sum, l1, r1);
+      SSE_DOT(l + 8, r + 8, sum, l2, r2);
+      SSE_DOT(l + 12, r + 12, sum, l3, r3);
+    }
+    _mm_storeu_ps(unpack, sum);
+    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
+#else
+
+    float        dot0, dot1, dot2, dot3;
+    const float *last = a + size;
+    const float *unroll_group = last - 3;
+
+    /* Process 4 items with each loop for efficiency. */
+    while (a < unroll_group) {
+      dot0 = a[0] * b[0];
+      dot1 = a[1] * b[1];
+      dot2 = a[2] * b[2];
+      dot3 = a[3] * b[3];
+      result += dot0 + dot1 + dot2 + dot3;
+      a += 4;
+      b += 4;
+    }
+    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+    while (a < last) {
+      result += *a++ * *b++;
+    }
+#endif
+#endif
+#endif
+    return result;
+  }
+
+  template<typename T>
+  float DistanceFastL2<T>::compare(const T *a, const T *b, float norm,
+                  unsigned size) const {
+      float result = -2 * DistanceInnerProduct<T>::inner_product(a, b, size);
+      result += norm;
+      return result;
+    }
+
+  template<typename T>
+  float DistanceFastL2<T>::norm(const T *a, unsigned size) const {
+    if (!std::is_floating_point<T>::value) {
+      diskann::cerr << "ERROR: FastL2 only defined for float currently."
+                    << std::endl;
+      throw diskann::ANNException(
+          "ERROR: FastL2 only defined for float currently.", -1, __FUNCSIG__,
+          __FILE__, __LINE__);
+    }
+    float result = 0;
+#ifdef __GNUC__
+#ifdef __AVX__
+#define AVX_L2NORM(addr, dest, tmp) \
+  tmp = _mm256_loadu_ps(addr);      \
+  tmp = _mm256_mul_ps(tmp, tmp);    \
+  dest = _mm256_add_ps(dest, tmp);
+
+    __m256       sum;
+    __m256       l0, l1;
+    unsigned     D = (size + 7) & ~7U;
+    unsigned     DR = D % 16;
+    unsigned     DD = D - DR;
+    const float *l = (float *) a;
+    const float *e_l = l + DD;
+    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+
+    sum = _mm256_loadu_ps(unpack);
+    if (DR) {
+      AVX_L2NORM(e_l, sum, l0);
+    }
+    for (unsigned i = 0; i < DD; i += 16, l += 16) {
+      AVX_L2NORM(l, sum, l0);
+      AVX_L2NORM(l + 8, sum, l1);
+    }
+    _mm256_storeu_ps(unpack, sum);
+    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
+             unpack[5] + unpack[6] + unpack[7];
+#else
+#ifdef __SSE2__
+#define SSE_L2NORM(addr, dest, tmp) \
+  tmp = _mm128_loadu_ps(addr);      \
+  tmp = _mm128_mul_ps(tmp, tmp);    \
+  dest = _mm128_add_ps(dest, tmp);
+
+    __m128       sum;
+    __m128       l0, l1, l2, l3;
+    unsigned     D = (size + 3) & ~3U;
+    unsigned     DR = D % 16;
+    unsigned     DD = D - DR;
+    const float *l = a;
+    const float *e_l = l + DD;
+    float        unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
+
+    sum = _mm_load_ps(unpack);
+    switch (DR) {
+      case 12:
+        SSE_L2NORM(e_l + 8, sum, l2);
+      case 8:
+        SSE_L2NORM(e_l + 4, sum, l1);
+      case 4:
+        SSE_L2NORM(e_l, sum, l0);
+      default:
+        break;
+    }
+    for (unsigned i = 0; i < DD; i += 16, l += 16) {
+      SSE_L2NORM(l, sum, l0);
+      SSE_L2NORM(l + 4, sum, l1);
+      SSE_L2NORM(l + 8, sum, l2);
+      SSE_L2NORM(l + 12, sum, l3);
+    }
+    _mm_storeu_ps(unpack, sum);
+    result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
+#else
+    float        dot0, dot1, dot2, dot3;
+    const float *last = a + size;
+    const float *unroll_group = last - 3;
+
+    /* Process 4 items with each loop for efficiency. */
+    while (a < unroll_group) {
+      dot0 = a[0] * a[0];
+      dot1 = a[1] * a[1];
+      dot2 = a[2] * a[2];
+      dot3 = a[3] * a[3];
+      result += dot0 + dot1 + dot2 + dot3;
+      a += 4;
+    }
+    /* Process last 0-3 pixels.  Not needed for standard vector lengths. */
+    while (a < last) {
+      result += (*a) * (*a);
+      a++;
+    }
+#endif
+#endif
+#endif
+    return result;
+  }
+
+  float AVXDistanceInnerProductFloat::compare(const float *a, const float *b,
+                                              uint32_t size) const {
+    float result = 0.0f;
+#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
+  tmp1 = _mm256_loadu_ps(addr1);                \
+  tmp2 = _mm256_loadu_ps(addr2);                \
+  tmp1 = _mm256_mul_ps(tmp1, tmp2);             \
+  dest = _mm256_add_ps(dest, tmp1);
+
+    __m256       sum;
+    __m256       l0, l1;
+    __m256       r0, r1;
+    unsigned     D = (size + 7) & ~7U;
+    unsigned     DR = D % 16;
+    unsigned     DD = D - DR;
+    const float *l = (float *) a;
+    const float *r = (float *) b;
+    const float *e_l = l + DD;
+    const float *e_r = r + DD;
+#ifndef _WINDOWS
+    float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
+#else
+    __declspec(align(32)) float unpack[8] = {0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+    sum = _mm256_loadu_ps(unpack);
+    if (DR) {
+      AVX_DOT(e_l, e_r, sum, l0, r0);
+    }
+
+    for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
+      AVX_DOT(l, r, sum, l0, r0);
+      AVX_DOT(l + 8, r + 8, sum, l1, r1);
+    }
+    _mm256_storeu_ps(unpack, sum);
+    result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
+             unpack[5] + unpack[6] + unpack[7];
+
+    return -result;
+  }
+
+  template DISKANN_DLLEXPORT class DistanceInnerProduct<float>;
+  template DISKANN_DLLEXPORT class DistanceInnerProduct<int8_t>;
+  template DISKANN_DLLEXPORT class DistanceInnerProduct<uint8_t>;
+
+  template DISKANN_DLLEXPORT class DistanceFastL2<float>;
+  template DISKANN_DLLEXPORT class DistanceFastL2<int8_t>;
+  template DISKANN_DLLEXPORT class DistanceFastL2<uint8_t>;
+
+}  // namespace diskann
--- a/src/dll/CMakeLists.txt
+++ b/src/dll/CMakeLists.txt
@ -2,7 +2,7 @@
 # Licensed under the MIT license.

 add_library(diskann_dll SHARED dllmain.cpp ../partition_and_pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp 
-	  ../windows_aligned_file_reader.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../aux_utils.cpp ../ann_exception.cpp)
+	  ../windows_aligned_file_reader.cpp ../distance.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../aux_utils.cpp ../ann_exception.cpp)
 if (MSVC)
 	add_definitions(-D_USRDLL -D_WINDLL -DDISKANN_DLL)
 	add_compile_options(/MD)
@ -11,8 +11,8 @@ if (MSVC)
 	target_link_options(diskann_dll PRIVATE $<$<CONFIG:Debug>:/IMPLIB:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib>
 										$<$<CONFIG:Release>:/IMPLIB:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib>
 					   )
-	target_link_libraries(diskann_dll debug  ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
-	target_link_libraries(diskann_dll optimized ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
+	target_link_libraries(diskann_dll debug  ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
+	target_link_libraries(diskann_dll optimized ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
 					

 	add_custom_command(TARGET 
@ -50,7 +50,7 @@ if (MSVC)
 	add_custom_command(TARGET 
 							diskann_dll 
 							POST_BUILD 
-							COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll "$<$<CONFIG:debug>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}\">$<$<CONFIG:release>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}\">" )
+							COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.dll "$<$<CONFIG:debug>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}\">$<$<CONFIG:release>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}\">" )

 endif()

--- a/src/index.cpp
+++ b/src/index.cpp
--- a/src/partition_and_pq.cpp
+++ b/src/partition_and_pq.cpp
@ -995,7 +995,7 @@ int partition_with_ram_budget(const std::string data_file,
                 sampling_rate);  // to account for the fact that p is the size
                                  // of the shard over the testing sample.
      double cur_shard_ram_estimate =
-          ESTIMATE_RAM_USAGE(p, train_dim, sizeof(T), graph_degree);
+          diskann::estimate_ram_usage(p, train_dim, sizeof(T), graph_degree);

      if (cur_shard_ram_estimate > max_ram_usage)
        max_ram_usage = cur_shard_ram_estimate;
--- a/src/pq_flash_index.cpp
+++ b/src/pq_flash_index.cpp
@ -97,7 +97,7 @@ namespace diskann {
    this->dist_cmp = new DistanceL2UInt8();
    if (Avx2SupportedCPU) {
      diskann::cout << "Using AVX2 dist_cmp_float function." << std::endl;
-      this->dist_cmp_float = new DistanceL2();
+      this->dist_cmp_float = new DistanceL2Float();
    } else if (AvxSupportedCPU) {
      diskann::cout << "Using AVX dist_cmp_float function" << std::endl;
      this->dist_cmp_float = new AVXDistanceL2Float();
@ -122,7 +122,7 @@ namespace diskann {
      diskann::cout << "Using AVX2 function for dist_cmp and dist_cmp_float"
                    << std::endl;
      this->dist_cmp = new DistanceL2Int8();
-      this->dist_cmp_float = new DistanceL2();
+      this->dist_cmp_float = new DistanceL2Float();
    } else if (AvxSupportedCPU) {
      diskann::cout << "No AVX2 support. Switching to AVX routines for "
                       "dist_cmp, dist_cmp_float."
@ -152,8 +152,8 @@ namespace diskann {
      if (Avx2SupportedCPU) {
        diskann::cout << "Using AVX2 functions for dist_cmp and dist_cmp_float"
                      << std::endl;
-        this->dist_cmp = new DistanceL2();
-        this->dist_cmp_float = new DistanceL2();
+        this->dist_cmp = new DistanceL2Float();
+        this->dist_cmp_float = new DistanceL2Float();
      } else if (AvxSupportedCPU) {
        diskann::cout << "No AVX2 support. Switching to AVX functions for "
                         "dist_cmp and dist_cmp_float."
@ -170,8 +170,8 @@ namespace diskann {
      }
    } else if (metric == diskann::Metric::INNER_PRODUCT) {
      std::cout << "Using inner product distance function" << std::endl;
-      this->dist_cmp = new DistanceInnerProduct<float>();
-      this->dist_cmp_float = new DistanceInnerProduct<float>();
+      //      this->dist_cmp = new DistanceInnerProduct<float>();
+      //      this->dist_cmp_float = new DistanceInnerProduct<float>();
    } else {
      std::cout << "Unsupported metric type. Reverting to float." << std::endl;
      this->dist_cmp = new AVXDistanceL2Float();
--- a/src/utils.cpp
+++ b/src/utils.cpp
@ -5,6 +5,9 @@

 #include <stdio.h>

+const uint32_t MAX_REQUEST_SIZE = 1024 * 1024 * 1024;  // 64MB
+const uint32_t MAX_SIMULTANEOUS_READ_REQUESTS = 128;
+
 #ifdef _WINDOWS
 #include <intrin.h>

@ -51,10 +54,167 @@ bool cpuHasAvx2Support() {
 }
 #endif

-#ifndef _WINDOWS
-bool AvxSupportedCPU = false;
-bool Avx2SupportedCPU = true;
-#else
+#ifdef _WINDOWS
 bool AvxSupportedCPU = cpuHasAvxSupport();
 bool Avx2SupportedCPU = cpuHasAvx2Support();
+#else
+bool Avx2SupportedCPU = true;
+bool AvxSupportedCPU = false;
 #endif
+
+namespace diskann {
+  // Get the right distance function for the given metric.
+  template<>
+  diskann::Distance<float>* get_distance_function(diskann::Metric m) {
+    if (m == diskann::Metric::L2) {
+      if (Avx2SupportedCPU) {
+        diskann::cout << "L2: Using AVX2 distance computation" << std::endl;
+        return new diskann::DistanceL2Float();
+      } else if (AvxSupportedCPU) {
+        diskann::cout
+            << "L2: AVX2 not supported. Using AVX distance computation"
+            << std::endl;
+        return new diskann::AVXDistanceL2Float();
+      } else {
+        diskann::cout << "L2: Older CPU. Using slow distance computation"
+                      << std::endl;
+        return new diskann::SlowDistanceL2Float();
+      }
+    } else if (m == diskann::Metric::COSINE) {
+      diskann::cout << "Cosine: Using either AVX or AVX2 implementation"
+                    << std::endl;
+      return new diskann::DistanceCosineFloat();
+    } else if (m == diskann::Metric::INNER_PRODUCT) {
+      diskann::cout << "Inner product: Using AVX2 implementation" << std::endl;
+      return new diskann::AVXDistanceInnerProductFloat();
+    } else if (m == diskann::Metric::FAST_L2) {
+      return new diskann::DistanceFastL2<float>();
+    } else {
+      std::stringstream stream;
+      stream << "Only L2, cosine, and inner product supported for floating "
+                "point vectors as of now. Email "
+                "{gopalsr, harshasi, rakri}@microsoft.com if you need support "
+                "for any other metric."
+             << std::endl;
+      diskann::cerr << stream.str() << std::endl;
+      throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
+                                  __LINE__);
+    }
+  }
+
+  template<>
+  diskann::Distance<int8_t>* get_distance_function(diskann::Metric m) {
+    if (m == diskann::Metric::L2) {
+      if (Avx2SupportedCPU) {
+        diskann::cout << "Using AVX2 distance computation" << std::endl;
+        return new diskann::DistanceL2Int8();
+      } else if (AvxSupportedCPU) {
+        diskann::cout << "AVX2 not supported. Using AVX distance computation"
+                      << std::endl;
+        return new diskann::AVXDistanceL2Int8();
+      } else {
+        diskann::cout << "Older CPU. Using slow distance computation"
+                      << std::endl;
+        return new diskann::SlowDistanceL2Int<int8_t>();
+      }
+    } else if (m == diskann::Metric::COSINE) {
+      diskann::cout << "Using either AVX or AVX2 for Cosine similarity"
+                    << std::endl;
+      return new diskann::DistanceCosineInt8();
+    } else {
+      std::stringstream stream;
+      stream << "Only L2 and cosine supported for signed byte vectors as of "
+                "now. Email "
+                "{gopalsr, harshasi, rakri}@microsoft.com if you need support "
+                "for any other metric."
+             << std::endl;
+      diskann::cerr << stream.str() << std::endl;
+      throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
+                                  __LINE__);
+    }
+  }
+
+  template<>
+  diskann::Distance<uint8_t>* get_distance_function(diskann::Metric m) {
+    if (m == diskann::Metric::L2) {
+#ifdef _WINDOWS
+      diskann::cout
+          << "WARNING: AVX/AVX2 distance function not defined for Uint8. Using "
+             "slow version. "
+             "Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
+          << std::endl;
+#endif
+      return new diskann::DistanceL2UInt8();
+    } else if (m == diskann::Metric::COSINE) {
+      diskann::cout
+          << "AVX/AVX2 distance function not defined for Uint8. Using "
+             "slow version. "
+             "Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
+          << std::endl;
+      return new diskann::SlowDistanceCosineUInt8();
+    } else {
+      std::stringstream stream;
+      stream << "Only L2 and cosine supported for unsigned byte vectors as of "
+                "now. Email "
+                "{gopalsr, harshasi, rakri}@microsoft.com if you need support "
+                "for any other metric."
+             << std::endl;
+      diskann::cerr << stream.str() << std::endl;
+      throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
+                                  __LINE__);
+    }
+  }
+
+  void block_convert(std::ofstream& writr, std::ifstream& readr,
+                     float* read_buf, _u64 npts, _u64 ndims) {
+    readr.read((char*) read_buf, npts * ndims * sizeof(float));
+    _u32 ndims_u32 = (_u32) ndims;
+#pragma omp parallel for
+    for (_s64 i = 0; i < (_s64) npts; i++) {
+      float norm_pt = std::numeric_limits<float>::epsilon();
+      for (_u32 dim = 0; dim < ndims_u32; dim++) {
+        norm_pt +=
+            *(read_buf + i * ndims + dim) * *(read_buf + i * ndims + dim);
+      }
+      norm_pt = std::sqrt(norm_pt);
+      for (_u32 dim = 0; dim < ndims_u32; dim++) {
+        *(read_buf + i * ndims + dim) = *(read_buf + i * ndims + dim) / norm_pt;
+      }
+    }
+    writr.write((char*) read_buf, npts * ndims * sizeof(float));
+  }
+
+  void normalize_data_file(const std::string& inFileName,
+                           const std::string& outFileName) {
+    std::ifstream readr(inFileName, std::ios::binary);
+    std::ofstream writr(outFileName, std::ios::binary);
+
+    int npts_s32, ndims_s32;
+    readr.read((char*) &npts_s32, sizeof(_s32));
+    readr.read((char*) &ndims_s32, sizeof(_s32));
+
+    writr.write((char*) &npts_s32, sizeof(_s32));
+    writr.write((char*) &ndims_s32, sizeof(_s32));
+
+    _u64 npts = (_u64) npts_s32, ndims = (_u64) ndims_s32;
+    diskann::cout << "Normalizing FLOAT vectors in file: " << inFileName
+                  << std::endl;
+    diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
+                  << std::endl;
+
+    _u64 blk_size = 131072;
+    _u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
+    diskann::cout << "# blks: " << nblks << std::endl;
+
+    float* read_buf = new float[npts * ndims];
+    for (_u64 i = 0; i < nblks; i++) {
+      _u64 cblk_size = std::min(npts - i * blk_size, blk_size);
+      block_convert(writr, readr, read_buf, cblk_size, ndims);
+    }
+    delete[] read_buf;
+
+    diskann::cout << "Wrote normalized points to file: " << outFileName
+                  << std::endl;
+  }
+
+}  // namespace diskann
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 14)
 add_executable(build_memory_index build_memory_index.cpp )
 if(MSVC)
 	target_link_options(build_memory_index PRIVATE /MACHINE:x64 /DEBUG:FULL "/INCLUDE:_tcmalloc")
-	target_link_libraries(build_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
-	target_link_libraries(build_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
+	target_link_libraries(build_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
+	target_link_libraries(build_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
 else() 
 	target_link_libraries(build_memory_index ${PROJECT_NAME} -ltcmalloc)
 endif()
@ -15,8 +15,8 @@ endif()
 add_executable(search_memory_index search_memory_index.cpp )
 if(MSVC)
 	target_link_options(search_memory_index PRIVATE /MACHINE:x64 /DEBUG:FULL)
-	target_link_libraries(search_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
-	target_link_libraries(search_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
+	target_link_libraries(search_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
+	target_link_libraries(search_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
 else() 
 	target_link_libraries(search_memory_index ${PROJECT_NAME} aio -ltcmalloc)
 endif()
@ -24,8 +24,8 @@ endif()
 add_executable(build_disk_index build_disk_index.cpp )
 if(MSVC)
 	target_link_options(build_disk_index PRIVATE /MACHINE:x64 /DEBUG:FULL "/INCLUDE:_tcmalloc")
-	target_link_libraries(build_disk_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
-	target_link_libraries(build_disk_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
+	target_link_libraries(build_disk_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
+	target_link_libraries(build_disk_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
 else() 
 	target_link_libraries(build_disk_index ${PROJECT_NAME} -ltcmalloc aio)
 endif()
--- a/tests/build_memory_index.cpp
+++ b/tests/build_memory_index.cpp
@ -16,7 +16,7 @@
 #include "memory_mapper.h"
 #include "ann_exception.h"

-template<typename T>
+template<typename T, typename TagT = uint32_t>
 int build_in_memory_index(const std::string&     data_path,
                          const diskann::Metric& metric, const unsigned R,
                          const unsigned L, const float alpha,
@ -31,9 +31,14 @@ int build_in_memory_index(const std::string&     data_path,
  paras.Set<bool>("saturate_graph", 0);
  paras.Set<unsigned>("num_threads", num_threads);

-  diskann::Index<T> index(metric, data_path.c_str());
-  auto              s = std::chrono::high_resolution_clock::now();
-  index.build(paras);
+  _u64 data_num, data_dim;
+  diskann::get_bin_metadata(data_path, data_num, data_dim);
+
+  diskann::Index<T, TagT> index(metric, data_dim, data_num, false, false,
+                                false);  
+  auto                    s = std::chrono::high_resolution_clock::now();
+  index.build(data_path.c_str(), data_num, paras);
+
  std::chrono::duration<double> diff =
      std::chrono::high_resolution_clock::now() - s;

@ -56,13 +61,15 @@ int main(int argc, char** argv) {
  _u32 ctr = 2;

  diskann::Metric metric;
-  if (std::string(argv[ctr]) == std::string("mips"))
+  if (std::string(argv[ctr]) == std::string("mips")) {
    metric = diskann::Metric::INNER_PRODUCT;
-  else if (std::string(argv[ctr]) == std::string("l2"))
+  } else if (std::string(argv[ctr]) == std::string("l2")) {
    metric = diskann::Metric::L2;
-  else {
-    std::cerr << "Unsupported distance function. Currently only L2/ Inner "
-                 "Product support."
+  } else if (std::string(argv[ctr]) == std::string("cosine")) {
+    metric = diskann::Metric::COSINE;
+  }else {
+    std::cout << "Unsupported distance function. Currently only L2/ Inner "
+                 "Product/Cosine are supported."
              << std::endl;
    return -1;
  }
--- a/tests/search_disk_index.cpp
+++ b/tests/search_disk_index.cpp
@ -309,14 +309,14 @@ int search_disk_index(int argc, char** argv) {

 int main(int argc, char** argv) {
  if (argc < 12) {
-    std::cerr << "Usage: " << argv[0]
+    std::cout << "Usage: " << argv[0]
              << "   index_type<float/int8/uint8>   dist_fn<l2/mips>   "
                 "index_prefix_path   num_nodes_to_cache   "
                 "T(num_threads)   W(beamwidth)   "
                 "query_file.bin   truthset.bin(\"null\" for none)   "
                 "K   result_output_prefix   L1   L2 ..."
              << std::endl;
-    exit(-1);
+    return -1;
  }
  try {
    if (std::string(argv[1]) == std::string("float"))
--- a/tests/search_memory_index.cpp
+++ b/tests/search_memory_index.cpp
@ -36,9 +36,11 @@ int search_memory_index(int argc, char** argv) {
    metric = diskann::Metric::L2;
  else if (std::string(argv[ctr]) == std::string("fast_l2"))
    metric = diskann::Metric::FAST_L2;
+  else if (std::string(argv[ctr]) == std::string("cosine"))
+    metric = diskann::Metric::COSINE;
  else {
    std::cout << "Unsupported distance function. Currently only L2/ Inner "
-                 "Product/FAST_L2 support."
+                 "Product/FAST_L2/Cosine support."
              << std::endl;
    return -1;
  }
@ -55,6 +57,7 @@ int search_memory_index(int argc, char** argv) {

  std::string data_file(argv[ctr++]);
  std::string memory_index_file(argv[ctr++]);
+  _u64        max_points = std::atoi(argv[ctr++]);
  _u64        num_threads = std::atoi(argv[ctr++]);
  std::string query_bin(argv[ctr++]);
  std::string truthset_bin(argv[ctr++]);
@ -64,10 +67,13 @@ int search_memory_index(int argc, char** argv) {

  bool calc_recall_flag = false;

+  _u32 max_search_L = 0;
  for (; ctr < (_u32) argc; ctr++) {
    _u64 curL = std::atoi(argv[ctr]);
-    if (curL >= recall_at)
+    if (curL >= recall_at) {
      Lvec.push_back(curL);
+      max_search_L = max_search_L > curL ? max_search_L : curL;
+    }
  }

  if (Lvec.size() == 0) {
@ -91,8 +97,11 @@ int search_memory_index(int argc, char** argv) {
  std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
  std::cout.precision(2);

-  diskann::Index<T> index(metric, data_file.c_str());
-  index.load(memory_index_file.c_str());  // to load NSG
+  diskann::Index<T, uint32_t> index(metric, query_dim, max_points, false,
+                                    false);
+
+  index.load(memory_index_file.c_str(), num_threads,
+             max_search_L);
  std::cout << "Index loaded" << std::endl;

  if (metric == diskann::FAST_L2)
@ -172,17 +181,17 @@ int search_memory_index(int argc, char** argv) {
 }

 int main(int argc, char** argv) {
-  if (argc < 11) {
+  if (argc < 12) {
    std::cout << "Usage: " << argv[0]
              << "   index_type<float/int8/uint8>   "
                 "dist_fn<l2/mips/fast_l2>   "
-                 "data_file.bin   memory_index_path   "
+                 "data_file.bin   memory_index_path  max_points  "
                 "T(num_threads)   query_file.bin   "
                 "truthset.bin(\"null\" for none)   "
                 "K   result_output_prefix   "
                 "L1   L2 ... \n"
              << std::endl;
-    exit(-1);
+    return -1;
  }

  try {
--- a/tests/utils/int8_to_float_scale.cpp
+++ b/tests/utils/int8_to_float_scale.cpp
@ -12,7 +12,7 @@ void block_convert(std::ofstream& writer, float* write_buf,
  for (_u64 i = 0; i < npts; i++) {
    for (_u64 d = 0; d < ndims; d++) {
      write_buf[d + i * ndims] =
-          (((float)read_buf[d + i * ndims] - bias) * scale);
+          (((float) read_buf[d + i * ndims] - bias) * scale);
    }
  }
  writer.write((char*) write_buf, npts * ndims * sizeof(float));
@ -20,8 +20,8 @@ void block_convert(std::ofstream& writer, float* write_buf,

 int main(int argc, char** argv) {
  if (argc != 5) {
-    std::cout << "Usage: " << argv[0] << "  input-int8.bin  output-float.bin  bias  scale"
-              << std::endl;
+    std::cout << "Usage: " << argv[0]
+              << "  input-int8.bin  output-float.bin  bias  scale" << std::endl;
    exit(-1);
  }

--- a/tests/utils/rand_data_gen.cpp
+++ b/tests/utils/rand_data_gen.cpp
@ -8,12 +8,12 @@
 #include "utils.h"

 int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
-                       float norm) {
+                      float norm) {
  auto vec = new float[ndims];

  std::random_device         rd{};
  std::mt19937               gen{rd()};
-  std::normal_distribution<> normal_rand{0,1};
+  std::normal_distribution<> normal_rand{0, 1};

  for (_u64 i = 0; i < npts; i++) {
    float sum = 0;
@ -23,7 +23,7 @@ int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
      sum += vec[d] * vec[d];
    for (_u64 d = 0; d < ndims; ++d)
      vec[d] = vec[d] * norm / std::sqrt(sum);
-    
+
    writer.write((char*) vec, ndims * sizeof(float));
  }

@ -31,8 +31,7 @@ int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
  return 0;
 }

-int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
-                       float norm) {
+int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts, float norm) {
  auto vec = new float[ndims];
  auto vec_T = new int8_t[ndims];

@ -48,11 +47,11 @@ int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
      sum += vec[d] * vec[d];
    for (_u64 d = 0; d < ndims; ++d)
      vec[d] = vec[d] * norm / std::sqrt(sum);
-    
+
    for (_u64 d = 0; d < ndims; ++d) {
      vec_T[d] = std::round<int>(vec[d]);
    }
-    
+
    writer.write((char*) vec_T, ndims * sizeof(int8_t));
  }

@ -62,7 +61,7 @@ int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
 }

 int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
-                       float norm) {
+                      float norm) {
  auto vec = new float[ndims];
  auto vec_T = new int8_t[ndims];

@ -78,7 +77,7 @@ int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
      sum += vec[d] * vec[d];
    for (_u64 d = 0; d < ndims; ++d)
      vec[d] = vec[d] * norm / std::sqrt(sum);
-    
+
    for (_u64 d = 0; d < ndims; ++d) {
      vec_T[d] = 128 + std::round<int>(vec[d]);
    }
@ -93,8 +92,7 @@ int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,

 int main(int argc, char** argv) {
  if (argc != 6) {
-    std::cout << argv[0]
-              << " <float/int8/uint8> ndims npts norm output.bin"
+    std::cout << argv[0] << " <float/int8/uint8> ndims npts norm output.bin"
              << std::endl;
    exit(-1);
  }
@ -115,13 +113,15 @@ int main(int argc, char** argv) {
    return -1;
  }

-  if ((std::string(argv[1]) == std::string("int8"))
-   || (std::string(argv[1]) == std::string("uint8"))) {
-     if (norm > 127) {
-       std::cerr << "Error: for in8/uint8 datatypes, L2 norm can not be greater than 127" << std::endl;
-       return -1;
-     }
-   }
+  if ((std::string(argv[1]) == std::string("int8")) ||
+      (std::string(argv[1]) == std::string("uint8"))) {
+    if (norm > 127) {
+      std::cerr << "Error: for in8/uint8 datatypes, L2 norm can not be greater "
+                   "than 127"
+                << std::endl;
+      return -1;
+    }
+  }

  std::ofstream writer(argv[5], std::ios::binary);
  auto          npts_s32 = (_u32) npts;
@ -141,13 +141,13 @@ int main(int argc, char** argv) {
    } else if (std::string(argv[1]) == std::string("int8")) {
      ret = block_write_int8(writer, ndims, cblk_size, norm);
    } else if (std::string(argv[1]) == std::string("uint8")) {
-      ret =  block_write_uint8(writer, ndims, cblk_size, norm);
+      ret = block_write_uint8(writer, ndims, cblk_size, norm);
    }
    if (ret == 0)
      std::cout << "Block #" << i << " written" << std::endl;
    else {
      writer.close();
-      std::cout << "failed to write" <<std::endl;
+      std::cout << "failed to write" << std::endl;
      return -1;
    }
  }
--- a/tests/utils/tsv_to_bin.cpp
+++ b/tests/utils/tsv_to_bin.cpp
@ -4,12 +4,12 @@
 #include <iostream>
 #include "utils.h"

-void block_convert_float(std::ifstream& reader, std::ofstream& writer, _u64 npts,
-                   _u64 ndims) {
+void block_convert_float(std::ifstream& reader, std::ofstream& writer,
+                         _u64 npts, _u64 ndims) {
  auto read_buf = new float[npts * (ndims + 1)];

-  auto cursor = read_buf;
-  float    val;
+  auto  cursor = read_buf;
+  float val;

  for (_u64 i = 0; i < npts; i++) {
    for (_u64 d = 0; d < ndims; ++d) {
@ -23,16 +23,16 @@ void block_convert_float(std::ifstream& reader, std::ofstream& writer, _u64 npts
 }

 void block_convert_int8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
-                   _u64 ndims) {
+                        _u64 ndims) {
  auto read_buf = new int8_t[npts * (ndims + 1)];

  auto cursor = read_buf;
-  int    val;
+  int  val;

  for (_u64 i = 0; i < npts; i++) {
    for (_u64 d = 0; d < ndims; ++d) {
      reader >> val;
-      *cursor = (int8_t)val;
+      *cursor = (int8_t) val;
      cursor++;
    }
  }
@ -40,12 +40,12 @@ void block_convert_int8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
  delete[] read_buf;
 }

-void block_convert_uint8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
-                   _u64 ndims) {
+void block_convert_uint8(std::ifstream& reader, std::ofstream& writer,
+                         _u64 npts, _u64 ndims) {
  auto read_buf = new uint8_t[npts * (ndims + 1)];

  auto cursor = read_buf;
-  int    val;
+  int  val;

  for (_u64 i = 0; i < npts; i++) {
    for (_u64 d = 0; d < ndims; ++d) {
@ -58,7 +58,6 @@ void block_convert_uint8(std::ifstream& reader, std::ofstream& writer, _u64 npts
  delete[] read_buf;
 }

-
 int main(int argc, char** argv) {
  if (argc != 6) {
    std::cout << argv[0]
--- a/tests/utils/vector_analysis.cpp
+++ b/tests/utils/vector_analysis.cpp
@ -127,12 +127,12 @@ int aux_main(char** argv) {

 int main(int argc, char** argv) {
  if (argc < 4) {
-    std::cout
-        << argv[0]
-        << " data_type [float/int8/uint8] base_bin_file "
-           "[option: 1-norm analysis, 2-prep_base_for_mip, "
-           "3-prep_query_for_mip, 4-normalize-vecs] [out_file for options 2/3/4]"
-        << std::endl;
+    std::cout << argv[0]
+              << " data_type [float/int8/uint8] base_bin_file "
+                 "[option: 1-norm analysis, 2-prep_base_for_mip, "
+                 "3-prep_query_for_mip, 4-normalize-vecs] [out_file for "
+                 "options 2/3/4]"
+              << std::endl;
    exit(-1);
  }
				`@ -0,0 +1 @@`
				`Subproject commit fe85bbdf4cb891a67a8e2109c1c22a33aa958c7e`