Update for vamana (bulk and fresh) from kann-experiments.

Added support for cosine similarity

CMake files for build on Windows

Added gperftools as submodule and patches to gperftools

Changed cmake to use libtcmalloc from gperftools
This commit is contained in:
ravishankar 2022-04-12 00:19:41 -07:00 коммит произвёл Harsha Vardhan Simhadri
Родитель fd0b59dde3
Коммит bb5c124853
27 изменённых файлов: 5720 добавлений и 1785 удалений

3
.gitmodules поставляемый Normal file
Просмотреть файл

@ -0,0 +1,3 @@
[submodule "gperftools"]
path = gperftools
url = https://github.com/gperftools/gperftools.git

Просмотреть файл

@ -18,7 +18,7 @@ else()
endif()
project(diskann)
include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/dll)
include_directories(${PROJECT_SOURCE_DIR}/include ${PROJECT_SOURCE_DIR}/include/dll ${PROJECT_SOURCE_DIR}/gperftools/src)
#OpenMP
find_package(OpenMP)

1
gperftools Submodule

@ -0,0 +1 @@
Subproject commit fe85bbdf4cb891a67a8e2109c1c22a33aa958c7e

Просмотреть файл

@ -27,12 +27,16 @@ typedef int FileHandle;
#include "cached_io.h"
#include "common_includes.h"
#include "tsl/robin_set.h"
#include "utils.h"
#include "windows_customizations.h"
#include "gperftools/malloc_extension.h"
namespace diskann {
const size_t TRAINING_SET_SIZE = 100000;
const size_t MAX_PQ_TRAINING_SET_SIZE = 256000;
const size_t MAX_SAMPLE_POINTS_FOR_WARMUP = 1000000;
const double PQ_TRAINING_SET_FRACTION = 0.1;
const double SPACE_FOR_CACHED_NODES_IN_GB = 0.25;
const double THRESHOLD_FOR_CACHING_IN_GB = 1.0;
const uint32_t NUM_NODES_TO_CACHE = 250000;
@ -42,10 +46,24 @@ namespace diskann {
template<typename T>
class PQFlashIndex;
DISKANN_DLLEXPORT double get_memory_budget(const std::string &mem_budget_str);
DISKANN_DLLEXPORT double get_memory_budget(double search_ram_budget_in_gb);
DISKANN_DLLEXPORT void add_new_file_to_single_index(std::string index_file,
std::string new_file);
DISKANN_DLLEXPORT size_t calculate_num_pq_chunks(double final_index_ram_limit,
size_t points_num,
uint32_t dim);
DISKANN_DLLEXPORT double calculate_recall(
unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
unsigned *our_results, unsigned dim_or, unsigned recall_at);
DISKANN_DLLEXPORT double calculate_recall(
unsigned num_queries, unsigned *gold_std, float *gs_dist, unsigned dim_gs,
unsigned *our_results, unsigned dim_or, unsigned recall_at,
const tsl::robin_set<unsigned> &active_tags);
DISKANN_DLLEXPORT double calculate_range_search_recall(
unsigned num_queries, std::vector<std::vector<_u32>> &groundtruth,
std::vector<std::vector<_u32>> &our_results);
@ -74,6 +92,11 @@ namespace diskann {
const std::string &output_vamana,
const std::string &medoids_file);
template<typename T>
DISKANN_DLLEXPORT std::string preprocess_base_file(
const std::string &infile, const std::string &indexPrefix,
diskann::Metric &distMetric);
template<typename T>
DISKANN_DLLEXPORT int build_merged_vamana_index(
std::string base_file, diskann::Metric _compareMetric, unsigned L,

Просмотреть файл

@ -2,17 +2,28 @@
// Licensed under the MIT license.
#pragma once
#include <immintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <cmath>
#include <cstdint>
#include <cstdlib>
#include <vector>
#include <limits>
#include <algorithm>
#include <stdexcept>
#include "simd_utils.h"
extern bool Avx2SupportedCPU;
namespace diskann {
template<typename T>
inline float compute_l2_norm(const T* vector, uint64_t ndims) {
float norm = 0.0f;
float norm = std::numeric_limits<float>::epsilon();
for (uint64_t i = 0; i < ndims; i++) {
norm += vector[i] * vector[i];
norm += (float) (vector[i] * vector[i]);
}
return std::sqrt(norm);
}
@ -24,7 +35,7 @@ namespace diskann {
float right_norm = compute_l2_norm<T>(right, ndims);
float dot = 0.0f;
for (uint64_t i = 0; i < ndims; i++) {
dot += left[i] * right[i];
dot += (float) (left[i] * right[i]);
}
float cos_sim = dot / (left_norm * right_norm);
return cos_sim;
@ -37,10 +48,266 @@ namespace diskann {
cos_dists.reserve(npts);
for (size_t i = 0; i < npts; i++) {
const float* point = all_data + (size_t)(indices[i]) * (size_t)(ndims);
const float* point = all_data + (size_t) (indices[i]) * (size_t) (ndims);
cos_dists.push_back(
compute_cosine_similarity<float>(point, query, ndims));
}
return cos_dists;
}
} // namespace diskann
#ifdef _WINDOWS
// SIMD implementation of Cosine similarity. Taken from hnsw library.
/**
* Non-metric Space Library
*
* Authors: Bilegsaikhan Naidan (https://github.com/bileg), Leonid Boytsov
* (http://boytsov.info). With contributions from Lawrence Cayton
* (http://lcayton.com/) and others.
*
* For the complete list of contributors and further details see:
* https://github.com/searchivarius/NonMetricSpaceLib
*
* Copyright (c) 2014
*
* This code is released under the
* Apache License Version 2.0 http://www.apache.org/licenses/.
*
*/
namespace diskann {
using namespace std;
#define PORTABLE_ALIGN16 __declspec(align(16))
static float NormScalarProductSIMD2(const int8_t* pVect1,
const int8_t* pVect2, uint32_t qty) {
if (Avx2SupportedCPU) {
__m256 cos, p1Len, p2Len;
cos = p1Len = p2Len = _mm256_setzero_ps();
while (qty >= 32) {
__m256i rx = _mm256_load_si256((__m256i*) pVect1),
ry = _mm256_load_si256((__m256i*) pVect2);
cos = _mm256_add_ps(cos, _mm256_mul_epi8(rx, ry));
p1Len = _mm256_add_ps(p1Len, _mm256_mul_epi8(rx, rx));
p2Len = _mm256_add_ps(p2Len, _mm256_mul_epi8(ry, ry));
pVect1 += 32;
pVect2 += 32;
qty -= 32;
}
while (qty > 0) {
__m128i rx = _mm_load_si128((__m128i*) pVect1),
ry = _mm_load_si128((__m128i*) pVect2);
cos = _mm256_add_ps(cos, _mm256_mul32_pi8(rx, ry));
p1Len = _mm256_add_ps(p1Len, _mm256_mul32_pi8(rx, rx));
p2Len = _mm256_add_ps(p2Len, _mm256_mul32_pi8(ry, ry));
pVect1 += 4;
pVect2 += 4;
qty -= 4;
}
cos = _mm256_hadd_ps(_mm256_hadd_ps(cos, cos), cos);
p1Len = _mm256_hadd_ps(_mm256_hadd_ps(p1Len, p1Len), p1Len);
p2Len = _mm256_hadd_ps(_mm256_hadd_ps(p2Len, p2Len), p2Len);
float denominator = max(numeric_limits<float>::min() * 2,
sqrt(p1Len.m256_f32[0] + p1Len.m256_f32[4]) *
sqrt(p2Len.m256_f32[0] + p2Len.m256_f32[4]));
float cosine = (cos.m256_f32[0] + cos.m256_f32[4]) / denominator;
return max(float(-1), min(float(1), cosine));
}
__m128 cos, p1Len, p2Len;
cos = p1Len = p2Len = _mm_setzero_ps();
__m128i rx, ry;
while (qty >= 16) {
rx = _mm_load_si128((__m128i*) pVect1);
ry = _mm_load_si128((__m128i*) pVect2);
cos = _mm_add_ps(cos, _mm_mul_epi8(rx, ry));
p1Len = _mm_add_ps(p1Len, _mm_mul_epi8(rx, rx));
p2Len = _mm_add_ps(p2Len, _mm_mul_epi8(ry, ry));
pVect1 += 16;
pVect2 += 16;
qty -= 16;
}
while (qty > 0) {
rx = _mm_load_si128((__m128i*) pVect1);
ry = _mm_load_si128((__m128i*) pVect2);
cos = _mm_add_ps(cos, _mm_mul32_pi8(rx, ry));
p1Len = _mm_add_ps(p1Len, _mm_mul32_pi8(rx, rx));
p2Len = _mm_add_ps(p2Len, _mm_mul32_pi8(ry, ry));
pVect1 += 4;
pVect2 += 4;
qty -= 4;
}
cos = _mm_hadd_ps(_mm_hadd_ps(cos, cos), cos);
p1Len = _mm_hadd_ps(_mm_hadd_ps(p1Len, p1Len), p1Len);
p2Len = _mm_hadd_ps(_mm_hadd_ps(p2Len, p2Len), p2Len);
float norm1 = p1Len.m128_f32[0];
float norm2 = p2Len.m128_f32[0];
static const float eps = numeric_limits<float>::min() * 2;
if (norm1 < eps) { /*
* This shouldn't normally happen for this space, but
* if it does, we don't want to get NANs
*/
if (norm2 < eps) {
return 1;
}
return 0;
}
/*
* Sometimes due to rounding errors, we get values > 1 or < -1.
* This throws off other functions that use scalar product, e.g., acos
*/
return max(float(-1),
min(float(1), cos.m128_f32[0] / sqrt(norm1) / sqrt(norm2)));
}
static float NormScalarProductSIMD(const float* pVect1, const float* pVect2,
uint32_t qty) {
// Didn't get significant performance gain compared with 128bit version.
static const float eps = numeric_limits<float>::min() * 2;
if (Avx2SupportedCPU) {
uint32_t qty8 = qty / 8;
const float* pEnd1 = pVect1 + 8 * qty8;
const float* pEnd2 = pVect1 + qty;
__m256 v1, v2;
__m256 sum_prod = _mm256_set_ps(0, 0, 0, 0, 0, 0, 0, 0);
__m256 sum_square1 = sum_prod;
__m256 sum_square2 = sum_prod;
while (pVect1 < pEnd1) {
v1 = _mm256_loadu_ps(pVect1);
pVect1 += 8;
v2 = _mm256_loadu_ps(pVect2);
pVect2 += 8;
sum_prod = _mm256_add_ps(sum_prod, _mm256_mul_ps(v1, v2));
sum_square1 = _mm256_add_ps(sum_square1, _mm256_mul_ps(v1, v1));
sum_square2 = _mm256_add_ps(sum_square2, _mm256_mul_ps(v2, v2));
}
float PORTABLE_ALIGN16 TmpResProd[8];
float PORTABLE_ALIGN16 TmpResSquare1[8];
float PORTABLE_ALIGN16 TmpResSquare2[8];
_mm256_store_ps(TmpResProd, sum_prod);
_mm256_store_ps(TmpResSquare1, sum_square1);
_mm256_store_ps(TmpResSquare2, sum_square2);
float sum = 0.0f;
float norm1 = 0.0f;
float norm2 = 0.0f;
for (uint32_t i = 0; i < 8; ++i) {
sum += TmpResProd[i];
norm1 += TmpResSquare1[i];
norm2 += TmpResSquare2[i];
}
while (pVect1 < pEnd2) {
sum += (*pVect1) * (*pVect2);
norm1 += (*pVect1) * (*pVect1);
norm2 += (*pVect2) * (*pVect2);
++pVect1;
++pVect2;
}
if (norm1 < eps) {
return norm2 < eps ? 1.0f : 0.0f;
}
return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
}
__m128 v1, v2;
__m128 sum_prod = _mm_set1_ps(0);
__m128 sum_square1 = sum_prod;
__m128 sum_square2 = sum_prod;
while (qty >= 4) {
v1 = _mm_loadu_ps(pVect1);
pVect1 += 4;
v2 = _mm_loadu_ps(pVect2);
pVect2 += 4;
sum_prod = _mm_add_ps(sum_prod, _mm_mul_ps(v1, v2));
sum_square1 = _mm_add_ps(sum_square1, _mm_mul_ps(v1, v1));
sum_square2 = _mm_add_ps(sum_square2, _mm_mul_ps(v2, v2));
qty -= 4;
}
float sum = sum_prod.m128_f32[0] + sum_prod.m128_f32[1] +
sum_prod.m128_f32[2] + sum_prod.m128_f32[3];
float norm1 = sum_square1.m128_f32[0] + sum_square1.m128_f32[1] +
sum_square1.m128_f32[2] + sum_square1.m128_f32[3];
float norm2 = sum_square2.m128_f32[0] + sum_square2.m128_f32[1] +
sum_square2.m128_f32[2] + sum_square2.m128_f32[3];
if (norm1 < eps) {
return norm2 < eps ? 1.0f : 0.0f;
}
return max(float(-1), min(float(1), sum / sqrt(norm1) / sqrt(norm2)));
}
static float NormScalarProductSIMD2(const float* pVect1, const float* pVect2,
uint32_t qty) {
return NormScalarProductSIMD(pVect1, pVect2, qty);
}
template<class T>
static float CosineSimilarity2(const T* p1, const T* p2, uint32_t qty) {
return std::max(0.0f, 1.0f - NormScalarProductSIMD2(p1, p2, qty));
}
// static template float CosineSimilarity2<__int8>(const __int8* pVect1,
// const __int8* pVect2, size_t qty);
// static template float CosineSimilarity2<float>(const float* pVect1,
// const float* pVect2, size_t qty);
template<class T>
static void CosineSimilarityNormalize(T* pVector, uint32_t qty) {
T sum = 0;
for (uint32_t i = 0; i < qty; ++i) {
sum += pVector[i] * pVector[i];
}
sum = 1 / sqrt(sum);
if (sum == 0) {
sum = numeric_limits<T>::min();
}
for (uint32_t i = 0; i < qty; ++i) {
pVector[i] *= sum;
}
}
// template static void CosineSimilarityNormalize<float>(float* pVector,
// size_t qty);
// template static void CosineSimilarityNormalize<double>(double* pVector,
// size_t qty);
template<>
void CosineSimilarityNormalize(__int8* pVector, uint32_t qty) {
throw std::runtime_error(
"For int8 type vector, you can not use cosine distance!");
}
template<>
void CosineSimilarityNormalize(__int16* pVector, uint32_t qty) {
throw std::runtime_error(
"For int16 type vector, you can not use cosine distance!");
}
template<>
void CosineSimilarityNormalize(int* pVector, uint32_t qty) {
throw std::runtime_error(
"For int type vector, you can not use cosine distance!");
}
} // namespace diskann
#endif

Просмотреть файл

@ -1,443 +1,65 @@
#pragma once
#include <utils.h>
#ifdef _WINDOWS
#include <immintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#include <cosine_similarity.h>
#include <iostream>
namespace {
static inline __m128 _mm_mulhi_epi8(__m128i X) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mulhi_epi8_shift32(__m128i X) {
__m128i zero = _mm_setzero_si128();
X = _mm_srli_epi64(X, 32);
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i sign_y = _mm_cmplt_epi8(Y, zero);
__m128i xlo = _mm_unpacklo_epi8(X, sign_x);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
__m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
__m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
}
static inline __m128 _mm_mul_epi8(__m128i X) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xlo = _mm_unpacklo_epi8(X, sign_x);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y) {
__m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
return _mm_cvtepi32_ps(
_mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
}
static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y) {
__m256i zero = _mm256_setzero_si256();
__m256i sign_x = _mm256_cmpgt_epi8(zero, X);
__m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
__m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
__m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
__m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
__m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo),
_mm256_madd_epi16(xhi, yhi)));
}
static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y) {
__m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)),
_mm256_setzero_ps(), 252);
}
static inline float _mm256_reduce_add_ps(__m256 x) {
/* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
const __m128 x128 =
_mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
/* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
/* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
/* Conversion to float is a no-op on x86-64 */
return _mm_cvtss_f32(x32);
}
} // namespace
#include "windows_customizations.h"
namespace diskann {
// enum Metric { L2 = 0, INNER_PRODUCT = 1, FAST_L2 = 2, PQ = 3 };
template<typename T>
class Distance {
public:
virtual float compare(const T *a, const T *b, unsigned length) const = 0;
virtual float compare(const T *a, const T *b, uint32_t length) const = 0;
virtual ~Distance() {
}
};
template<typename T>
class DistanceCosine : public Distance<T> {
float compare(const T *a, const T *b, unsigned length) const {
return diskann::compute_cosine_similarity<T>(a, b, length);
}
class DistanceCosineInt8 : public Distance<int8_t> {
public:
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
uint32_t length) const;
};
class DistanceCosineFloat : public Distance<float> {
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t length) const;
};
class SlowDistanceCosineUInt8 : public Distance<uint8_t> {
public:
DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
uint32_t length) const;
};
class DistanceL2Int8 : public Distance<int8_t> {
public:
float compare(const int8_t *a, const int8_t *b, unsigned size) const {
int32_t result = 0;
#ifdef _WINDOWS
#ifdef USE_AVX2
__m256 r = _mm256_setzero_ps();
char * pX = (char *) a, *pY = (char *) b;
while (size >= 32) {
__m256i r1 = _mm256_subs_epi8(_mm256_loadu_si256((__m256i *) pX),
_mm256_loadu_si256((__m256i *) pY));
r = _mm256_add_ps(r, _mm256_mul_epi8(r1, r1));
pX += 32;
pY += 32;
size -= 32;
}
while (size > 0) {
__m128i r2 = _mm_subs_epi8(_mm_loadu_si128((__m128i *) pX),
_mm_loadu_si128((__m128i *) pY));
r = _mm256_add_ps(r, _mm256_mul32_pi8(r2, r2));
pX += 4;
pY += 4;
size -= 4;
}
r = _mm256_hadd_ps(_mm256_hadd_ps(r, r), r);
return r.m256_f32[0] + r.m256_f32[4];
#else
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (_s32 i = 0; i < (_s32) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
#endif
#else
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (_s32 i = 0; i < (_s32) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
#endif
}
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
uint32_t size) const;
};
class DistanceL2UInt8 : public Distance<uint8_t> {
public:
float compare(const uint8_t *a, const uint8_t *b, unsigned size) const {
uint32_t result = 0;
#ifndef _WINDOWS
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
#endif
for (_s32 i = 0; i < (_s32) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
}
DISKANN_DLLEXPORT virtual float compare(const uint8_t *a, const uint8_t *b,
uint32_t size) const;
};
class DistanceL2 : public Distance<float> {
class DistanceL2Float : public Distance<float> {
public:
#ifndef _WINDOWS
float compare(const float *a, const float *b, unsigned size) const
__attribute__((hot)) {
a = (const float *) __builtin_assume_aligned(a, 32);
b = (const float *) __builtin_assume_aligned(b, 32);
#ifdef _WINDOWS
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t size) const;
#else
float compare(const float *a, const float *b, unsigned size) const {
#endif
float result = 0;
#ifdef USE_AVX2
// assume size is divisible by 8
_u16 niters = size / 8;
__m256 sum = _mm256_setzero_ps();
for (_u16 j = 0; j < niters; j++) {
// scope is a[8j:8j+7], b[8j:8j+7]
// load a_vec
if (j < (niters - 1)) {
_mm_prefetch((char *) (a + 8 * (j + 1)), _MM_HINT_T0);
_mm_prefetch((char *) (b + 8 * (j + 1)), _MM_HINT_T0);
}
__m256 a_vec = _mm256_load_ps(a + 8 * j);
// load b_vec
__m256 b_vec = _mm256_load_ps(b + 8 * j);
// a_vec - b_vec
__m256 tmp_vec = _mm256_sub_ps(a_vec, b_vec);
/*
// (a_vec - b_vec)**2
__m256 tmp_vec2 = _mm256_mul_ps(tmp_vec, tmp_vec);
// accumulate sum
sum = _mm256_add_ps(sum, tmp_vec2);
*/
// sum = (tmp_vec**2) + sum
sum = _mm256_fmadd_ps(tmp_vec, tmp_vec, sum);
}
// horizontal add sum
result = _mm256_reduce_add_ps(sum);
#else
#ifndef _WINDOWS
#pragma omp simd reduction(+ : result) aligned(a, b : 32)
#endif
for (_s32 i = 0; i < (_s32) size; i++) {
result += (a[i] - b[i]) * (a[i] - b[i]);
}
#endif
return result;
}
};
// Slow implementations of the distance functions for machines without AVX2
template<typename T>
class SlowDistanceL2Int : public Distance<T> {
virtual float compare(const T *a, const T *b, unsigned length) const {
uint32_t result = 0;
for (_u32 i = 0; i < length; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
}
};
class SlowDistanceL2Float : public Distance<float> {
virtual float compare(const float *a, const float *b,
unsigned length) const {
float result = 0.0f;
for (_u32 i = 0; i < length; i++) {
result += (a[i] - b[i]) * (a[i] - b[i]);
}
return result;
}
};
class AVXDistanceL2Int8 : public Distance<int8_t> {
public:
virtual float compare(const int8_t *a, const int8_t *b,
unsigned int length) const {
#ifndef _WINDOWS
int32_t result = 0;
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (_s32 i = 0; i < (_s32) length; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
}
#else
__m128 r = _mm_setzero_ps();
__m128i r1;
while (length >= 16) {
r1 = _mm_subs_epi8(_mm_load_si128((__m128i *) a),
_mm_load_si128((__m128i *) b));
r = _mm_add_ps(r, _mm_mul_epi8(r1));
a += 16;
b += 16;
length -= 16;
}
r = _mm_hadd_ps(_mm_hadd_ps(r, r), r);
float res = r.m128_f32[0];
if (length >= 8) {
__m128 r2 = _mm_setzero_ps();
__m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 8)),
_mm_load_si128((__m128i *) (b - 8)));
r2 = _mm_add_ps(r2, _mm_mulhi_epi8(r3));
a += 8;
b += 8;
length -= 8;
r2 = _mm_hadd_ps(_mm_hadd_ps(r2, r2), r2);
res += r2.m128_f32[0];
}
if (length >= 4) {
__m128 r2 = _mm_setzero_ps();
__m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 12)),
_mm_load_si128((__m128i *) (b - 12)));
r2 = _mm_add_ps(r2, _mm_mulhi_epi8_shift32(r3));
res += r2.m128_f32[0] + r2.m128_f32[1];
}
return res;
}
#endif
};
class AVXDistanceL2Float : public Distance<float> {
public:
virtual float compare(const float *a, const float *b,
unsigned int length) const {
#ifndef _WINDOWS
float result = 0;
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (_s32 i = 0; i < (_s32) length; i++) {
result += (a[i] - b[i]) * (a[i] - b[i]);
}
return result;
}
#else
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (length >= 4) {
v1 = _mm_loadu_ps(a);
a += 4;
v2 = _mm_loadu_ps(b);
b += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
length -= 4;
}
return sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] +
sum.m128_f32[3];
}
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t size) const
__attribute__((hot));
#endif
};
template<typename T>
class DistanceInnerProduct : public Distance<T> {
public:
float inner_product(const T *a, const T *b, unsigned size) const {
float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm256_loadu_ps(addr1); \
tmp2 = _mm256_loadu_ps(addr2); \
tmp1 = _mm256_mul_ps(tmp1, tmp2); \
dest = _mm256_add_ps(dest, tmp1);
__m256 sum;
__m256 l0, l1;
__m256 r0, r1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = (float *) a;
const float *r = (float *) b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
sum = _mm256_loadu_ps(unpack);
if (DR) {
AVX_DOT(e_l, e_r, sum, l0, r0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
AVX_DOT(l, r, sum, l0, r0);
AVX_DOT(l + 8, r + 8, sum, l1, r1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
unpack[5] + unpack[6] + unpack[7];
#else
#ifdef __SSE2__
#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm128_loadu_ps(addr1); \
tmp2 = _mm128_loadu_ps(addr2); \
tmp1 = _mm128_mul_ps(tmp1, tmp2); \
dest = _mm128_add_ps(dest, tmp1);
__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_DOT(e_l + 8, e_r + 8, sum, l2, r2);
case 8:
SSE_DOT(e_l + 4, e_r + 4, sum, l1, r1);
case 4:
SSE_DOT(e_l, e_r, sum, l0, r0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
SSE_DOT(l, r, sum, l0, r0);
SSE_DOT(l + 4, r + 4, sum, l1, r1);
SSE_DOT(l + 8, r + 8, sum, l2, r2);
SSE_DOT(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else
float dot0, dot1, dot2, dot3;
const float *last = a + size;
const float *unroll_group = last - 3;
/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * b[0];
dot1 = a[1] * b[1];
dot2 = a[2] * b[2];
dot3 = a[3] * b[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
b += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += *a++ * *b++;
}
#endif
#endif
#endif
return result;
}
float compare(const T *a, const T *b, unsigned size)
const { // since we use normally minimization objective for distance
// comparisons, we are returning 1/x.
float inner_product(const T *a, const T *b, unsigned size) const;
float compare(const T *a, const T *b, unsigned size) const {
// since we use normally minimization objective for distance
// comparisons, we are returning 1/x.
float result = inner_product(a, b, size);
// if (result < 0)
// return std::numeric_limits<float>::max();
@ -451,100 +73,65 @@ namespace diskann {
: public DistanceInnerProduct<T> { // currently defined only for float.
// templated for future use.
public:
float norm(const T *a, unsigned size) const {
float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_L2NORM(addr, dest, tmp) \
tmp = _mm256_loadu_ps(addr); \
tmp = _mm256_mul_ps(tmp, tmp); \
dest = _mm256_add_ps(dest, tmp);
__m256 sum;
__m256 l0, l1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = (float *) a;
const float *e_l = l + DD;
float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
sum = _mm256_loadu_ps(unpack);
if (DR) {
AVX_L2NORM(e_l, sum, l0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
AVX_L2NORM(l, sum, l0);
AVX_L2NORM(l + 8, sum, l1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
unpack[5] + unpack[6] + unpack[7];
#else
#ifdef __SSE2__
#define SSE_L2NORM(addr, dest, tmp) \
tmp = _mm128_loadu_ps(addr); \
tmp = _mm128_mul_ps(tmp, tmp); \
dest = _mm128_add_ps(dest, tmp);
__m128 sum;
__m128 l0, l1, l2, l3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *e_l = l + DD;
float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_L2NORM(e_l + 8, sum, l2);
case 8:
SSE_L2NORM(e_l + 4, sum, l1);
case 4:
SSE_L2NORM(e_l, sum, l0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
SSE_L2NORM(l, sum, l0);
SSE_L2NORM(l + 4, sum, l1);
SSE_L2NORM(l + 8, sum, l2);
SSE_L2NORM(l + 12, sum, l3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else
float dot0, dot1, dot2, dot3;
const float *last = a + size;
const float *unroll_group = last - 3;
/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * a[0];
dot1 = a[1] * a[1];
dot2 = a[2] * a[2];
dot3 = a[3] * a[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += (*a) * (*a);
a++;
}
#endif
#endif
#endif
return result;
}
using DistanceInnerProduct<T>::compare;
float norm(const T *a, unsigned size) const;
float compare(const T *a, const T *b, float norm,
unsigned size) const { // not implement
float result = -2 * DistanceInnerProduct<T>::inner_product(a, b, size);
result += norm;
return result;
unsigned size) const;
};
// Gopal. Slow implementations of the distance functions to get diskann to
// work in pre-AVX machines. Performance here is not a concern, so we are
// using the simplest possible implementation.
template<typename T>
class SlowDistanceL2Int : public Distance<T> {
public:
// Implementing here because this is a template function
DISKANN_DLLEXPORT virtual float compare(const T *a, const T *b,
uint32_t length) const {
uint32_t result = 0;
for (uint32_t i = 0; i < length; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
}
};
class SlowDistanceL2Float : public Distance<float> {
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t length) const;
};
// AVX implementations. Borrowed from HNSW code.
class AVXDistanceL2Int8 : public Distance<int8_t> {
public:
DISKANN_DLLEXPORT virtual float compare(const int8_t *a, const int8_t *b,
uint32_t length) const;
};
class AVXDistanceL2Float : public Distance<float> {
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t length) const;
};
class AVXDistanceInnerProductFloat : public Distance<float> {
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t length) const;
};
class AVXNormalizedCosineDistanceFloat : public Distance<float> {
private:
AVXDistanceInnerProductFloat _innerProduct;
public:
DISKANN_DLLEXPORT virtual float compare(const float *a, const float *b,
uint32_t length) const {
// Inner product returns negative values to indicate distance.
// This will ensure that cosine is between -1 and 1.
return 1.0f + _innerProduct.compare(a, b, length);
}
};
} // namespace diskann

Просмотреть файл

@ -3,81 +3,173 @@
#pragma once
#include <atomic>
#include <cassert>
#include <map>
#include <shared_mutex>
#include <sstream>
#include <stack>
#include <string>
#include <unordered_map>
#include "tsl/robin_set.h"
#include "tsl/robin_map.h"
#include "distance.h"
#include "neighbor.h"
#include "parameters.h"
#include "utils.h"
#include "concurrent_queue.h"
#include "windows_customizations.h"
#define SLACK_FACTOR 1.3
#define GRAPH_SLACK_FACTOR 1.3
#define OVERHEAD_FACTOR 1.1
#define ESTIMATE_RAM_USAGE(size, dim, datasize, degree) \
(1.30 * (((double) size * dim) * datasize + \
((double) size * degree) * sizeof(unsigned) * SLACK_FACTOR))
namespace boost {
#ifndef BOOST_DYNAMIC_BITSET_FWD_HPP
template<typename Block = unsigned long,
typename Allocator = std::allocator<Block>>
class dynamic_bitset;
#endif
} // namespace boost
namespace diskann {
template<typename T, typename TagT = int>
inline double estimate_ram_usage(_u64 size, _u32 dim, _u32 datasize,
_u32 degree) {
double size_of_data = ((double) size) * ROUND_UP(dim, 8) * datasize;
double size_of_graph =
((double) size) * degree * sizeof(unsigned) * GRAPH_SLACK_FACTOR;
double size_of_locks = ((double) size) * sizeof(std::mutex);
double size_of_outer_vector = ((double) size) * sizeof(ptrdiff_t);
return OVERHEAD_FACTOR * (size_of_data + size_of_graph + size_of_locks +
size_of_outer_vector);
}
template<typename T>
struct InMemQueryScratch {
std::vector<Neighbor> * _pool = nullptr;
tsl::robin_set<unsigned> *_visited = nullptr;
std::vector<unsigned> * _des = nullptr;
std::vector<Neighbor> * _best_l_nodes = nullptr;
tsl::robin_set<unsigned> *_inserted_into_pool_rs = nullptr;
boost::dynamic_bitset<> * _inserted_into_pool_bs = nullptr;
T * aligned_query = nullptr;
uint32_t *indices = nullptr;
float * interim_dists = nullptr;
uint32_t search_l;
uint32_t indexing_l;
uint32_t r;
InMemQueryScratch();
void setup(uint32_t search_l, uint32_t indexing_l, uint32_t r, size_t dim);
void clear();
void resize_for_query(uint32_t new_search_l);
void destroy();
std::vector<Neighbor> &pool() {
return *_pool;
}
std::vector<unsigned> &des() {
return *_des;
}
tsl::robin_set<unsigned> &visited() {
return *_visited;
}
std::vector<Neighbor> &best_l_nodes() {
return *_best_l_nodes;
}
tsl::robin_set<unsigned> &inserted_into_pool_rs() {
return *_inserted_into_pool_rs;
}
boost::dynamic_bitset<> &inserted_into_pool_bs() {
return *_inserted_into_pool_bs;
}
};
template<typename T, typename TagT = uint32_t>
class Index {
public:
DISKANN_DLLEXPORT Index(Metric m, const char *filename,
const size_t max_points = 0, const size_t nd = 0,
const size_t num_frozen_pts = 0,
const bool enable_tags = false,
const bool store_data = true,
const bool support_eager_delete = false);
// Constructor for Bulk operations and for creating the index object solely
// for loading a prexisting index.
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
const bool dynamic_index,
const bool save_index_in_one_file,
const bool enable_tags = false,
const bool support_eager_delete = false);
// Constructor for incremental index
DISKANN_DLLEXPORT Index(Metric m, const size_t dim, const size_t max_points,
const bool dynamic_index,
const bool save_index_in_one_file,
const Parameters &indexParameters,
const Parameters &searchParameters,
const bool enable_tags = false,
const bool support_eager_delete = false);
DISKANN_DLLEXPORT ~Index();
// Public Functions for Static Support
// checks if data is consolidated, saves graph, metadata and associated
// tags.
DISKANN_DLLEXPORT void save(const char *filename);
DISKANN_DLLEXPORT void load(const char *filename,
const bool load_tags = false,
const char *tag_filename = NULL);
// generates one or more frozen points that will never get deleted from the
// graph
DISKANN_DLLEXPORT int generate_random_frozen_points(
const char *filename = NULL);
DISKANN_DLLEXPORT _u64 save_graph(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT _u64 save_data(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT _u64 save_tags(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT _u64 save_delete_list(const std::string &filename,
size_t offset = 0);
DISKANN_DLLEXPORT void load(const char *index_file, uint32_t num_threads,
uint32_t search_l);
DISKANN_DLLEXPORT size_t load_graph(const std::string filename,
size_t expected_num_points,
size_t offset = 0);
DISKANN_DLLEXPORT size_t load_data(std::string filename, size_t offset = 0);
DISKANN_DLLEXPORT size_t load_tags(const std::string tag_file_name,
size_t offset = 0);
DISKANN_DLLEXPORT size_t load_delete_set(const std::string &filename,
size_t offset = 0);
DISKANN_DLLEXPORT size_t get_num_points();
DISKANN_DLLEXPORT size_t return_max_points();
DISKANN_DLLEXPORT void build(
const char *filename, const size_t num_points_to_load,
Parameters & parameters,
const std::vector<TagT> &tags = std::vector<TagT>());
DISKANN_DLLEXPORT void build(const char * filename,
const size_t num_points_to_load,
Parameters & parameters,
const char * tag_filename);
// Gopal. Added search overload that takes L as parameter, so that we
// can customize L on a per-query basis without tampering with "Parameters"
DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search(const T * query,
const size_t K,
const unsigned L,
unsigned *indices);
template<typename IDType>
DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search(
const T *query, const uint64_t K, const unsigned L,
std::vector<unsigned> init_ids, uint64_t *indices, float *distances);
const T *query, const size_t K, const unsigned L, IDType *indices,
float *distances = nullptr);
DISKANN_DLLEXPORT std::pair<uint32_t, uint32_t> search_with_tags(
const T *query, const size_t K, const unsigned L, TagT *tags,
unsigned *indices_buffer = NULL);
DISKANN_DLLEXPORT size_t search_with_tags(const T *query, const uint64_t K,
const unsigned L, TagT *tags,
float * distances,
std::vector<T *> &res_vectors);
// repositions frozen points to the end of _data - if they have been moved
// during deletion
DISKANN_DLLEXPORT void readjust_data(unsigned _num_frozen_pts);
DISKANN_DLLEXPORT void clear_index();
/* insertions possible only when id corresponding to tag does not already
* exist in the graph */
DISKANN_DLLEXPORT int insert_point(const T * point,
const Parameters & parameter,
std::vector<Neighbor> & pool,
std::vector<Neighbor> & tmp,
tsl::robin_set<unsigned> & visited,
std::vector<SimpleNeighbor> &cut_graph,
const TagT tag);
// Public Functions for Incremental Support
// insertions possible only when id corresponding to tag does not already
// exist in the graph
DISKANN_DLLEXPORT int insert_point(
const T *point, const Parameters &parameter,
const TagT tag); // only keep point, tag, parameters
// call before triggering deleteions - sets important flags required for
// deletion related operations
@ -91,12 +183,64 @@ namespace diskann {
// Record deleted point now and restructure graph later. Return -1 if tag
// not found, 0 if OK. Do not call if _eager_delete was called earlier and
// data was not consolidated
DISKANN_DLLEXPORT int delete_point(const TagT tag);
DISKANN_DLLEXPORT int lazy_delete(const TagT &tag);
// Record deleted points now and restructure graph later. Add to failed_tags
// if tag not found. Do not call if _eager_delete was called earlier and
// data was not consolidated. Return -1 if
DISKANN_DLLEXPORT int lazy_delete(const tsl::robin_set<TagT> &tags,
std::vector<TagT> & failed_tags);
// Delete point from graph and restructure it immediately. Do not call if
// _lazy_delete was called earlier and data was not consolidated
DISKANN_DLLEXPORT int eager_delete(const TagT tag,
const Parameters &parameters);
const Parameters &parameters,
int delete_mode = 1);
// return _data and tag_to_location offset
DISKANN_DLLEXPORT int extract_data(
T *ret_data, std::unordered_map<TagT, unsigned> &tag_to_location);
DISKANN_DLLEXPORT void get_location_to_tag(
std::unordered_map<unsigned, TagT> &ret_loc_to_tag);
DISKANN_DLLEXPORT void prune_all_nbrs(const Parameters &parameters);
DISKANN_DLLEXPORT void compact_data_for_insert();
DISKANN_DLLEXPORT bool hasIndexBeenSaved();
const std::vector<std::vector<unsigned>> *get_graph() const {
return &this->_final_graph;
}
T * get_data();
const std::unordered_map<unsigned, TagT> *get_tags() const {
return &this->_location_to_tag;
};
// repositions frozen points to the end of _data - if they have been moved
// during deletion
DISKANN_DLLEXPORT void reposition_frozen_point_to_end();
DISKANN_DLLEXPORT void reposition_point(unsigned old_location,
unsigned new_location);
DISKANN_DLLEXPORT void compact_frozen_point();
DISKANN_DLLEXPORT void compact_data_for_search();
DISKANN_DLLEXPORT void consolidate(Parameters &parameters);
// DISKANN_DLLEXPORT void save_index_as_one_file(bool flag);
DISKANN_DLLEXPORT void get_active_tags(tsl::robin_set<TagT> &active_tags);
DISKANN_DLLEXPORT int get_vector_by_tag(TagT &tag, T *vec);
DISKANN_DLLEXPORT const T *get_vector_by_tag(const TagT &tag);
DISKANN_DLLEXPORT void print_status() const;
// This variable MUST be updated if the number of entries in the metadata
// change.
DISKANN_DLLEXPORT static const int METADATA_ROWS = 5;
DISKANN_DLLEXPORT static bool get_npts_and_dim_from_index(
const char *file_name, size_t &npts, size_t &dim);
DISKANN_DLLEXPORT void optimize_graph();
@ -105,24 +249,49 @@ namespace diskann {
/* Internals of the library */
protected:
typedef std::vector<SimpleNeighbor> vecNgh;
typedef std::vector<std::vector<unsigned>> CompactGraph;
CompactGraph _final_graph;
CompactGraph _in_graph;
// No copy/assign.
Index(const Index<T, TagT> &) = delete;
Index<T, TagT> &operator=(const Index<T, TagT> &) = delete;
std::vector<std::vector<unsigned>> _final_graph;
std::vector<std::vector<unsigned>> _in_graph;
// generates one frozen point that will never get deleted from the
// graph
int generate_frozen_point();
// determines navigating node of the graph by calculating medoid of data
unsigned calculate_entry_point();
// called only when _eager_delete is to be supported
void update_in_graph();
template<typename IDType>
std::pair<uint32_t, uint32_t> search_impl(const T *query, const size_t K,
const unsigned L, IDType *indices,
float * distances,
InMemQueryScratch<T> &scratch);
std::pair<uint32_t, uint32_t> iterate_to_fixed_point(
const T *node_coords, const unsigned Lindex,
const std::vector<unsigned> &init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> & expanded_nodes_ids,
std::vector<Neighbor> & best_L_nodes);
std::vector<Neighbor> &best_L_nodes, std::vector<unsigned> &des,
tsl::robin_set<unsigned> &inserted_into_pool_rs,
boost::dynamic_bitset<> &inserted_into_pool_bs, bool ret_frozen = true,
bool search_invocation = false);
void get_expanded_nodes(const size_t node, const unsigned Lindex,
std::vector<unsigned> init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> &expanded_nodes_ids,
std::vector<unsigned> & des,
std::vector<Neighbor> & best_L_nodes,
tsl::robin_set<unsigned> &inserted_into_pool_rs,
boost::dynamic_bitset<> & inserted_into_pool_bs);
// get_expanded_nodes for insertion. Must investigate to see if perf can
// be improved here as well using the same technique as above.
void get_expanded_nodes(const size_t node_id, const unsigned Lindex,
std::vector<unsigned> init_ids,
std::vector<Neighbor> & expanded_nodes_info,
tsl::robin_set<unsigned> &expanded_nodes_ids);
@ -151,60 +320,93 @@ namespace diskann {
void link(Parameters &parameters);
// WARNING: Do not call reserve_location() without acquiring change_lock_
unsigned reserve_location();
int reserve_location();
void release_location();
// get new location corresponding to each undeleted tag after deletions
std::vector<unsigned> get_new_location(unsigned &active);
// Support for resizing the index
// This function must be called ONLY after taking the _change_lock and
// _update_lock. Anything else in a MT environment will lead to an
// inconsistent index.
void resize(size_t new_max_points);
/* // get new location corresponding to each undeleted tag after
deletions std::vector<unsigned> get_new_location(unsigned &active);*/
// renumber nodes, update tag and location maps and compact the graph, mode
// = _consolidated_order in case of lazy deletion and _compacted_order in
// case of eager deletion
void compact_data(std::vector<unsigned> new_location, unsigned active,
bool &mode);
void compact_data();
// WARNING: Do not call consolidate_deletes without acquiring change_lock_
// Returns number of live points left after consolidation
size_t consolidate_deletes(const Parameters &parameters);
void initialize_query_scratch(uint32_t num_threads, uint32_t search_l,
uint32_t indexing_l, uint32_t r, size_t dim);
private:
Metric _metric = diskann::L2;
size_t _dim;
size_t _aligned_dim;
T * _data;
size_t _nd; // number of active points i.e. existing in the graph
size_t _max_points; // total number of points in given data set
size_t _num_frozen_pts;
bool _has_built;
Distance<T> *_distance;
unsigned _width;
unsigned _ep;
Metric _dist_metric = diskann::L2;
size_t _dim = 0;
size_t _aligned_dim = 0;
T * _data = nullptr;
size_t _nd = 0; // number of active points i.e. existing in the graph
size_t _max_points = 0; // total number of points in given data set
size_t _num_frozen_pts = 0;
bool _has_built = false;
Distance<T> *_distance = nullptr;
unsigned _width = 0;
unsigned _ep = 0;
size_t _max_range_of_loaded_graph = 0;
bool _saturate_graph = false;
std::vector<std::mutex> _locks; // Per node lock, cardinality=max_points_
bool _save_as_one_file = false;
bool _dynamic_index = false;
bool _enable_tags = false;
// Using normalied L2 for cosine.
bool _normalize_vecs = false;
char * _opt_graph;
size_t _node_size;
size_t _data_len;
size_t _neighbor_len;
// Indexing parameters
uint32_t _indexingQueueSize, _indexingRange, _indexingMaxC;
float _indexingAlpha;
uint32_t _search_queue_size;
bool _can_delete;
bool _eager_done; // true if eager deletions have been made
bool _lazy_done; // true if lazy deletions have been made
bool _compacted_order; // true if after eager deletions, data has been
// consolidated
bool _enable_tags;
bool _consolidated_order; // true if after lazy deletions, data has been
// consolidated
bool _support_eager_delete; //_support_eager_delete = activates extra data
// structures and functions required for eager
// deletion
bool _store_data;
// Query scratch data structures
ConcurrentQueue<InMemQueryScratch<T>> _query_scratch;
// flags for dynamic indexing
std::unordered_map<TagT, unsigned> _tag_to_location;
std::unordered_map<unsigned, TagT> _location_to_tag;
tsl::robin_set<unsigned> _delete_set;
tsl::robin_set<unsigned> _empty_slots;
std::mutex _change_lock; // Allow only 1 thread to insert/delete
bool _support_eager_delete =
false; //_support_eager_delete = activates extra data
// bool _can_delete = false; // only true if deletes can be done (if
// enabled)
bool _eager_done = false; // true if eager deletions have been made
bool _lazy_done = false; // true if lazy deletions have been made
bool _data_compacted = true; // true if data has been consolidated
bool _is_saved = false; // Gopal. Checking if the index is already saved.
std::vector<std::mutex> _locks; // Per node lock, cardinality=max_points_
std::shared_timed_mutex _tag_lock; // reader-writer lock on
// _tag_to_location and
std::mutex _change_lock; // Lock taken to synchronously modify _nd
std::vector<std::mutex> _locks_in; // Per node lock
std::shared_timed_mutex _delete_lock; // Lock on _delete_set and
// _empty_slots when reading and
// writing to them
// _location_to_tag, has a shared lock
// and exclusive lock associated with
// it.
std::shared_timed_mutex _update_lock; // coordinate save() and any change
// being done to the graph.
static const float INDEX_GROWTH_FACTOR;
char * _opt_graph;
size_t _node_size;
size_t _data_len;
size_t _neighbor_len;
};
} // namespace diskann

105
include/simd_utils.h Normal file
Просмотреть файл

@ -0,0 +1,105 @@
#pragma once
#ifdef _WINDOWS
#include <immintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <intrin.h>
#else
#include <immintrin.h>
#endif
namespace diskann {
static inline __m256 _mm256_mul_epi8(__m256i X) {
__m256i zero = _mm256_setzero_si256();
__m256i sign_x = _mm256_cmpgt_epi8(zero, X);
__m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
__m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, xlo),
_mm256_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mulhi_epi8(__m128i X) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mulhi_epi8_shift32(__m128i X) {
__m128i zero = _mm_setzero_si128();
X = _mm_srli_epi64(X, 32);
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_setzero_si128(), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mul_epi8(__m128i X, __m128i Y) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i sign_y = _mm_cmplt_epi8(Y, zero);
__m128i xlo = _mm_unpacklo_epi8(X, sign_x);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
__m128i ylo = _mm_unpacklo_epi8(Y, sign_y);
__m128i yhi = _mm_unpackhi_epi8(Y, sign_y);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_madd_epi16(xlo, ylo), _mm_madd_epi16(xhi, yhi)));
}
static inline __m128 _mm_mul_epi8(__m128i X) {
__m128i zero = _mm_setzero_si128();
__m128i sign_x = _mm_cmplt_epi8(X, zero);
__m128i xlo = _mm_unpacklo_epi8(X, sign_x);
__m128i xhi = _mm_unpackhi_epi8(X, sign_x);
return _mm_cvtepi32_ps(
_mm_add_epi32(_mm_madd_epi16(xlo, xlo), _mm_madd_epi16(xhi, xhi)));
}
static inline __m128 _mm_mul32_pi8(__m128i X, __m128i Y) {
__m128i xlo = _mm_cvtepi8_epi16(X), ylo = _mm_cvtepi8_epi16(Y);
return _mm_cvtepi32_ps(
_mm_unpacklo_epi32(_mm_madd_epi16(xlo, ylo), _mm_setzero_si128()));
}
static inline __m256 _mm256_mul_epi8(__m256i X, __m256i Y) {
__m256i zero = _mm256_setzero_si256();
__m256i sign_x = _mm256_cmpgt_epi8(zero, X);
__m256i sign_y = _mm256_cmpgt_epi8(zero, Y);
__m256i xlo = _mm256_unpacklo_epi8(X, sign_x);
__m256i xhi = _mm256_unpackhi_epi8(X, sign_x);
__m256i ylo = _mm256_unpacklo_epi8(Y, sign_y);
__m256i yhi = _mm256_unpackhi_epi8(Y, sign_y);
return _mm256_cvtepi32_ps(_mm256_add_epi32(_mm256_madd_epi16(xlo, ylo),
_mm256_madd_epi16(xhi, yhi)));
}
static inline __m256 _mm256_mul32_pi8(__m128i X, __m128i Y) {
__m256i xlo = _mm256_cvtepi8_epi16(X), ylo = _mm256_cvtepi8_epi16(Y);
return _mm256_blend_ps(_mm256_cvtepi32_ps(_mm256_madd_epi16(xlo, ylo)),
_mm256_setzero_ps(), 252);
}
static inline float _mm256_reduce_add_ps(__m256 x) {
/* ( x3+x7, x2+x6, x1+x5, x0+x4 ) */
const __m128 x128 =
_mm_add_ps(_mm256_extractf128_ps(x, 1), _mm256_castps256_ps128(x));
/* ( -, -, x1+x3+x5+x7, x0+x2+x4+x6 ) */
const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
/* ( -, -, -, x0+x1+x2+x3+x4+x5+x6+x7 ) */
const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
/* Conversion to float is a no-op on x86-64 */
return _mm_cvtss_f32(x32);
}
} // namespace

Просмотреть файл

@ -4,15 +4,21 @@
#pragma once
#include <fcntl.h>
#include <algorithm>
#include <errno.h>
#include <cassert>
#include <cstdlib>
#include <cstring>
#include <fstream>
#include <iostream>
#include <limits.h>
#include <string>
#include <memory>
#include <random>
#include <set>
#include <sstream>
#include <string.h>
#ifdef __APPLE__
#else
#include <malloc.h>
@ -26,8 +32,11 @@ typedef HANDLE FileHandle;
typedef int FileHandle;
#endif
#include "distance.h"
#include "utils.h"
#include "logger.h"
#include "cached_io.h"
#include "ann_exception.h"
#include "common_includes.h"
#include "windows_customizations.h"
@ -51,6 +60,44 @@ typedef int FileHandle;
#define IS_ALIGNED(X, Y) ((uint64_t)(X) % (uint64_t)(Y) == 0)
#define IS_512_ALIGNED(X) IS_ALIGNED(X, 512)
#define IS_4096_ALIGNED(X) IS_ALIGNED(X, 4096)
#define METADATA_SIZE \
4096 // all metadata of individual sub-component files is written in first
// 4KB for unified files
inline bool file_exists(const std::string& name, bool dirCheck = false) {
int val;
#ifndef _WINDOWS
struct stat buffer;
val = stat(name.c_str(), &buffer);
#else
// It is the 21st century but Windows API still thinks in 32-bit terms.
// Turns out calling stat() on a file > 4GB results in errno = 132 (OVERFLOW).
// How silly is this!? So calling _stat64()
struct _stat64 buffer;
val = _stat64(name.c_str(), &buffer);
#endif
diskann::cout << " Stat(" << name.c_str() << ") returned: " << val
<< std::endl;
if (val != 0) {
switch (errno) {
case EINVAL:
diskann::cout << "Invalid argument passed to stat()" << std::endl;
break;
case ENOENT:
diskann::cout << "File " << name.c_str() << " does not exist"
<< std::endl;
break;
default:
diskann::cout << "Unexpected error in stat():" << errno << std::endl;
break;
}
return false;
} else {
// the file entry exists. If reqd, check if this is a directory.
return dirCheck ? buffer.st_mode & S_IFDIR : true;
}
}
typedef uint64_t _u64;
typedef int64_t _s64;
@ -60,11 +107,62 @@ typedef uint16_t _u16;
typedef int16_t _s16;
typedef uint8_t _u8;
typedef int8_t _s8;
inline void open_file_to_write(std::ofstream& writer,
const std::string& filename) {
writer.exceptions(std::ofstream::failbit | std::ofstream::badbit);
if (!file_exists(filename))
writer.open(filename, std::ios::binary | std::ios::out);
else
writer.open(filename, std::ios::binary | std::ios::in | std::ios::out);
if (writer.fail()) {
char buff[1024];
#ifdef _WINDOWS
strerror_s(buff, 1024, errno);
#else
strerror_r(errno, buff, 1024);
#endif
diskann::cerr << std::string("Failed to open file") + filename +
" for write because " + buff
<< std::endl;
throw diskann::ANNException(std::string("Failed to open file ") + filename +
" for write because: " + buff,
-1);
}
}
inline _u64 get_file_size(const std::string& fname) {
std::ifstream reader(fname, std::ios::binary | std::ios::ate);
if (!reader.fail() && reader.is_open()) {
_u64 end_pos = reader.tellg();
reader.close();
return end_pos;
} else {
diskann::cerr << "Could not open file: " << fname << std::endl;
return 0;
}
}
inline int delete_file(const std::string& fileName) {
if (file_exists(fileName)) {
auto rc = ::remove(fileName.c_str());
if (rc != 0) {
diskann::cerr
<< "Could not delete file: " << fileName
<< " even though it exists. This might indicate a permissions issue. "
"If you see this message, please contact the diskann team."
<< std::endl;
}
return rc;
} else {
return 0;
}
}
namespace diskann {
static const size_t MAX_SIZE_OF_STREAMBUF = 2LL * 1024 * 1024 * 1024;
enum Metric { L2 = 0, INNER_PRODUCT = 1, FAST_L2 = 2, PQ = 3 };
enum Metric { L2 = 0, INNER_PRODUCT = 1, COSINE = 2, FAST_L2 = 3, PQ = 4 };
inline void alloc_aligned(void** ptr, size_t size, size_t align) {
*ptr = nullptr;
@ -77,6 +175,24 @@ namespace diskann {
assert(*ptr != nullptr);
}
inline void realloc_aligned(void** ptr, size_t size, size_t align) {
assert(IS_ALIGNED(size, align));
#ifdef _WINDOWS
*ptr = ::_aligned_realloc(*ptr, size, align);
#else
diskann::cerr << "No aligned realloc on GCC. Must malloc and mem_align, "
"left it out for now."
<< std::endl;
#endif
assert(*ptr != nullptr);
}
inline void check_stop(std::string arnd) {
int brnd;
diskann::cout << arnd << std::endl;
std::cin >> brnd;
}
inline void aligned_free(void* ptr) {
// Gopal. Must have a check here if the pointer was actually allocated by
// _alloc_aligned
@ -384,7 +500,7 @@ namespace diskann {
_u64 total_res = (_u64) total_u32;
diskann::cout << "Metadata: #pts = " << gt_num
<< ", #total_results = " << total_res << "... " << std::flush;
<< ", #total_results = " << total_res << "..." << std::endl;
size_t expected_file_size =
2 * sizeof(_u32) + gt_num * sizeof(_u32) + total_res * sizeof(_u32);
@ -440,8 +556,8 @@ namespace diskann {
}
template<typename T>
inline void save_bin(const std::string& filename, T* data, size_t npts,
size_t ndims) {
inline uint64_t save_bin(const std::string& filename, T* data, size_t npts,
size_t ndims) {
std::ofstream writer(filename, std::ios::binary | std::ios::out);
diskann::cout << "Writing bin: " << filename.c_str() << std::endl;
int npts_i32 = (int) npts, ndims_i32 = (int) ndims;
@ -454,7 +570,9 @@ namespace diskann {
// data = new T[npts_u64 * ndims_u64];
writer.write((char*) data, npts * ndims * sizeof(T));
writer.close();
size_t bytes_written = npts * ndims * sizeof(T) + 2 * sizeof(uint32_t);
diskann::cout << "Finished writing bin." << std::endl;
return bytes_written;
}
// load_aligned_bin functions START
@ -652,6 +770,67 @@ namespace diskann {
writer.write((char*) cur_pt, ndims * sizeof(T));
}
}
template<typename T>
inline uint64_t save_data_in_base_dimensions(const std::string& filename,
T* data, size_t npts,
size_t ndims, size_t aligned_dim,
size_t offset = 0) {
std::ofstream writer; //(filename, std::ios::binary | std::ios::out);
open_file_to_write(writer, filename);
int npts_i32 = (int) npts, ndims_i32 = (int) ndims;
_u64 bytes_written = 2 * sizeof(uint32_t) + npts * ndims * sizeof(T);
writer.seekp(offset, writer.beg);
writer.write((char*) &npts_i32, sizeof(int));
writer.write((char*) &ndims_i32, sizeof(int));
for (size_t i = 0; i < npts; i++) {
writer.write((char*) (data + i * aligned_dim), ndims * sizeof(T));
}
writer.close();
return bytes_written;
}
template<typename T>
inline void copy_aligned_data_from_file(const std::string bin_file, T*& data,
size_t& npts, size_t& dim,
const size_t& rounded_dim,
size_t offset = 0) {
if (data == nullptr) {
diskann::cerr << "Memory was not allocated for " << data
<< " before calling the load function. Exiting..."
<< std::endl;
throw diskann::ANNException(
"Null pointer passed to copy_aligned_data_from_file function", -1,
__FUNCSIG__, __FILE__, __LINE__);
}
std::ifstream reader(bin_file, std::ios::binary);
reader.seekg(offset, reader.beg);
int npts_i32, dim_i32;
reader.read((char*) &npts_i32, sizeof(int));
reader.read((char*) &dim_i32, sizeof(int));
npts = (unsigned) npts_i32;
dim = (unsigned) dim_i32;
/*
size_t expected_actual_file_size =
npts * dim * sizeof(T) + 2 * sizeof(uint32_t);
if (actual_file_size != expected_actual_file_size) {
std::stringstream stream;
stream << "Error. File size mismatch. Actual size is " << actual_file_size
<< " while expected size is " << expected_actual_file_size
<< " npts = " << npts << " dim = " << dim
<< " size of <T>= " << sizeof(T) << std::endl;
diskann::cout << stream.str() << std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
__LINE__);
}
*/
for (size_t i = 0; i < npts; i++) {
reader.read((char*) (data + i * rounded_dim), dim * sizeof(T));
memset(data + i * rounded_dim + dim, 0, (rounded_dim - dim) * sizeof(T));
}
}
// NOTE :: good efficiency when total_vec_size is integral multiple of 64
inline void prefetch_vector(const char* vec, size_t vecsize) {
@ -666,6 +845,16 @@ namespace diskann {
for (size_t d = 0; d < max_prefetch_size; d += 64)
_mm_prefetch((const char*) vec + d, _MM_HINT_T1);
}
// NOTE: Implementation in utils.cpp.
void block_convert(std::ofstream& writr, std::ifstream& readr,
float* read_buf, _u64 npts, _u64 ndims);
DISKANN_DLLEXPORT void normalize_data_file(const std::string& inFileName,
const std::string& outFileName);
template<typename T>
Distance<T>* get_distance_function(Metric m);
}; // namespace diskann
struct PivotContainer {
@ -687,6 +876,7 @@ struct PivotContainer {
float piv_dist;
};
/*
inline bool file_exists(const std::string& name) {
struct stat buffer;
auto val = stat(name.c_str(), &buffer);
@ -694,20 +884,7 @@ inline bool file_exists(const std::string& name) {
<< std::endl;
return (val == 0);
}
inline _u64 get_file_size(const std::string& fname) {
std::ifstream reader(fname, std::ios::binary | std::ios::ate);
if (!reader.fail() && reader.is_open()) {
_u64 end_pos = reader.tellg();
diskann::cout << " Tellg: " << reader.tellg() << " as u64: " << end_pos
<< std::endl;
reader.close();
return end_pos;
} else {
diskann::cout << "Could not open file: " << fname << std::endl;
return 0;
}
}
*/
inline bool validate_index_file_size(std::ifstream& in) {
if (!in.is_open())
@ -730,18 +907,60 @@ inline bool validate_index_file_size(std::ifstream& in) {
return true;
}
// This function is valid only for float data type.
template<typename T>
inline void normalize(T* arr, size_t dim) {
float sum = 0.0f;
for (uint32_t i = 0; i < dim; i++) {
sum += arr[i] * arr[i];
}
sum = sqrt(sum);
for (uint32_t i = 0; i < dim; i++) {
arr[i] = (T)(arr[i] / sum);
}
}
#ifdef _WINDOWS
#include <intrin.h>
#include <Psapi.h>
extern bool AvxSupportedCPU;
extern bool Avx2SupportedCPU;
inline size_t getMemoryUsage() {
PROCESS_MEMORY_COUNTERS_EX pmc;
GetProcessMemoryInfo(GetCurrentProcess(), (PROCESS_MEMORY_COUNTERS*) &pmc,
sizeof(pmc));
return pmc.PrivateUsage;
}
inline std::string getWindowsErrorMessage(DWORD lastError) {
char* errorText;
FormatMessageA(
// use system message tables to retrieve error text
FORMAT_MESSAGE_FROM_SYSTEM
// allocate buffer on local heap for error text
| FORMAT_MESSAGE_ALLOCATE_BUFFER
// Important! will fail otherwise, since we're not
// (and CANNOT) pass insertion parameters
| FORMAT_MESSAGE_IGNORE_INSERTS,
NULL, // unused with FORMAT_MESSAGE_FROM_SYSTEM
lastError, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT),
(LPSTR) &errorText, // output
0, // minimum size for output buffer
NULL); // arguments - see note
return errorText != nullptr ? std::string(errorText) : std::string();
}
inline void printProcessMemory(const char* message) {
PROCESS_MEMORY_COUNTERS counters;
HANDLE h = GetCurrentProcess();
GetProcessMemoryInfo(h, &counters, sizeof(counters));
diskann::cout << message << " [Peaking Working Set size: "
<< counters.PeakWorkingSetSize * 1.0 / (1024 * 1024 * 1024)
<< counters.PeakWorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
<< "GB Working set size: "
<< counters.WorkingSetSize * 1.0 / (1024 * 1024 * 1024)
<< counters.WorkingSetSize * 1.0 / (1024.0 * 1024 * 1024)
<< "GB Private bytes "
<< counters.PagefileUsage * 1.0 / (1024 * 1024 * 1024) << "GB]"
<< std::endl;
@ -752,10 +971,14 @@ inline void printProcessMemory(const char* message) {
inline bool avx2Supported() {
return true;
}
inline void printProcessMemory(const char* message) {
diskann::cout << message << std::endl;
inline void printProcessMemory(const char*) {
}
inline size_t
getMemoryUsage() { // for non-windows, we have not implemented this function
return 0;
}
#endif
extern bool AvxSupportedCPU;

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -7,7 +7,7 @@ if(MSVC)
add_subdirectory(dll)
else()
#file(GLOB CPP_SOURCES *.cpp)
set(CPP_SOURCES ann_exception.cpp aux_utils.cpp index.cpp
set(CPP_SOURCES ann_exception.cpp aux_utils.cpp distance.cpp index.cpp
linux_aligned_file_reader.cpp math_utils.cpp memory_mapper.cpp
partition_and_pq.cpp pq_flash_index.cpp logger.cpp utils.cpp)
add_library(${PROJECT_NAME} ${CPP_SOURCES})

Просмотреть файл

@ -31,8 +31,8 @@ namespace diskann {
std::system_error& e, const std::string& funcSig,
const std::string& fileName,
unsigned int lineNum)
: ANNException(std::string(" While opening file ") + filename +
std::string(", error code: ") +
: ANNException(std::string(" While opening file \'") + filename +
std::string("\', error code: ") +
std::to_string(e.code().value()) + " " +
e.code().message(),
e.code().value(), funcSig, fileName, lineNum) {

Просмотреть файл

@ -19,21 +19,118 @@
#include "partition_and_pq.h"
#include "percentile_stats.h"
#include "pq_flash_index.h"
#include "tsl/robin_set.h"
#include "utils.h"
namespace diskann {
double get_memory_budget(const std::string &mem_budget_str) {
double mem_ram_budget = atof(mem_budget_str.c_str());
double final_index_ram_limit = mem_ram_budget;
if (mem_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB >
void add_new_file_to_single_index(std::string index_file,
std::string new_file) {
std::unique_ptr<_u64[]> metadata;
_u64 nr, nc;
diskann::load_bin<_u64>(index_file, metadata, nr, nc);
if (nc != 1) {
std::stringstream stream;
stream << "Error, index file specified does not have correct metadata. "
<< std::endl;
throw diskann::ANNException(stream.str(), -1);
}
size_t index_ending_offset = metadata[nr - 1];
_u64 read_blk_size = 64 * 1024 * 1024;
cached_ofstream writer(index_file, read_blk_size);
_u64 check_file_size = get_file_size(index_file);
if (check_file_size != index_ending_offset) {
std::stringstream stream;
stream << "Error, index file specified does not have correct metadata "
"(last entry must match the filesize). "
<< std::endl;
throw diskann::ANNException(stream.str(), -1);
}
cached_ifstream reader(new_file, read_blk_size);
size_t fsize = reader.get_file_size();
if (fsize == 0) {
std::stringstream stream;
stream << "Error, new file specified is empty. Not appending. "
<< std::endl;
throw diskann::ANNException(stream.str(), -1);
}
size_t num_blocks = DIV_ROUND_UP(fsize, read_blk_size);
char * dump = new char[read_blk_size];
for (_u64 i = 0; i < num_blocks; i++) {
size_t cur_block_size = read_blk_size > fsize - (i * read_blk_size)
? fsize - (i * read_blk_size)
: read_blk_size;
reader.read(dump, cur_block_size);
writer.write(dump, cur_block_size);
}
// reader.close();
// writer.close();
delete[] dump;
std::vector<_u64> new_meta;
for (_u64 i = 0; i < nr; i++)
new_meta.push_back(metadata[i]);
new_meta.push_back(metadata[nr - 1] + fsize);
diskann::save_bin<_u64>(index_file, new_meta.data(), new_meta.size(), 1);
}
double get_memory_budget(double search_ram_budget) {
double final_index_ram_limit = search_ram_budget;
if (search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB >
THRESHOLD_FOR_CACHING_IN_GB) { // slack for space used by cached
// nodes
final_index_ram_limit = mem_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB;
final_index_ram_limit = search_ram_budget - SPACE_FOR_CACHED_NODES_IN_GB;
}
return final_index_ram_limit * 1024 * 1024 * 1024;
}
double get_memory_budget(const std::string &mem_budget_str) {
double search_ram_budget = atof(mem_budget_str.c_str());
return get_memory_budget(search_ram_budget);
}
size_t calculate_num_pq_chunks(double final_index_ram_limit,
size_t points_num, uint32_t dim,
const std::vector<std::string> &param_list) {
size_t num_pq_chunks =
(size_t)(std::floor)(_u64(final_index_ram_limit / (double) points_num));
diskann::cout << "Calculated num_pq_chunks :" << num_pq_chunks << std::endl;
if (param_list.size() >= 6) {
float compress_ratio = (float) atof(param_list[5].c_str());
if (compress_ratio > 0 && compress_ratio <= 1) {
size_t chunks_by_cr = (size_t)(std::floor)(compress_ratio * dim);
if (chunks_by_cr > 0 && chunks_by_cr < num_pq_chunks) {
diskann::cout << "Compress ratio:" << compress_ratio
<< " new #pq_chunks:" << chunks_by_cr << std::endl;
num_pq_chunks = chunks_by_cr;
} else {
diskann::cout << "Compress ratio: " << compress_ratio
<< " #new pq_chunks: " << chunks_by_cr
<< " is either zero or greater than num_pq_chunks: "
<< num_pq_chunks << ". num_pq_chunks is unchanged. "
<< std::endl;
}
} else {
diskann::cerr << "Compression ratio: " << compress_ratio
<< " should be in (0,1]" << std::endl;
}
}
num_pq_chunks = num_pq_chunks <= 0 ? 1 : num_pq_chunks;
num_pq_chunks = num_pq_chunks > dim ? dim : num_pq_chunks;
num_pq_chunks =
num_pq_chunks > MAX_PQ_CHUNKS ? MAX_PQ_CHUNKS : num_pq_chunks;
diskann::cout << "Compressing " << dim << "-dimensional data into "
<< num_pq_chunks << " bytes per vector." << std::endl;
return num_pq_chunks;
}
double calculate_recall(unsigned num_queries, unsigned *gold_std,
float *gs_dist, unsigned dim_gs,
unsigned *our_results, unsigned dim_or,
@ -70,6 +167,63 @@ namespace diskann {
return total_recall / (num_queries) * (100.0 / recall_at);
}
double calculate_recall(unsigned num_queries, unsigned *gold_std,
float *gs_dist, unsigned dim_gs,
unsigned *our_results, unsigned dim_or,
unsigned recall_at,
const tsl::robin_set<unsigned> &active_tags) {
double total_recall = 0;
std::set<unsigned> gt, res;
bool printed = false;
for (size_t i = 0; i < num_queries; i++) {
gt.clear();
res.clear();
unsigned *gt_vec = gold_std + dim_gs * i;
unsigned *res_vec = our_results + dim_or * i;
size_t tie_breaker = recall_at;
unsigned active_points_count = 0;
unsigned cur_counter = 0;
while (active_points_count < recall_at && cur_counter < dim_gs) {
if (active_tags.find(*(gt_vec + cur_counter)) != active_tags.end()) {
active_points_count++;
}
cur_counter++;
}
if (active_tags.empty())
cur_counter = recall_at;
if ((active_points_count < recall_at && !active_tags.empty()) &&
!printed) {
diskann::cout << "Warning: Couldn't find enough closest neighbors "
<< active_points_count << "/" << recall_at
<< " from "
"truthset for query # "
<< i << ". Will result in under-reported value of recall."
<< std::endl;
printed = true;
}
if (gs_dist != nullptr) {
tie_breaker = cur_counter - 1;
float *gt_dist_vec = gs_dist + dim_gs * i;
while (tie_breaker < dim_gs &&
gt_dist_vec[tie_breaker] == gt_dist_vec[cur_counter - 1])
tie_breaker++;
}
gt.insert(gt_vec, gt_vec + tie_breaker);
res.insert(res_vec, res_vec + recall_at);
unsigned cur_recall = 0;
for (auto &v : res) {
if (gt.find(v) != gt.end()) {
cur_recall++;
}
}
total_recall += cur_recall;
}
return ((double) (total_recall / (num_queries))) *
((double) (100.0 / recall_at));
}
double calculate_range_search_recall(
unsigned num_queries, std::vector<std::vector<_u32>> &groundtruth,
std::vector<std::vector<_u32>> &our_results) {
@ -130,12 +284,20 @@ namespace diskann {
if (files.fileExists(cache_warmup_file)) {
diskann::load_aligned_bin<T>(files, cache_warmup_file, warmup, warmup_num,
file_dim, file_aligned_dim);
diskann::cout << "In the warmup file: " << cache_warmup_file
<< " File dim: " << file_dim
<< " File aligned dim: " << file_aligned_dim
<< " Expected dim: " << warmup_dim
<< " Expected aligned dim: " << warmup_aligned_dim
<< std::endl;
if (file_dim != warmup_dim || file_aligned_dim != warmup_aligned_dim) {
std::stringstream stream;
stream << "Mismatched dimensions in sample file. file_dim = "
<< file_dim << " file_aligned_dim: " << file_aligned_dim
<< " index_dim: " << warmup_dim
<< " index_aligned_dim: " << warmup_aligned_dim << std::endl;
diskann::cerr << stream.str();
throw diskann::ANNException(stream.str(), -1);
}
} else {
@ -247,18 +409,20 @@ namespace diskann {
std::vector<cached_ifstream> vamana_readers(nshards);
for (_u64 i = 0; i < nshards; i++) {
vamana_readers[i].open(vamana_names[i], 1024 * 1048576);
size_t actual_file_size = get_file_size(vamana_names[i]);
// size_t actual_file_size = get_file_size(vamana_names[i]);
size_t expected_file_size;
vamana_readers[i].read((char *) &expected_file_size, sizeof(uint64_t));
if (actual_file_size != expected_file_size) {
std::stringstream stream;
stream << "Error in Vamana Index file " << vamana_names[i]
<< " Actual file size: " << actual_file_size
<< " does not match expected file size: " << expected_file_size
<< std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
__LINE__);
}
/* if (actual_file_size != expected_file_size) {
std::stringstream stream;
stream << "Error in Vamana Index file " << vamana_names[i]
<< " Actual file size: " << actual_file_size
<< " does not match expected file size: " <<
expected_file_size
<< std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__,
__FILE__,
__LINE__);
} */
}
size_t merged_index_size = 16;
@ -379,7 +543,7 @@ namespace diskann {
diskann::get_bin_metadata(base_file, base_num, base_dim);
double full_index_ram =
ESTIMATE_RAM_USAGE(base_num, base_dim, sizeof(T), R);
estimate_ram_usage(base_num, base_dim, sizeof(T), R);
if (full_index_ram < ram_budget * 1024 * 1024 * 1024) {
diskann::cout << "Full index fits in RAM budget, should consume at most "
<< full_index_ram / (1024 * 1024 * 1024)
@ -394,9 +558,10 @@ namespace diskann {
paras.Set<std::string>("save_path", mem_index_path);
std::unique_ptr<diskann::Index<T>> _pvamanaIndex =
std::unique_ptr<diskann::Index<T>>(
new diskann::Index<T>(compareMetric, base_file.c_str()));
_pvamanaIndex->build(paras);
std::unique_ptr<diskann::Index<T>>(new diskann::Index<T>(
compareMetric, base_dim, base_num, false, false, false));
_pvamanaIndex->build(base_file.c_str(), base_num, paras);
_pvamanaIndex->save(mem_index_path.c_str());
std::remove(medoids_file.c_str());
std::remove(centroids_file.c_str());
@ -432,10 +597,13 @@ namespace diskann {
paras.Set<bool>("saturate_graph", 1);
paras.Set<std::string>("save_path", shard_index_file);
_u64 shard_base_dim, shard_base_pts;
get_bin_metadata(shard_base_file, shard_base_pts, shard_base_dim);
std::unique_ptr<diskann::Index<T>> _pvamanaIndex =
std::unique_ptr<diskann::Index<T>>(
new diskann::Index<T>(compareMetric, shard_base_file.c_str()));
_pvamanaIndex->build(paras);
std::unique_ptr<diskann::Index<T>>(new diskann::Index<T>(
compareMetric, shard_base_dim, shard_base_pts, false,
false)); // TODO: Single?
_pvamanaIndex->build(shard_base_file.c_str(), shard_base_pts, paras);
_pvamanaIndex->save(shard_index_file.c_str());
std::remove(shard_base_file.c_str());
// wait_for_keystroke();
@ -766,7 +934,7 @@ namespace diskann {
size_t train_size, train_dim;
float *train_data;
double p_val = ((double) TRAINING_SET_SIZE / (double) points_num);
double p_val = ((double) MAX_PQ_TRAINING_SET_SIZE / (double) points_num);
// generates random sample and sets it to train_data and updates
// train_size
gen_random_slice<T>(data_file_to_use.c_str(), p_val, train_data, train_size,

497
src/distance.cpp Normal file
Просмотреть файл

@ -0,0 +1,497 @@
// TODO
// CHECK COSINE ON LINUX
#ifdef _WINDOWS
#include <immintrin.h>
#include <smmintrin.h>
#include <tmmintrin.h>
#include <intrin.h>
#else
#include <immintrin.h>
#endif
#include "simd_utils.h"
#include <cosine_similarity.h>
#include <iostream>
#include "distance.h"
#include "logger.h"
#include "ann_exception.h"
namespace diskann {
// Cosine similarity.
float DistanceCosineInt8::compare(const int8_t *a, const int8_t *b,
uint32_t length) const {
#ifdef _WINDOWS
return diskann::CosineSimilarity2<int8_t>(a, b, length);
#else
return diskann::compute_cosine_similarity(a, b, length);
#endif
}
float DistanceCosineFloat::compare(const float *a, const float *b,
uint32_t length) const {
#ifdef _WINDOWS
return diskann::CosineSimilarity2<float>(a, b, length);
#else
return diskann::compute_cosine_similarity(a, b, length);
#endif
}
float SlowDistanceCosineUInt8::compare(const uint8_t *a, const uint8_t *b,
uint32_t length) const {
int magA = 0, magB = 0, scalarProduct = 0;
for (uint32_t i = 0; i < length; i++) {
magA += ((uint32_t) a[i]) * ((uint32_t) a[i]);
magB += ((uint32_t) b[i]) * ((uint32_t) b[i]);
scalarProduct += ((uint32_t) a[i]) * ((uint32_t) b[i]);
}
// similarity == 1-cosine distance
return 1.0f - (float) (scalarProduct / (sqrt(magA) * sqrt(magB)));
}
// L2 distance functions.
float DistanceL2Int8::compare(const int8_t *a, const int8_t *b,
uint32_t size) const {
int32_t result = 0;
#ifdef _WINDOWS
#ifdef USE_AVX2
__m256 r = _mm256_setzero_ps();
char * pX = (char *) a, *pY = (char *) b;
while (size >= 32) {
__m256i r1 = _mm256_subs_epi8(_mm256_loadu_si256((__m256i *) pX),
_mm256_loadu_si256((__m256i *) pY));
r = _mm256_add_ps(r, _mm256_mul_epi8(r1, r1));
pX += 32;
pY += 32;
size -= 32;
}
while (size > 0) {
__m128i r2 = _mm_subs_epi8(_mm_loadu_si128((__m128i *) pX),
_mm_loadu_si128((__m128i *) pY));
r = _mm256_add_ps(r, _mm256_mul32_pi8(r2, r2));
pX += 4;
pY += 4;
size -= 4;
}
r = _mm256_hadd_ps(_mm256_hadd_ps(r, r), r);
return r.m256_f32[0] + r.m256_f32[4];
#else
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (_s32 i = 0; i < (_s32) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
#endif
#else
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
for (int32_t i = 0; i < (int32_t) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
#endif
}
float DistanceL2UInt8::compare(const uint8_t *a, const uint8_t *b,
uint32_t size) const {
uint32_t result = 0;
#ifndef _WINDOWS
#pragma omp simd reduction(+ : result) aligned(a, b : 8)
#endif
for (int32_t i = 0; i < (int32_t) size; i++) {
result += ((int32_t)((int16_t) a[i] - (int16_t) b[i])) *
((int32_t)((int16_t) a[i] - (int16_t) b[i]));
}
return (float) result;
}
#ifndef _WINDOWS
float DistanceL2Float::compare(const float *a, const float *b,
uint32_t size) const {
a = (const float *) __builtin_assume_aligned(a, 32);
b = (const float *) __builtin_assume_aligned(b, 32);
#else
float DistanceL2Float::compare(const float *a, const float *b,
uint32_t size) const {
#endif
float result = 0;
#ifdef USE_AVX2
// assume size is divisible by 8
uint16_t niters = (uint16_t)(size / 8);
__m256 sum = _mm256_setzero_ps();
for (uint16_t j = 0; j < niters; j++) {
// scope is a[8j:8j+7], b[8j:8j+7]
// load a_vec
if (j < (niters - 1)) {
_mm_prefetch((char *) (a + 8 * (j + 1)), _MM_HINT_T0);
_mm_prefetch((char *) (b + 8 * (j + 1)), _MM_HINT_T0);
}
__m256 a_vec = _mm256_load_ps(a + 8 * j);
// load b_vec
__m256 b_vec = _mm256_load_ps(b + 8 * j);
// a_vec - b_vec
__m256 tmp_vec = _mm256_sub_ps(a_vec, b_vec);
sum = _mm256_fmadd_ps(tmp_vec, tmp_vec, sum);
}
// horizontal add sum
result = _mm256_reduce_add_ps(sum);
#else
#ifndef _WINDOWS
#pragma omp simd reduction(+ : result) aligned(a, b : 32)
#endif
for (int32_t i = 0; i < (int32_t) size; i++) {
result += (a[i] - b[i]) * (a[i] - b[i]);
}
#endif
return result;
}
float SlowDistanceL2Float::compare(const float *a, const float *b,
uint32_t length) const {
float result = 0.0f;
for (uint32_t i = 0; i < length; i++) {
result += (a[i] - b[i]) * (a[i] - b[i]);
}
return result;
}
#ifdef _WINDOWS
float AVXDistanceL2Int8::compare(const int8_t *a, const int8_t *b,
uint32_t length) const {
__m128 r = _mm_setzero_ps();
__m128i r1;
while (length >= 16) {
r1 = _mm_subs_epi8(_mm_load_si128((__m128i *) a),
_mm_load_si128((__m128i *) b));
r = _mm_add_ps(r, _mm_mul_epi8(r1));
a += 16;
b += 16;
length -= 16;
}
r = _mm_hadd_ps(_mm_hadd_ps(r, r), r);
float res = r.m128_f32[0];
if (length >= 8) {
__m128 r2 = _mm_setzero_ps();
__m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 8)),
_mm_load_si128((__m128i *) (b - 8)));
r2 = _mm_add_ps(r2, _mm_mulhi_epi8(r3));
a += 8;
b += 8;
length -= 8;
r2 = _mm_hadd_ps(_mm_hadd_ps(r2, r2), r2);
res += r2.m128_f32[0];
}
if (length >= 4) {
__m128 r2 = _mm_setzero_ps();
__m128i r3 = _mm_subs_epi8(_mm_load_si128((__m128i *) (a - 12)),
_mm_load_si128((__m128i *) (b - 12)));
r2 = _mm_add_ps(r2, _mm_mulhi_epi8_shift32(r3));
res += r2.m128_f32[0] + r2.m128_f32[1];
}
return res;
}
float AVXDistanceL2Float::compare(const float *a, const float *b,
uint32_t length) const {
__m128 diff, v1, v2;
__m128 sum = _mm_set1_ps(0);
while (length >= 4) {
v1 = _mm_loadu_ps(a);
a += 4;
v2 = _mm_loadu_ps(b);
b += 4;
diff = _mm_sub_ps(v1, v2);
sum = _mm_add_ps(sum, _mm_mul_ps(diff, diff));
length -= 4;
}
return sum.m128_f32[0] + sum.m128_f32[1] + sum.m128_f32[2] +
sum.m128_f32[3];
}
#else
float AVXDistanceL2Int8::compare(const int8_t *, const int8_t *,
uint32_t) const {
return 0;
}
float AVXDistanceL2Float::compare(const float *, const float *,
uint32_t) const {
return 0;
}
#endif
template<typename T>
float DistanceInnerProduct<T>::inner_product(const T *a, const T *b,
unsigned size) const {
if (!std::is_floating_point<T>::value) {
diskann::cerr << "ERROR: Inner Product only defined for float currently."
<< std::endl;
throw diskann::ANNException(
"ERROR: Inner Product only defined for float currently.", -1,
__FUNCSIG__, __FILE__, __LINE__);
}
float result = 0;
#ifdef __GNUC__
#ifdef USE_AVX2
#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm256_loadu_ps(addr1); \
tmp2 = _mm256_loadu_ps(addr2); \
tmp1 = _mm256_mul_ps(tmp1, tmp2); \
dest = _mm256_add_ps(dest, tmp1);
__m256 sum;
__m256 l0, l1;
__m256 r0, r1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = (float *) a;
const float *r = (float *) b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
sum = _mm256_loadu_ps(unpack);
if (DR) {
AVX_DOT(e_l, e_r, sum, l0, r0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
AVX_DOT(l, r, sum, l0, r0);
AVX_DOT(l + 8, r + 8, sum, l1, r1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
unpack[5] + unpack[6] + unpack[7];
#else
#ifdef __SSE2__
#define SSE_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm128_loadu_ps(addr1); \
tmp2 = _mm128_loadu_ps(addr2); \
tmp1 = _mm128_mul_ps(tmp1, tmp2); \
dest = _mm128_add_ps(dest, tmp1);
__m128 sum;
__m128 l0, l1, l2, l3;
__m128 r0, r1, r2, r3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *r = b;
const float *e_l = l + DD;
const float *e_r = r + DD;
float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_DOT(e_l + 8, e_r + 8, sum, l2, r2);
case 8:
SSE_DOT(e_l + 4, e_r + 4, sum, l1, r1);
case 4:
SSE_DOT(e_l, e_r, sum, l0, r0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
SSE_DOT(l, r, sum, l0, r0);
SSE_DOT(l + 4, r + 4, sum, l1, r1);
SSE_DOT(l + 8, r + 8, sum, l2, r2);
SSE_DOT(l + 12, r + 12, sum, l3, r3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else
float dot0, dot1, dot2, dot3;
const float *last = a + size;
const float *unroll_group = last - 3;
/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * b[0];
dot1 = a[1] * b[1];
dot2 = a[2] * b[2];
dot3 = a[3] * b[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
b += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += *a++ * *b++;
}
#endif
#endif
#endif
return result;
}
template<typename T>
float DistanceFastL2<T>::compare(const T *a, const T *b, float norm,
unsigned size) const {
float result = -2 * DistanceInnerProduct<T>::inner_product(a, b, size);
result += norm;
return result;
}
template<typename T>
float DistanceFastL2<T>::norm(const T *a, unsigned size) const {
if (!std::is_floating_point<T>::value) {
diskann::cerr << "ERROR: FastL2 only defined for float currently."
<< std::endl;
throw diskann::ANNException(
"ERROR: FastL2 only defined for float currently.", -1, __FUNCSIG__,
__FILE__, __LINE__);
}
float result = 0;
#ifdef __GNUC__
#ifdef __AVX__
#define AVX_L2NORM(addr, dest, tmp) \
tmp = _mm256_loadu_ps(addr); \
tmp = _mm256_mul_ps(tmp, tmp); \
dest = _mm256_add_ps(dest, tmp);
__m256 sum;
__m256 l0, l1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = (float *) a;
const float *e_l = l + DD;
float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
sum = _mm256_loadu_ps(unpack);
if (DR) {
AVX_L2NORM(e_l, sum, l0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
AVX_L2NORM(l, sum, l0);
AVX_L2NORM(l + 8, sum, l1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
unpack[5] + unpack[6] + unpack[7];
#else
#ifdef __SSE2__
#define SSE_L2NORM(addr, dest, tmp) \
tmp = _mm128_loadu_ps(addr); \
tmp = _mm128_mul_ps(tmp, tmp); \
dest = _mm128_add_ps(dest, tmp);
__m128 sum;
__m128 l0, l1, l2, l3;
unsigned D = (size + 3) & ~3U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = a;
const float *e_l = l + DD;
float unpack[4] __attribute__((aligned(16))) = {0, 0, 0, 0};
sum = _mm_load_ps(unpack);
switch (DR) {
case 12:
SSE_L2NORM(e_l + 8, sum, l2);
case 8:
SSE_L2NORM(e_l + 4, sum, l1);
case 4:
SSE_L2NORM(e_l, sum, l0);
default:
break;
}
for (unsigned i = 0; i < DD; i += 16, l += 16) {
SSE_L2NORM(l, sum, l0);
SSE_L2NORM(l + 4, sum, l1);
SSE_L2NORM(l + 8, sum, l2);
SSE_L2NORM(l + 12, sum, l3);
}
_mm_storeu_ps(unpack, sum);
result += unpack[0] + unpack[1] + unpack[2] + unpack[3];
#else
float dot0, dot1, dot2, dot3;
const float *last = a + size;
const float *unroll_group = last - 3;
/* Process 4 items with each loop for efficiency. */
while (a < unroll_group) {
dot0 = a[0] * a[0];
dot1 = a[1] * a[1];
dot2 = a[2] * a[2];
dot3 = a[3] * a[3];
result += dot0 + dot1 + dot2 + dot3;
a += 4;
}
/* Process last 0-3 pixels. Not needed for standard vector lengths. */
while (a < last) {
result += (*a) * (*a);
a++;
}
#endif
#endif
#endif
return result;
}
float AVXDistanceInnerProductFloat::compare(const float *a, const float *b,
uint32_t size) const {
float result = 0.0f;
#define AVX_DOT(addr1, addr2, dest, tmp1, tmp2) \
tmp1 = _mm256_loadu_ps(addr1); \
tmp2 = _mm256_loadu_ps(addr2); \
tmp1 = _mm256_mul_ps(tmp1, tmp2); \
dest = _mm256_add_ps(dest, tmp1);
__m256 sum;
__m256 l0, l1;
__m256 r0, r1;
unsigned D = (size + 7) & ~7U;
unsigned DR = D % 16;
unsigned DD = D - DR;
const float *l = (float *) a;
const float *r = (float *) b;
const float *e_l = l + DD;
const float *e_r = r + DD;
#ifndef _WINDOWS
float unpack[8] __attribute__((aligned(32))) = {0, 0, 0, 0, 0, 0, 0, 0};
#else
__declspec(align(32)) float unpack[8] = {0, 0, 0, 0, 0, 0, 0, 0};
#endif
sum = _mm256_loadu_ps(unpack);
if (DR) {
AVX_DOT(e_l, e_r, sum, l0, r0);
}
for (unsigned i = 0; i < DD; i += 16, l += 16, r += 16) {
AVX_DOT(l, r, sum, l0, r0);
AVX_DOT(l + 8, r + 8, sum, l1, r1);
}
_mm256_storeu_ps(unpack, sum);
result = unpack[0] + unpack[1] + unpack[2] + unpack[3] + unpack[4] +
unpack[5] + unpack[6] + unpack[7];
return -result;
}
template DISKANN_DLLEXPORT class DistanceInnerProduct<float>;
template DISKANN_DLLEXPORT class DistanceInnerProduct<int8_t>;
template DISKANN_DLLEXPORT class DistanceInnerProduct<uint8_t>;
template DISKANN_DLLEXPORT class DistanceFastL2<float>;
template DISKANN_DLLEXPORT class DistanceFastL2<int8_t>;
template DISKANN_DLLEXPORT class DistanceFastL2<uint8_t>;
} // namespace diskann

Просмотреть файл

@ -2,7 +2,7 @@
# Licensed under the MIT license.
add_library(diskann_dll SHARED dllmain.cpp ../partition_and_pq.cpp ../pq_flash_index.cpp ../logger.cpp ../utils.cpp
../windows_aligned_file_reader.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../aux_utils.cpp ../ann_exception.cpp)
../windows_aligned_file_reader.cpp ../distance.cpp ../memory_mapper.cpp ../index.cpp ../math_utils.cpp ../aux_utils.cpp ../ann_exception.cpp)
if (MSVC)
add_definitions(-D_USRDLL -D_WINDLL -DDISKANN_DLL)
add_compile_options(/MD)
@ -11,8 +11,8 @@ if (MSVC)
target_link_options(diskann_dll PRIVATE $<$<CONFIG:Debug>:/IMPLIB:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib>
$<$<CONFIG:Release>:/IMPLIB:${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib>
)
target_link_libraries(diskann_dll debug ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(diskann_dll optimized ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(diskann_dll debug ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
target_link_libraries(diskann_dll optimized ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
add_custom_command(TARGET
@ -50,7 +50,7 @@ if (MSVC)
add_custom_command(TARGET
diskann_dll
POST_BUILD
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.dll "$<$<CONFIG:debug>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}\">$<$<CONFIG:release>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}\">" )
COMMAND ${CMAKE_COMMAND} -E copy ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.dll "$<$<CONFIG:debug>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}\">$<$<CONFIG:release>:\"${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}\">" )
endif()

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Просмотреть файл

@ -995,7 +995,7 @@ int partition_with_ram_budget(const std::string data_file,
sampling_rate); // to account for the fact that p is the size
// of the shard over the testing sample.
double cur_shard_ram_estimate =
ESTIMATE_RAM_USAGE(p, train_dim, sizeof(T), graph_degree);
diskann::estimate_ram_usage(p, train_dim, sizeof(T), graph_degree);
if (cur_shard_ram_estimate > max_ram_usage)
max_ram_usage = cur_shard_ram_estimate;

Просмотреть файл

@ -97,7 +97,7 @@ namespace diskann {
this->dist_cmp = new DistanceL2UInt8();
if (Avx2SupportedCPU) {
diskann::cout << "Using AVX2 dist_cmp_float function." << std::endl;
this->dist_cmp_float = new DistanceL2();
this->dist_cmp_float = new DistanceL2Float();
} else if (AvxSupportedCPU) {
diskann::cout << "Using AVX dist_cmp_float function" << std::endl;
this->dist_cmp_float = new AVXDistanceL2Float();
@ -122,7 +122,7 @@ namespace diskann {
diskann::cout << "Using AVX2 function for dist_cmp and dist_cmp_float"
<< std::endl;
this->dist_cmp = new DistanceL2Int8();
this->dist_cmp_float = new DistanceL2();
this->dist_cmp_float = new DistanceL2Float();
} else if (AvxSupportedCPU) {
diskann::cout << "No AVX2 support. Switching to AVX routines for "
"dist_cmp, dist_cmp_float."
@ -152,8 +152,8 @@ namespace diskann {
if (Avx2SupportedCPU) {
diskann::cout << "Using AVX2 functions for dist_cmp and dist_cmp_float"
<< std::endl;
this->dist_cmp = new DistanceL2();
this->dist_cmp_float = new DistanceL2();
this->dist_cmp = new DistanceL2Float();
this->dist_cmp_float = new DistanceL2Float();
} else if (AvxSupportedCPU) {
diskann::cout << "No AVX2 support. Switching to AVX functions for "
"dist_cmp and dist_cmp_float."
@ -170,8 +170,8 @@ namespace diskann {
}
} else if (metric == diskann::Metric::INNER_PRODUCT) {
std::cout << "Using inner product distance function" << std::endl;
this->dist_cmp = new DistanceInnerProduct<float>();
this->dist_cmp_float = new DistanceInnerProduct<float>();
// this->dist_cmp = new DistanceInnerProduct<float>();
// this->dist_cmp_float = new DistanceInnerProduct<float>();
} else {
std::cout << "Unsupported metric type. Reverting to float." << std::endl;
this->dist_cmp = new AVXDistanceL2Float();

Просмотреть файл

@ -5,6 +5,9 @@
#include <stdio.h>
const uint32_t MAX_REQUEST_SIZE = 1024 * 1024 * 1024; // 64MB
const uint32_t MAX_SIMULTANEOUS_READ_REQUESTS = 128;
#ifdef _WINDOWS
#include <intrin.h>
@ -51,10 +54,167 @@ bool cpuHasAvx2Support() {
}
#endif
#ifndef _WINDOWS
bool AvxSupportedCPU = false;
bool Avx2SupportedCPU = true;
#else
#ifdef _WINDOWS
bool AvxSupportedCPU = cpuHasAvxSupport();
bool Avx2SupportedCPU = cpuHasAvx2Support();
#else
bool Avx2SupportedCPU = true;
bool AvxSupportedCPU = false;
#endif
namespace diskann {
// Get the right distance function for the given metric.
template<>
diskann::Distance<float>* get_distance_function(diskann::Metric m) {
if (m == diskann::Metric::L2) {
if (Avx2SupportedCPU) {
diskann::cout << "L2: Using AVX2 distance computation" << std::endl;
return new diskann::DistanceL2Float();
} else if (AvxSupportedCPU) {
diskann::cout
<< "L2: AVX2 not supported. Using AVX distance computation"
<< std::endl;
return new diskann::AVXDistanceL2Float();
} else {
diskann::cout << "L2: Older CPU. Using slow distance computation"
<< std::endl;
return new diskann::SlowDistanceL2Float();
}
} else if (m == diskann::Metric::COSINE) {
diskann::cout << "Cosine: Using either AVX or AVX2 implementation"
<< std::endl;
return new diskann::DistanceCosineFloat();
} else if (m == diskann::Metric::INNER_PRODUCT) {
diskann::cout << "Inner product: Using AVX2 implementation" << std::endl;
return new diskann::AVXDistanceInnerProductFloat();
} else if (m == diskann::Metric::FAST_L2) {
return new diskann::DistanceFastL2<float>();
} else {
std::stringstream stream;
stream << "Only L2, cosine, and inner product supported for floating "
"point vectors as of now. Email "
"{gopalsr, harshasi, rakri}@microsoft.com if you need support "
"for any other metric."
<< std::endl;
diskann::cerr << stream.str() << std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
__LINE__);
}
}
template<>
diskann::Distance<int8_t>* get_distance_function(diskann::Metric m) {
if (m == diskann::Metric::L2) {
if (Avx2SupportedCPU) {
diskann::cout << "Using AVX2 distance computation" << std::endl;
return new diskann::DistanceL2Int8();
} else if (AvxSupportedCPU) {
diskann::cout << "AVX2 not supported. Using AVX distance computation"
<< std::endl;
return new diskann::AVXDistanceL2Int8();
} else {
diskann::cout << "Older CPU. Using slow distance computation"
<< std::endl;
return new diskann::SlowDistanceL2Int<int8_t>();
}
} else if (m == diskann::Metric::COSINE) {
diskann::cout << "Using either AVX or AVX2 for Cosine similarity"
<< std::endl;
return new diskann::DistanceCosineInt8();
} else {
std::stringstream stream;
stream << "Only L2 and cosine supported for signed byte vectors as of "
"now. Email "
"{gopalsr, harshasi, rakri}@microsoft.com if you need support "
"for any other metric."
<< std::endl;
diskann::cerr << stream.str() << std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
__LINE__);
}
}
template<>
diskann::Distance<uint8_t>* get_distance_function(diskann::Metric m) {
if (m == diskann::Metric::L2) {
#ifdef _WINDOWS
diskann::cout
<< "WARNING: AVX/AVX2 distance function not defined for Uint8. Using "
"slow version. "
"Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
<< std::endl;
#endif
return new diskann::DistanceL2UInt8();
} else if (m == diskann::Metric::COSINE) {
diskann::cout
<< "AVX/AVX2 distance function not defined for Uint8. Using "
"slow version. "
"Contact gopalsr@microsoft.com if you need AVX/AVX2 support."
<< std::endl;
return new diskann::SlowDistanceCosineUInt8();
} else {
std::stringstream stream;
stream << "Only L2 and cosine supported for unsigned byte vectors as of "
"now. Email "
"{gopalsr, harshasi, rakri}@microsoft.com if you need support "
"for any other metric."
<< std::endl;
diskann::cerr << stream.str() << std::endl;
throw diskann::ANNException(stream.str(), -1, __FUNCSIG__, __FILE__,
__LINE__);
}
}
void block_convert(std::ofstream& writr, std::ifstream& readr,
float* read_buf, _u64 npts, _u64 ndims) {
readr.read((char*) read_buf, npts * ndims * sizeof(float));
_u32 ndims_u32 = (_u32) ndims;
#pragma omp parallel for
for (_s64 i = 0; i < (_s64) npts; i++) {
float norm_pt = std::numeric_limits<float>::epsilon();
for (_u32 dim = 0; dim < ndims_u32; dim++) {
norm_pt +=
*(read_buf + i * ndims + dim) * *(read_buf + i * ndims + dim);
}
norm_pt = std::sqrt(norm_pt);
for (_u32 dim = 0; dim < ndims_u32; dim++) {
*(read_buf + i * ndims + dim) = *(read_buf + i * ndims + dim) / norm_pt;
}
}
writr.write((char*) read_buf, npts * ndims * sizeof(float));
}
void normalize_data_file(const std::string& inFileName,
const std::string& outFileName) {
std::ifstream readr(inFileName, std::ios::binary);
std::ofstream writr(outFileName, std::ios::binary);
int npts_s32, ndims_s32;
readr.read((char*) &npts_s32, sizeof(_s32));
readr.read((char*) &ndims_s32, sizeof(_s32));
writr.write((char*) &npts_s32, sizeof(_s32));
writr.write((char*) &ndims_s32, sizeof(_s32));
_u64 npts = (_u64) npts_s32, ndims = (_u64) ndims_s32;
diskann::cout << "Normalizing FLOAT vectors in file: " << inFileName
<< std::endl;
diskann::cout << "Dataset: #pts = " << npts << ", # dims = " << ndims
<< std::endl;
_u64 blk_size = 131072;
_u64 nblks = ROUND_UP(npts, blk_size) / blk_size;
diskann::cout << "# blks: " << nblks << std::endl;
float* read_buf = new float[npts * ndims];
for (_u64 i = 0; i < nblks; i++) {
_u64 cblk_size = std::min(npts - i * blk_size, blk_size);
block_convert(writr, readr, read_buf, cblk_size, ndims);
}
delete[] read_buf;
diskann::cout << "Wrote normalized points to file: " << outFileName
<< std::endl;
}
} // namespace diskann

Просмотреть файл

@ -6,8 +6,8 @@ set(CMAKE_CXX_STANDARD 14)
add_executable(build_memory_index build_memory_index.cpp )
if(MSVC)
target_link_options(build_memory_index PRIVATE /MACHINE:x64 /DEBUG:FULL "/INCLUDE:_tcmalloc")
target_link_libraries(build_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(build_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(build_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
target_link_libraries(build_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
else()
target_link_libraries(build_memory_index ${PROJECT_NAME} -ltcmalloc)
endif()
@ -15,8 +15,8 @@ endif()
add_executable(search_memory_index search_memory_index.cpp )
if(MSVC)
target_link_options(search_memory_index PRIVATE /MACHINE:x64 /DEBUG:FULL)
target_link_libraries(search_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib)
target_link_libraries(search_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib)
target_link_libraries(search_memory_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
target_link_libraries(search_memory_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
else()
target_link_libraries(search_memory_index ${PROJECT_NAME} aio -ltcmalloc)
endif()
@ -24,8 +24,8 @@ endif()
add_executable(build_disk_index build_disk_index.cpp )
if(MSVC)
target_link_options(build_disk_index PRIVATE /MACHINE:x64 /DEBUG:FULL "/INCLUDE:_tcmalloc")
target_link_libraries(build_disk_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(build_disk_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/dependencies/windows/tcmalloc/libtcmalloc_minimal.lib)
target_link_libraries(build_disk_index debug ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
target_link_libraries(build_disk_index optimized ${CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE}/diskann_dll.lib ${PROJECT_SOURCE_DIR}/gperftools/x64/Release-Patch/libtcmalloc_minimal.lib)
else()
target_link_libraries(build_disk_index ${PROJECT_NAME} -ltcmalloc aio)
endif()

Просмотреть файл

@ -16,7 +16,7 @@
#include "memory_mapper.h"
#include "ann_exception.h"
template<typename T>
template<typename T, typename TagT = uint32_t>
int build_in_memory_index(const std::string& data_path,
const diskann::Metric& metric, const unsigned R,
const unsigned L, const float alpha,
@ -31,9 +31,14 @@ int build_in_memory_index(const std::string& data_path,
paras.Set<bool>("saturate_graph", 0);
paras.Set<unsigned>("num_threads", num_threads);
diskann::Index<T> index(metric, data_path.c_str());
auto s = std::chrono::high_resolution_clock::now();
index.build(paras);
_u64 data_num, data_dim;
diskann::get_bin_metadata(data_path, data_num, data_dim);
diskann::Index<T, TagT> index(metric, data_dim, data_num, false, false,
false);
auto s = std::chrono::high_resolution_clock::now();
index.build(data_path.c_str(), data_num, paras);
std::chrono::duration<double> diff =
std::chrono::high_resolution_clock::now() - s;
@ -56,13 +61,15 @@ int main(int argc, char** argv) {
_u32 ctr = 2;
diskann::Metric metric;
if (std::string(argv[ctr]) == std::string("mips"))
if (std::string(argv[ctr]) == std::string("mips")) {
metric = diskann::Metric::INNER_PRODUCT;
else if (std::string(argv[ctr]) == std::string("l2"))
} else if (std::string(argv[ctr]) == std::string("l2")) {
metric = diskann::Metric::L2;
else {
std::cerr << "Unsupported distance function. Currently only L2/ Inner "
"Product support."
} else if (std::string(argv[ctr]) == std::string("cosine")) {
metric = diskann::Metric::COSINE;
}else {
std::cout << "Unsupported distance function. Currently only L2/ Inner "
"Product/Cosine are supported."
<< std::endl;
return -1;
}

Просмотреть файл

@ -309,14 +309,14 @@ int search_disk_index(int argc, char** argv) {
int main(int argc, char** argv) {
if (argc < 12) {
std::cerr << "Usage: " << argv[0]
std::cout << "Usage: " << argv[0]
<< " index_type<float/int8/uint8> dist_fn<l2/mips> "
"index_prefix_path num_nodes_to_cache "
"T(num_threads) W(beamwidth) "
"query_file.bin truthset.bin(\"null\" for none) "
"K result_output_prefix L1 L2 ..."
<< std::endl;
exit(-1);
return -1;
}
try {
if (std::string(argv[1]) == std::string("float"))

Просмотреть файл

@ -36,9 +36,11 @@ int search_memory_index(int argc, char** argv) {
metric = diskann::Metric::L2;
else if (std::string(argv[ctr]) == std::string("fast_l2"))
metric = diskann::Metric::FAST_L2;
else if (std::string(argv[ctr]) == std::string("cosine"))
metric = diskann::Metric::COSINE;
else {
std::cout << "Unsupported distance function. Currently only L2/ Inner "
"Product/FAST_L2 support."
"Product/FAST_L2/Cosine support."
<< std::endl;
return -1;
}
@ -55,6 +57,7 @@ int search_memory_index(int argc, char** argv) {
std::string data_file(argv[ctr++]);
std::string memory_index_file(argv[ctr++]);
_u64 max_points = std::atoi(argv[ctr++]);
_u64 num_threads = std::atoi(argv[ctr++]);
std::string query_bin(argv[ctr++]);
std::string truthset_bin(argv[ctr++]);
@ -64,10 +67,13 @@ int search_memory_index(int argc, char** argv) {
bool calc_recall_flag = false;
_u32 max_search_L = 0;
for (; ctr < (_u32) argc; ctr++) {
_u64 curL = std::atoi(argv[ctr]);
if (curL >= recall_at)
if (curL >= recall_at) {
Lvec.push_back(curL);
max_search_L = max_search_L > curL ? max_search_L : curL;
}
}
if (Lvec.size() == 0) {
@ -91,8 +97,11 @@ int search_memory_index(int argc, char** argv) {
std::cout.setf(std::ios_base::fixed, std::ios_base::floatfield);
std::cout.precision(2);
diskann::Index<T> index(metric, data_file.c_str());
index.load(memory_index_file.c_str()); // to load NSG
diskann::Index<T, uint32_t> index(metric, query_dim, max_points, false,
false);
index.load(memory_index_file.c_str(), num_threads,
max_search_L);
std::cout << "Index loaded" << std::endl;
if (metric == diskann::FAST_L2)
@ -172,17 +181,17 @@ int search_memory_index(int argc, char** argv) {
}
int main(int argc, char** argv) {
if (argc < 11) {
if (argc < 12) {
std::cout << "Usage: " << argv[0]
<< " index_type<float/int8/uint8> "
"dist_fn<l2/mips/fast_l2> "
"data_file.bin memory_index_path "
"data_file.bin memory_index_path max_points "
"T(num_threads) query_file.bin "
"truthset.bin(\"null\" for none) "
"K result_output_prefix "
"L1 L2 ... \n"
<< std::endl;
exit(-1);
return -1;
}
try {

Просмотреть файл

@ -12,7 +12,7 @@ void block_convert(std::ofstream& writer, float* write_buf,
for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; d++) {
write_buf[d + i * ndims] =
(((float)read_buf[d + i * ndims] - bias) * scale);
(((float) read_buf[d + i * ndims] - bias) * scale);
}
}
writer.write((char*) write_buf, npts * ndims * sizeof(float));
@ -20,8 +20,8 @@ void block_convert(std::ofstream& writer, float* write_buf,
int main(int argc, char** argv) {
if (argc != 5) {
std::cout << "Usage: " << argv[0] << " input-int8.bin output-float.bin bias scale"
<< std::endl;
std::cout << "Usage: " << argv[0]
<< " input-int8.bin output-float.bin bias scale" << std::endl;
exit(-1);
}

40
tests/utils/rand_data_gen.cpp Executable file → Normal file
Просмотреть файл

@ -8,12 +8,12 @@
#include "utils.h"
int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
float norm) {
float norm) {
auto vec = new float[ndims];
std::random_device rd{};
std::mt19937 gen{rd()};
std::normal_distribution<> normal_rand{0,1};
std::normal_distribution<> normal_rand{0, 1};
for (_u64 i = 0; i < npts; i++) {
float sum = 0;
@ -23,7 +23,7 @@ int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
sum += vec[d] * vec[d];
for (_u64 d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
writer.write((char*) vec, ndims * sizeof(float));
}
@ -31,8 +31,7 @@ int block_write_float(std::ofstream& writer, _u64 ndims, _u64 npts,
return 0;
}
int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
float norm) {
int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts, float norm) {
auto vec = new float[ndims];
auto vec_T = new int8_t[ndims];
@ -48,11 +47,11 @@ int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
sum += vec[d] * vec[d];
for (_u64 d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
for (_u64 d = 0; d < ndims; ++d) {
vec_T[d] = std::round<int>(vec[d]);
}
writer.write((char*) vec_T, ndims * sizeof(int8_t));
}
@ -62,7 +61,7 @@ int block_write_int8(std::ofstream& writer, _u64 ndims, _u64 npts,
}
int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
float norm) {
float norm) {
auto vec = new float[ndims];
auto vec_T = new int8_t[ndims];
@ -78,7 +77,7 @@ int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
sum += vec[d] * vec[d];
for (_u64 d = 0; d < ndims; ++d)
vec[d] = vec[d] * norm / std::sqrt(sum);
for (_u64 d = 0; d < ndims; ++d) {
vec_T[d] = 128 + std::round<int>(vec[d]);
}
@ -93,8 +92,7 @@ int block_write_uint8(std::ofstream& writer, _u64 ndims, _u64 npts,
int main(int argc, char** argv) {
if (argc != 6) {
std::cout << argv[0]
<< " <float/int8/uint8> ndims npts norm output.bin"
std::cout << argv[0] << " <float/int8/uint8> ndims npts norm output.bin"
<< std::endl;
exit(-1);
}
@ -115,13 +113,15 @@ int main(int argc, char** argv) {
return -1;
}
if ((std::string(argv[1]) == std::string("int8"))
|| (std::string(argv[1]) == std::string("uint8"))) {
if (norm > 127) {
std::cerr << "Error: for in8/uint8 datatypes, L2 norm can not be greater than 127" << std::endl;
return -1;
}
}
if ((std::string(argv[1]) == std::string("int8")) ||
(std::string(argv[1]) == std::string("uint8"))) {
if (norm > 127) {
std::cerr << "Error: for in8/uint8 datatypes, L2 norm can not be greater "
"than 127"
<< std::endl;
return -1;
}
}
std::ofstream writer(argv[5], std::ios::binary);
auto npts_s32 = (_u32) npts;
@ -141,13 +141,13 @@ int main(int argc, char** argv) {
} else if (std::string(argv[1]) == std::string("int8")) {
ret = block_write_int8(writer, ndims, cblk_size, norm);
} else if (std::string(argv[1]) == std::string("uint8")) {
ret = block_write_uint8(writer, ndims, cblk_size, norm);
ret = block_write_uint8(writer, ndims, cblk_size, norm);
}
if (ret == 0)
std::cout << "Block #" << i << " written" << std::endl;
else {
writer.close();
std::cout << "failed to write" <<std::endl;
std::cout << "failed to write" << std::endl;
return -1;
}
}

Просмотреть файл

@ -4,12 +4,12 @@
#include <iostream>
#include "utils.h"
void block_convert_float(std::ifstream& reader, std::ofstream& writer, _u64 npts,
_u64 ndims) {
void block_convert_float(std::ifstream& reader, std::ofstream& writer,
_u64 npts, _u64 ndims) {
auto read_buf = new float[npts * (ndims + 1)];
auto cursor = read_buf;
float val;
auto cursor = read_buf;
float val;
for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; ++d) {
@ -23,16 +23,16 @@ void block_convert_float(std::ifstream& reader, std::ofstream& writer, _u64 npts
}
void block_convert_int8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
_u64 ndims) {
_u64 ndims) {
auto read_buf = new int8_t[npts * (ndims + 1)];
auto cursor = read_buf;
int val;
int val;
for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; ++d) {
reader >> val;
*cursor = (int8_t)val;
*cursor = (int8_t) val;
cursor++;
}
}
@ -40,12 +40,12 @@ void block_convert_int8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
delete[] read_buf;
}
void block_convert_uint8(std::ifstream& reader, std::ofstream& writer, _u64 npts,
_u64 ndims) {
void block_convert_uint8(std::ifstream& reader, std::ofstream& writer,
_u64 npts, _u64 ndims) {
auto read_buf = new uint8_t[npts * (ndims + 1)];
auto cursor = read_buf;
int val;
int val;
for (_u64 i = 0; i < npts; i++) {
for (_u64 d = 0; d < ndims; ++d) {
@ -58,7 +58,6 @@ void block_convert_uint8(std::ifstream& reader, std::ofstream& writer, _u64 npts
delete[] read_buf;
}
int main(int argc, char** argv) {
if (argc != 6) {
std::cout << argv[0]

Просмотреть файл

@ -127,12 +127,12 @@ int aux_main(char** argv) {
int main(int argc, char** argv) {
if (argc < 4) {
std::cout
<< argv[0]
<< " data_type [float/int8/uint8] base_bin_file "
"[option: 1-norm analysis, 2-prep_base_for_mip, "
"3-prep_query_for_mip, 4-normalize-vecs] [out_file for options 2/3/4]"
<< std::endl;
std::cout << argv[0]
<< " data_type [float/int8/uint8] base_bin_file "
"[option: 1-norm analysis, 2-prep_base_for_mip, "
"3-prep_query_for_mip, 4-normalize-vecs] [out_file for "
"options 2/3/4]"
<< std::endl;
exit(-1);
}