File distances.h

namespace faiss

Implementation of k-means clustering with many variants.

Copyright (c) Facebook, Inc. and its affiliates.

This source code is licensed under the MIT license found in the LICENSE file in the root directory of this source tree.

IDSelector is intended to define a subset of vectors to handle (for removal or as subset to search)

PQ4 SIMD packing and accumulation functions

The basic kernel accumulates nq query vectors with bbs = nb * 2 * 16 vectors and produces an output matrix for that. It is interesting for nq * nb <= 4, otherwise register spilling becomes too large.

The implementation of these functions is spread over 3 cpp files to reduce parallel compile times. Templates are instantiated explicitly.

This file contains callbacks for kernels that compute distances.

Throughout the library, vectors are provided as float * pointers. Most algorithms can be optimized when several vectors are processed (added/searched) together in a batch. In this case, they are passed in as a matrix. When n vectors of size d are provided as float * x, component j of vector i is

x[ i * d + j ]

where 0 <= i < n and 0 <= j < d. In other words, matrices are always compact. When specifying the size of the matrix, we call it an n*d matrix, which implies a row-major storage.

I/O functions can read/write to a filename, a file handle or to an object that abstracts the medium.

The read functions return objects that should be deallocated with delete. All references within these objectes are owned by the object.

Definition of inverted lists + a few common classes that implement the interface.

Since IVF (inverted file) indexes are of so much use for large-scale use cases, we group a few functions related to them in this small library. Most functions work both on IndexIVFs and IndexIVFs embedded within an IndexPreTransform.

In this file are the implementations of extra metrics beyond L2 and inner product

Implements a few neural net layers, mainly to support QINCo

Defines a few objects that apply transformations to a set of vectors Often these are pre-processing steps.

Functions

float fvec_L2sqr(const float *x, const float *y, size_t d): Squared L2 distance between two vectors.

float fvec_inner_product(const float *x, const float *y, size_t d): inner product

float fvec_L1(const float *x, const float *y, size_t d): L1 distance.

float fvec_Linf(const float *x, const float *y, size_t d): infinity distance

void fvec_inner_product_batch_4(const float *x, const float *y0, const float *y1, const float *y2, const float *y3, const size_t d, float &dis0, float &dis1, float &dis2, float &dis3): Special version of inner product that computes 4 distances between x and yi, which is performance oriented.

void fvec_L2sqr_batch_4(const float *x, const float *y0, const float *y1, const float *y2, const float *y3, const size_t d, float &dis0, float &dis1, float &dis2, float &dis3): Special version of L2sqr that computes 4 distances between x and yi, which is performance oriented.

void pairwise_L2sqr(int64_t d, int64_t nq, const float *xq, int64_t nb, const float *xb, float *dis, int64_t ldq = -1, int64_t ldb = -1, int64_t ldd = -1)

Compute pairwise distances between sets of vectors

Parameters:

d – dimension of the vectors
nq – nb of query vectors
nb – nb of database vectors
xq – query vectors (size nq * d)
xb – database vectors (size nb * d)
dis – output distances (size nq * nb)
ldq, ldb, ldd – strides for the matrices

void fvec_inner_products_ny(float *ip, const float *x, const float *y, size_t d, size_t ny)

void fvec_L2sqr_ny(float *dis, const float *x, const float *y, size_t d, size_t ny)

void fvec_L2sqr_ny_transposed(float *dis, const float *x, const float *y, const float *y_sqlen, size_t d, size_t d_offset, size_t ny)

size_t fvec_L2sqr_ny_nearest(float *distances_tmp_buffer, const float *x, const float *y, size_t d, size_t ny)

size_t fvec_L2sqr_ny_nearest_y_transposed(float *distances_tmp_buffer, const float *x, const float *y, const float *y_sqlen, size_t d, size_t d_offset, size_t ny)

float fvec_norm_L2sqr(const float *x, size_t d): squared norm of a vector

void fvec_norms_L2(float *norms, const float *x, size_t d, size_t nx)

compute the L2 norms for a set of vectors

Parameters:

norms – output norms, size nx
x – set of vectors, size nx * d

void fvec_norms_L2sqr(float *norms, const float *x, size_t d, size_t nx): same as fvec_norms_L2, but computes squared norms

void fvec_renorm_L2(size_t d, size_t nx, float *x)

void inner_product_to_L2sqr(float *dis, const float *nr1, const float *nr2, size_t n1, size_t n2)

void fvec_add(size_t d, const float *a, const float *b, float *c)

compute c := a + b for vectors

c and a can overlap, c and b can overlap

Parameters:

a – size d
b – size d
c – size d

void fvec_add(size_t d, const float *a, float b, float *c)

compute c := a + b for a, c vectors and b a scalar

c and a can overlap

Parameters:

a – size d
c – size d

void fvec_sub(size_t d, const float *a, const float *b, float *c)

compute c := a - b for vectors

c and a can overlap, c and b can overlap

Parameters:

a – size d
b – size d
c – size d

void fvec_inner_products_by_idx(float *ip, const float *x, const float *y, const int64_t *ids, size_t d, size_t nx, size_t ny)

compute the inner product between x and a subset y of ny vectors defined by ids

ip(i, j) = inner_product(x(i, :), y(ids(i, j), :))

Parameters:

ip – output array, size nx * ny
x – first-term vector, size nx * d
y – second-term vector, size (max(ids) + 1) * d
ids – ids to sample from y, size nx * ny

void fvec_L2sqr_by_idx(float *dis, const float *x, const float *y, const int64_t *ids, size_t d, size_t nx, size_t ny)

compute the squared L2 distances between x and a subset y of ny vectors defined by ids

dis(i, j) = inner_product(x(i, :), y(ids(i, j), :))

Parameters:

dis – output array, size nx * ny
x – first-term vector, size nx * d
y – second-term vector, size (max(ids) + 1) * d
ids – ids to sample from y, size nx * ny

void pairwise_indexed_L2sqr(size_t d, size_t n, const float *x, const int64_t *ix, const float *y, const int64_t *iy, float *dis)

compute dis[j] = L2sqr(x[ix[j]], y[iy[j]]) forall j=0..n-1

Parameters:

x – size (max(ix) + 1, d)
y – size (max(iy) + 1, d)
ix – size n
iy – size n
dis – size n

void pairwise_indexed_inner_product(size_t d, size_t n, const float *x, const int64_t *ix, const float *y, const int64_t *iy, float *dis)

compute dis[j] = inner_product(x[ix[j]], y[iy[j]]) forall j=0..n-1

Parameters:

x – size (max(ix) + 1, d)
y – size (max(iy) + 1, d)
ix – size n
iy – size n
dis – size n

void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_minheap_array_t *res, const IDSelector *sel = nullptr)

Return the k nearest neighbors of each of the nx vectors x among the ny vector y, w.r.t to max inner product.

Parameters:

x – query vectors, size nx * d
y – database vectors, size ny * d
res – result heap structure, which also provides k. Sorted on output

void knn_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, size_t k, float *distances, int64_t *indexes, const IDSelector *sel = nullptr)

Return the k nearest neighbors of each of the nx vectors x among the ny vector y, for the inner product metric.

Parameters:

x – query vectors, size nx * d
y – database vectors, size ny * d
distances – output distances, size nq * k
indexes – output vector ids, size nq * k

void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float_maxheap_array_t *res, const float *y_norm2 = nullptr, const IDSelector *sel = nullptr)

Return the k nearest neighbors of each of the nx vectors x among the ny vector y, for the L2 distance

Parameters:

x – query vectors, size nx * d
y – database vectors, size ny * d
res – result heap strcture, which also provides k. Sorted on output
y_norm2 – (optional) norms for the y vectors (nullptr or size ny)
sel – search in this subset of vectors

void knn_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, size_t k, float *distances, int64_t *indexes, const float *y_norm2 = nullptr, const IDSelector *sel = nullptr)

Return the k nearest neighbors of each of the nx vectors x among the ny vector y, for the L2 distance

Parameters:

x – query vectors, size nx * d
y – database vectors, size ny * d
distances – output distances, size nq * k
indexes – output vector ids, size nq * k
y_norm2 – (optional) norms for the y vectors (nullptr or size ny)
sel – search in this subset of vectors

void knn_inner_products_by_idx(const float *x, const float *y, const int64_t *subset, size_t d, size_t nx, size_t ny, size_t nsubset, size_t k, float *vals, int64_t *ids, int64_t ld_ids = -1)

Find the max inner product neighbors for nx queries in a set of ny vectors indexed by ids. May be useful for re-ranking a pre-selected vector list

Parameters:

x – query vectors, size nx * d
y – database vectors, size (max(ids) + 1) * d
ids – subset of database vectors to consider, size (nx, nsubset)
res – result structure
ld_ids – stride for the ids array. -1: use nsubset, 0: all queries process the same subset

void knn_L2sqr_by_idx(const float *x, const float *y, const int64_t *subset, size_t d, size_t nx, size_t ny, size_t nsubset, size_t k, float *vals, int64_t *ids, int64_t ld_subset = -1)

Find the nearest neighbors for nx queries in a set of ny vectors indexed by ids. May be useful for re-ranking a pre-selected vector list

Parameters:

x – query vectors, size nx * d
y – database vectors, size (max(ids) + 1) * d
subset – subset of database vectors to consider, size (nx, nsubset)
res – rIDesult structure
ld_subset – stride for the subset array. -1: use nsubset, 0: all queries process the same subset

void range_search_L2sqr(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *result, const IDSelector *sel = nullptr)

Return the k nearest neighbors of each of the nx vectors x among the ny vector y, w.r.t to max inner product

Parameters:

x – query vectors, size nx * d
y – database vectors, size ny * d
radius – search radius around the x vectors
result – result structure

void range_search_inner_product(const float *x, const float *y, size_t d, size_t nx, size_t ny, float radius, RangeSearchResult *result, const IDSelector *sel = nullptr): same as range_search_L2sqr for the inner product similarity

void compute_PQ_dis_tables_dsub2(size_t d, size_t ksub, const float *centroids, size_t nx, const float *x, bool is_inner_product, float *dis_tables): specialized function for PQ2

void fvec_madd(size_t n, const float *a, float bf, const float *b, float *c)

compute c := a + bf * b for a, b and c tables

Parameters:

n – size of the tables
a – size n
b – size n
c – result table, size n

int fvec_madd_and_argmin(size_t n, const float *a, float bf, const float *b, float *c)

same as fvec_madd, also return index of the min of the result table

Returns:: index of the min of table c

Variables

FAISS_API int distance_compute_blas_threshold

FAISS_API int distance_compute_blas_query_bs

FAISS_API int distance_compute_blas_database_bs