Skip to content

Commit

Permalink
Eliminate blockmap (#318)
Browse files Browse the repository at this point in the history
* wip reset

* Use lantern_slot_t for lantern usage

* Update WAL version and validate_index strategy to new neighbor length

* Fix test regressions caused by neighbor id length change

* Rename lantern_slot_t to ldb_lantern_slot_union_t

* Move to using an anonymous namespace in usearch_storage.cpp

* Pass an item_pointers array to populate rewrite info during index build

* Add API to interface with usearch node neighbors

* Implement node neighbor rewriting logic (rewriting disabled for now)

* Use 64bit ID for node retriever

* Update entry_slot after updating all graph neighbors. (last step for converting built index into new blockmap-less format)

* WIP: fix all retrievers to assume itemPointerData neighbors

* Fix remaining version handling info for scans

* Rename HnswIndexTuple id to seqid

* Pass slot externally to make inserts work

* Add slot type scalar kind in usearch

* remove blockmap test

* Delete some of blockmap handling

* Update regression tests assuming there is no blockmap in index structure

* Delete a lot of the blockmap code

* Remove continue_blockmap_initialization SQL interface

* Add update test to 0.3.0

* Delete more unused code

* Do not copy blockmap to scan retriever cache

* Disable validate_index

* Simplify node retrievers

* Add usearch performance patches

* Add perf installation instructions (tested on gcp)

* Update binary file tests to the new format

* in cmakefiles Add option to avoid omiting frame pointer and get rid of robin-map includes

* Update Dockerfile.dev with postgres GUCs

- set wal_level to logical
- set shared_buffers
- set listen_addresses to '*' so host can connect

* Remove blockmap-related and failure-points tests

* Add level distribution snippet

* Remove remnants of backward compatible upgrade support

* Add a note on handling of pq pages in deletes and WAL rewriting

* Remove extra lines from update script

* update usearch

* remove failure point test

* disable failure points by default

* update usearch (uint48 hasher to work for index creation from pg)

* Use unsigned long long in place of uint64 for pg<->c++ compat

* Fix a similar compat issue that arrises with the last fix in gcc

* Add 0.3.0 release to CMake (do not create release yet)

* Disable all existing indexes so they can be safely reindexed

* fix indent in usearch

* fix update script for disabling indexes

* remove disable_lantern_indexes from relocation

* Switch to memcpy in all misaligned neighbor list manipulations

* Update cost estimates

* Remove level log stuff and make tests consistently pass on mac and linux

it seems mac and linux generate random numbers somewhat differently
and for some values of M, hnsw level distributions end up affecting
overall index size. This commit addresses that

* Remove unused members of the index header

* Fix unaligned memory write

* Add alignment of 2 for slot type

* simplify matters and get rid of unaligned stack variable

* Remove unused argument to wal_area_reset

* Use uint32 cast for entry slot

* debug sanitizer

* do palloc zero to see why reading zero'th element might cause a panic

* debug

* Fix bug found by address sanitizer

* Suppress unaligned warning and make slot type explicitly unaligned

* suppress unknown pragma warnings and delete duplicate flag

* Error out when inserting into a wal-incompatible index

* suppress compiler warnings in clang

* fix pragma formatting on mac
  • Loading branch information
Ngalstyan4 authored May 15, 2024
1 parent cd7f187 commit 6309aab
Show file tree
Hide file tree
Showing 43 changed files with 463 additions and 1,372 deletions.
11 changes: 7 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.3)

set(LANTERN_VERSION 0.2.7)
set(LANTERN_VERSION 0.3.0)

project(
LanternDB
Expand Down Expand Up @@ -28,7 +28,7 @@ option(POSTGRES_UBSAN "Turn this on when building with a postgres that has UBSAN
option(DEV "Developer mode: provide code formatting, get postgres source, etc." OFF)
option(CODECOVERAGE "Enable code coverage for the build" OFF)
option(BENCH "Enable benchmarking" OFF)
option(FAILURE_POINTS "Enable failure points" ON)
option(FAILURE_POINTS "Enable failure points" OFF)
option(BUILD_C_TESTS "Build C client tests" OFF)

if (${BUILD_FOR_DISTRIBUTING})
Expand Down Expand Up @@ -137,18 +137,18 @@ target_include_directories(
PRIVATE "./third_party/usearch/c"
PRIVATE "./third_party/usearch/include/"
PRIVATE "./third_party/usearch/fp16/include/"
PRIVATE "./third_party/usearch/robin-map/include/"
)

target_compile_options(lantern PRIVATE
-g
-fstack-protector-strong
-Wformat
-Werror=format-security
-Wno-unknown-pragmas
-ftree-vectorize
-fassociative-math
-fno-signed-zeros
-fno-trapping-math
# -fno-omit-frame-pointer
-fPIC
)
if(${MARCH_NATIVE})
Expand Down Expand Up @@ -225,6 +225,8 @@ endif()
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_link_options(lantern PRIVATE -static-libstdc++)
elseif (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# it seems -Wno-unknown-pragmas is not enough for clang
target_compile_options(lantern PRIVATE "-Wno-unknown-warning-option")
target_link_options(lantern PRIVATE -static)
endif ()
# not sure why, but the above is not enough to force pulling cpp runtime into lantern.so
Expand Down Expand Up @@ -276,6 +278,7 @@ set (_update_files
sql/updates/0.2.4--0.2.5.sql
sql/updates/0.2.5--0.2.6.sql
sql/updates/0.2.6--0.2.7.sql
sql/updates/0.2.7--0.3.0.sql
)

# Generate version information for the binary
Expand Down
13 changes: 11 additions & 2 deletions docker/Dockerfile.dev
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,23 @@ RUN git clone https://github.com/lanterndata/benchmark build/benchmark \
&& pip install -r core/requirements.txt \
&& pip install -r external/requirements.txt

# Install perf
RUN sudo apt update && sudo apt install -y linux-tools-common linux-tools-generic linux-tools-`uname -r`
# in host, enable perf_event paranoid via
# echo -1 | sudo tee /proc/sys/kernel/perf_event_paranoid

ENV DATABASE_URL=postgres://postgres@localhost:5432/postgres
ENV LANTERN_DATABASE_URL=postgres://postgres@localhost:5432/postgres

# Uncomment the following to change the data directory of the running postgres
# RUN /usr/local/pgsql/bin/initdb -D /var/lib/postgresql/data
# RUN echo "listen_addresses '*' " >> /var/lib/postgresql/data/postgresql.conf
RUN /usr/local/pgsql/bin/initdb -D /var/lib/postgresql/data
RUN echo "shared_preload_libraries = 'pg_cron,pg_stat_statements' " >> /var/lib/postgresql/data/postgresql.conf
# RUN echo "host all all 0.0.0.0/0 trust" >> /var/lib/postgresql/data/pg_hba.conf
RUN echo "shared_preload_libraries = 'pg_cron,pg_stat_statements' " >> /var/lib/postgresql/data/postgresql.conf && \
echo "wal_level = 'logical' " >> /var/lib/postgresql/data/postgresql.conf && \
echo "listen_addresses = '*' " >> /var/lib/postgresql/data/postgresql.conf && \
echo "shared_buffers = '4GB' " >> /var/lib/postgresql/data/postgresql.conf

RUN echo "host all all 0.0.0.0/0 trust" >> /var/lib/postgresql/data/pg_hba.conf
# CMD ["/usr/local/pgsql/bin/postgres", "-D", "/var/lib/postgresql/data"]
CMD ["/usr/local/pgsql/bin/postgres", "-D", "/var/lib/postgresql/data"]
7 changes: 6 additions & 1 deletion scripts/run_all_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,14 @@ then
curl -sSo index-sift1k-l2-0.0.0.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-l2-v2.usearch
curl -sSo index-sift1k-cos.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-cos-v3.usearch
curl -sSo index-sift1k-l2.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-l2-v3.usearch
# Actual index files

# Outdated index files
curl -sSo index-sift1k-cos-0.0.13.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-cos-0.0.13.usearch
curl -sSo index-sift1k-l2sq-0.0.13.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-l2sq-0.0.13.usearch

# Actual index files
curl -sSo index-sift1k-cos-0.3.0.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-cos-0.3.0.usearch
curl -sSo index-sift1k-l2sq-0.3.0.usearch https://storage.googleapis.com/lanterndata/lanterndb_binary_indexes/index-sift1k-l2sq-0.3.0.usearch
# Corrupted index file for test
tail -c +100 index-sift1k-l2.usearch > index-sift1k-l2-corrupted.usearch
popd
Expand Down
3 changes: 0 additions & 3 deletions sql/lantern.sql
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,6 @@ CREATE FUNCTION _lantern_internal.validate_index(index regclass, print_info bool
CREATE FUNCTION _lantern_internal.failure_point_enable(func TEXT, name TEXT, dont_trigger_first_nr INTEGER DEFAULT 0) RETURNS VOID
AS 'MODULE_PATHNAME', 'lantern_internal_failure_point_enable' LANGUAGE C STABLE STRICT PARALLEL UNSAFE;

CREATE FUNCTION _lantern_internal.continue_blockmap_group_initialization(index regclass) RETURNS VOID
AS 'MODULE_PATHNAME', 'lantern_internal_continue_blockmap_group_initialization' LANGUAGE C STABLE STRICT PARALLEL UNSAFE;

CREATE FUNCTION _lantern_internal.create_pq_codebook(REGCLASS, NAME, INT, INT, TEXT, INT) RETURNS REAL[][][]
AS 'MODULE_PATHNAME', 'create_pq_codebook' LANGUAGE C STABLE STRICT PARALLEL UNSAFE;
-- operator classes
Expand Down
15 changes: 15 additions & 0 deletions sql/updates/0.2.7--0.3.0.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
WITH sub as (
SELECT indrelid as table_oid, indexrelid, amname
FROM pg_class t
JOIN pg_index ix ON t.oid = ix.indrelid
JOIN pg_class i ON i.oid = ix.indexrelid
JOIN pg_am a ON i.relam = a.oid
JOIN pg_namespace n ON n.oid = i.relnamespace
WHERE a.amname = 'lantern_hnsw'
)
UPDATE pg_index ix
SET indisvalid = false, indisready = false
FROM sub
WHERE ix.indexrelid = sub.indexrelid;

DROP FUNCTION IF EXISTS _lantern_internal.continue_blockmap_group_initialization;
7 changes: 3 additions & 4 deletions src/hnsw.c
Original file line number Diff line number Diff line change
Expand Up @@ -432,13 +432,12 @@ Datum lantern_internal_failure_point_enable(PG_FUNCTION_ARGS)
PG_RETURN_VOID();
}

// todo:: remove in 0.3.1
PGDLLEXPORT PG_FUNCTION_INFO_V1(lantern_internal_continue_blockmap_group_initialization);
Datum lantern_internal_continue_blockmap_group_initialization(PG_FUNCTION_ARGS)
{
Oid indrelid = PG_GETARG_OID(0);

ldb_continue_blockmap_group_initialization(indrelid);
PG_RETURN_VOID();
LDB_UNUSED(fcinfo);
PG_RETURN_NULL();
}

PGDLLEXPORT PG_FUNCTION_INFO_V1(lantern_reindex_external_index);
Expand Down
24 changes: 24 additions & 0 deletions src/hnsw.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@

#include <postgres.h>

#include <assert.h>
#include <fmgr.h>
#include <storage/itemptr.h>
#include <utils/relcache.h>

#define LANTERN_INTERNAL_SCHEMA_NAME "_lantern_internal"
Expand All @@ -27,6 +29,28 @@ typedef enum
UNKNOWN
} HnswColumnType;

// compilers warn about potential UB when members of this struct are accessed directly
// though the struct is always accessed via memcpy so the warning does not apply
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wpacked-not-aligned"

#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wpacked-not-aligned"

// C version of uint48_t in c++/usearch
typedef union __attribute__((__packed__))
{
ItemPointerData itemPointerData;
uint32 seqid;
} ldb_unaligned_slot_union_t;

#pragma clang diagnostic pop

#pragma GCC diagnostic pop

static_assert(sizeof(ldb_unaligned_slot_union_t) >= sizeof(ItemPointerData),
"ldb_unaligned_slot_union_t must be large enough for ItemPointerData");

/* Exported functions */
PGDLLEXPORT void _PG_init(void);
PGDLLEXPORT void _PG_fini(void);
Expand Down
3 changes: 1 addition & 2 deletions src/hnsw/build.c
Original file line number Diff line number Diff line change
Expand Up @@ -553,10 +553,9 @@ static void BuildIndex(Relation heap, Relation index, IndexInfo *indexInfo, ldb_
assert(result_buf != MAP_FAILED);
//****************************** mmap index to memory END ******************************//

//****************************** saving to WAL BEGIN ******************************//
// save the index to WAL
UpdateProgress(PROGRESS_CREATEIDX_PHASE, LDB_PROGRESS_HNSW_PHASE_LOAD);
StoreExternalIndex(index, &metadata, MAIN_FORKNUM, result_buf, &opts, num_added_vectors);
//****************************** saving to WAL END ******************************//

munmap_ret = munmap(result_buf, index_file_stat.st_size);
assert(munmap_ret == 0);
Expand Down
26 changes: 11 additions & 15 deletions src/hnsw/delete.c
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ IndexBulkDeleteResult *ldb_ambulkdelete(IndexVacuumInfo *info,
elog(WARNING,
"LanternDB: hnsw index deletes are currently not implemented. This is a no-op. No memory will be reclaimed");
// traverse through the index and call the callback for all elements
BlockNumber blockno;
Buffer buf;
HnswIndexHeaderPage header;
Page page;
Expand All @@ -46,21 +45,18 @@ IndexBulkDeleteResult *ldb_ambulkdelete(IndexVacuumInfo *info,
gxlogState = GenericXLogStart(info->index);
page = GenericXLogRegisterBuffer(gxlogState, buf, LDB_GENERIC_XLOG_DELTA_IMAGE);
maxoffset = PageGetMaxOffsetNumber(page);
// when index is a pq-index, there will be pq header pages that are currently empty
// the loop below will skip those. in the future, when those pages are filled up,
// we need to add a branch here and skip those pages

if(isBlockMapBlock(header.blockmap_groups, header.blockmap_groups_nr, blockno)) {
ldb_invariant(1 == maxoffset, "expected blockmap page with single item");
HnswBlockmapPage *blockmap_page
= (HnswBlockmapPage *)PageGetItem(page, PageGetItemId(page, FirstOffsetNumber));
} else {
for(offset = FirstOffsetNumber; offset <= maxoffset; offset = OffsetNumberNext(offset)) {
HnswIndexTuple *nodepage = (HnswIndexTuple *)PageGetItem(page, PageGetItemId(page, offset));
unsigned long label = label_from_node(nodepage->node);
label2ItemPointer(label, &tid_data);
if(callback(&tid_data, callback_state)) {
block_modified = true;
reset_node_label(nodepage->node);
stats->tuples_removed += 1;
}
for(offset = FirstOffsetNumber; offset <= maxoffset; offset = OffsetNumberNext(offset)) {
HnswIndexTuple *nodepage = (HnswIndexTuple *)PageGetItem(page, PageGetItemId(page, offset));
unsigned long label = label_from_node(nodepage->node);
label2ItemPointer(label, &tid_data);
if(callback(&tid_data, callback_state)) {
block_modified = true;
reset_node_label(nodepage->node);
stats->tuples_removed += 1;
}
}

Expand Down
Loading

0 comments on commit 6309aab

Please sign in to comment.