Skip to content

Commit

Permalink
Release 0.2.3
Browse files Browse the repository at this point in the history
  • Loading branch information
Ngalstyan4 committed Apr 9, 2024
1 parent 12a0136 commit 9b4889b
Show file tree
Hide file tree
Showing 3 changed files with 297 additions and 8 deletions.
13 changes: 7 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cmake_minimum_required(VERSION 3.3)

set(LANTERN_VERSION 0.2.2)
set(LANTERN_VERSION 0.2.3)

project(
LanternDB
Expand Down Expand Up @@ -110,7 +110,7 @@ if (BUILD_C_TESTS)
target_link_directories(lantern_c_tests PRIVATE ${PostgreSQL_LIBRARY_DIRS})
# Link libpq
target_link_libraries(lantern_c_tests "-lpq")

add_custom_target(
test-client
COMMAND ${CMAKE_SOURCE_DIR}/scripts/run_all_tests.sh --client
Expand Down Expand Up @@ -251,9 +251,9 @@ set(_script_file "lantern--${RELEASE_ID}.sql")
# ============== Use clang compiler to emit llvm bytecode =================
find_program(LLVM_LTO NAMES llvm-lto)
if(
NOT LLVM_LTO STREQUAL "LLVM_LTO-NOTFOUND"
AND PostgreSQL_WITH_LLVM
AND CMAKE_C_COMPILER_ID MATCHES "Clang"
NOT LLVM_LTO STREQUAL "LLVM_LTO-NOTFOUND"
AND PostgreSQL_WITH_LLVM
AND CMAKE_C_COMPILER_ID MATCHES "Clang"
AND CMAKE_CXX_COMPILER_ID MATCHES "Clang"
)
target_link_options(lantern PRIVATE -flto)
Expand All @@ -270,6 +270,7 @@ endif()
set (_update_files
sql/updates/0.2.0--0.2.1.sql
sql/updates/0.2.1--0.2.2.sql
sql/updates/0.2.2--0.2.3.sql
)

# Generate version information for the binary
Expand Down Expand Up @@ -400,7 +401,7 @@ if (CLANG_FORMAT)
# Add format check target
add_custom_target(
format_check
COMMAND ${CLANG_FORMAT} --dry-run -Werror ${CLANG_FORMATTABLE_FILES}
COMMAND ${CLANG_FORMAT} --dry-run -Werror ${CLANG_FORMATTABLE_FILES}
COMMENT "Checking code formatting with clang-format"
VERBATIM
)
Expand Down
2 changes: 1 addition & 1 deletion sql/lantern.sql
Original file line number Diff line number Diff line change
Expand Up @@ -475,6 +475,7 @@ BEGIN
END;
$$ LANGUAGE plpgsql;

-- Asynchronous task scheduling BEGIN
CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_lantern_tasks() RETURNS VOID AS
$async_tasks_related$
BEGIN
Expand All @@ -484,7 +485,6 @@ BEGIN
RETURN;
END IF;

-- Asynchronous task scheduling BEGIN
CREATE TABLE lantern.tasks (
jobid bigserial primary key,
query text not null,
Expand Down
290 changes: 289 additions & 1 deletion sql/updates/0.2.2--0.2.3.sql
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,292 @@ $async_tasks_related$ LANGUAGE plpgsql;
SELECT _lantern_internal.maybe_setup_lantern_tasks();
DROP FUNCTION _lantern_internal.maybe_setup_lantern_tasks();

-- Asynchronous task scheduling BEGIN
-- ^^^^
-- Asynchronous task scheduling END

-- Weighted vector search

CREATE OR REPLACE FUNCTION _lantern_internal.mask_arrays(arr text)
RETURNS text AS $$
BEGIN
-- match:
-- single quote (escaped by doubling it)
-- opening square bracket (escaped with a backslash)
-- any character (as few as possible, via *?)
-- closing square bracket (escaped with a backslash)
-- single quote (escaped by doubling it)
-- the string ::vector literally
arr := regexp_replace(arr, '''\[.*?\]''::vector', '''[MASKED_VECTOR]''::vector','g');
-- same as above, but for non-explain context where the explicit cast is missing
arr := regexp_replace(arr, '''\[.*?\]''', '''[MASKED_VECTOR]''','g');

RETURN arr;
END
$$ LANGUAGE plpgsql;


-- Helper function that takes in the output of EXPLAIN (FORMAT JSON) and masks long vectors in ORDER BY clauses
CREATE OR REPLACE FUNCTION _lantern_internal.mask_order_by_in_plan(json_data jsonb)
RETURNS jsonb AS $$
DECLARE
key TEXT;
value JSONB;
BEGIN
-- Check if the input is null
IF json_data IS NULL THEN
RETURN NULL;
END IF;

-- Check if the input is a JSON object
IF jsonb_typeof(json_data) = 'object' THEN
-- Loop through each key-value pair in the JSON object
FOR key, value IN SELECT * FROM jsonb_each(json_data) LOOP
-- If the key is "Order By", set the value to null
IF key = 'Order By' OR key = 'Filter' OR key = 'Sort Key' THEN
value = _lantern_internal.mask_arrays(value::text);
json_data = jsonb_set(json_data, ARRAY[key], value);
ELSE
-- Recursively call the function for nested JSON objects or arrays
json_data = jsonb_set(json_data, ARRAY[key], _lantern_internal.mask_order_by_in_plan(value));
END IF;
END LOOP;
-- Check if the input is a JSON array
ELSIF jsonb_typeof(json_data) = 'array' THEN
-- Loop through each element in the JSON array
FOR idx IN 0 .. jsonb_array_length(json_data) - 1 LOOP
-- Recursively call the function for elements of the array
json_data = jsonb_set(json_data, ARRAY[idx::text], _lantern_internal.mask_order_by_in_plan(json_data->idx));
END LOOP;
END IF;

RETURN json_data;
END;
$$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION _lantern_internal.maybe_setup_weighted_vector_search() RETURNS VOID AS
$weighted_vector_search$
DECLARE
pgvector_exists boolean;
BEGIN
-- Check if the vector type from pgvector exists
SELECT EXISTS (
SELECT 1
FROM pg_type
WHERE typname = 'vector'
) INTO pgvector_exists;

IF NOT pgvector_exists THEN
RAISE NOTICE 'pgvector extension not found. Skipping lantern weighted vector search setup';
RETURN;
END IF;

CREATE OR REPLACE FUNCTION lantern.weighted_vector_search(
relation_type anyelement,
w1 numeric,
col1 text,
vec1 vector,
w2 numeric= 0,
col2 text = NULL,
vec2 vector = NULL,
w3 numeric = 0,
col3 text = NULL,
vec3 vector = NULL,
ef integer = 100,
max_dist numeric = NULL,
-- set l2 (pgvector) and l2sq (lantern) as default, as we do for lantern index.
distance_operator text = '<->',
id_col text = 'id',
exact boolean = false,
debug_output boolean = false,
analyze_output boolean = false
)
-- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement
-- when there is single "anylement column" being returned (e.g. returns table ("row" anylement))
-- then that single "column" is properly spread with source table's column names
-- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place
-- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol"
RETURNS TABLE ("row" anyelement) AS
$$
DECLARE
joint_condition text;
query_base text;
query_final_where text = '';
query1 text;
query2 text;
query3 text;
-- variables for weighted columns
wc1 text = NULL;
wc2 text = NULL;
wc3 text = NULL;
cte_query text;
maybe_unions_query text;
final_query text;
explain_query text;
explain_output jsonb;
old_hnsw_ef_search numeric;
debug_count integer;
maybe_analyze text = '';
BEGIN
-- TODO:: better validate inputs to throw nicer errors in case of wrong input:
-- 1. only allow valid distance_operator stirngs (<->, <=>, but not abracadabra)
-- 2. only allow valid column names
-- 3. throw an error on negative weights
-- 4. check that id_col column exists before proceeding

IF analyze_output THEN
maybe_analyze := 'ANALYZE,';
END IF;
-- Joint similarity metric condition
wc1 := format('(%s * (%I %s %L))', w1, col1, distance_operator, vec1);
IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN
wc2 := format(' (%s * (%I %s %L))', w2, col2, distance_operator, vec2);
END IF;
IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN
wc3 := format(' (%s * (%I %s %L))', w3, col3, distance_operator, vec3);
END IF;

joint_condition := wc1 || COALESCE('+' || wc2, '') || COALESCE('+' || wc3, '');

-- Base query with joint similarity metric
query_base := format('SELECT * FROM %s ', pg_typeof(relation_type));
IF max_dist IS NOT NULL THEN
query_final_where := format(' WHERE %s < %L', joint_condition, max_dist);
END IF;

IF exact THEN
final_query := query_base || query_final_where || format(' ORDER BY %s', joint_condition);
IF debug_output THEN
explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query);
EXECUTE explain_query INTO explain_output;

RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query);

explain_output := _lantern_internal.mask_order_by_in_plan(explain_output);
RAISE WARNING 'weighted vector search explain(exact=true): %', jsonb_pretty(explain_output);
END IF;
RETURN QUERY EXECUTE final_query;
-- the empty return below is crucial, to make sure the rest of the function is not executed after the return query above
RETURN;
END IF;

EXECUTE format('SET LOCAL hnsw.ef_search TO %L', ef);
-- UNION ALL.. part of the final query that aggregates results from individual vector search queries
maybe_unions_query := '';

-- Query 1: Order by first condition's weighted similarity
query1 := format('%s ORDER BY %I %s %L LIMIT %L', query_base || query_final_where, col1, distance_operator, vec1, ef);

IF debug_output THEN
EXECUTE format('SELECT count(*) FROM (%s) t', query1) INTO debug_count;
RAISE WARNING 'col1 yielded % rows', debug_count;
END IF;

cte_query = format('WITH query1 AS (%s) ', query1);

-- Query 2: Order by other conditions' weighted similarity, if applicable
IF w2 > 0 AND col2 IS NOT NULL AND vec2 IS NOT NULL THEN
query2 := format('%s ORDER BY %I %s %L LIMIT %L', query_base || query_final_where, col2, distance_operator, vec2, ef);
cte_query := cte_query || format(', query2 AS (%s)', query2);
maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query2) ');
IF debug_output THEN
EXECUTE format('SELECT count(*) FROM (%s) t', query2) INTO debug_count;
RAISE WARNING 'col2 yielded % rows', debug_count;
END IF;
END IF;

IF w3 > 0 AND col3 IS NOT NULL AND vec3 IS NOT NULL THEN
query3 := format('%s ORDER BY %I %s %L LIMIT %L', query_base || query_final_where, col3, distance_operator, vec3, ef);
cte_query := cte_query || format(', query3 AS (%s)', query3);
maybe_unions_query := maybe_unions_query || format(' UNION ALL (SELECT * FROM query3) ');
IF debug_output THEN
EXECUTE format('SELECT count(*) FROM (%s) t', query3) INTO debug_count;
RAISE WARNING 'col3 yielded % rows', debug_count;
END IF;
END IF;

final_query := cte_query || format($final_cte_query$SELECT * FROM (
SELECT DISTINCT ON (%I) * FROM (
(SELECT * FROM query1)
%s
) t
)
tt %s ORDER BY %s$final_cte_query$,
id_col, maybe_unions_query, query_final_where, joint_condition);

IF debug_output THEN
explain_query := format('EXPLAIN (%s COSTS FALSE, FORMAT JSON) %s', maybe_analyze, final_query);
EXECUTE explain_query INTO explain_output;

RAISE WARNING 'Query: %', _lantern_internal.mask_arrays(final_query);

explain_output := _lantern_internal.mask_order_by_in_plan(explain_output);
RAISE WARNING ' weighted vector search explain: %', jsonb_pretty(explain_output);
END IF;
RETURN QUERY EXECUTE final_query;
END
$$ LANGUAGE plpgsql;

-- setup API shortcuts
CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_cos(
relation_type anyelement,
w1 numeric,
col1 text,
vec1 vector,
w2 numeric= 0,
col2 text = NULL,
vec2 vector = NULL,
w3 numeric = 0,
col3 text = NULL,
vec3 vector = NULL,
ef integer = 100,
max_dist numeric = NULL,
id_col text = 'id',
exact boolean = false,
debug_output boolean = false
)
-- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement
-- when there is single "anylement column" being returned (e.g. returns table ("row" anylement))
-- then that single "column" is properly spread with source table's column names
-- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place
-- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol"
RETURNS TABLE ("row" anyelement) AS $$

BEGIN
RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<=>', id_col, exact, debug_output);
END $$ LANGUAGE plpgsql;

CREATE OR REPLACE FUNCTION lantern.weighted_vector_search_l2sq(
relation_type anyelement,
w1 numeric,
col1 text,
vec1 vector,
w2 numeric= 0,
col2 text = NULL,
vec2 vector = NULL,
w3 numeric = 0,
col3 text = NULL,
vec3 vector = NULL,
ef integer = 100,
max_dist numeric = NULL,
id_col text = 'id',
exact boolean = false,
debug_output boolean = false
)
-- N.B. Something seems strange about PL/pgSQL functions that return table with anyelement
-- when there is single "anylement column" being returned (e.g. returns table ("row" anylement))
-- then that single "column" is properly spread with source table's column names
-- but, when returning ("row" anyelement, "anothercol" integer), things fall all oaver the place
-- now, the returned table always has 2 columns one row that is a record of sorts, and one "anothercol"
RETURNS TABLE ("row" anyelement) AS $$

BEGIN
RETURN QUERY SELECT * FROM lantern.weighted_vector_search(relation_type, w1, col1, vec1, w2, col2, vec2, w3, col3, vec3, ef, max_dist, '<->', id_col, exact, debug_output);
END $$ LANGUAGE plpgsql;


END
$weighted_vector_search$ LANGUAGE plpgsql;

SELECT _lantern_internal.maybe_setup_weighted_vector_search();
DROP FUNCTION _lantern_internal.maybe_setup_weighted_vector_search;

0 comments on commit 9b4889b

Please sign in to comment.