erikbern · Balwancia · Apr 21, 2025
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -32,6 +32,7 @@ jobs:
           - annoy
           - balltree
           - bruteforce
+          - cassandra
           - ckdtree
           - descartes
           - diskann

diff --git a/ann_benchmarks/algorithms/cassandra/Dockerfile b/ann_benchmarks/algorithms/cassandra/Dockerfile
@@ -0,0 +1,22 @@
+FROM cassandra:5.0.4
+
+RUN apt-get update && \
+    DEBIAN_FRONTEND=noninteractive \
+    apt-get install -y --no-install-recommends \
+        python3 python3-pip && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /home/app
+COPY requirements.txt .
+
+RUN pip install --no-cache-dir -r requirements.txt \
+    && pip install --no-cache-dir cassandra-driver
+
+COPY run_algorithm.py .
+COPY ann_benchmarks/algorithms/cassandra/entrypoint.sh ann_benchmarks/algorithms/cassandra/usr/local/bin/entrypoint.sh
+RUN chmod +x ann_benchmarks/algorithms/cassandra/usr/local/bin/entrypoint.sh
+
+EXPOSE 9042
+
+ENTRYPOINT ["entrypoint.sh"]
diff --git a/ann_benchmarks/algorithms/cassandra/config.yml b/ann_benchmarks/algorithms/cassandra/config.yml
@@ -0,0 +1,25 @@
+float:
+  angular:
+  - base_args: ['@metric', '@dimension']
+    constructor: Cassandra
+    disabled: false
+    docker_tag: ann-benchmarks-cassandra
+    module: ann_benchmarks.algorithms.cassandra
+    name: cassandra
+    run_groups:
+      m-16-ef-100:
+        arg_groups: [{ef_construction: 100, m: 16}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 160]]
+  euclidean:
+  - base_args: ['@metric', '@dimension']
+    constructor: Cassandra
+    disabled: false
+    docker_tag: ann-benchmarks-cassandra
+    module: ann_benchmarks.algorithms.cassandra
+    name: cassandra
+    run_groups:
+      m-16-ef-100:
+        arg_groups: [{ef_construction: 100, m: 16}]
+        args: {}
+        query_args: [[10, 20, 40, 80, 160]]
diff --git a/ann_benchmarks/algorithms/cassandra/entrypoint.sh b/ann_benchmarks/algorithms/cassandra/entrypoint.sh
@@ -0,0 +1,17 @@
+#!/usr/bin/env bash
+set -e
+trap 'kill -TERM $CASS_PID' EXIT
+
+docker-entrypoint.sh cassandra -R &
+CASS_PID=$!
+
+echo "⏳ Waiting for Cassandra to come up …"
+until cqlsh -u cassandra -p cassandra 127.0.0.1 9042 -e "DESCRIBE CLUSTER" >/dev/null 2>&1
+do
+  sleep 3
+done
+echo "✅ Cassandra is up!"
+
+python3 -u /home/app/run_algorithm.py "$@"
+
+wait $CASS_PID
diff --git a/ann_benchmarks/algorithms/cassandra/module.py b/ann_benchmarks/algorithms/cassandra/module.py
@@ -0,0 +1,84 @@
+import uuid
+from cassandra.cluster import Cluster
+from cassandra.query import BatchStatement, BatchType
+
+from ann_benchmarks.algorithms.base.module import BaseANN
+
+class Cassandra(BaseANN):
+    def __init__(self, metric, dimension, method_param):
+        self.metric = metric
+        self.dimension = dimension
+        self.method_param = method_param
+        self.keyspace = "ann_benchmarks"
+        self.param_string = "-".join(k + "-" + str(v) for k, v in self.method_param.items()).lower()
+        self.index_name = f"os-{self.param_string}"
+        self.table_name = f"vector_items-{self.param_string}"
+
+        self.cluster = Cluster(['localhost'])
+        self.conn = self.cluster.connect()
+        self._setup_keyspace()
+
+    def _setup_keyspace(self):
+        self.conn.execute(f"""
+        CREATE KEYSPACE IF NOT EXISTS {self.keyspace}
+        WITH replication = {{'class': 'SimpleStrategy', 'replication_factor': 1}};
+        """)
+        self.conn.set_keyspace(self.keyspace)
+
+    def _create_table(self):
+        self.conn.execute(f"""
+            CREATE TABLE IF NOT EXISTS {self.table_name} (
+                id BIGINT PRIMARY KEY,
+                embedding VECTOR<FLOAT, {self.dimension}>,
+            ) WITH compaction = {{ 'class': 'LeveledCompactionStrategy' }};
+        """)
+
+    def _create_index(self, ):
+        DISTANCE_MAPPING = {
+            "L2": "EUCLIDEAN",
+            "COSINE": "COSINE",
+            "DOT": "DOT_PRODUCT",
+        }
+
+        hnsw_distance_type = DISTANCE_MAPPING.get(self.metric, "EUCLIDEAN") 
+        self.conn.execute(f"""
+            CREATE INDEX IF NOT EXISTS {self.index_name}
+                ON {self.keyspace}.{self.table_name}(embedding) USING 'sai'
+                WITH OPTIONS = {{ 'similarity_function': '{hnsw_distance_type}' }};
+        """)
+
+    def fit(self, X, batch_size=100):
+        self.vector_dim = X.shape[1]
+        self._create_table()
+        self._create_index()
+
+        insert_query = f"INSERT INTO {self.table_name} (id, embedding) VALUES (?, ?)"
+        prepared = self.conn.prepare(insert_query)
+        batch = BatchStatement(batch_type=BatchType.UNLOGGED)
+
+        for i, vec in enumerate(X):
+            batch.add(prepared, (i, vec.tolist()))
+
+            if len(batch) >= batch_size:
+                self.conn.execute(batch)
+                batch.clear()
+        if batch:
+            self.conn.execute(batch)
+
+    def set_query_arguments(self, params):
+        self._ef_search = params
+
+    def query(self, v, n):
+        query = f"""
+        SELECT id FROM {self.table_name}
+        ORDER BY embedding ANN OF %s
+        LIMIT %s
+        """
+        results = self.conn.execute(query, (v.tolist(), n))
+        return [row.id for row in results]
+
+    def batch_query(self, X, n): 
+        self.batch_res = [self.query(q, n) for q in X]
+
+    def get_batch_results(self):
+        return self.batch_res
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,16 @@
+set -e
+trap 'kill -TERM $CASS_PID' EXIT
+
+docker-entrypoint.sh cassandra -R &
+CASS_PID=$!
+
+echo "⏳ Waiting for Cassandra to come up …"
+until cqlsh -u cassandra -p cassandra 127.0.0.1 9042 -e "DESCRIBE CLUSTER" >/dev/null 2>&1
+do
+  sleep 3
+done
+echo "✅ Cassandra is up!"
+
+python3 -u /home/app/run.py "$@"
+
+wait $CASS_PID