Skip to content

[DB-40533] Add support for new type VECTOR(<dim>,DOUBLE) #182

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions pynuodb/datatype.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
from datetime import datetime as Timestamp, date as Date, time as Time
from datetime import timedelta as TimeDelta

from pynuodb import protocol

try:
from typing import Tuple, Union # pylint: disable=unused-import
except ImportError:
Expand Down Expand Up @@ -149,10 +151,37 @@ def __cmp__(self, other):
return -1


class Vector(list):
"""A specific type for SQL VECTOR(<dim>, DOUBLE)
Copy link
Contributor

@madscientist madscientist Jun 27, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not quite right, is it? A Vector element doesn't have to be a DOUBLE (I mean, it does now but it might not in the future)?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right. It is supposed to be usable with other element types as well. Will fix that.

to be able to detect the desired type when binding parameters.
Apart from creating the value as a Vector with subtype
this can be used as a list."""
DOUBLE = protocol.VECTOR_DOUBLE

def __init__(self, subtype, *args, **kwargs):
if args:
if subtype != Vector.DOUBLE:
raise TypeError("Vector type only supported for subtype DOUBLE")

self.subtype = subtype

# forward the remaining arguments to the list __init__
super(Vector, self).__init__(*args, **kwargs)
else:
raise TypeError("Vector needs to be initialized with a subtype like Vector.DOUBLE as"
" first argument")

def getSubtype(self):
# type: () -> int
"""Returns the subtype of vector this instance holds data for"""
return self.subtype


STRING = TypeObject(str)
BINARY = TypeObject(str)
NUMBER = TypeObject(int, decimal.Decimal)
DATETIME = TypeObject(Timestamp, Date, Time)
VECTOR_DOUBLE = TypeObject(list)
ROWID = TypeObject()
NULL = TypeObject(None)

Expand All @@ -179,6 +208,7 @@ def __cmp__(self, other):
"timestamp without time zone": DATETIME,
"timestamp with time zone": DATETIME,
"time without time zone": DATETIME,
"vector double": VECTOR_DOUBLE,
# Old types used by NuoDB <2.0.3
"binarystring": BINARY,
"binaryvaryingstring": BINARY,
Expand Down
71 changes: 71 additions & 0 deletions pynuodb/encodedsession.py
Original file line number Diff line number Diff line change
Expand Up @@ -775,6 +775,39 @@ def putScaledCount2(self, value):
self.__output += data
return self

def putVectorDouble(self, value):
# type: (datatype.Vector) -> EncodedSession
"""Append a Vector with subtype Vector.DOUBLE to the message.

:type value: datatype.Vector
"""
self.__output.append(protocol.VECTOR)
# subtype
self.__output.append(protocol.VECTOR_DOUBLE)
# length in bytes in count notation, i.e. first
# number of bytes needed for the length, then the
# encoded length
lengthStr = crypt.toByteString(len(value) * 8)
self.__output.append(len(lengthStr))
self.__output += lengthStr

# the actual vector: Each value as double in little endian encoding
for val in value:
self.__output += struct.pack('<d', float(val))

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this work if the value is NULL?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A NULL should be set via putNull, not putVectorDouble.

But assuming someone would put some kind of NULL-value in here: What NULL representation would you expect there? I'll try to outline a few:

  • If value is None this path will not be taken (we only reach here through putValue below which calls putVector nly for instances of Vector
  • If value is an empty list this would encode an empty list. I did not include that in the tests, but I am pretty sure that would trigger a server side error with a VECTOR(0) not being supported or the like. Could add a test and ensure it works correctly.


return self

def putVector(self, value):
# type: (datatype.Vector) -> EncodedSession
"""Append a Vector type to the message.

:type value: datatype.Vector
"""
if value.getSubtype() == datatype.Vector.DOUBLE:
return self.putVectorDouble(value)

raise DataError("unsupported value for VECTOR subtype: %d" % (value.getSubtype()))

def putValue(self, value): # pylint: disable=too-many-return-statements
# type: (Any) -> EncodedSession
"""Call the supporting function based on the type of the value."""
Expand Down Expand Up @@ -806,6 +839,11 @@ def putValue(self, value): # pylint: disable=too-many-return-statements
if isinstance(value, bool):
return self.putBoolean(value)

# we don't want to autodetect lists as being VECTOR, so we
# only bind double if it is the explicit type
if isinstance(value, datatype.Vector):
return self.putVector(value)

# I find it pretty bogus that we pass str(value) here: why not value?
return self.putString(str(value))

Expand Down Expand Up @@ -1035,6 +1073,36 @@ def getUUID(self):

raise DataError('Not a UUID')

def getVector(self):
# type: () -> datatype.Vector
"""Read the next vector off the session.

:rtype datatype.Vector
"""
if self._getTypeCode() == protocol.VECTOR:
subtype = crypt.fromByteString(self._takeBytes(1))
if subtype == protocol.VECTOR_DOUBLE:
# VECTOR(<dim>, DOUBLE)
lengthBytes = crypt.fromByteString(self._takeBytes(1))
length = crypt.fromByteString(self._takeBytes(lengthBytes))

if length % 8 != 0:
raise DataError("Invalid size for VECTOR DOUBLE data: %d" % (length))

dimension = length // 8

# VECTOR DOUBLE stores the data as little endian
vector = datatype.Vector(datatype.Vector.DOUBLE,
[struct.unpack('<d', self._takeBytes(8))[0]
for _ in range(dimension)])

return vector
else:
raise DataError("Unknown VECTOR type: %d" % (subtype))
return 1

raise DataError('Not a VECTOR')

def getScaledCount2(self):
# type: () -> decimal.Decimal
"""Read a scaled and signed decimal from the session.
Expand Down Expand Up @@ -1110,6 +1178,9 @@ def getValue(self):
if code == protocol.UUID:
return self.getUUID()

if code == protocol.VECTOR:
return self.getVector()

if code == protocol.SCALEDCOUNT2:
return self.getScaledCount2()

Expand Down
5 changes: 4 additions & 1 deletion pynuodb/protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@
BLOBLEN4 = 193
CLOBLEN0 = 194
CLOBLEN4 = 198
SCALEDCOUNT1 = 199
VECTOR = 199
UUID = 200
SCALEDDATELEN0 = 200
SCALEDDATELEN1 = 201
Expand All @@ -66,6 +66,9 @@
DEBUGBARRIER = 240
SCALEDTIMESTAMPNOTZ = 241

# subtypes of the VECTOR type
VECTOR_DOUBLE = 0

# Protocol Messages
FAILURE = 0
OPENDATABASE = 3
Expand Down
86 changes: 86 additions & 0 deletions tests/nuodb_types_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
import decimal
import datetime

from pynuodb import datatype

from . import nuodb_base


Expand Down Expand Up @@ -125,3 +127,87 @@ def test_null_type(self):
assert len(row) == 1
assert cursor.description[0][1] == null_type
assert row[0] is None

def test_vector_type(self):
con = self._connect()
cursor = con.cursor()

# only activate this tests if tested against version 8 or above
cursor.execute("select cast(substring_index(release_ver, '.', 1) as int)"
" from system.nodes limit 1")
Comment on lines +136 to +137
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Wouldn't it be safer and easier to use geteffectiveversion() and compare based on that integer value, rather than parsing the version string?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just didn't know we have that function. Much better idea to use that.

row = cursor.fetchone()
database_major_version = row[0]
if database_major_version < 8:
return
Comment on lines +135 to +141
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a problem with this, per se.

I think it would be cleaner to obtain this information once somewhere in conftest.py and make it available there, for example maybe in the DATABASE_FIXTURE since that already creates a connection.

The current DATABASE_FIXTURE is the connection arguments but you could add this there and then extract it again in the nuodb_base.NuoBase._setup().

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll look into that. If it isn't extremely hard (which I doubt) I'll do that.


cursor.execute("CREATE TEMPORARY TABLE tmp ("
" vec3 VECTOR(3, DOUBLE),"
" vec5 VECTOR(5, DOUBLE))")

cursor.execute("INSERT INTO tmp VALUES ("
" '[1.1,2.2,33.33]',"
" '[-1,2,-3,4,-5]')")

cursor.execute("SELECT * FROM tmp")

# check metadata
[name, type, _, _, precision, scale, _] = cursor.description[0]
assert name == "VEC3"
assert type == datatype.VECTOR_DOUBLE
assert precision == 3
assert scale == 0

[name, type, _, _, precision, scale, _] = cursor.description[1]
assert name == "VEC5"
assert type == datatype.VECTOR_DOUBLE
assert precision == 5
assert scale == 0

# check content
row = cursor.fetchone()
assert len(row) == 2
assert row[0] == [1.1, 2.2, 33.33]
assert row[1] == [-1, 2, -3, 4, -5]
assert cursor.fetchone() is None

# check this is actually a Vector type, not just a list
assert isinstance(row[0], datatype.Vector)
assert row[0].getSubtype() == datatype.Vector.DOUBLE
assert isinstance(row[1], datatype.Vector)
assert row[1].getSubtype() == datatype.Vector.DOUBLE

# check prepared parameters
parameters = [datatype.Vector(datatype.Vector.DOUBLE, [11.11, -2.2, 3333.333]),
datatype.Vector(datatype.Vector.DOUBLE, [-1.23, 2.345, -0.34, 4, -5678.9])]
cursor.execute("TRUNCATE TABLE tmp")
cursor.execute("INSERT INTO tmp VALUES (?, ?)", parameters)

cursor.execute("SELECT * FROM tmp")

# check content
row = cursor.fetchone()
assert len(row) == 2
assert row[0] == parameters[0]
assert row[1] == parameters[1]
assert cursor.fetchone() is None

# check that the inserted values are interpreted correctly by the database
cursor.execute("SELECT CAST(vec3 AS STRING) || ' - ' || CAST(vec5 AS STRING) AS strRep"
" FROM tmp")

row = cursor.fetchone()
assert len(row) == 1
assert row[0] == "[11.11,-2.2,3333.333] - [-1.23,2.345,-0.34,4,-5678.9]"
assert cursor.fetchone() is None

# currently binding a list also works - this is done via implicit string
# conversion of the passed argument in default bind case
parameters = [[11.11, -2.2, 3333.333]]
cursor.execute("SELECT VEC3 = ? FROM tmp", parameters)

# check content
row = cursor.fetchone()
assert len(row) == 1
assert row[0] is True
assert cursor.fetchone() is None