Skip to content

Commit b751db7

Browse files
Support int8 & uint8 vector dtypes (#279)
This PR includes support for two integer dtypes, and also markes the 'dtype' kwarg in embedding methods as deprecated.
1 parent f41581a commit b751db7

File tree

15 files changed

+59
-42
lines changed

15 files changed

+59
-42
lines changed

docs/user_guide/04_vectorizers.ipynb

Lines changed: 8 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -223,12 +223,12 @@
223223
},
224224
{
225225
"cell_type": "code",
226-
"execution_count": 7,
226+
"execution_count": null,
227227
"metadata": {},
228228
"outputs": [],
229229
"source": [
230230
"# You can also create many embeddings at once\n",
231-
"embeddings = hf.embed_many(sentences, as_buffer=True, dtype=\"float32\")\n"
231+
"embeddings = hf.embed_many(sentences, as_buffer=True)\n"
232232
]
233233
},
234234
{
@@ -660,13 +660,13 @@
660660
"metadata": {},
661661
"source": [
662662
"## Selecting your float data type\n",
663-
"When embedding text as byte arrays RedisVL supports 4 different floating point data types, `float16`, `float32`, `float64` and `bfloat16`.\n",
663+
"When embedding text as byte arrays RedisVL supports 4 different floating point data types, `float16`, `float32`, `float64` and `bfloat16`, and 2 integer types, `int8` and `uint8`.\n",
664664
"Your dtype set for your vectorizer must match what is defined in your search index. If one is not explicitly set the default is `float32`."
665665
]
666666
},
667667
{
668668
"cell_type": "code",
669-
"execution_count": 19,
669+
"execution_count": null,
670670
"metadata": {},
671671
"outputs": [],
672672
"source": [
@@ -675,8 +675,9 @@
675675
"# subsequent calls to embed('', as_buffer=True) and embed_many('', as_buffer=True) will now encode as float16\n",
676676
"float16_bytes = vectorizer.embed('test sentence', as_buffer=True)\n",
677677
"\n",
678-
"# you can override this setting on each individual method call\n",
679-
"float64_bytes = vectorizer.embed('test sentence', as_buffer=True, dtype=\"float64\")\n",
678+
"# to generate embeddings with different dtype instantiate a new vectorizer\n",
679+
"vectorizer_64 = HFTextVectorizer(dtype='float64')\n",
680+
"float64_bytes = vectorizer_64.embed('test sentence', as_buffer=True)\n",
680681
"\n",
681682
"float16_bytes != float64_bytes"
682683
]
@@ -690,38 +691,6 @@
690691
"# cleanup\n",
691692
"index.delete()"
692693
]
693-
},
694-
{
695-
"cell_type": "code",
696-
"execution_count": null,
697-
"metadata": {},
698-
"outputs": [],
699-
"source": []
700-
},
701-
{
702-
"cell_type": "code",
703-
"execution_count": 1,
704-
"metadata": {},
705-
"outputs": [],
706-
"source": [
707-
"dist = max(i for i in range(10))"
708-
]
709-
},
710-
{
711-
"cell_type": "code",
712-
"execution_count": null,
713-
"metadata": {},
714-
"outputs": [],
715-
"source": [
716-
"dist"
717-
]
718-
},
719-
{
720-
"cell_type": "code",
721-
"execution_count": null,
722-
"metadata": {},
723-
"outputs": [],
724-
"source": []
725694
}
726695
],
727696
"metadata": {
@@ -746,4 +715,4 @@
746715
},
747716
"nbformat": 4,
748717
"nbformat_minor": 2
749-
}
718+
}

redisvl/redis/utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ def array_to_buffer(array: List[float], dtype: str) -> bytes:
4141
raise ValueError(
4242
f"Invalid data type: {dtype}. Supported types are: {[t.lower() for t in VectorDataType]}"
4343
)
44-
return np.array(array).astype(dtype.lower()).tobytes()
44+
return np.array(array, dtype=dtype.lower()).tobytes()
4545

4646

4747
def buffer_to_array(buffer: bytes, dtype: str) -> List[Any]:

redisvl/schema/fields.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class VectorDataType(str, Enum):
3030
FLOAT16 = "FLOAT16"
3131
FLOAT32 = "FLOAT32"
3232
FLOAT64 = "FLOAT64"
33+
INT8 = "INT8"
34+
UINT8 = "UINT8"
3335

3436

3537
class VectorIndexAlgorithm(str, Enum):

redisvl/utils/vectorize/text/azureopenai.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tenacity import retry, stop_after_attempt, wait_random_exponential
66
from tenacity.retry import retry_if_not_exception_type
77

8+
from redisvl.utils.utils import deprecated_argument
89
from redisvl.utils.vectorize.base import BaseVectorizer
910

1011
# ignore that openai isn't imported
@@ -167,6 +168,7 @@ def _set_model_dims(self, model) -> int:
167168
stop=stop_after_attempt(6),
168169
retry=retry_if_not_exception_type(TypeError),
169170
)
171+
@deprecated_argument("dtype")
170172
def embed_many(
171173
self,
172174
texts: List[str],
@@ -213,6 +215,7 @@ def embed_many(
213215
stop=stop_after_attempt(6),
214216
retry=retry_if_not_exception_type(TypeError),
215217
)
218+
@deprecated_argument("dtype")
216219
def embed(
217220
self,
218221
text: str,
@@ -251,6 +254,7 @@ def embed(
251254
stop=stop_after_attempt(6),
252255
retry=retry_if_not_exception_type(TypeError),
253256
)
257+
@deprecated_argument("dtype")
254258
async def aembed_many(
255259
self,
256260
texts: List[str],
@@ -299,6 +303,7 @@ async def aembed_many(
299303
stop=stop_after_attempt(6),
300304
retry=retry_if_not_exception_type(TypeError),
301305
)
306+
@deprecated_argument("dtype")
302307
async def aembed(
303308
self,
304309
text: str,

redisvl/utils/vectorize/text/bedrock.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from tenacity import retry, stop_after_attempt, wait_random_exponential
77
from tenacity.retry import retry_if_not_exception_type
88

9+
from redisvl.utils.utils import deprecated_argument
910
from redisvl.utils.vectorize.base import BaseVectorizer
1011

1112

@@ -118,6 +119,7 @@ def _set_model_dims(self, model: str) -> int:
118119
stop=stop_after_attempt(6),
119120
retry=retry_if_not_exception_type(TypeError),
120121
)
122+
@deprecated_argument("dtype")
121123
def embed(
122124
self,
123125
text: str,
@@ -158,6 +160,7 @@ def embed(
158160
stop=stop_after_attempt(6),
159161
retry=retry_if_not_exception_type(TypeError),
160162
)
163+
@deprecated_argument("dtype")
161164
def embed_many(
162165
self,
163166
texts: List[str],

redisvl/utils/vectorize/text/cohere.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tenacity import retry, stop_after_attempt, wait_random_exponential
66
from tenacity.retry import retry_if_not_exception_type
77

8+
from redisvl.utils.utils import deprecated_argument
89
from redisvl.utils.vectorize.base import BaseVectorizer
910

1011
# ignore that cohere isn't imported
@@ -111,6 +112,7 @@ def _set_model_dims(self, model) -> int:
111112
raise ValueError(f"Error setting embedding model dimensions: {str(e)}")
112113
return len(embedding)
113114

115+
@deprecated_argument("dtype")
114116
def embed(
115117
self,
116118
text: str,
@@ -177,6 +179,7 @@ def embed(
177179
stop=stop_after_attempt(6),
178180
retry=retry_if_not_exception_type(TypeError),
179181
)
182+
@deprecated_argument("dtype")
180183
def embed_many(
181184
self,
182185
texts: List[str],

redisvl/utils/vectorize/text/custom.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pydantic.v1 import PrivateAttr
44

5+
from redisvl.utils.utils import deprecated_argument
56
from redisvl.utils.vectorize.base import BaseVectorizer
67

78

@@ -155,6 +156,7 @@ def _validate_sync_callables(self) -> int:
155156

156157
return dims
157158

159+
@deprecated_argument("dtype")
158160
def embed(
159161
self,
160162
text: str,
@@ -191,6 +193,7 @@ def embed(
191193

192194
return self._process_embedding(result, as_buffer, dtype)
193195

196+
@deprecated_argument("dtype")
194197
def embed_many(
195198
self,
196199
texts: List[str],
@@ -239,6 +242,7 @@ def embed_many(
239242
return embeddings
240243

241244
@validate_async
245+
@deprecated_argument("dtype")
242246
async def aembed(
243247
self,
244248
text: str,
@@ -280,6 +284,7 @@ async def aembed(
280284
return self._process_embedding(result, as_buffer, dtype)
281285

282286
@validate_async
287+
@deprecated_argument("dtype")
283288
async def aembed_many(
284289
self,
285290
texts: List[str],

redisvl/utils/vectorize/text/huggingface.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
from pydantic.v1 import PrivateAttr
44

5+
from redisvl.utils.utils import deprecated_argument
56
from redisvl.utils.vectorize.base import BaseVectorizer
67

78

@@ -79,6 +80,7 @@ def _set_model_dims(self):
7980
raise ValueError(f"Error setting embedding model dimensions: {str(e)}")
8081
return len(embedding)
8182

83+
@deprecated_argument("dtype")
8284
def embed(
8385
self,
8486
text: str,
@@ -112,6 +114,7 @@ def embed(
112114
embedding = self._client.encode([text], **kwargs)[0]
113115
return self._process_embedding(embedding.tolist(), as_buffer, dtype)
114116

117+
@deprecated_argument("dtype")
115118
def embed_many(
116119
self,
117120
texts: List[str],

redisvl/utils/vectorize/text/mistral.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tenacity import retry, stop_after_attempt, wait_random_exponential
66
from tenacity.retry import retry_if_not_exception_type
77

8+
from redisvl.utils.utils import deprecated_argument
89
from redisvl.utils.vectorize.base import BaseVectorizer
910

1011
# ignore that mistralai isn't imported
@@ -116,6 +117,7 @@ def _set_model_dims(self, model) -> int:
116117
stop=stop_after_attempt(6),
117118
retry=retry_if_not_exception_type(TypeError),
118119
)
120+
@deprecated_argument("dtype")
119121
def embed_many(
120122
self,
121123
texts: List[str],
@@ -162,6 +164,7 @@ def embed_many(
162164
stop=stop_after_attempt(6),
163165
retry=retry_if_not_exception_type(TypeError),
164166
)
167+
@deprecated_argument("dtype")
165168
def embed(
166169
self,
167170
text: str,
@@ -200,6 +203,7 @@ def embed(
200203
stop=stop_after_attempt(6),
201204
retry=retry_if_not_exception_type(TypeError),
202205
)
206+
@deprecated_argument("dtype")
203207
async def aembed_many(
204208
self,
205209
texts: List[str],
@@ -248,6 +252,7 @@ async def aembed_many(
248252
stop=stop_after_attempt(6),
249253
retry=retry_if_not_exception_type(TypeError),
250254
)
255+
@deprecated_argument("dtype")
251256
async def aembed(
252257
self,
253258
text: str,

redisvl/utils/vectorize/text/openai.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tenacity import retry, stop_after_attempt, wait_random_exponential
66
from tenacity.retry import retry_if_not_exception_type
77

8+
from redisvl.utils.utils import deprecated_argument
89
from redisvl.utils.vectorize.base import BaseVectorizer
910

1011
# ignore that openai isn't imported
@@ -121,6 +122,7 @@ def _set_model_dims(self, model) -> int:
121122
stop=stop_after_attempt(6),
122123
retry=retry_if_not_exception_type(TypeError),
123124
)
125+
@deprecated_argument("dtype")
124126
def embed_many(
125127
self,
126128
texts: List[str],
@@ -167,6 +169,7 @@ def embed_many(
167169
stop=stop_after_attempt(6),
168170
retry=retry_if_not_exception_type(TypeError),
169171
)
172+
@deprecated_argument("dtype")
170173
def embed(
171174
self,
172175
text: str,
@@ -205,6 +208,7 @@ def embed(
205208
stop=stop_after_attempt(6),
206209
retry=retry_if_not_exception_type(TypeError),
207210
)
211+
@deprecated_argument("dtype")
208212
async def aembed_many(
209213
self,
210214
texts: List[str],
@@ -253,6 +257,7 @@ async def aembed_many(
253257
stop=stop_after_attempt(6),
254258
retry=retry_if_not_exception_type(TypeError),
255259
)
260+
@deprecated_argument("dtype")
256261
async def aembed(
257262
self,
258263
text: str,

redisvl/utils/vectorize/text/vertexai.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from tenacity import retry, stop_after_attempt, wait_random_exponential
66
from tenacity.retry import retry_if_not_exception_type
77

8+
from redisvl.utils.utils import deprecated_argument
89
from redisvl.utils.vectorize.base import BaseVectorizer
910

1011

@@ -128,6 +129,7 @@ def _set_model_dims(self) -> int:
128129
stop=stop_after_attempt(6),
129130
retry=retry_if_not_exception_type(TypeError),
130131
)
132+
@deprecated_argument("dtype")
131133
def embed_many(
132134
self,
133135
texts: List[str],
@@ -173,6 +175,7 @@ def embed_many(
173175
stop=stop_after_attempt(6),
174176
retry=retry_if_not_exception_type(TypeError),
175177
)
178+
@deprecated_argument("dtype")
176179
def embed(
177180
self,
178181
text: str,

0 commit comments

Comments
 (0)