-
Notifications
You must be signed in to change notification settings - Fork 12
Open
Description
If you pass a string to lembed() for which the tokenized array length exceeds the models maximum, then a segmentation fault is triggered. The code below will output 'success for 2040' then segfault
It would be nice if you could pass a truncate flag or something
from transformers import AutoTokenizer
MODEL_ID = "Snowflake/snowflake-arctic-embed-m-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(len(tokenizer(('abc '*2000)[:2040])['input_ids'])) # 512
import sqlite3
import sqlite_vec
import sqlite_lembed
import time
def q(sql): return db.execute(sql).fetchall()
db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
sqlite_lembed.load(db)
db.enable_load_extension(False)
cursor = db.cursor()
cursor.execute("""insert into lembed_models(name, model) values ('default', lembed_model_from_file('/Users/ntaylor/.cache/huggingface/hub/models--yishan-wang--snowflake-arctic-embed-m-v1.5-Q8_0-GGUF/blobs/1073b6711706f55b451efe6c3ecf7398bf93c8c3b9b2df918673df9b77146a34'));""")
cursor.execute("create table foo(mycol text);")
q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*2000])
q("select lembed(substr(mycol, 0, 2040)) from foo")
print('success for 2040')
time.sleep(1)
q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*5000])
q("select lembed(substr(mycol, 0, 2041)) from foo")
print('success for 2041')
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels