Skip to content

seg fault occurs if lembed() input exceeds the models maximum tokens #7

@nattaylor

Description

@nattaylor

If you pass a string to lembed() for which the tokenized array length exceeds the models maximum, then a segmentation fault is triggered. The code below will output 'success for 2040' then segfault

It would be nice if you could pass a truncate flag or something

from transformers import AutoTokenizer
MODEL_ID = "Snowflake/snowflake-arctic-embed-m-v1.5"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
print(len(tokenizer(('abc '*2000)[:2040])['input_ids'])) # 512

import sqlite3
import sqlite_vec
import sqlite_lembed
import time

def q(sql): return db.execute(sql).fetchall()

db = sqlite3.connect(":memory:")
db.enable_load_extension(True)
sqlite_vec.load(db)
sqlite_lembed.load(db)
db.enable_load_extension(False)

cursor = db.cursor()
cursor.execute("""insert into lembed_models(name, model) values ('default', lembed_model_from_file('/Users/ntaylor/.cache/huggingface/hub/models--yishan-wang--snowflake-arctic-embed-m-v1.5-Q8_0-GGUF/blobs/1073b6711706f55b451efe6c3ecf7398bf93c8c3b9b2df918673df9b77146a34'));""")

cursor.execute("create table foo(mycol text);")

q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*2000])
q("select lembed(substr(mycol, 0, 2040)) from foo")
print('success for 2040')
time.sleep(1)
q("delete from foo")
cursor.execute("insert into foo values (?)", ['abc '*5000])
q("select lembed(substr(mycol, 0, 2041)) from foo")
print('success for 2041')

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions