Skip to content
This repository was archived by the owner on Aug 13, 2024. It is now read-only.

Commit e41a463

Browse files
committed
Types, tokenizer model support.
1 parent 8d51e58 commit e41a463

File tree

4 files changed

+21
-10
lines changed

4 files changed

+21
-10
lines changed

code_indexer_loop/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.1.0"
1+
__version__ = "0.2.0"

code_indexer_loop/api.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from langchain.embeddings.openai import OpenAIEmbeddings
77
from llama_index import ServiceContext, VectorStoreIndex
88
from llama_index.embeddings import LangchainEmbedding
9-
from llama_index.schema import TextNode
9+
from llama_index.schema import NodeWithScore, TextNode
1010
from llama_index.vector_stores import ChromaVectorStore
1111
from watchdog.events import FileSystemEventHandler
1212
from watchdog.observers import Observer
@@ -54,7 +54,7 @@ def query(self, query: str, k=10) -> str:
5454
[node_with_score.node.text for node_with_score in self.index.as_retriever(k=k).retrieve(query)]
5555
)
5656

57-
def query_nodes(self, query: str, k=10) -> list[TextNode]:
57+
def query_nodes(self, query: str, k=10) -> list[NodeWithScore]:
5858
return self.index.as_retriever(k=k).retrieve(query)
5959

6060
def query_documents(self, query: str, k=10) -> list[dict[str, str]]:

code_indexer_loop/code_splitter.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@
1212
from typing import List, Optional, Union
1313

1414
import tiktoken
15-
from tiktoken.model import MODEL_TO_ENCODING
1615
from tree_sitter import Node
1716

1817

@@ -71,11 +70,11 @@ def count(self, text: str, model: Optional[str] = None):
7170
if model is None:
7271
model = self.default_model
7372

74-
if model not in MODEL_TO_ENCODING.keys():
75-
raise ValueError(f"Model {model} not supported.")
76-
7773
if model not in self.initialized_models:
78-
self.initialized_models[model] = tiktoken.encoding_for_model(model)
74+
try:
75+
self.initialized_models[model] = tiktoken.encoding_for_model(model)
76+
except KeyError:
77+
raise KeyError(f"Model {model} not supported.")
7978

8079
return len(self.initialized_models[model].encode(text, disallowed_special=()))
8180

code_indexer_loop/test_api.py

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22

33
import pytest
44

5-
from code_indexer_loop.code_splitter import (CodeSplitter, MaxChunkLengthExceededError,
6-
TokenCounter)
5+
from code_indexer_loop.code_splitter import (CodeSplitter,
6+
MaxChunkLengthExceededError,
7+
TokenCounter)
78

89
THIS_FILE_DIR = os.path.dirname(os.path.realpath(__file__))
910

@@ -19,6 +20,17 @@ def create_code_splitter(language="python", target_chunk_tokens=5, max_chunk_tok
1920
)
2021

2122

23+
def test_code_splitter_prefix_model():
24+
CodeSplitter(
25+
language="python",
26+
target_chunk_tokens=10,
27+
max_chunk_tokens=10,
28+
enforce_max_chunk_tokens=True,
29+
token_model="gpt-4-32k-0613",
30+
coalesce=50,
31+
)
32+
33+
2234
def test_code_splitter():
2335
python_code_splitter = create_code_splitter()
2436
chunks = python_code_splitter.split_text(

0 commit comments

Comments
 (0)