Skip to content

Commit

Permalink
perf: replace tiktoken with wc; feat: jsonl schemas
Browse files Browse the repository at this point in the history
  • Loading branch information
bionicles committed Jul 11, 2024
1 parent 7ffc843 commit eec2b27
Show file tree
Hide file tree
Showing 12 changed files with 404 additions and 52 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ detritus/
tests/.pytest_cache
.pytest_cache

# ignore the absurdly huge jsonl file
absurdly_huge.jsonl

# deployment stuff
tests/version_increments/test_sink_version.py
tests/version_increments/dry_run_version.py
Expand Down
10 changes: 7 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,20 @@ debug:
nodemon -L -V

.PHONY: debug_command
debug_command: html_demo
# debug_command: test
debug_command: test_group7

html_demo:
tree_plus https://en.wikipedia.org/wiki/Zero_ring
# tree_plus --yc

# test data for the jsonl tokenization
absurdly-huge-jsonl:
python tests/build_absurdly_huge_jsonl.py

# TESTS
test: test_sequential test_tp_dotdot test_e2e test_cli test_programs test_deploy

N_WORKERS=12
# parallel unit tests (for dev rig)
test_parallel:
Expand All @@ -44,8 +50,6 @@ test_more_languages:
test_group7:
pytest tests/test_more_language_units.py -vv -k group7

# test: test_sequential
test: test_sequential test_tp_dotdot test_e2e test_cli test_programs test_deploy

# first we'll do our unit tests (most likely to need fast debug)
test_units:
Expand Down
65 changes: 65 additions & 0 deletions tests/build_absurdly_huge_jsonl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import pathlib
import json


import polars as pl

N_REPEATS = 42_000


def generate_huge_jsonl(
df: pl.DataFrame,
output_file: pathlib.Path,
repeat: int,
) -> bool:
"""
Generate an absurdly huge JSONL file from a Polars DataFrame.
Parameters:
df (polars.DataFrame): The input DataFrame.
output_file (str): The path to the output JSONL file.
repeat (int): The number of times to repeat the DataFrame in the output file.
"""
with open(output_file, "w+") as f:
for _ in range(repeat):
for row in df.iter_rows():
# make sure we keep the keys of the json ... dumping just the row only gives values
row_dict = {col: val for col, val in zip(df.columns, row)}
f.write(json.dumps(row_dict) + "\n")

return True


def main():
df = pl.DataFrame(
{
"SMILES": ["CCO", "CC(=O)O", "CC(=O)OC1=CC=CC=C1"],
"Yield": [0.85, 0.72, 0.63],
"Temperature": [25, 30, 35],
"Pressure": [1.0, 1.5, 2.0],
"Solvent": ["Ethanol", "Water", "Methanol"],
"Success": [True, False, True],
"Reaction_Conditions": [
{"Temperature": 25, "Pressure": 1.0, "Solvent": "Ethanol"},
{"Temperature": 30, "Pressure": 1.5, "Solvent": "Water"},
{"Temperature": 35, "Pressure": 2.0, "Solvent": "Methanol"},
],
"Products": [
["Ethane", "Carbon Dioxide"],
["Ethanol", "Carbon Dioxide"],
["Methanol", "Carbon Dioxide"],
],
"EdgeCasesMissed": [None, None, None],
}
)

huge_jsonl_path = (
pathlib.Path() / "tests" / "more_languages" / "group7" / "absurdly_huge.jsonl"
)
# Generate a huge JSONL file by repeating the DataFrame 100_000 times
assert generate_huge_jsonl(df, huge_jsonl_path, N_REPEATS)
print(f"Ok(huge_jsonl_path = {huge_jsonl_path})")


if __name__ == "__main__":
main()
136 changes: 125 additions & 11 deletions tests/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,10 @@ def unify_tree_symbols(tree_string):


def test_e2e_single_file():
result = tree_plus.from_seed(f"{test_directory}/file.py")
result = tree_plus.from_seed(
f"{test_directory}/file.py",
tokenizer_name=tree_plus.TokenizerName.GPT4,
)
assert isinstance(result, tree_plus.TreePlus)
result.render()
result_str = result.into_str()
Expand All @@ -79,7 +82,93 @@ def test_e2e_empty_folder():
assert unify_tree_symbols(result_str) == EXPECTATION_EMPTY


EXPECTATION_1 = """📁 path_to_test (1 folder, 6 files)
EXPECTATION_1_WC = """📁 path_to_test (1 folder, 6 files)
├── 📄 class_method_type.py (530 tokens, 102 lines)
│ ├── T = TypeVar("T")
│ ├── def parse_py(contents: str) -> List[str]
│ ├── class MyClass
│ ├── @staticmethod
│ │ def physical_element_aval(dtype) -> core.ShapedArray
│ ├── def my_method(self)
│ ├── @staticmethod
│ │ def my_typed_method(obj: dict) -> int
│ ├── def my_multiline_signature_method(
│ │ self,
│ │ alice: str = None,
│ │ bob: int = None,
│ │ ) -> tuple
│ ├── @lru_cache(maxsize=None)
│ │ def my_multiline_signature_function(
│ │ tree: tuple = (),
│ │ plus: str = "+",
│ │ ) -> tuple
│ ├── class LogLevelEnum(str, Enum)
│ ├── class Algo(BaseModel)
│ ├── @dataclass
│ │ class TestDataclass
│ ├── A = TypeVar("A", str, bytes)
│ ├── def omega_yikes(file: str, expected: List[str]) -> bool
│ ├── def ice[T](args: Iterable[T] = ())
│ ├── class list[T]
│ ├── def __getitem__(self, index: int, /) -> T
│ ├── @classmethod
│ │ def from_code(cls, toolbox, code: bytes, score=None) -> "Algo"
│ ├── @classmethod
│ │ def from_str(cls, toolbox, string: str, score=None) -> 'Algo'
│ └── class Router(hk.Module)
├── 📄 empty.py (0 tokens, 0 lines)
├── 📄 file.md (11 tokens, 2 lines)
│ └── # Hello, world!
├── 📄 file.py (18 tokens, 3 lines)
│ └── def hello_world()
├── 📄 file.txt (10 tokens, 2 lines)
└── 📄 version.py (13 tokens, 2 lines)
└── __version__ = "1.2.3"
"""
EXPECTATION_1_GPT4 = """📁 path_to_test (1 folder, 6 files)
├── 📄 class_method_type.py (541 tokens, 103 lines)
│ ├── T = TypeVar("T")
│ ├── def parse_py(contents: str) -> List[str]
│ ├── class MyClass
│ ├── @staticmethod
│ │ def physical_element_aval(dtype) -> core.ShapedArray
│ ├── def my_method(self)
│ ├── @staticmethod
│ │ def my_typed_method(obj: dict) -> int
│ ├── def my_multiline_signature_method(
│ │ self,
│ │ alice: str = None,
│ │ bob: int = None,
│ │ ) -> tuple
│ ├── @lru_cache(maxsize=None)
│ │ def my_multiline_signature_function(
│ │ tree: tuple = (),
│ │ plus: str = "+",
│ │ ) -> tuple
│ ├── class LogLevelEnum(str, Enum)
│ ├── class Algo(BaseModel)
│ ├── @dataclass
│ │ class TestDataclass
│ ├── A = TypeVar("A", str, bytes)
│ ├── def omega_yikes(file: str, expected: List[str]) -> bool
│ ├── def ice[T](args: Iterable[T] = ())
│ ├── class list[T]
│ ├── def __getitem__(self, index: int, /) -> T
│ ├── @classmethod
│ │ def from_code(cls, toolbox, code: bytes, score=None) -> "Algo"
│ ├── @classmethod
│ │ def from_str(cls, toolbox, string: str, score=None) -> 'Algo'
│ └── class Router(hk.Module)
├── 📄 empty.py (0 tokens, 0 lines)
├── 📄 file.md (12 tokens, 2 lines)
│ └── # Hello, world!
├── 📄 file.py (19 tokens, 3 lines)
│ └── def hello_world()
├── 📄 file.txt (11 tokens, 2 lines)
└── 📄 version.py (19 tokens, 2 lines)
└── __version__ = "1.2.3"
"""
EXPECTATION_1_GPT4O = """📁 path_to_test (1 folder, 6 files)
├── 📄 class_method_type.py (541 tokens, 103 lines)
│ ├── T = TypeVar("T")
│ ├── def parse_py(contents: str) -> List[str]
Expand Down Expand Up @@ -124,32 +213,55 @@ def test_e2e_empty_folder():
"""


def test_e2e_single_directory():
result = tree_plus.from_seeds((test_directory,))
@pytest.mark.parametrize(
"tokenizer_name,expectation",
[
(tree_plus.TokenizerName.WC, EXPECTATION_1_WC),
(tree_plus.TokenizerName.GPT4, EXPECTATION_1_GPT4),
(tree_plus.TokenizerName.GPT4O, EXPECTATION_1_GPT4O),
],
)
def test_e2e_single_directory(tokenizer_name, expectation):
result = tree_plus.from_seeds((test_directory,), tokenizer_name=tokenizer_name)
assert isinstance(result, tree_plus.TreePlus)
print("test_e2e_single_directory tree\n")
result.render()
result_str = result.into_str()
print("test_e2e_single_directory result_str\n", result_str)
assert unify_tree_symbols(result_str) == unify_tree_symbols(EXPECTATION_1)


def test_e2e_multiple_directories():
assert unify_tree_symbols(result_str) == unify_tree_symbols(expectation)


@pytest.mark.parametrize(
"tokenizer_name,expectation",
[
(tree_plus.TokenizerName.WC, EXPECTATION_1_WC),
(tree_plus.TokenizerName.GPT4, EXPECTATION_1_GPT4),
(tree_plus.TokenizerName.GPT4O, EXPECTATION_1_GPT4O),
],
)
def test_e2e_multiple_directories(tokenizer_name, expectation):
test_directory2 = "tests/path_to_test"
with tree_plus.debug_disabled():
result = tree_plus.from_seeds((test_directory, test_directory2))
result = tree_plus.from_seeds(
(test_directory, test_directory2),
tokenizer_name=tokenizer_name,
)
assert isinstance(result, tree_plus.TreePlus)
print("test_e2e_multiple_directories result")
result.render()
result_str = result.into_str()
print("test_e2e_multiple_directories result_str\n", result_str)
unified_tree_symbols = unify_tree_symbols(result_str)
assert unified_tree_symbols == unify_tree_symbols(EXPECTATION_1)
assert unified_tree_symbols == unify_tree_symbols(expectation)
assert unified_tree_symbols.count("📁 path_to_test") == 1


def test_e2e_glob():
result = tree_plus.from_seed("tests/more_languages", maybe_globs=("*.*s",))
result = tree_plus.from_seed(
"tests/more_languages",
maybe_globs=("*.*s",),
tokenizer_name=tree_plus.TokenizerName.GPT4,
)
assert isinstance(result, tree_plus.TreePlus)
result_str = result.into_str()
print(result_str)
Expand All @@ -171,6 +283,7 @@ def test_e2e_ignore_parameter_filetype():
result = tree_plus.from_seed(
"tests/more_languages/group1",
maybe_ignore=("*.kt",),
tokenizer_name=tree_plus.TokenizerName.GPT4,
)
assert isinstance(result, tree_plus.TreePlus)
result.render()
Expand All @@ -182,6 +295,7 @@ def test_e2e_ignore_parameter_directory():
result = tree_plus.from_seed(
"tests/more_languages",
maybe_ignore=("group2",),
tokenizer_name=tree_plus.TokenizerName.GPT4,
)
assert isinstance(result, tree_plus.TreePlus)
result_str = result.into_str()
Expand Down
13 changes: 13 additions & 0 deletions tests/test_more_language_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -2084,6 +2084,18 @@ def edge_case(self) -> str""",
"""@fragment
fn frag_main() -> @location(0) vec4f""",
]

JSONL_EXPECTATION = [
"SMILES: str",
"Yield: float",
"Temperature: int",
"Pressure: float",
"Solvent: str",
"Success: bool",
"Reaction_Conditions: dict",
"Products: list",
"EdgeCasesMissed: None",
]
# (
# "tests/more_languages/group6/yc.html",
# [
Expand Down Expand Up @@ -2128,6 +2140,7 @@ def edge_case(self) -> str""",
[
("tests/more_languages/group7/angular_crud.ts", ANGULAR_CRUD_EXPECTATION),
("tests/more_languages/group7/dataclass.py", DATACLASS_EXPECTATION),
("tests/more_languages/group7/absurdly_huge.jsonl", JSONL_EXPECTATION),
# ("tests/more_languages/group7/wgsl_test.wgsl", WGSL_EXPECTATION),
# ("tests/more_languages/group7/AAPLShaders.metal", METAL_EXPECTATION),
],
Expand Down
46 changes: 45 additions & 1 deletion tests/test_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
# )

import tree_plus_src as tree_plus
from tree_plus_src.count_tokens_lines import TokenizerName

# TODO: test debug_disabled

Expand Down Expand Up @@ -201,7 +202,50 @@ def test_units_parse_markers():
),
],
)
def test_units_token_counting(file, expected):
def test_units_token_counting_gpt4(file, expected):
result = tree_plus.count_tokens_lines(file, tokenizer_name=TokenizerName.GPT4)
assert isinstance(result, tree_plus.TokenLineCount)
assert result == expected


@pytest.mark.parametrize(
"file,expected",
[
(
"tests/path_to_test/file.py",
tree_plus.TokenLineCount(n_tokens=19, n_lines=3),
),
(
"tests/path_to_test/empty.py",
tree_plus.TokenLineCount(n_tokens=0, n_lines=0),
),
],
)
def test_units_token_counting_gpt4(file, expected):
result = tree_plus.count_tokens_lines(file, tokenizer_name=TokenizerName.GPT4)
assert isinstance(result, tree_plus.TokenLineCount)
assert result == expected


@pytest.mark.parametrize(
"file,expected",
[
(
"tests/more_languages/group7/absurdly_huge.jsonl",
tree_plus.TokenLineCount(n_tokens=2782500, n_lines=42000),
),
(
"tests/path_to_test/file.py",
tree_plus.TokenLineCount(n_tokens=18, n_lines=3),
),
(
"tests/path_to_test/empty.py",
tree_plus.TokenLineCount(n_tokens=0, n_lines=0),
),
],
)
def test_units_token_counting_wc(file, expected):
result = tree_plus.count_tokens_lines(file)
print(result)
assert isinstance(result, tree_plus.TokenLineCount)
assert result == expected
Loading

0 comments on commit eec2b27

Please sign in to comment.