Skip to content

Commit

Permalink
fix (ci): replace match w if/else for py38
Browse files Browse the repository at this point in the history
  • Loading branch information
bionicles committed Jul 11, 2024
1 parent b1ae035 commit ab93aa2
Show file tree
Hide file tree
Showing 3 changed files with 61 additions and 17 deletions.
21 changes: 20 additions & 1 deletion tests/test_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,12 +227,31 @@ def test_units_token_counting_gpt4(file, expected):
assert result == expected


@pytest.mark.parametrize(
"file,expected",
[
(
"tests/path_to_test/file.py",
tree_plus.TokenLineCount(n_tokens=19, n_lines=3),
),
(
"tests/path_to_test/empty.py",
tree_plus.TokenLineCount(n_tokens=0, n_lines=0),
),
],
)
def test_units_token_counting_gpt4o(file, expected):
result = tree_plus.count_tokens_lines(file, tokenizer_name=TokenizerName.GPT4O)
assert isinstance(result, tree_plus.TokenLineCount)
assert result == expected


@pytest.mark.parametrize(
"file,expected",
[
(
"tests/more_languages/group7/absurdly_huge.jsonl",
tree_plus.TokenLineCount(n_tokens=2782500, n_lines=42000),
tree_plus.TokenLineCount(n_tokens=8_347, n_lines=126),
),
(
"tests/path_to_test/file.py",
Expand Down
27 changes: 20 additions & 7 deletions tree_plus_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,14 +224,27 @@ def main(
article_comment_tree = tree_plus.from_hacker_news_articles(hacker_news_articles)
_paths += (article_comment_tree,)

# TOO SOON! need to support py38, thanks CI/CD testing matrix!
# _tokenizer_name = TokenizerName.WC
# match (tiktoken, tokenizer_name):
# case (False, None) | (True, "wc"):
# pass
# case (True, None) | (_, "gpt4o"):
# _tokenizer_name = TokenizerName.GPT4O
# case (_, "gpt4"):
# _tokenizer_name = TokenizerName.GPT4O

_tokenizer_name = TokenizerName.WC
match (tiktoken, tokenizer_name):
case (False, None) | (True, "wc"):
pass
case (True, None) | (_, "gpt4o"):
_tokenizer_name = TokenizerName.GPT4O
case (_, "gpt4"):
_tokenizer_name = TokenizerName.GPT4O
if (not tiktoken and tokenizer_name is None) or (
tiktoken and tokenizer_name == "wc"
):
pass
elif (tiktoken and tokenizer_name is None) or (tokenizer_name == "gpt4o"):
_tokenizer_name = TokenizerName.GPT4O
elif tokenizer_name == "gpt4":
_tokenizer_name = TokenizerName.GPT4
else:
raise ValueError(f"unsupported {tiktoken=} {tokenizer_name=}")

root = tree_plus.from_seeds(
_paths,
Expand Down
30 changes: 21 additions & 9 deletions tree_plus_src/count_tokens_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,15 +136,27 @@ def count_tokens_lines(
return None
debug_print(f"count_tokens_lines counting {file_path=}")

match tokenizer_name:
case TokenizerName.GPT4O | TokenizerName.GPT4:
contents = read_file(file_path)
count = count_openai_tokens_lines_from_contents(
contents,
tokenizer_name=tokenizer_name,
)
case TokenizerName.WC:
count = count_wc_tokens_lines_from_path(file_path)
# TOO SOON! py38 failed on this
# match tokenizer_name:
# case TokenizerName.GPT4O | TokenizerName.GPT4:
# contents = read_file(file_path)
# count = count_openai_tokens_lines_from_contents(
# contents,
# tokenizer_name=tokenizer_name,
# )
# case TokenizerName.WC:
# count = count_wc_tokens_lines_from_path(file_path)

if tokenizer_name is TokenizerName.GPT4O or tokenizer_name is TokenizerName.GPT4:
contents = read_file(file_path)
count = count_openai_tokens_lines_from_contents(
contents,
tokenizer_name=tokenizer_name,
)
elif tokenizer_name == TokenizerName.WC:
count = count_wc_tokens_lines_from_path(file_path)
else:
raise ValueError(f"unsupported {tokenizer_name=}")
debug_print(f"count_tokens_lines {count=}")
return count

Expand Down

0 comments on commit ab93aa2

Please sign in to comment.