Skip to content

Commit

Permalink
fix: catastrophic backtracking in some rust function signatures; docs…
Browse files Browse the repository at this point in the history
…: move help above example output in README
  • Loading branch information
bionicles committed Jul 22, 2024
1 parent 3992adb commit 1ff1fb0
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 84 deletions.
141 changes: 72 additions & 69 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,78 @@

`pip install -U tree_plus`

## Usage
Here's how `tree_plus --help` looks (`-h` and `-H` both also work)
<!-- t5-start -->
```sh
tree_plus -h
Usage: tree_plus [OPTIONS] [PATHS]...

A `tree` util enhanced with tokens, lines, and components.

Wrap patterns in quotes: -i "*.py" / -g "*.rs"

Example Invocations (These are not subcomands, you idiot):

Show tree_plus_src and tests simultaneously
> tree_plus tree_plus_src tests

Show files matching "*.*s" within tests/more_languages
> tree_plus -g "*.*s" tests/more_languages

Ignore Java files
> tree_plus -i "*.java" tests

Override DEFAULT_IGNORE: Only ignore .ini files.
> tree_plus -o -i "*.ini" tests/dot_dot

Syntax Highlight python files in src and tests
> tree_plus -s tree_plus_src/*.py tests/*.py

Concise Mode (No Parsing)
> tree_plus -c

URL + Tag Categories for a website
> tree_plus example.com

URL + Tag Categories for multiple websites with a link tree
> tree_plus example.com example.org -l

Hacker News Mode (3 articles, max depth 3)
> tree_plus --yc

Hacker News Mode (6 articles, max depth 6, warning, slow!)
> tree_plus --yc -n 6 -m 6

Use the Tiktoken gpt4o Model Tokenizer to tokenize Rust files
> tree_plus -t -g '*.rs'

Options:
-i, -I, --ignore TEXT Patterns to ignore, in quotes: -i "*.java"
-o, -O, --override Override DEFAULT_IGNORE (includes ignored
content): -o -i "*.java"
-g, -G, --glob TEXT Patterns to find, in quotes: -g "*.rs"
-v, -V, --version Print the version and exit.
-d, -D, --debug Enables $DEBUG_TREE_PLUS.
-s, -S, --syntax Enables Syntax Highlighting (WIP).
-c, -C, --concise Omit module components. (False)
--yc, --hn Include ycombinator (False)
-n, -N, --number INTEGER number of results (--yc mode only, default 3)
-m, -M, --max-depth INTEGER maximum number of steps (depth / level) from
root (--yc mode only, default 3)
-l, -L, --links include links (web mode only, default False)
-t, --tiktoken a shorthand for tiktoken with the gpt4o
tokenizer
-T, --tokenizer-name TEXT name of the tokenizer to use, for now only
'gpt4o' works
-h, -H, --help Show this message and exit.

v(1.0.52) --- https://github.com/bionicles/tree_plus/blob/main/README.md

```
<!-- t5-end -->


## Example Output:
- [ ] Demonstrate Parsed Checkboxes
<!-- t1-start -->
Expand Down Expand Up @@ -679,75 +751,6 @@ tree_plus v(1.0.52) ignore=('tests',) globs=() syntax=False paths=()
<!-- t1-end -->
- [x] Demonstrate Parsed Checkboxes
Here's how `tree_plus --help` looks (`-h` and `-H` both also work)
<!-- t5-start -->
```sh
tree_plus -h
Usage: tree_plus [OPTIONS] [PATHS]...
A `tree` util enhanced with tokens, lines, and components.
Wrap patterns in quotes: -i "*.py" / -g "*.rs"
Example Invocations (These are not subcomands, you idiot):
Show tree_plus_src and tests simultaneously
> tree_plus tree_plus_src tests
Show files matching "*.*s" within tests/more_languages
> tree_plus -g "*.*s" tests/more_languages
Ignore Java files
> tree_plus -i "*.java" tests
Override DEFAULT_IGNORE: Only ignore .ini files.
> tree_plus -o -i "*.ini" tests/dot_dot
Syntax Highlight python files in src and tests
> tree_plus -s tree_plus_src/*.py tests/*.py
Concise Mode (No Parsing)
> tree_plus -c
URL + Tag Categories for a website
> tree_plus example.com
URL + Tag Categories for multiple websites with a link tree
> tree_plus example.com example.org -l
Hacker News Mode (3 articles, max depth 3)
> tree_plus --yc
Hacker News Mode (6 articles, max depth 6, warning, slow!)
> tree_plus --yc -n 6 -m 6
Use the Tiktoken gpt4o Model Tokenizer to tokenize Rust files
> tree_plus -t -g '*.rs'
Options:
-i, -I, --ignore TEXT Patterns to ignore, in quotes: -i "*.java"
-o, -O, --override Override DEFAULT_IGNORE (includes ignored
content): -o -i "*.java"
-g, -G, --glob TEXT Patterns to find, in quotes: -g "*.rs"
-v, -V, --version Print the version and exit.
-d, -D, --debug Enables $DEBUG_TREE_PLUS.
-s, -S, --syntax Enables Syntax Highlighting (WIP).
-c, -C, --concise Omit module components. (False)
--yc, --hn Include ycombinator (False)
-n, -N, --number INTEGER number of results (--yc mode only, default 3)
-m, -M, --max-depth INTEGER maximum number of steps (depth / level) from
root (--yc mode only, default 3)
-l, -L, --links include links (web mode only, default False)
-t, --tiktoken a shorthand for tiktoken with the gpt4o
tokenizer
-T, --tokenizer-name TEXT name of the tokenizer to use, for now only
'gpt4o' works
-h, -H, --help Show this message and exit.
v(1.0.52) --- https://github.com/bionicles/tree_plus/blob/main/README.md
```
<!-- t5-end -->
## Start Quick!
Expand Down
35 changes: 29 additions & 6 deletions tests/more_languages/group4/rust_test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,11 @@ pub mod lib {
// Define a flow function
pub fn flow<S1, S2, S3, S4, E, T, L>(
source: S1, // edge
extractor: E,
inbox: S2,
transformer: T,
outbox: S3,
loader: L,
extractor: E,
inbox: S2,
transformer: T,
outbox: S3,
loader: L,
sink: &mut S4,
) -> Result<(), Box<dyn Error>>
where
Expand Down Expand Up @@ -163,4 +163,27 @@ where
Bion: Cool
{
println!("Bion is cool!");
}
}

// reproduces a catastrophic backtracking error
#[macro_export]
macro_rules! unit {
impl crate::lib::Lensable<(), $unit_dtype> for $unit_name {
fn insert(
&mut self,
key: (),
value: $unit_dtype,
) -> Result<Option<$unit_dtype>, ETLError> {
if key == () {
let old_value = self.0 as $unit_dtype;
self.0 = value as $unit_dtype;
Ok(Some(old_value))
} else {
Err(ETLError::KeyError)
}
}
// other methods omitted as they did not trigger the issue
}
// other impls omitted, as they did not trigger the issue
}
};
21 changes: 14 additions & 7 deletions tests/test_more_language_units.py
Original file line number Diff line number Diff line change
Expand Up @@ -1148,11 +1148,11 @@ def test_more_languages_group3(file: str, expected: List[str]):
" mod engine",
"""pub fn flow<S1, S2, S3, S4, E, T, L>(
source: S1,
extractor: E,
inbox: S2,
transformer: T,
outbox: S3,
loader: L,
extractor: E,
inbox: S2,
transformer: T,
outbox: S3,
loader: L,
sink: &mut S4,
) -> Result<(), Box<dyn Error>>
where
Expand All @@ -1176,6 +1176,13 @@ def test_more_languages_group3(file: str, expected: List[str]):
"""async fn handle_get(State(pool): State<PgPool>) -> Result<Html<String>, (StatusCode, String)>
where
Bion: Cool""",
"""#[macro_export]
macro_rules! unit""",
""" fn insert(
&mut self,
key: (),
value: $unit_dtype,
) -> Result<Option<$unit_dtype>, ETLError>""",
],
),
(
Expand Down Expand Up @@ -1314,8 +1321,8 @@ def test_more_languages_group4(
):
print(f"{file=}")
result = parse_file(file)
print(f"{result=}")
print(f"{expected=}")
print("expectation", expected)
print("reality", result)
assert result == expected
# assert 0

Expand Down
7 changes: 5 additions & 2 deletions tree_plus_src/parse_file.py
Original file line number Diff line number Diff line change
Expand Up @@ -1238,6 +1238,9 @@ def parse_erl(contents: str) -> List[str]:
# re.MULTILINE,
# )

# this caused catastrophic backtracking in the macro impl Lensable for ND
# r"\n(?P<function>\s*(?:pub\s+)?(?:async\s+)?fn\s+(?:[\w_]+)(?:<.*>)?\((?P<function_params>(?:[^()]+|\((?P<function_nested>[^()]+)\))*)\)(?P<function_return_type>\s+->\s+(?:[^;{]*))?(?P<function_where_clause>\s*(?:where\n(?P<function_where>.*)?)?)?)|"


def parse_rs(contents: str) -> List[str]:
debug_print("parse_rs")
Expand All @@ -1246,7 +1249,7 @@ def parse_rs(contents: str) -> List[str]:

combined_pattern = re.compile(
# functions
r"\n(?P<function>\s*(?:pub\s+)?(?:async\s+)?fn\s+(?:[\w_]+)(?:<.*>)?\((?P<function_params>(?:[^()]+|\((?P<function_nested>[^()]+)\))*)\)(?P<function_return_type>\s+->\s+(?:[^;{]*))?(?P<function_where_clause>\s*(?:where\n(?P<function_where>.*)?)?)?)|"
r"^(?P<function>\s*(?:pub\s+)?(?:async\s+)?fn\s+(?:[\w_]+)(?:<[^>]*>)?\((?P<function_params>[^;{]*))|"
# structs and impls with generics
r"\n(?P<struct_impl>(?: *((?:pub\s+)?struct)|impl)[^{;]*?) ?[{;]|"
# trait, enum, or mod
Expand All @@ -1262,7 +1265,7 @@ def parse_rs(contents: str) -> List[str]:
component = None
# functions
if groups.get("function"):
component = groups["function"].rstrip().rstrip(",")
component = groups["function"].rstrip().rstrip(",").rstrip("\n").rstrip(";")
# struct or impl
elif groups.get("struct_impl"):
component = groups["struct_impl"].rstrip()
Expand Down

0 comments on commit 1ff1fb0

Please sign in to comment.