Skip to content

Commit

Permalink
fix: ignores deep leaves when getting structure for C/CPP sources for…
Browse files Browse the repository at this point in the history
… increasing compare structure algorithm accuracy.

Refs: #192.
  • Loading branch information
Artanias authored Jul 6, 2024
1 parent e48c414 commit 731996a
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
UTIL_VERSION := 0.4.9
UTIL_VERSION := 0.4.10
UTIL_NAME := codeplag
PWD := $(shell pwd)

Expand Down
39 changes: 17 additions & 22 deletions src/codeplag/cplag/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,37 +19,20 @@ def get_not_ignored(tree: Cursor, src: Path | str) -> list[Cursor]:
return parsed_nodes


def generic_visit(node, features: ASTFeatures, curr_depth: int = 0) -> None:
def generic_visit(node: Cursor, features: ASTFeatures, curr_depth: int = 0) -> None:
if curr_depth == 0:
children = get_not_ignored(node, features.filepath)
else:
node_name = repr(node.kind)
if node_name not in features.unodes:
features.unodes[node_name] = features.count_unodes
features.from_num[features.count_unodes] = node_name
features.count_unodes += 1
features.structure.append(
NodeStructurePlace(curr_depth, features.unodes[node_name])
)
__add_node_to_structure(features, repr(node.kind), curr_depth)
children = list(node.get_children())

if curr_depth == 1:
features.head_nodes.append(node.spelling)

if len(children) == 0:
if len(children) == 0 and curr_depth == 1:
for token in node.get_tokens():
token_name = repr(token.kind)
if token_name not in features.unodes:
features.unodes[token_name] = features.count_unodes
features.from_num[features.count_unodes] = token_name
features.count_unodes += 1
features.structure.append(
NodeStructurePlace(curr_depth, features.unodes[token_name])
)

if curr_depth == 1:
features.head_nodes.append(token_name)

__add_node_to_structure(features, token_name, curr_depth)
features.head_nodes.append(token_name)
else:
for child in children:
features.tokens.append(child.kind.value)
Expand All @@ -69,3 +52,15 @@ def get_features(tree: Cursor, filepath: Path | str = "") -> ASTFeatures:
generic_visit(tree, features)

return features


def __add_node_to_structure(
features: ASTFeatures, node_name: str, curr_depth: int
) -> None:
if node_name not in features.unodes:
features.unodes[node_name] = features.count_unodes
features.from_num[features.count_unodes] = node_name
features.count_unodes += 1
features.structure.append(
NodeStructurePlace(curr_depth, features.unodes[node_name])
)
24 changes: 12 additions & 12 deletions test/unit/codeplag/cplag/test_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,10 @@ def test_generic_visit(first_cursor):
assert features.operators == {}
assert features.keywords == {}
assert features.literals == {}
assert len(features.unodes) == 13
assert len(features.from_num) == 13
assert features.count_unodes == 13
assert len(features.structure) == 34
assert len(features.unodes) == 10
assert len(features.from_num) == 10
assert features.count_unodes == 10
assert len(features.structure) == 23
assert features.tokens == [8, 10, 10, 202, 205, 114, 100,
101, 106, 214, 100, 101, 214,
103, 100, 101, 100, 101, 114,
Expand All @@ -112,10 +112,10 @@ def test_get_features(second_cursor):
assert features.operators == {'==': 1, '%': 1}
assert features.keywords == {'int': 1, 'if': 1, 'return': 2, 'long': 2}
assert features.literals == {'0L': 1}
assert len(features.unodes) == 13
assert len(features.from_num) == 13
assert features.count_unodes == 13
assert len(features.structure) == 36
assert len(features.unodes) == 10
assert len(features.from_num) == 10
assert features.count_unodes == 10
assert len(features.structure) == 25
assert features.tokens == [8, 10, 10, 202, 205, 114, 100,
101, 106, 202, 214, 100, 100,
101, 214, 103, 100, 101, 100,
Expand All @@ -134,8 +134,8 @@ def test_bad_encoding_syms(third_cursor):
# Ignored bad symbols
assert '" .\\n"' in features.literals.keys() # noqa
assert len(features.literals.keys()) == 12
assert len(features.unodes) == 20
assert len(features.from_num) == 20
assert features.count_unodes == 20
assert len(features.structure) == 225
assert len(features.unodes) == 18
assert len(features.from_num) == 18
assert features.count_unodes == 18
assert len(features.structure) == 167
assert len(features.tokens) == 167

0 comments on commit 731996a

Please sign in to comment.