Skip to content

Commit ca21376

Browse files
authored
Merge pull request #19 from aboutcode-org/code-stemming
Add support for code stemming with tree-sitter
2 parents 8e23645 + 4f99bc2 commit ca21376

23 files changed

+7301
-13
lines changed

Makefile

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,11 +19,11 @@ dev:
1919

2020
isort:
2121
@echo "-> Apply isort changes to ensure proper imports ordering"
22-
${VENV}/bin/isort --sl -l 100 src tests setup.py
22+
${VENV}/bin/isort --sl -l 100 src tests setup.py --skip="tests/testfiles/"
2323

2424
black:
2525
@echo "-> Apply black code formatter"
26-
${VENV}/bin/black -l 100 src tests setup.py
26+
${VENV}/bin/black -l 100 src tests setup.py --exclude="tests/testfiles/"
2727

2828
doc8:
2929
@echo "-> Run doc8 validation"
@@ -33,11 +33,11 @@ valid: isort black
3333

3434
check:
3535
@echo "-> Run pycodestyle (PEP8) validation"
36-
@${ACTIVATE} pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,migrations,settings.py,.cache .
36+
@${ACTIVATE} pycodestyle --max-line-length=100 --exclude=.eggs,venv,lib,thirdparty,docs,migrations,settings.py,.cache,tests/testfiles/stemming/ .
3737
@echo "-> Run isort imports ordering validation"
38-
@${ACTIVATE} isort --sl --check-only -l 100 setup.py src tests .
38+
@${ACTIVATE} isort --sl --check-only -l 100 setup.py src tests . --skip="tests/testfiles/"
3939
@echo "-> Run black validation"
40-
@${ACTIVATE} black --check --check -l 100 src tests setup.py
40+
@${ACTIVATE} black --check --check -l 100 src tests setup.py --exclude="tests/testfiles/"
4141

4242
clean:
4343
@echo "-> Clean the Python env"

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ norecursedirs = [
3737
"tests/data",
3838
".eggs",
3939
"src/*/data",
40-
"tests/*/data"
40+
"tests/testfiles/*"
4141
]
4242

4343
python_files = "*.py"

requirements.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,11 @@ soupsieve==2.6
1919
text-unidecode==1.3
2020
urllib3==2.2.3
2121
wheel==0.45.1
22+
tree-sitter==0.23.0
23+
tree-sitter-c==0.21.1
24+
tree-sitter-cpp==0.22.0
25+
tree-sitter-go==0.21.0
26+
tree-sitter-java==0.21.0
27+
tree-sitter-javascript==0.21.2
28+
tree-sitter-python==0.21.0
29+
tree-sitter-rust==0.21.2

setup.cfg

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,15 @@ install_requires =
4848
commoncode
4949
plugincode
5050
samecode
51+
typecode
52+
tree-sitter
53+
tree-sitter-c
54+
tree-sitter-cpp
55+
tree-sitter-go
56+
tree-sitter-java
57+
tree-sitter-javascript
58+
tree-sitter-python
59+
tree-sitter-rust
5160

5261

5362
[options.packages.find]

src/matchcode_toolkit/fingerprinting.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,9 @@ def tokenizer(text):
194194
return _tokenizer(text.lower())
195195

196196

197-
def get_file_fingerprint_hashes(location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs):
197+
def get_file_fingerprint_hashes(
198+
location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs
199+
):
198200
"""
199201
Return a mapping of fingerprint hashes for the file at `location`
200202

src/matchcode_toolkit/plugin_fingerprint.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#
99

1010
import attr
11-
1211
from commoncode.cliutils import SCAN_GROUP
1312
from commoncode.cliutils import PluggableCommandLineOption
1413
from plugincode.scan import ScanPlugin

src/matchcode_toolkit/stemming.py

Lines changed: 166 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,166 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# ScanCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/matchcode-toolkit for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
import importlib
11+
12+
from tree_sitter import Language
13+
from tree_sitter import Parser
14+
from typecode.contenttype import Type
15+
16+
17+
class TreeSitterWheelNotInstalled(Exception):
18+
pass
19+
20+
21+
TS_LANGUAGE_CONF = {
22+
"C": {
23+
"wheel": "tree_sitter_c",
24+
"identifiers": ["identifier"],
25+
"comments": ["comment"],
26+
},
27+
"C++": {
28+
"wheel": "tree_sitter_cpp",
29+
"identifiers": ["identifier"],
30+
"comments": ["comment"],
31+
},
32+
"Go": {
33+
"wheel": "tree_sitter_go",
34+
"identifiers": ["identifier"],
35+
"comments": ["comment"],
36+
},
37+
"Java": {
38+
"wheel": "tree_sitter_java",
39+
"identifiers": ["identifier"],
40+
"comments": ["comment", "block_comment", "line_comment"],
41+
},
42+
"JavaScript": {
43+
"wheel": "tree_sitter_javascript",
44+
"identifiers": ["identifier"],
45+
"comments": ["comment"],
46+
},
47+
"Python": {
48+
"wheel": "tree_sitter_python",
49+
"identifiers": ["identifier"],
50+
"comments": ["comment"],
51+
},
52+
"Rust": {
53+
"wheel": "tree_sitter_rust",
54+
"identifiers": ["identifier"],
55+
"comments": ["comment", "block_comment", "line_comment"],
56+
},
57+
}
58+
59+
60+
def get_parser(location):
61+
"""
62+
Get the appropriate tree-sitter parser and grammar config for
63+
file at location.
64+
"""
65+
file_type = Type(location)
66+
language = file_type.programming_language
67+
68+
if not language or language not in TS_LANGUAGE_CONF:
69+
return
70+
71+
language_info = TS_LANGUAGE_CONF[language]
72+
wheel = language_info["wheel"]
73+
74+
try:
75+
grammar = importlib.import_module(wheel)
76+
except ModuleNotFoundError:
77+
raise TreeSitterWheelNotInstalled(f"{wheel} package is not installed")
78+
79+
parser = Parser(language=Language(grammar.language()))
80+
81+
return parser, language_info
82+
83+
84+
def add_to_mutation_index(node, mutation_index):
85+
if content := node.text.decode():
86+
end_point = node.end_point
87+
start_point = node.start_point
88+
mutation_index[(end_point.row, end_point.column)] = {
89+
"type": node.type,
90+
"content": content,
91+
"start_point": (start_point.row, start_point.column),
92+
"end_point": (end_point.row, end_point.column),
93+
}
94+
95+
96+
def traverse(node, language_info, mutation_index):
97+
"""
98+
Recursively traverse the parse tree node and create mutation index.
99+
100+
Mutation index contains the start, end coordinates and where mutations
101+
is to be applied, along with the type of mutation. Each mutation entry
102+
is keyed by a tuple containing the end coordinates.
103+
"""
104+
if node.type in language_info.get("identifiers") or node.type in language_info.get("comments"):
105+
add_to_mutation_index(node=node, mutation_index=mutation_index)
106+
107+
for child in node.children:
108+
traverse(child, language_info, mutation_index)
109+
110+
111+
def apply_mutation(text, start_point, end_point, replacement, successive_line_count):
112+
"""Mutate tokens between start and end points with replacement string."""
113+
114+
start_row, start_col = start_point
115+
end_row, end_col = end_point
116+
117+
# Compute 1D mutation position from 2D coordinates
118+
start_index = successive_line_count[start_row] + start_col
119+
end_index = successive_line_count[end_row] + end_col
120+
121+
modified_text = text[:start_index] + replacement + text[end_index:]
122+
modified_lines = modified_text.splitlines(keepends=True)
123+
124+
# Remove empty comment lines.
125+
if not replacement and modified_lines[start_row].strip() == "":
126+
del modified_lines[start_row]
127+
128+
return "".join(modified_lines)
129+
130+
131+
def get_stem_code(location):
132+
"""
133+
Return the stemmed code for the code file at the specified `location`.
134+
135+
Parse the code using tree-sitter, create a mutation index for tokens that
136+
need to be replaced or removed, and apply these mutations bottom-up to
137+
generate the stemmed code.
138+
"""
139+
parser_result = get_parser(location)
140+
if not parser_result:
141+
return
142+
143+
with open(location, "rb") as f:
144+
source = f.read()
145+
mutations = {}
146+
parser, language_info = parser_result
147+
tree = parser.parse(source)
148+
traverse(tree.root_node, language_info, mutations)
149+
150+
# Apply mutations bottom-up
151+
mutations = dict(sorted(mutations.items(), reverse=True))
152+
text = source.decode()
153+
cur_count = 0
154+
lines = text.splitlines(keepends=True)
155+
successive_line_count = [cur_count := cur_count + len(line) for line in lines]
156+
successive_line_count.insert(0, 0)
157+
158+
for value in mutations.values():
159+
text = apply_mutation(
160+
text=text,
161+
end_point=value["end_point"],
162+
start_point=value["start_point"],
163+
replacement=("idf" if value["type"] == "identifier" else ""),
164+
successive_line_count=successive_line_count,
165+
)
166+
return text

tests/test_fingerprinting.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from commoncode.resource import VirtualCodebase
1414
from commoncode.testcase import FileBasedTesting
1515
from commoncode.testcase import check_against_expected_json_file
16+
from samecode.halohash import byte_hamming_distance
1617

1718
from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
1819
from matchcode_toolkit.fingerprinting import _get_resource_subpath
@@ -22,7 +23,6 @@
2223
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
2324
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
2425
from matchcode_toolkit.fingerprinting import split_fingerprint
25-
from samecode.halohash import byte_hamming_distance
2626

2727

2828
class Resource:
@@ -193,10 +193,13 @@ def test_snippets_similarity(self, regen=False):
193193
results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
194194
results1_snippets
195195
)
196-
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
196+
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
197+
results2_snippets
198+
)
197199

198200
matching_snippets = (
199-
results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
201+
results1_snippet_mappings_by_snippets.keys()
202+
& results2_snippet_mappings_by_snippets.keys()
200203
)
201204
expected_matching_snippets = {
202205
"33b1d50de7e1701bd4beb706bf25970e",
@@ -247,10 +250,13 @@ def test_snippets_similarity_2(self, regen=False):
247250
results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
248251
results1_snippets
249252
)
250-
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
253+
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
254+
results2_snippets
255+
)
251256

252257
matching_snippets = (
253-
results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
258+
results1_snippet_mappings_by_snippets.keys()
259+
& results2_snippet_mappings_by_snippets.keys()
254260
)
255261

256262
# jaccard coefficient

tests/test_stemming.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
#
2+
# Copyright (c) nexB Inc. and others. All rights reserved.
3+
# ScanCode is a trademark of nexB Inc.
4+
# SPDX-License-Identifier: Apache-2.0
5+
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
6+
# See https://github.com/aboutcode-org/matchcode-toolkit for support or download.
7+
# See https://aboutcode.org for more information about nexB OSS projects.
8+
#
9+
10+
11+
from pathlib import Path
12+
13+
from commoncode.testcase import FileBasedTesting
14+
15+
from matchcode_toolkit import stemming
16+
17+
18+
def check_against_expected_code_file(results, expected_file, regen=False):
19+
"""
20+
Check that the ``results`` data are the same as the data in the
21+
``expected_file``.
22+
23+
If `regen` is True the expected_file will overwritten with the ``results``.
24+
This is convenient for updating tests expectations. But use with caution.
25+
"""
26+
if regen:
27+
with open(expected_file, "w") as reg:
28+
reg.write(results)
29+
expected = results
30+
else:
31+
with open(expected_file) as exp:
32+
expected = exp.read()
33+
34+
assert results == expected
35+
36+
37+
class TestFingerprintingFunctions(FileBasedTesting):
38+
test_data_dir = Path(__file__).parent / "testfiles/stemming"
39+
40+
def test_java_code_stemming(self):
41+
file_location = self.test_data_dir / "java/contenttype.java"
42+
expected_file_location = self.test_data_dir / "java/contenttype-stemmed.java"
43+
results = stemming.get_stem_code(location=str(file_location))
44+
check_against_expected_code_file(results, expected_file_location)
45+
46+
def test_cpp_code_stemming(self):
47+
file_location = self.test_data_dir / "cpp/string.cpp"
48+
expected_file_location = self.test_data_dir / "cpp/string-stemmed.cpp"
49+
results = stemming.get_stem_code(location=str(file_location))
50+
check_against_expected_code_file(results, expected_file_location)
51+
52+
def test_c_code_stemming(self):
53+
file_location = self.test_data_dir / "c/main.c"
54+
expected_file_location = self.test_data_dir / "c/main-stemmed.c"
55+
results = stemming.get_stem_code(location=str(file_location))
56+
check_against_expected_code_file(results, expected_file_location)
57+
58+
def test_golang_code_stemming(self):
59+
file_location = self.test_data_dir / "golang/utils.go"
60+
expected_file_location = self.test_data_dir / "golang/utils-stemmed.go"
61+
results = stemming.get_stem_code(location=str(file_location))
62+
check_against_expected_code_file(results, expected_file_location)
63+
64+
def test_python_code_stemming(self):
65+
file_location = self.test_data_dir / "python/sync_scancode_scans.py"
66+
expected_file_location = self.test_data_dir / "python/sync_scancode_scans-stemmed.py"
67+
results = stemming.get_stem_code(location=str(file_location))
68+
check_against_expected_code_file(results, expected_file_location)
69+
70+
def test_javascript_code_stemming(self):
71+
file_location = self.test_data_dir / "javascript/utils.js"
72+
expected_file_location = self.test_data_dir / "javascript/utils-stemmed.js"
73+
results = stemming.get_stem_code(location=str(file_location))
74+
check_against_expected_code_file(results, expected_file_location)
75+
76+
def test_rust_code_stemming(self):
77+
file_location = self.test_data_dir / "rust/metrics.rs"
78+
expected_file_location = self.test_data_dir / "rust/metrics-stemmeds.rs"
79+
results = stemming.get_stem_code(location=str(file_location))
80+
check_against_expected_code_file(results, expected_file_location)

0 commit comments

Comments
 (0)