Skip to content

Commit adc5daf

Browse files
authored
Merge pull request #20 from aboutcode-org/update-fingerprinting-plugin
Update fingerprinting plugin
2 parents ca21376 + fde3008 commit adc5daf

File tree

2 files changed

+33
-3
lines changed

2 files changed

+33
-3
lines changed

CHANGELOG.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,11 @@
11
Changelog
22
=========
33

4+
v7.1.0
5+
------
6+
7+
*2025-02-20* -- Add code stemming functionality. Add ``line_by_pos`` function to help with displaying snippet matches. Expose SNIPPET_WINDOW_LENGTH for use in other places.
8+
49
v7.0.0
510
------
611

src/matchcode_toolkit/fingerprinting.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import binascii
1111
import re
1212

13+
from licensedcode.tokenize import query_lines
1314
from samecode.halohash import BitAverageHaloHash
1415

1516
# A collection of directory fingerprints that we want to avoid
@@ -19,6 +20,8 @@
1920
"0000000000000000000000000000000000000000",
2021
]
2122

23+
SNIPPET_WINDOW_LENGTH = 16
24+
2225

2326
def _create_directory_fingerprint(inputs):
2427
"""
@@ -165,6 +168,7 @@ def create_halohash_chunks(bah128):
165168
word_splitter = re.compile(query_pattern, re.UNICODE).findall
166169

167170

171+
# TODO: return line numbers from where the token was taken
168172
def _tokenizer(text):
169173
"""
170174
Return an list of tokens from a unicode text.
@@ -195,7 +199,7 @@ def tokenizer(text):
195199

196200

197201
def get_file_fingerprint_hashes(
198-
location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs
202+
location, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False, **kwargs
199203
):
200204
"""
201205
Return a mapping of fingerprint hashes for the file at `location`
@@ -227,7 +231,9 @@ def get_file_fingerprint_hashes(
227231
)
228232

229233

230-
def create_file_fingerprints(content, ngram_length=5, window_length=16, include_ngrams=False):
234+
def create_file_fingerprints(
235+
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
236+
):
231237
"""
232238
Return a mapping of halo1 and snippet hashes from content string
233239
"""
@@ -260,7 +266,7 @@ def create_file_fingerprints(content, ngram_length=5, window_length=16, include_
260266
selected_windows = list(select_ngrams(windows, with_pos=True))
261267
# TODO: consider using itertools.chain.from_iterable()
262268
selected_windows_bytes = [
263-
(pos, [g.encode("utf-8") for g in window]) for pos, window in selected_windows
269+
(int(pos), [g.encode("utf-8") for g in window]) for pos, window in selected_windows
264270
]
265271
selected_windows_bytes = [(pos, b"".join(window)) for pos, window in selected_windows_bytes]
266272
snippets = []
@@ -276,3 +282,22 @@ def create_file_fingerprints(content, ngram_length=5, window_length=16, include_
276282
fingerprints["snippets"] = snippets
277283

278284
return fingerprints
285+
286+
287+
def get_line_by_pos(content):
288+
"""
289+
Return a mapping of lines numbers whose indices correspond to a token position
290+
in `content`.
291+
292+
For example, given line_by_pos[0] = 1, this means that the token at position
293+
0 in `content` is on line 1.
294+
"""
295+
line_number_and_lines = query_lines(query_string=content)
296+
line_by_pos = {}
297+
pos = 0
298+
for line_number, line in line_number_and_lines:
299+
tokens = tokenizer(line)
300+
for _ in tokens:
301+
line_by_pos[pos] = line_number
302+
pos += 1
303+
return line_by_pos

0 commit comments

Comments
 (0)