Skip to content

Commit 8e0405e

Browse files
committed
Create fingerprint for stem code
Signed-off-by: Keshav Priyadarshi <git@keshav.space>
1 parent adc5daf commit 8e0405e

File tree

1 file changed

+38
-0
lines changed

1 file changed

+38
-0
lines changed

src/matchcode_toolkit/fingerprinting.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
from licensedcode.tokenize import query_lines
1414
from samecode.halohash import BitAverageHaloHash
1515

16+
from matchcode_toolkit.stemming import get_stem_code
17+
1618
# A collection of directory fingerprints that we want to avoid
1719
IGNORED_DIRECTORY_FINGERPRINTS = [
1820
# This is both the directory content and directory structure fingerprint for
@@ -231,6 +233,42 @@ def get_file_fingerprint_hashes(
231233
)
232234

233235

236+
def get_stem_file_fingerprint_hashes(
237+
location,
238+
ngram_length=5,
239+
window_length=16,
240+
include_ngrams=False,
241+
**kwargs,
242+
):
243+
"""
244+
Return a mapping of stem code fingerprint hashes for the file at `location`
245+
246+
The `halo1` hash is the hex digest of the fingerprint of the file.
247+
`halo1` is empty if the file is empty.
248+
249+
- We start by breaking the file into words (tokens)
250+
- We compute ngrams over the list of tokens
251+
252+
Return an empty mapping if `location` is not a text file
253+
"""
254+
from commoncode import filetype
255+
from typecode.contenttype import get_type
256+
257+
# Do not process `location` if it's not a text file
258+
ft = get_type(location)
259+
if not (filetype.is_file(location) and ft.is_text):
260+
return {}
261+
262+
stemmed_content = get_stem_code(location=location)
263+
264+
return create_file_fingerprints(
265+
stemmed_content,
266+
ngram_length=ngram_length,
267+
window_length=window_length,
268+
include_ngrams=include_ngrams,
269+
)
270+
271+
234272
def create_file_fingerprints(
235273
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
236274
):

0 commit comments

Comments
 (0)