Create fingerprint for stem code

keshav-space · keshav-space · commit 8e0405ecd40d · 2025-02-25T12:16:10.000+05:30
Signed-off-by: Keshav Priyadarshi &lt;git@keshav.space&gt;
diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py
@@ -13,6 +13,8 @@
 from licensedcode.tokenize import query_lines
 from samecode.halohash import BitAverageHaloHash
 
+from matchcode_toolkit.stemming import get_stem_code
+
 # A collection of directory fingerprints that we want to avoid
 IGNORED_DIRECTORY_FINGERPRINTS = [
     # This is both the directory content and directory structure fingerprint for
@@ -231,6 +233,42 @@ def get_file_fingerprint_hashes(
     )
 
 
+def get_stem_file_fingerprint_hashes(
+    location,
+    ngram_length=5,
+    window_length=16,
+    include_ngrams=False,
+    **kwargs,
+):
+    """
+    Return a mapping of stem code fingerprint hashes for the file at `location`
+
+    The `halo1` hash is the hex digest of the fingerprint of the file.
+    `halo1` is empty if the file is empty.
+
+    - We start by breaking the file into words (tokens)
+    - We compute ngrams over the list of tokens
+
+    Return an empty mapping if `location` is not a text file
+    """
+    from commoncode import filetype
+    from typecode.contenttype import get_type
+
+    # Do not process `location` if it's not a text file
+    ft = get_type(location)
+    if not (filetype.is_file(location) and ft.is_text):
+        return {}
+
+    stemmed_content = get_stem_code(location=location)
+
+    return create_file_fingerprints(
+        stemmed_content,
+        ngram_length=ngram_length,
+        window_length=window_length,
+        include_ngrams=include_ngrams,
+    )
+
+
 def create_file_fingerprints(
     content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
 ):