|
13 | 13 | from licensedcode.tokenize import query_lines |
14 | 14 | from samecode.halohash import BitAverageHaloHash |
15 | 15 |
|
| 16 | +from matchcode_toolkit.stemming import get_stem_code |
| 17 | + |
16 | 18 | # A collection of directory fingerprints that we want to avoid |
17 | 19 | IGNORED_DIRECTORY_FINGERPRINTS = [ |
18 | 20 | # This is both the directory content and directory structure fingerprint for |
@@ -231,6 +233,42 @@ def get_file_fingerprint_hashes( |
231 | 233 | ) |
232 | 234 |
|
233 | 235 |
|
| 236 | +def get_stem_file_fingerprint_hashes( |
| 237 | + location, |
| 238 | + ngram_length=5, |
| 239 | + window_length=16, |
| 240 | + include_ngrams=False, |
| 241 | + **kwargs, |
| 242 | +): |
| 243 | + """ |
| 244 | + Return a mapping of stem code fingerprint hashes for the file at `location` |
| 245 | +
|
| 246 | + The `halo1` hash is the hex digest of the fingerprint of the file. |
| 247 | + `halo1` is empty if the file is empty. |
| 248 | +
|
| 249 | + - We start by breaking the file into words (tokens) |
| 250 | + - We compute ngrams over the list of tokens |
| 251 | +
|
| 252 | + Return an empty mapping if `location` is not a text file |
| 253 | + """ |
| 254 | + from commoncode import filetype |
| 255 | + from typecode.contenttype import get_type |
| 256 | + |
| 257 | + # Do not process `location` if it's not a text file |
| 258 | + ft = get_type(location) |
| 259 | + if not (filetype.is_file(location) and ft.is_text): |
| 260 | + return {} |
| 261 | + |
| 262 | + stemmed_content = get_stem_code(location=location) |
| 263 | + |
| 264 | + return create_file_fingerprints( |
| 265 | + stemmed_content, |
| 266 | + ngram_length=ngram_length, |
| 267 | + window_length=window_length, |
| 268 | + include_ngrams=include_ngrams, |
| 269 | + ) |
| 270 | + |
| 271 | + |
234 | 272 | def create_file_fingerprints( |
235 | 273 | content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False |
236 | 274 | ): |
|
0 commit comments