Skip to content

Commit a80effe

Browse files
committed
Update key field values for stemmed fingerprints
Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent fc37d16 commit a80effe

File tree

2 files changed

+27
-13
lines changed

2 files changed

+27
-13
lines changed

src/matchcode_toolkit/fingerprinting.py

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
226226
content = f.read()
227227

228228
return create_file_fingerprints(
229-
content,
229+
content=content,
230230
ngram_length=ngram_length,
231231
window_length=window_length,
232232
include_ngrams=include_ngrams,
233233
)
234234

235235

236-
def get_stem_file_fingerprint_hashes(
236+
def get_stemmed_file_fingerprint_hashes(
237237
location,
238238
ngram_length=5,
239239
window_length=16,
@@ -262,28 +262,45 @@ def get_stem_file_fingerprint_hashes(
262262
stemmed_content = get_stem_code(location=location)
263263

264264
return create_file_fingerprints(
265-
stemmed_content,
265+
stemmed_content=stemmed_content,
266266
ngram_length=ngram_length,
267267
window_length=window_length,
268268
include_ngrams=include_ngrams,
269269
)
270270

271271

272272
def create_file_fingerprints(
273-
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
273+
content=None,
274+
stemmed_content=None,
275+
ngram_length=5,
276+
window_length=SNIPPET_WINDOW_LENGTH,
277+
include_ngrams=False,
274278
):
275279
"""
276-
Return a mapping of halo1 and snippet hashes from content string
280+
Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both.
277281
"""
278282
from licensedcode.tokenize import ngrams
279283
from licensedcode.tokenize import select_ngrams
280284

285+
if content and stemmed_content:
286+
raise Exception(
287+
"create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
288+
)
289+
290+
if stemmed_content:
291+
halo1_key = "stemmed_halo1"
292+
snippets_key = "stemmed_snippets"
293+
else:
294+
halo1_key = "halo1"
295+
snippets_key = "snippets"
296+
281297
fingerprints = {
282-
"halo1": "",
283-
"snippets": [],
298+
halo1_key: "",
299+
snippets_key: [],
284300
}
285301

286302
# tokenize content into words
303+
content = content or stemmed_content
287304
words = list(tokenizer(content))
288305

289306
# Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +314,7 @@ def create_file_fingerprints(
297314
content_fingerprint = content_hash.hexdigest().decode("utf-8")
298315
ngs_count_hex_str = "%08x" % ngs_count
299316
file_fingerprint = ngs_count_hex_str + content_fingerprint
300-
fingerprints["halo1"] = file_fingerprint
317+
fingerprints[halo1_key] = file_fingerprint
301318

302319
# Select windows from the content to compute snippet fingerprints
303320
windows = ngrams(words, window_length)
@@ -317,7 +334,7 @@ def create_file_fingerprints(
317334
s["ngrams"] = list(window)
318335
snippets.append(s)
319336
if snippets:
320-
fingerprints["snippets"] = snippets
337+
fingerprints[snippets_key] = snippets
321338

322339
return fingerprints
323340

src/matchcode_toolkit/pipelines/fingerprint_codebase.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,4 @@ def fingerprint_codebase(self):
4545
"""
4646
matchcode.fingerprint_codebase_directories(self.project)
4747
matchcode.fingerprint_codebase_resources(self.project)
48-
49-
def fingerprint_stem_codebase_resources(self):
50-
"""Compute stem code fingerprint for resources"""
51-
matchcode.fingerprint_stem_codebase_resources(self.project)
48+
matchcode.fingerprint_stemmed_codebase_resources(self.project)

0 commit comments

Comments
 (0)