@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
226226 content = f .read ()
227227
228228 return create_file_fingerprints (
229- content ,
229+ content = content ,
230230 ngram_length = ngram_length ,
231231 window_length = window_length ,
232232 include_ngrams = include_ngrams ,
233233 )
234234
235235
236- def get_stem_file_fingerprint_hashes (
236+ def get_stemmed_file_fingerprint_hashes (
237237 location ,
238238 ngram_length = 5 ,
239239 window_length = 16 ,
@@ -262,28 +262,45 @@ def get_stem_file_fingerprint_hashes(
262262 stemmed_content = get_stem_code (location = location )
263263
264264 return create_file_fingerprints (
265- stemmed_content ,
265+ stemmed_content = stemmed_content ,
266266 ngram_length = ngram_length ,
267267 window_length = window_length ,
268268 include_ngrams = include_ngrams ,
269269 )
270270
271271
272272def create_file_fingerprints (
273- content , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
273+ content = None ,
274+ stemmed_content = None ,
275+ ngram_length = 5 ,
276+ window_length = SNIPPET_WINDOW_LENGTH ,
277+ include_ngrams = False ,
274278):
275279 """
276- Return a mapping of halo1 and snippet hashes from content string
280+ Return a mapping of halo1 and snippet hashes from ` content` or `stemmed_content`, not both.
277281 """
278282 from licensedcode .tokenize import ngrams
279283 from licensedcode .tokenize import select_ngrams
280284
285+ if content and stemmed_content :
286+ raise Exception (
287+ "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both."
288+ )
289+
290+ if stemmed_content :
291+ halo1_key = "stemmed_halo1"
292+ snippets_key = "stemmed_snippets"
293+ else :
294+ halo1_key = "halo1"
295+ snippets_key = "snippets"
296+
281297 fingerprints = {
282- "halo1" : "" ,
283- "snippets" : [],
298+ halo1_key : "" ,
299+ snippets_key : [],
284300 }
285301
286302 # tokenize content into words
303+ content = content or stemmed_content
287304 words = list (tokenizer (content ))
288305
289306 # Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +314,7 @@ def create_file_fingerprints(
297314 content_fingerprint = content_hash .hexdigest ().decode ("utf-8" )
298315 ngs_count_hex_str = "%08x" % ngs_count
299316 file_fingerprint = ngs_count_hex_str + content_fingerprint
300- fingerprints ["halo1" ] = file_fingerprint
317+ fingerprints [halo1_key ] = file_fingerprint
301318
302319 # Select windows from the content to compute snippet fingerprints
303320 windows = ngrams (words , window_length )
@@ -317,7 +334,7 @@ def create_file_fingerprints(
317334 s ["ngrams" ] = list (window )
318335 snippets .append (s )
319336 if snippets :
320- fingerprints ["snippets" ] = snippets
337+ fingerprints [snippets_key ] = snippets
321338
322339 return fingerprints
323340
0 commit comments