@@ -226,14 +226,14 @@ def get_file_fingerprint_hashes(
226226 content = f .read ()
227227
228228 return create_file_fingerprints (
229- content ,
229+ content = content ,
230230 ngram_length = ngram_length ,
231231 window_length = window_length ,
232232 include_ngrams = include_ngrams ,
233233 )
234234
235235
236- def get_stem_file_fingerprint_hashes (
236+ def get_stemmed_file_fingerprint_hashes (
237237 location ,
238238 ngram_length = 5 ,
239239 window_length = 16 ,
@@ -262,28 +262,39 @@ def get_stem_file_fingerprint_hashes(
262262 stemmed_content = get_stem_code (location = location )
263263
264264 return create_file_fingerprints (
265- stemmed_content ,
265+ stemmed_content = stemmed_content ,
266266 ngram_length = ngram_length ,
267267 window_length = window_length ,
268268 include_ngrams = include_ngrams ,
269269 )
270270
271271
272272def create_file_fingerprints (
273- content , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
273+ content = None , stemmed_content = None , ngram_length = 5 , window_length = SNIPPET_WINDOW_LENGTH , include_ngrams = False
274274):
275275 """
276- Return a mapping of halo1 and snippet hashes from content string
276+ Return a mapping of halo1 and snippet hashes from ` content` or `stemmed_content`, not both.
277277 """
278278 from licensedcode .tokenize import ngrams
279279 from licensedcode .tokenize import select_ngrams
280280
281+ if content and stemmed_content :
282+ raise Exception ("create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both." )
283+
284+ if stemmed_content :
285+ halo1_key = "stemmed_halo1"
286+ snippets_key = "stemmed_snippets"
287+ else :
288+ halo1_key = "halo1"
289+ snippets_key = "snippets"
290+
281291 fingerprints = {
282- "halo1" : "" ,
283- "snippets" : [],
292+ halo1_key : "" ,
293+ snippets_key : [],
284294 }
285295
286296 # tokenize content into words
297+ content = content or stemmed_content
287298 words = list (tokenizer (content ))
288299
289300 # Create a file fingerprint from the number of elements in the content hash
@@ -297,7 +308,7 @@ def create_file_fingerprints(
297308 content_fingerprint = content_hash .hexdigest ().decode ("utf-8" )
298309 ngs_count_hex_str = "%08x" % ngs_count
299310 file_fingerprint = ngs_count_hex_str + content_fingerprint
300- fingerprints ["halo1" ] = file_fingerprint
311+ fingerprints [halo1_key ] = file_fingerprint
301312
302313 # Select windows from the content to compute snippet fingerprints
303314 windows = ngrams (words , window_length )
@@ -317,7 +328,7 @@ def create_file_fingerprints(
317328 s ["ngrams" ] = list (window )
318329 snippets .append (s )
319330 if snippets :
320- fingerprints ["snippets" ] = snippets
331+ fingerprints [snippets_key ] = snippets
321332
322333 return fingerprints
323334
0 commit comments