Skip to content

Commit 5063350

Browse files
committed
Set window length constant
Signed-off-by: Jono Yang <jyang@nexb.com>
1 parent 007ca57 commit 5063350

File tree

3 files changed

+21
-10
lines changed

3 files changed

+21
-10
lines changed

src/matchcode_toolkit/fingerprinting.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@
1010
import binascii
1111
import re
1212

13-
from samecode.halohash import BitAverageHaloHash
1413
from licensedcode.tokenize import query_lines
15-
14+
from samecode.halohash import BitAverageHaloHash
1615

1716
# A collection of directory fingerprints that we want to avoid
1817
IGNORED_DIRECTORY_FINGERPRINTS = [
@@ -21,6 +20,8 @@
2120
"0000000000000000000000000000000000000000",
2221
]
2322

23+
SNIPPET_WINDOW_LENGTH = 16
24+
2425

2526
def _create_directory_fingerprint(inputs):
2627
"""
@@ -166,6 +167,7 @@ def create_halohash_chunks(bah128):
166167
query_pattern = "[^_\\W]+"
167168
word_splitter = re.compile(query_pattern, re.UNICODE).findall
168169

170+
169171
# TODO: return line numbers from where the token was taken
170172
def _tokenizer(text):
171173
"""
@@ -196,7 +198,9 @@ def tokenizer(text):
196198
return _tokenizer(text.lower())
197199

198200

199-
def get_file_fingerprint_hashes(location, ngram_length=5, window_length=16, include_ngrams=False, **kwargs):
201+
def get_file_fingerprint_hashes(
202+
location, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False, **kwargs
203+
):
200204
"""
201205
Return a mapping of fingerprint hashes for the file at `location`
202206
@@ -227,7 +231,9 @@ def get_file_fingerprint_hashes(location, ngram_length=5, window_length=16, incl
227231
)
228232

229233

230-
def create_file_fingerprints(content, ngram_length=5, window_length=16, include_ngrams=False):
234+
def create_file_fingerprints(
235+
content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False
236+
):
231237
"""
232238
Return a mapping of halo1 and snippet hashes from content string
233239
"""

src/matchcode_toolkit/plugin_fingerprint.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,6 @@
88
#
99

1010
import attr
11-
1211
from commoncode.cliutils import SCAN_GROUP
1312
from commoncode.cliutils import PluggableCommandLineOption
1413
from plugincode.scan import ScanPlugin

tests/test_fingerprinting.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from commoncode.resource import VirtualCodebase
1414
from commoncode.testcase import FileBasedTesting
1515
from commoncode.testcase import check_against_expected_json_file
16+
from samecode.halohash import byte_hamming_distance
1617

1718
from matchcode_toolkit.fingerprinting import _create_directory_fingerprint
1819
from matchcode_toolkit.fingerprinting import _get_resource_subpath
@@ -22,7 +23,6 @@
2223
from matchcode_toolkit.fingerprinting import create_structure_fingerprint
2324
from matchcode_toolkit.fingerprinting import get_file_fingerprint_hashes
2425
from matchcode_toolkit.fingerprinting import split_fingerprint
25-
from samecode.halohash import byte_hamming_distance
2626

2727

2828
class Resource:
@@ -193,10 +193,13 @@ def test_snippets_similarity(self, regen=False):
193193
results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
194194
results1_snippets
195195
)
196-
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
196+
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
197+
results2_snippets
198+
)
197199

198200
matching_snippets = (
199-
results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
201+
results1_snippet_mappings_by_snippets.keys()
202+
& results2_snippet_mappings_by_snippets.keys()
200203
)
201204
expected_matching_snippets = {
202205
"33b1d50de7e1701bd4beb706bf25970e",
@@ -247,10 +250,13 @@ def test_snippets_similarity_2(self, regen=False):
247250
results1_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
248251
results1_snippets
249252
)
250-
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(results2_snippets)
253+
results2_snippet_mappings_by_snippets = self._create_snippet_mappings_by_snippets(
254+
results2_snippets
255+
)
251256

252257
matching_snippets = (
253-
results1_snippet_mappings_by_snippets.keys() & results2_snippet_mappings_by_snippets.keys()
258+
results1_snippet_mappings_by_snippets.keys()
259+
& results2_snippet_mappings_by_snippets.keys()
254260
)
255261

256262
# jaccard coefficient

0 commit comments

Comments
 (0)