Skip to content

Commit e9bc861

Browse files
isasmendiagusagustingroh
authored andcommitted
fix(SP-2610): improve snippet skipping for JSON/XML files
1 parent 3348979 commit e9bc861

File tree

1 file changed

+16
-3
lines changed

1 file changed

+16
-3
lines changed

src/main/java/com/scanoss/Winnowing.java

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -268,8 +268,21 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
268268
log.trace("Skipping snippets as the file is too small: {} - {}", filename, contents.length);
269269
return true;
270270
}
271-
if (contents[0] == '{' || contents[0] == '<') {
272-
log.trace("Skipping snippets as the file appears to be JSON/XML/HTML: {}", filename);
271+
//See https://github.com/scanoss/scanoss.py/blob/ede0477f3ea1b13a0147154b565b1bf6a72a6843/src/scanoss/winnowing.py#L248-L260
272+
//for python implementation reference
273+
274+
// Create prefix from first MIN_FILE_SIZE-1 characters, lowercase and trimmed
275+
String prefix = new String(contents, 0, ScanossConstants.MIN_FILE_SIZE - 1).toLowerCase().strip();
276+
277+
// Check for JSON files (starts with { or [)
278+
if (prefix.charAt(0) == '{' || prefix.charAt(0) == '[') {
279+
log.trace("Skipping snippets as the file appears to be JSON: {}", filename);
280+
return true;
281+
}
282+
// Check for XML/HTML/AC3D files with explicit prefix matching
283+
if (prefix.startsWith("<?xml") || prefix.startsWith("<html") ||
284+
prefix.startsWith("<ac3d") || prefix.startsWith("<!doc")) {
285+
log.trace("Skipping snippets as the file appears to be xml/html/binary: {}", filename);
273286
return true;
274287
}
275288
if (!filename.isEmpty()) {
@@ -290,7 +303,7 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
290303
}
291304
}
292305
if (firstLineEnd == 0) {
293-
firstLineEnd = contents.length; // No newline found, use entire content length
306+
firstLineEnd = contents.length - 1; // No newline found, use length-1 (matching Python)
294307
}
295308
if (snippetLimit > 0 && firstLineEnd > snippetLimit) {
296309
log.trace("Skipping snippets due to first line being too long: {} - {} chars", filename, firstLineEnd);

0 commit comments

Comments
 (0)