@@ -268,8 +268,21 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
268268 log .trace ("Skipping snippets as the file is too small: {} - {}" , filename , contents .length );
269269 return true ;
270270 }
271- if (contents [0 ] == '{' || contents [0 ] == '<' ) {
272- log .trace ("Skipping snippets as the file appears to be JSON/XML/HTML: {}" , filename );
271+ //See https://github.com/scanoss/scanoss.py/blob/ede0477f3ea1b13a0147154b565b1bf6a72a6843/src/scanoss/winnowing.py#L248-L260
272+ //for python implementation reference
273+
274+ // Create prefix from first MIN_FILE_SIZE-1 characters, lowercase and trimmed
275+ String prefix = new String (contents , 0 , ScanossConstants .MIN_FILE_SIZE - 1 ).toLowerCase ().strip ();
276+
277+ // Check for JSON files (starts with { or [)
278+ if (prefix .charAt (0 ) == '{' || prefix .charAt (0 ) == '[' ) {
279+ log .trace ("Skipping snippets as the file appears to be JSON: {}" , filename );
280+ return true ;
281+ }
282+ // Check for XML/HTML/AC3D files with explicit prefix matching
283+ if (prefix .startsWith ("<?xml" ) || prefix .startsWith ("<html" ) ||
284+ prefix .startsWith ("<ac3d" ) || prefix .startsWith ("<!doc" )) {
285+ log .trace ("Skipping snippets as the file appears to be xml/html/binary: {}" , filename );
273286 return true ;
274287 }
275288 if (!filename .isEmpty ()) {
@@ -290,7 +303,7 @@ private Boolean skipSnippets(@NonNull String filename, char[] contents) {
290303 }
291304 }
292305 if (firstLineEnd == 0 ) {
293- firstLineEnd = contents .length ; // No newline found, use entire content length
306+ firstLineEnd = contents .length - 1 ; // No newline found, use length-1 (matching Python)
294307 }
295308 if (snippetLimit > 0 && firstLineEnd > snippetLimit ) {
296309 log .trace ("Skipping snippets due to first line being too long: {} - {} chars" , filename , firstLineEnd );
0 commit comments