cucumber · mpkorstanje · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025 · Jul 27, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -8,7 +8,8 @@ This document is formatted according to the principles of [Keep A CHANGELOG](htt
 
 ## [Unreleased]
 ### Fixed
-[Go] Trim trailing tab characters from title and step lines ([#441](https://github.com/cucumber/gherkin/pull/441))
+- [Go] Trim trailing tab characters from title and step lines ([#441](https://github.com/cucumber/gherkin/pull/441))
+- [Java] Use a more consistent definition of whitespace ([#442](https://github.com/cucumber/gherkin/pull/442))
 
 ## [33.0.0] - 2025-07-07
 ### Changed

diff --git a/java/src/main/java/io/cucumber/gherkin/StringUtils.java b/java/src/main/java/io/cucumber/gherkin/StringUtils.java
@@ -3,56 +3,82 @@
 
 import java.util.AbstractMap.SimpleEntry;
 import java.util.Map.Entry;
+import java.util.function.Predicate;
 
 import static io.cucumber.gherkin.GherkinLanguageConstants.COMMENT_PREFIX_CHAR;
 
-class StringUtils {
+final class StringUtils {
 
     /**
-     * Matches regex pattern for whitespace.
+     * An extended definition of Whitespace minus new lines.
+     * <p>
+     * Character in Unicode general category {@code Zs} and directionality
+     * categories {@code WS}, {@code B}, and {@code S} are considered whitespace
+     * for this definition.
+     *
+     * @param c character to test
+     * @return true iff the {@code c} is whitespace and not new line..
      */
-    private static final char[] WHITESPACE_CHARS = new char[]{' ', '\t', '\n', '\u000B', '\f', '\r'};
+    private static boolean isWhiteSpaceExcludingNewLine(Character c) {
+        return c != '\n' && isWhiteSpace(c);
+    }
 
     /**
-     * Matches regex pattern whitespace + NEL + NBSP.
-     */
-    private static final char[] WHITESPACE_CHARS_EXTENDED = new char[]{' ', '\t', '\n', '\u000B', '\f', '\r', '\u0085', '\u00A0'};
-
-    /**
-     * Matches regex pattern whitespace + NEL + NBSP - new line.
+     * An extended definition of Whitespace.
+     * <p>
+     * Character in Unicode general category {@code Zs} and directionality
+     * categories {@code WS}, {@code B}, and {@code S} are considered whitespace
+     * for this definition.
+     *
+     * @param c character to test
+     * @return true iff the {@code c} is whitespace.
      */
-    private static final char[] WHITESPACE_CHARS_EXTENDED_KEEP_NEW_LINES = new char[]{' ', '\t', '\u000B', '\f', '\r', '\u0085', '\u00A0'};
+    static boolean isWhiteSpace(char c) {
+        // Fast path for the common case
+        return c == ' ' || c == '\t' || isCharacterTypeSpace(c) || isDirectionalitySpace(c);
+    }
+
+    private static boolean isCharacterTypeSpace(char c) {
+        return (((
+                (1 << Character.SPACE_SEPARATOR)
+                        // Not in the definition, but a subset of isDirectionalitySpace
+                        | (1 << Character.LINE_SEPARATOR)
+                        // Not in the definition, but a subset of isDirectionalitySpace
+                        | (1 << Character.PARAGRAPH_SEPARATOR)
+        ) >> Character.getType(c)) & 1) != 0;
+    }
+
+    private static boolean isDirectionalitySpace(char c) {
+        return (
+                (((1 << Character.DIRECTIONALITY_WHITESPACE)
+                        | (1 << Character.DIRECTIONALITY_PARAGRAPH_SEPARATOR)
+                        | (1 << Character.DIRECTIONALITY_SEGMENT_SEPARATOR)
+                ) >> Character.getDirectionality(c)) & 1) != 0;
+    }
 
     static String rtrim(String s) {
         if (s.isEmpty()) {
             return s;
         }
-
-        int length = s.length();
-
-        int end = length - 1;
-        while (end >= 0 && contains(WHITESPACE_CHARS_EXTENDED, s.charAt(end))) {
-            end--;
-        }
-
-        return s.substring(0, end + 1);
+        int end = findLastIndexNotIn(s, 0, StringUtils::isWhiteSpace);
+        return s.substring(0, end);
     }
 
     static Entry<String, Integer> trimAndIndentKeepNewLines(String input) {
-        return trimAndIndent(input, WHITESPACE_CHARS_EXTENDED_KEEP_NEW_LINES);
+        return trimAndIndent(input, StringUtils::isWhiteSpaceExcludingNewLine);
     }
 
     static Entry<String, Integer> trimAndIndent(String input) {
-        return trimAndIndent(input, WHITESPACE_CHARS_EXTENDED);
+        return trimAndIndent(input, StringUtils::isWhiteSpace);
     }
 
-    private static Entry<String, Integer> trimAndIndent(String input, char[] whitespaceChars) {
+    private static Entry<String, Integer> trimAndIndent(String input, Predicate<Character> isSpace) {
         if (input.isEmpty()) {
             return new SimpleEntry<>("", 0);
         }
 
-        int start = findFirstIndexNotIn(input, input.length(), whitespaceChars);
-        int end = findLastIndexNotIn(input, start, whitespaceChars);
+        int start = findFirstIndexNotIn(input, input.length(), isSpace);
+        int end = findLastIndexNotIn(input, start, isSpace);
 
         String trimmed = input.substring(start, end);
         int indent = input.codePointCount(0, start);
@@ -66,8 +92,8 @@ static String removeComments(String input) {
         int start = 0;
         int length = input.length();
 
-        while (start < length - 1 
-                && !(contains(WHITESPACE_CHARS, input.charAt(start)) 
+        while (start < length - 1
+                && !(Character.isSpaceChar(input.charAt(start))
                 && input.charAt(start + 1) == COMMENT_PREFIX_CHAR)
         ) {
             start++;
@@ -76,42 +102,33 @@ static String removeComments(String input) {
     }
 
     static boolean containsWhiteSpace(String input) {
-        return findFirstIndexIn(input, WHITESPACE_CHARS) != -1;
+        return findFirstIndexIn(input, StringUtils::isWhiteSpace) != -1;
     }
 
-    private static int findFirstIndexNotIn(String input, int endIndex, char[] characters) {
+    private static int findFirstIndexNotIn(String input, int endIndex, Predicate<Character> isSpace) {
         int start = 0;
-        while (start < endIndex && contains(characters, input.charAt(start))) {
+        while (start < endIndex && isSpace.test(input.charAt(start))) {
             start++;
         }
         return start;
     }
-    
-    private static int findLastIndexNotIn(String input, int beginIndex, char[] characters) {
+
+    private static int findLastIndexNotIn(String input, int beginIndex, Predicate<Character> isSpace) {
         int end = input.length();
-        while (end > beginIndex && contains(characters, input.charAt(end - 1))) {
+        while (end > beginIndex && isSpace.test(input.charAt(end - 1))) {
             end--;
         }
         return end;
     }
 
-    private static int findFirstIndexIn(String input, char[] characters) {
+    private static int findFirstIndexIn(String input, Predicate<Character> isSpace) {
         int length = input.length();
         for (int i = 0; i < length; i++) {
-            if (contains(characters, input.charAt(i))) {
+            if (isSpace.test(input.charAt(i))) {
                 return i;
             }
         }
         return -1;
     }
-
-    private static boolean contains(char[] characters, char c) {
-        for (char candidate : characters) {
-            if (candidate == c) {
-                return true;
-            }
-        }
-        return false;
-    }
 
 }
diff --git a/java/src/test/java/io/cucumber/gherkin/GherkinLineTest.java b/java/src/test/java/io/cucumber/gherkin/GherkinLineTest.java
@@ -93,6 +93,16 @@ void finds_tags__comment_inside_tag() {
                 new GherkinLineSpan(7, "@is")
         ), gherkinLineSpans);
     }
+    @Test
+    void finds_tags__comment_inside_tag_preceded_by_nbsp() {
+        GherkinLine gherkinLine = new GherkinLine("@this @is\u202F#acomment  ", line);
+        List<GherkinLineSpan> gherkinLineSpans = gherkinLine.parseTags();
+
+        assertEquals(asList(
+                new GherkinLineSpan(1, "@this"),
+                new GherkinLineSpan(7, "@is")
+        ), gherkinLineSpans);
+    }
 
     @Test
     void finds_tags__commented_before_tag() {

diff --git a/java/src/test/java/io/cucumber/gherkin/StringUtilsTest.java b/java/src/test/java/io/cucumber/gherkin/StringUtilsTest.java
@@ -5,6 +5,7 @@
 import java.util.Map.Entry;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
 
 class StringUtilsTest {
     private static final String WHITESPACE = "\u00A0 \t";
@@ -64,4 +65,40 @@ void removeComments() {
         assertEquals("@issue#1234 @issue#31415", StringUtils.removeComments("@issue#1234 @issue#31415"));
     }
 
+    @Test
+    void isWhiteSpace() {
+        // https://en.wikipedia.org/wiki/Whitespace_character#Unicode
+        char[] whitespace = new char[]{
+                '\t',
+                '\n',
+                '\u000B',
+                '\f',
+                '\r',
+                ' ',
+                '\u0085',
+                '\u00A0',
+                '\u1680',
+                '\u2000',
+                '\u2001',
+                '\u2002',
+                '\u2003',
+                '\u2004',
+                '\u2005',
+                '\u2006',
+                '\u2007',
+                '\u2008',
+                '\u2009',
+                '\u200A',
+                '\u2028',
+                '\u2029',
+                '\u202F',
+                '\u205F',
+                '\u3000'
+        };
+
+        for (char c : whitespace) {
+            assertTrue(StringUtils.isWhiteSpace(c), Character.getName(c) + " was not whitespace");
+        }
+    }
+
 }