apache · rmuir · Mar 21, 2025 · Mar 21, 2025
diff --git a/gradle/generation/icu.gradle b/gradle/generation/icu.gradle
@@ -287,7 +287,7 @@ configure(project(":lucene:analysis:common")) {
 configure(project(":lucene:core")) {
   task generateUnicodePropsInternal() {
     def icuConfig = rootProject.configurations.icu_current
-    def outputFile = file("src/java/org/apache/lucene/util/automaton/CaseFolding.java")
+    def outputFile = file("src/java/org/apache/lucene/util/CaseFolding.java")
 
     description "Regenerate ${outputFile} (with ${icuConfig.name})"
     group "generation"

diff --git a/gradle/generation/icu/GenerateCaseFolding.groovy b/gradle/generation/icu/GenerateCaseFolding.groovy
@@ -11,7 +11,24 @@ def unicodeVersion = UCharacter.getUnicodeVersion().toString()
 
 def outputFile = Paths.get(args[0])
 
-def generateSwitch() {
+def generateFoldSwitch() {
+  StringBuilder sb = new StringBuilder()
+  sb.append("switch(c) {\n")
+  for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
+    int lower = UCharacter.toLowerCase(c)
+    int folded = UCharacter.foldCase(c, true)
+    if (folded != lower) {
+      sb.append(String.format(Locale.ROOT, "      case 0x%04X: // %s\n", c, UCharacter.getName(c)))
+      sb.append(String.format(Locale.ROOT, "        return 0x%04X; // %s vs %s\n", folded, UCharacter.getName(folded), UCharacter.getName(lower)))
+    }
+  }
+  sb.append("      default:\n")
+  sb.append("        return Character.toLowerCase(c);\n")
+  sb.append("    }")
+  return sb.toString()
+}
+
+def generateExpandSwitch() {
   StringBuilder sb = new StringBuilder()
   sb.append("switch(c) {\n")
   for (int c = UCharacter.MIN_CODE_POINT; c <= UCharacter.MAX_CODE_POINT; c++) {
@@ -54,17 +71,24 @@ def code = """
  * limitations under the License.
  */
 
-package org.apache.lucene.util.automaton;
+package org.apache.lucene.util;
 
 import java.util.function.IntConsumer;
 
 /**
- * This file contains unicode properties used by {@code RegExp}.
+ * This file contains unicode properties used by {@code UnicodeUtil}.
  * The data was generated using ICU4J v${icuVersion}, unicode version: ${unicodeVersion}.
  */
 final class CaseFolding {
   private CaseFolding() {}
 
+  /**
+   * Returns the simple case folding of {@code c}
+   */
+  static int fold(int c) {
+    ${generateFoldSwitch()}
+  }
+
   /**
    * Calls {@code fn} consumer with {@code c} itself and its {@code scf} mappings. 
    */
@@ -82,7 +106,7 @@ final class CaseFolding {
       fn.accept(lower);
     }
     // add special casing variants
-    ${generateSwitch()}
+    ${generateExpandSwitch()}
   }
 }
 """

diff --git a/...ne/analysis/common/src/java/org/apache/lucene/analysis/core/CaseFoldingFilterFactory.java b/...ne/analysis/common/src/java/org/apache/lucene/analysis/core/CaseFoldingFilterFactory.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis.core;
+
+import java.util.Map;
+import org.apache.lucene.analysis.CaseFoldingFilter;
+import org.apache.lucene.analysis.TokenFilterFactory;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Factory for {@link CaseFoldingFilter}.
+ *
+ * <pre class="prettyprint">
+ * &lt;fieldType name="text_fold" class="solr.TextField" positionIncrementGap="100"&gt;
+ *   &lt;analyzer&gt;
+ *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ *     &lt;filter class="solr.CaseFoldingFilterFactory"/&gt;
+ *   &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;</pre>
+ *
+ * @since 10.2.0
+ * @lucene.spi {@value #NAME}
+ */
+public class CaseFoldingFilterFactory extends TokenFilterFactory {
+
+  /** SPI name */
+  public static final String NAME = "caseFolding";
+
+  /** Creates a new CaseFoldingFilterFactory */
+  public CaseFoldingFilterFactory(Map<String, String> args) {
+    super(args);
+    if (!args.isEmpty()) {
+      throw new IllegalArgumentException("Unknown parameters: " + args);
+    }
+  }
+
+  /** Default ctor for compatibility with SPI */
+  public CaseFoldingFilterFactory() {
+    throw defaultCtorException();
+  }
+
+  @Override
+  public TokenStream create(TokenStream input) {
+    return new CaseFoldingFilter(input);
+  }
+
+  @Override
+  public TokenStream normalize(TokenStream input) {
+    return create(input);
+  }
+}
diff --git a/...ysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory b/...ysis/common/src/resources/META-INF/services/org.apache.lucene.analysis.TokenFilterFactory
@@ -30,6 +30,7 @@ org.apache.lucene.analysis.commongrams.CommonGramsFilterFactory
 org.apache.lucene.analysis.commongrams.CommonGramsQueryFilterFactory
 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilterFactory
 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilterFactory
+org.apache.lucene.analysis.core.CaseFoldingFilterFactory
 org.apache.lucene.analysis.core.DecimalDigitFilterFactory
 org.apache.lucene.analysis.core.LowerCaseFilterFactory
 org.apache.lucene.analysis.core.StopFilterFactory

diff --git a/lucene/core/src/generated/checksums/generateUnicodeProps.json b/lucene/core/src/generated/checksums/generateUnicodeProps.json
@@ -1,4 +1,4 @@
 {
-    "lucene/core/src/java/org/apache/lucene/util/automaton/CaseFolding.java": "c6f70e3e92f2b953ea71294b4d5d510d61fc0b90",
+    "lucene/core/src/java/org/apache/lucene/util/CaseFolding.java": "1dae8456a324addd5bb5de932b0d0459d0e4e979",
     "property:icuConfig": "com.ibm.icu:icu4j:77.1"
 }
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CaseFoldingFilter.java b/lucene/core/src/java/org/apache/lucene/analysis/CaseFoldingFilter.java
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.analysis;
+
+import java.io.IOException;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+/**
+ * Normalizes token text to erase case differences.
+ *
+ * <p>Text is normalized using the Unicode simple case folding
+ */
+public class CaseFoldingFilter extends TokenFilter {
+  private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+
+  /**
+   * Create a new CaseFoldingFilter, that folds case
+   *
+   * @param in TokenStream to filter
+   */
+  public CaseFoldingFilter(TokenStream in) {
+    super(in);
+  }
+
+  @Override
+  public final boolean incrementToken() throws IOException {
+    if (input.incrementToken()) {
+      CharacterUtils.simpleCaseFold(termAtt.buffer(), 0, termAtt.length());
+      return true;
+    } else {
+      return false;
+    }
+  }
+}
diff --git a/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java b/lucene/core/src/java/org/apache/lucene/analysis/CharacterUtils.java
@@ -18,6 +18,7 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import org.apache.lucene.util.UnicodeUtil;
 
 /**
  * Utility class to write tokenizers or token filters.
@@ -42,6 +43,23 @@ public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
     return new CharacterBuffer(new char[bufferSize], 0, 0);
   }
 
+  /**
+   * Converts each unicode codepoint to its simple case folding starting at the given offset.
+   *
+   * @param buffer the char buffer to fold
+   * @param offset the offset to start at
+   * @param limit the max char in the buffer to fold
+   */
+  public static void simpleCaseFold(final char[] buffer, final int offset, final int limit) {
+    assert buffer.length >= limit;
+    assert 0 <= offset && offset <= buffer.length;
+    for (int i = offset; i < limit; ) {
+      i +=
+          Character.toChars(
+              UnicodeUtil.foldCase(Character.codePointAt(buffer, i, limit)), buffer, i);
+    }
+  }
+
   /**
    * Converts each unicode codepoint to lowerCase via {@link Character#toLowerCase(int)} starting at
    * the given offset.