Upgrade to unicode 16

ycm-core · Oct 10, 2024 · 32e42c9 · 32e42c9
1 parent 9cb5a84
commit 32e42c9
Show file tree

Hide file tree

Showing 9 changed files with 1,105 additions and 303 deletions.
diff --git a/cpp/ycm/Character.cpp b/cpp/ycm/Character.cpp
@@ -31,7 +31,7 @@ bool CodePointCompare( const CodePoint *left, const CodePoint *right ) {
 
 
 // Sort the code points according to the Canonical Ordering Algorithm.
-// See https://www.unicode.org/versions/latest/ch03.pdf#G49591
+// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591
 CodePointSequence CanonicalSort( CodePointSequence code_points ) {
   auto code_point_start = code_points.begin();
   auto code_point_end = code_points.end();
@@ -64,7 +64,7 @@ CodePointSequence CanonicalSort( CodePointSequence code_points ) {
 
 // Decompose a UTF-8 encoded string into a sequence of code points according to
 // Canonical Decomposition. See
-// https://www.unicode.org/versions/latest/ch03.pdf#G733
+// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G733
 CodePointSequence CanonicalDecompose( std::string_view text ) {
   assert( NormalizeInput( text ) == text );
   return CanonicalSort( BreakIntoCodePoints( text ) );
@@ -78,7 +78,7 @@ Character::Character( std::string_view character )
     is_punctuation_( false ),
     is_uppercase_( false ) {
   // Normalize the character through NFD (Normalization Form D). See
-  // https://www.unicode.org/versions/latest/ch03.pdf#G49621
+  // https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621
   CodePointSequence code_points = CanonicalDecompose( character );
 
   for ( const auto &code_point : code_points ) {

diff --git a/cpp/ycm/Character.h b/cpp/ycm/Character.h
@@ -27,7 +27,7 @@ namespace YouCompleteMe {
 // This class represents a UTF-8 character. It takes a UTF-8 encoded string
 // corresponding to a grapheme cluster (see
 // https://www.unicode.org/glossary/#grapheme_cluster), normalize it through NFD
-// (see https://www.unicode.org/versions/latest/ch03.pdf#G49621), and
+// (see https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621), and
 // compute the folded and swapped case versions of the normalized character. It
 // also holds some properties like if the character is a letter or a
 // punctuation, and if it is uppercase.

diff --git a/cpp/ycm/CodePoint.h b/cpp/ycm/CodePoint.h
@@ -91,7 +91,7 @@ struct RawCodePoint {
 //  - its breaking property: used to split a word into characters.
 //  - its combining class: used to sort a sequence of code points according to
 //    the Canonical Ordering algorithm (see
-//    https://www.unicode.org/versions/latest/ch03.pdf#G49591).
+//    https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591).
 class CodePoint {
 public:
   YCM_EXPORT explicit CodePoint( std::string_view code_point );

diff --git a/cpp/ycm/UnicodeTable.inc b/cpp/ycm/UnicodeTable.inc
diff --git a/cpp/ycm/tests/CodePoint_test.cpp b/cpp/ycm/tests/CodePoint_test.cpp
@@ -87,16 +87,21 @@ const TextCodePointPair tests[] = {
                 GraphemeBreakProperty::EXTEND,
                 IndicConjunctBreakProperty::EXTEND } },
   // Bengali vowel sign Aa
-  { "া", { "া", "া", "া", false, false, false, GraphemeBreakProperty::EXTEND } },
+  { "া", { "া", "া", "া", false, false, false,
+                 GraphemeBreakProperty::EXTEND,
+                 IndicConjunctBreakProperty::EXTEND } },
   // Zero-width non-joiner
   { "‌", { "‌", "‌", "‌", false, false, false,
                 GraphemeBreakProperty::EXTEND } },
   // Combining cyrillic millions sign
-  { "҈", { "҈", "҈", "҈", false, false, false, GraphemeBreakProperty::EXTEND } },
+  { "҈", { "҈", "҈", "҈", false, false, false,
+                GraphemeBreakProperty::EXTEND,
+                IndicConjunctBreakProperty::EXTEND } },
 
   // Zero-width joiner
   { "‍", { "‍", "‍", "‍", false, false, false,
-                GraphemeBreakProperty::ZWJ, IndicConjunctBreakProperty::EXTEND } },
+                GraphemeBreakProperty::ZWJ,
+		IndicConjunctBreakProperty::EXTEND } },
 
   // Regional indicator symbol letter b
   { "🇧", { "🇧", "🇧", "🇧", false, false, false,