Skip to content

Commit

Permalink
Upgrade to unicode 16
Browse files Browse the repository at this point in the history
  • Loading branch information
bstaletic committed Oct 10, 2024
1 parent 9cb5a84 commit 32e42c9
Show file tree
Hide file tree
Showing 9 changed files with 1,105 additions and 303 deletions.
6 changes: 3 additions & 3 deletions cpp/ycm/Character.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ bool CodePointCompare( const CodePoint *left, const CodePoint *right ) {


// Sort the code points according to the Canonical Ordering Algorithm.
// See https://www.unicode.org/versions/latest/ch03.pdf#G49591
// See https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591
CodePointSequence CanonicalSort( CodePointSequence code_points ) {
auto code_point_start = code_points.begin();
auto code_point_end = code_points.end();
Expand Down Expand Up @@ -64,7 +64,7 @@ CodePointSequence CanonicalSort( CodePointSequence code_points ) {

// Decompose a UTF-8 encoded string into a sequence of code points according to
// Canonical Decomposition. See
// https://www.unicode.org/versions/latest/ch03.pdf#G733
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G733
CodePointSequence CanonicalDecompose( std::string_view text ) {
assert( NormalizeInput( text ) == text );
return CanonicalSort( BreakIntoCodePoints( text ) );
Expand All @@ -78,7 +78,7 @@ Character::Character( std::string_view character )
is_punctuation_( false ),
is_uppercase_( false ) {
// Normalize the character through NFD (Normalization Form D). See
// https://www.unicode.org/versions/latest/ch03.pdf#G49621
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621
CodePointSequence code_points = CanonicalDecompose( character );

for ( const auto &code_point : code_points ) {
Expand Down
2 changes: 1 addition & 1 deletion cpp/ycm/Character.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ namespace YouCompleteMe {
// This class represents a UTF-8 character. It takes a UTF-8 encoded string
// corresponding to a grapheme cluster (see
// https://www.unicode.org/glossary/#grapheme_cluster), normalize it through NFD
// (see https://www.unicode.org/versions/latest/ch03.pdf#G49621), and
// (see https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49621), and
// compute the folded and swapped case versions of the normalized character. It
// also holds some properties like if the character is a letter or a
// punctuation, and if it is uppercase.
Expand Down
2 changes: 1 addition & 1 deletion cpp/ycm/CodePoint.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ struct RawCodePoint {
// - its breaking property: used to split a word into characters.
// - its combining class: used to sort a sequence of code points according to
// the Canonical Ordering algorithm (see
// https://www.unicode.org/versions/latest/ch03.pdf#G49591).
// https://www.unicode.org/versions/latest/core-spec/chapter-3/#G49591).
class CodePoint {
public:
YCM_EXPORT explicit CodePoint( std::string_view code_point );
Expand Down
42 changes: 21 additions & 21 deletions cpp/ycm/UnicodeTable.inc

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions cpp/ycm/tests/CodePoint_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,16 +87,21 @@ const TextCodePointPair tests[] = {
GraphemeBreakProperty::EXTEND,
IndicConjunctBreakProperty::EXTEND } },
// Bengali vowel sign Aa
{ "", { "", "", "", false, false, false, GraphemeBreakProperty::EXTEND } },
{ "", { "", "", "", false, false, false,
GraphemeBreakProperty::EXTEND,
IndicConjunctBreakProperty::EXTEND } },
// Zero-width non-joiner
{ "", { "", "", "", false, false, false,
GraphemeBreakProperty::EXTEND } },
// Combining cyrillic millions sign
{ "҈", { "҈", "҈", "҈", false, false, false, GraphemeBreakProperty::EXTEND } },
{ "҈", { "҈", "҈", "҈", false, false, false,
GraphemeBreakProperty::EXTEND,
IndicConjunctBreakProperty::EXTEND } },

// Zero-width joiner
{ "", { "", "", "", false, false, false,
GraphemeBreakProperty::ZWJ, IndicConjunctBreakProperty::EXTEND } },
GraphemeBreakProperty::ZWJ,
IndicConjunctBreakProperty::EXTEND } },

// Regional indicator symbol letter b
{ "🇧", { "🇧", "🇧", "🇧", false, false, false,
Expand Down
Loading

0 comments on commit 32e42c9

Please sign in to comment.