Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/simple.roc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,6 @@ maybeLength = word |> Str.toUtf8 |> CodePoint.parseUtf8 |> Result.map List.len

main =
when maybeLength is
Ok count -> Stdout.line "\n\nThere are a total of \(Num.toStr count) code points in \(word)\n\n"
Err _ -> crash "ERROR: Unable to parse \(word)!"
Ok count -> Stdout.line "\n\nThere are a total of $(Num.toStr count) code points in $(word)\n\n"
Err _ -> crash "ERROR: Unable to parse $(word)!"

59 changes: 25 additions & 34 deletions package/CodePoint.roc
Original file line number Diff line number Diff line change
Expand Up @@ -29,40 +29,34 @@ toU32 = InternalCP.toU32
## (that is, it's between `0` and `0x10FFFF`).
fromU32 : U32 -> Result CodePoint [InvalidCodePoint]
fromU32 = \u32 ->
# Definition: http://www.unicode.org/glossary/#code_point
if u32 <= 0x10FFFF then
Ok (fromU32Unchecked u32)
else
Err InvalidCodePoint

## Returns false if this is either a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you mean to remove these links? I think we should keep these.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was thinking external links should remain unique for code maintainability. Internal links can be repeated for internal navigation, and they could ultimately lead back to the original external link.
I'm open to putting them back if you find it necessary.

## or a [low-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point).
##
## To check for either of those individually, use [isHighSurrogate] or [isLowSurrogate]
## Returns false if this is [isHighSurrogate] or [isLowSurrogate]
isValidScalar : CodePoint -> Bool
isValidScalar = \codePoint -> !(isHighSurrogate codePoint || isLowSurrogate codePoint)

## Returns true if this is a [high-surrogate code point](http://www.unicode.org/glossary/#high_surrogate_code_point)
## (`0xD800` to `0xDBFF`)
## from U+D800 to U+DBFF
isHighSurrogate : CodePoint -> Bool
isHighSurrogate = \codePoint ->
u32 = InternalCP.toU32 codePoint

u32 >= 0xDC00 && u32 <= 0xDFFF
u32 = toU32 codePoint
u32 >= 0xD800 && u32 <= 0xDBFF

## Returns true if this is a [low-surrogate code point](https://www.unicode.org/glossary/#low_surrogate_code_point)
## U+DC00 to U+DFFF
## from U+DC00 to U+DFFF
isLowSurrogate : CodePoint -> Bool
isLowSurrogate = \codePoint ->
u32 = InternalCP.toU32 codePoint

u32 = toU32 codePoint
u32 >= 0xDC00 && u32 <= 0xDFFF

## Zig docs: bytes the UTF-8 representation would require
## for the given codepoint.
utf8Len : CodePoint -> Result U64 [InvalidCodePoint]
utf8Len : CodePoint -> Result U8 [InvalidCodePoint]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We used U64 here as it was recently changed from Nat. I don't really know myself, but just wondering why you changed it to U8?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was one of my doubts and should've asked before. But utf8Len and countUtf8Bytes return values in the range of 1 to 4.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Lets give it a try. It's always easy to cast if needed

utf8Len = \codePoint ->
u32 = InternalCP.toU32 codePoint
u32 = toU32 codePoint

if u32 < 0x80 then
Ok 1
Expand All @@ -78,8 +72,7 @@ utf8Len = \codePoint ->
## Encode a Scalar as UTF-8 bytes and append those bytes to an existing list of UTF-8 bytes.
appendUtf8 : List U8, CodePoint -> List U8
appendUtf8 = \bytes, codePoint ->
u32 = InternalCP.toU32 codePoint

u32 = toU32 codePoint
if u32 < 0x80 then
List.append bytes (Num.toU8 u32)
else if u32 < 0x800 then
Expand Down Expand Up @@ -173,9 +166,9 @@ addContinuation = \original, continuationByte ->
|> Num.bitwiseOr (Num.toU32 (Num.bitwiseAnd continuationByte 0b00111111))

## The number of UTF-8 bytes it takes to represent this Scalar.
countUtf8Bytes : CodePoint -> U64
countUtf8Bytes : CodePoint -> U8
countUtf8Bytes = \codePoint ->
u32 = InternalCP.toU32 codePoint
u32 = toU32 codePoint

if u32 < 0x80 then
1
Expand Down Expand Up @@ -252,11 +245,9 @@ Utf8ParseErr : [OverlongEncoding, ExpectedContinuation, EncodesSurrogateHalf, In

parseUtf8 : List U8 -> Result (List CodePoint) Utf8ParseErr
parseUtf8 = \bytes ->

# we will have at most List.len bytes code points
listWithCapacity : List CodePoint
listWithCapacity = List.withCapacity (List.len bytes)

parseUtf8Help bytes listWithCapacity

parseUtf8Help : List U8, List CodePoint -> Result (List CodePoint) Utf8ParseErr
Expand Down Expand Up @@ -339,34 +330,34 @@ parsePartialUtf8 = \bytes ->
else
Err InvalidUtf8


toStr : List CP -> Result Str [BadUtf8]
toStr : List CodePoint -> Result Str [BadUtf8]
toStr = \cps ->

# allocated extra space for the extra bytes as some CPs expand into
# allocated extra space for the extra bytes as some CPs expand into
# multiple U8s, so this minimises extra allocations
capacity = List.withCapacity (50 + List.len cps)

cps
|> cpsToStrHelp capacity
cps
|> cpsToStrHelp capacity
|> Str.fromUtf8
|> Result.onErr \_ -> Err BadUtf8

cpsToStrHelp : List CP, List U8 -> List U8
cpsToStrHelp : List CodePoint, List U8 -> List U8
cpsToStrHelp = \cps, bytes ->
when cps is
when cps is
[] -> bytes
[cp,..] ->
cpsToStrHelp
[cp, ..] ->
cpsToStrHelp
(List.dropFirst cps 1)
(CodePoint.appendUtf8 bytes cp)

expect # test toStr
cr = (fromU32Unchecked 13)
lf = (fromU32Unchecked 10)

expect
# test toStr
cr = fromU32Unchecked 13
lf = fromU32Unchecked 10

toStr [cr, lf] == Ok "\r\n"

## Empty input
expect [] |> parsePartialUtf8 == Err ListWasEmpty

Expand Down
71 changes: 29 additions & 42 deletions package/Grapheme.roc
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,11 @@ Grapheme : InternalGBP.GBP
# Note GB13 is not used, it has been merged with GB12 as they are identical as far as I can tell
Rule : [GB1, GB2, GB3, GB4, GB5, GB6, GB7, GB8, GB9, GB9a, GB9b, GB9c, GB11, GB12, GB999]

# User internally to represent the text segmentation algorithm. We include the
# User internally to represent the text segmentation algorithm. We include the
# Rules here so that it is feasible to debug this and ensure algorithm correctness
# We could remove these and reduce the number of allocations, however it is very
# We could remove these and reduce the number of allocations, however it is very
# difficult then to understand if the implementation is applying each rule correctly
Tokens : List [BR Rule,NB Rule,CP CodePoint]
Tokens : List [BR Rule, NB Rule, CP CodePoint]

## Split a string into extended grapheme clusters
split : Str -> Result (List Str) Utf8ParseErr
Expand Down Expand Up @@ -63,19 +63,17 @@ splitHelp = \state, codePoints, breakPoints, acc ->
nextBPs = List.dropFirst breakPoints 1

when (state, codePoints, breakPoints) is

# Special handling for last codepoint
(Next, [cp], _) -> List.concat acc [CP cp, BR GB2]
(AfterHungulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2]
(AfterHungulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2]
(AfterHungulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2]
(AfterHungulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHungulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulL prev, [cp], [bp]) if bp == L || bp == V || bp == LV || bp == LVT -> List.concat acc [CP prev, NB GB6, CP cp, BR GB2]
(AfterHangulLVorV prev, [cp], [bp]) if bp == V || bp == T -> List.concat acc [CP prev, NB GB7, CP cp, BR GB2]
(AfterHangulLVTorT prev, [cp], [bp]) if bp == T -> List.concat acc [CP prev, NB GB8, CP cp, BR GB2]
(AfterHangulL prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulLVorV prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc
(LastWithPrev prev, [cp], [bp]) if bp == Control || bp == CR || bp == LF -> List.concat acc [CP prev, BR GB5, CP cp, BR GB2]
(LastWithPrev prev, [cp], [bp]) if bp == Extend -> List.concat acc [CP prev, NB GB9, CP cp, BR GB2]
(LastWithPrev prev, [cp], [bp]) if bp == ZWJ ->

if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then
List.concat acc [CP prev, NB GB11, CP cp, BR GB2]
else
Expand All @@ -87,26 +85,21 @@ splitHelp = \state, codePoints, breakPoints, acc ->
(EmojiSeqNext prev, [], []) -> List.concat acc [CP prev, BR GB2]
(EmojiSeqNext prev, [cp], [_]) -> List.concat acc [CP prev, NB GB11, CP cp, BR GB2]
(EmojiSeqZWJ prev, [_], [_]) -> splitHelp (LastWithPrev prev) codePoints breakPoints acc

(AfterEvenRI prev, [], []) -> List.concat acc [CP prev, BR GB2]
(AfterOddRI prev, [], []) -> List.concat acc [CP prev, BR GB2]

# Looking at current breakpoint property
(Next, [cp, ..], [bp, ..]) if bp == CR -> splitHelp (AfterCR cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == Control || bp == LF -> splitHelp Next nextCPs nextBPs (List.concat acc [CP cp, BR GB4])
(Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == LVT || bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs acc
(Next, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs acc

# Advance to next, this is requred so that we can apply rules which break before
# Advance to next, this is required so that we can apply rules which break before
(Next, [cp, ..], _) -> splitHelp (LookAtNext cp) nextCPs nextBPs acc

# Looking ahead at next, given previous
(LookAtNext prev, _, [bp, ..]) if bp == Control || bp == CR || bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB5])
(LookAtNext prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
(LookAtNext prev, [cp, ..], [bp, ..]) if bp == ZWJ ->

if prev |> CodePoint.toU32 |> InternalEmoji.isPictographic then
# enter emoji sequence
splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
Expand All @@ -115,15 +108,13 @@ splitHelp = \state, codePoints, breakPoints, acc ->

# Look ahead, given previous was Emoji related
(EmojiSeqZWJ prev, [cp, ..], [bp, ..]) ->

if bp == ZWJ then
# got another ZWJ continue the sequence
splitHelp (EmojiSeqNext cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11])
else
splitHelp Next codePoints breakPoints acc

(EmojiSeqNext prev, [cp, ..], [_, ..]) ->

if cp |> CodePoint.toU32 |> InternalEmoji.isPictographic then
# got another emoji, continue the sequence
splitHelp (EmojiSeqZWJ cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB11])
Expand All @@ -134,31 +125,27 @@ splitHelp = \state, codePoints, breakPoints, acc ->
(AfterExtend prev, [cp, ..], [bp, ..]) if bp == Extend -> splitHelp (AfterExtend cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB9])
(AfterExtend prev, [_, ..], [_, ..]) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999])
(LookAtNext prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB999])

# Looking ahead, given previous was a Regional Indicator
(AfterOddRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterEvenRI cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB12])
(AfterOddRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterEvenRI prev, [cp, ..], [bp, ..]) if bp == RI -> splitHelp (AfterOddRI cp) nextCPs nextBPs (List.concat acc [CP prev, BR GB999])
(AfterEvenRI prev, [_, ..], [_, ..]) -> splitHelp (LookAtNext prev) codePoints breakPoints acc

# Looking ahead, given previous was CR
(AfterCR prev, _, [bp, ..]) if bp == LF -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, NB GB3])
(AfterCR prev, _, _) -> splitHelp Next codePoints breakPoints (List.concat acc [CP prev, BR GB4])

# Looking ahead, given previous was Hangul
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHungulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHungulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHungulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHungulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHungulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHungulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8])
(AfterHungulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHungulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc

(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == L -> splitHelp (AfterHangulL cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == V || bp == LV -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, [cp, ..], [bp, ..]) if bp == LVT -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB6])
(AfterHangulL prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulL prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == V -> splitHelp (AfterHangulLVorV cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHangulLVorV prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB7])
(AfterHangulLVorV prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulLVorV prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, [cp, ..], [bp, ..]) if bp == T -> splitHelp (AfterHangulLVTorT cp) nextCPs nextBPs (List.concat acc [CP prev, NB GB8])
(AfterHangulLVTorT prev, _, [bp, ..]) if bp == ZWJ -> splitHelp (AfterZWJ prev) codePoints breakPoints acc
(AfterHangulLVTorT prev, _, _) -> splitHelp (LookAtNext prev) codePoints breakPoints acc
# Print out a helpful error message requesting users report the unhandled case.
_ ->
crash
Expand All @@ -168,10 +155,10 @@ splitHelp = \state, codePoints, breakPoints, acc ->
It is difficult to track down and catch every possible combination, so it would be helpful if you could log this as an issue with a reproduction.

Grapheme.split state machine state at the time was:
\(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints))
$(Inspect.toStr (state, List.map codePoints CodePoint.toU32, breakPoints))
"""

# Used internally as a test helper to generate the expected answer for a given
# Used internally as a test helper to generate the expected answer for a given
# input. Most of the test inputs come from the test data, some are manually developed
# to cover additional edge cases not found in the test data file.
testHelp : List (List U32) -> Tokens
Expand Down Expand Up @@ -342,7 +329,7 @@ expect
]
a == b

# GB11 emoji another complicated example
# GB11 emoji another complicated example
# % [0.2] LATIN SMALL LETTER A (Other) x [9.0] EMOJI MODIFIER FITZPATRICK TYPE-6 (Extend) % [999.0] BABY (ExtPict) x [9.0] ZERO WIDTH JOINER (ZWJ_ExtCccZwj) x [11.0] OCTAGONAL SIGN (ExtPict) % [0.3]
expect
a = testHelp [[97, 127999], [128118, 8205, 128721]]
Expand Down Expand Up @@ -383,4 +370,4 @@ expect
a == b

expect split "🥷🏼" == Ok ["🥷🏼"]
expect split "🇦🇺🦘🪃" == Ok ["🇦🇺", "🦘", "🪃"]
expect split "🇦🇺🦘🪃" == Ok ["🇦🇺", "🦘", "🪃"]
Loading