|
| 1 | +# Various Unicode functionality from the utf8proc library |
| 2 | +module UTF8proc |
| 3 | + |
| 4 | +import Base: show, showcompact, ==, string, symbol, isless, hash |
| 5 | + |
| 6 | +# also exported by Base: |
| 7 | +export normalize_string, is_valid_char, is_assigned_char |
| 8 | + |
| 9 | +# whether codepoints are valid Unicode |
| 10 | +is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c)) |
| 11 | + |
| 12 | +const UTF8PROC_NULLTERM = (1<<0) |
| 13 | +const UTF8PROC_STABLE = (1<<1) |
| 14 | +const UTF8PROC_COMPAT = (1<<2) |
| 15 | +const UTF8PROC_COMPOSE = (1<<3) |
| 16 | +const UTF8PROC_DECOMPOSE = (1<<4) |
| 17 | +const UTF8PROC_IGNORE = (1<<5) |
| 18 | +const UTF8PROC_REJECTNA = (1<<6) |
| 19 | +const UTF8PROC_NLF2LS = (1<<7) |
| 20 | +const UTF8PROC_NLF2PS = (1<<8) |
| 21 | +const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS) |
| 22 | +const UTF8PROC_STRIPCC = (1<<9) |
| 23 | +const UTF8PROC_CASEFOLD = (1<<10) |
| 24 | +const UTF8PROC_CHARBOUND = (1<<11) |
| 25 | +const UTF8PROC_LUMP = (1<<12) |
| 26 | +const UTF8PROC_STRIPMARK = (1<<13) |
| 27 | + |
| 28 | +let |
| 29 | + const p = Array(Ptr{Uint8}, 1) |
| 30 | + global utf8proc_map |
| 31 | + function utf8proc_map(s::String, flags::Integer) |
| 32 | + result = ccall(:utf8proc_map, Cssize_t, |
| 33 | + (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint), |
| 34 | + bytestring(s), 0, p, flags | UTF8PROC_NULLTERM) |
| 35 | + result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8}, |
| 36 | + (Cssize_t,), result))) |
| 37 | + a = ccall(:jl_ptr_to_array_1d, Vector{Uint8}, |
| 38 | + (Any, Ptr{Uint8}, Csize_t, Cint), |
| 39 | + Vector{Uint8}, p[1], result, true) |
| 40 | + ccall(:jl_array_to_string, Any, (Any,), a)::ByteString |
| 41 | + end |
| 42 | +end |
| 43 | + |
| 44 | +function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false) |
| 45 | + flags = 0 |
| 46 | + stable && (flags = flags | UTF8PROC_STABLE) |
| 47 | + compat && (flags = flags | UTF8PROC_COMPAT) |
| 48 | + compose && (flags = flags | UTF8PROC_COMPOSE) |
| 49 | + if decompose |
| 50 | + compose && throw(ArgumentError("compose=true and decompose=true cannot both be specified")) |
| 51 | + flags = flags | UTF8PROC_DECOMPOSE |
| 52 | + end |
| 53 | + stripignore && (flags = flags | UTF8PROC_IGNORE) |
| 54 | + rejectna && (flags = flags | UTF8PROC_REJECTNA) |
| 55 | + newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified")) |
| 56 | + newline2ls && (flags = flags | UTF8PROC_NLF2LS) |
| 57 | + newline2ps && (flags = flags | UTF8PROC_NLF2PS) |
| 58 | + newline2lf && (flags = flags | UTF8PROC_NLF2LF) |
| 59 | + stripcc && (flags = flags | UTF8PROC_STRIPCC) |
| 60 | + casefold && (flags = flags | UTF8PROC_CASEFOLD) |
| 61 | + lump && (flags = flags | UTF8PROC_LUMP) |
| 62 | + stripmark && (flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE)) |
| 63 | + utf8proc_map(s, flags) |
| 64 | +end |
| 65 | + |
| 66 | +function normalize_string(s::String, nf::Symbol) |
| 67 | + utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) : |
| 68 | + nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) : |
| 69 | + nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE |
| 70 | + | UTF8PROC_COMPAT) : |
| 71 | + nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE |
| 72 | + | UTF8PROC_COMPAT) : |
| 73 | + throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD"))) |
| 74 | +end |
| 75 | + |
| 76 | +# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category |
| 77 | +function category_code(c) |
| 78 | + # note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c |
| 79 | + c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs |
| 80 | + unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c)) |
| 81 | +end |
| 82 | + |
| 83 | +is_assigned_char(c) = category_code(c) != 0 |
| 84 | + |
| 85 | +# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes? |
| 86 | + |
| 87 | +end # module |
0 commit comments