Skip to content

Commit 9e5ce63

Browse files
committed
export utf8proc functionality in Julia (followup to #5462 and #5434)
1 parent c19a8e9 commit 9e5ce63

File tree

5 files changed

+126
-2
lines changed

5 files changed

+126
-2
lines changed

base/char.jl

-2
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
char(x) = convert(Char, x)
22
char(x::FloatingPoint) = char(iround(x))
33

4-
is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)
5-
64
integer(x::Char) = int(x)
75
unsigned(x::Char) = uint(x)
86

base/exports.jl

+2
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,7 @@ export
766766
hex2bytes,
767767
ind2chr,
768768
info,
769+
is_assigned_char,
769770
is_valid_ascii,
770771
is_valid_char,
771772
is_valid_utf8,
@@ -793,6 +794,7 @@ export
793794
matchall,
794795
ndigits,
795796
nextind,
797+
normalize_string,
796798
oct,
797799
parsefloat,
798800
parseint,

base/sysimg.jl

+2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,8 @@ include("utf8.jl")
7575
include("utf16.jl")
7676
include("iobuffer.jl")
7777
include("string.jl")
78+
include("utf8proc.jl")
79+
importall .UTF8proc
7880
include("regex.jl")
7981
include("base64.jl")
8082
importall .Base64

base/utf8proc.jl

+87
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Various Unicode functionality from the utf8proc library
2+
module UTF8proc
3+
4+
import Base: show, showcompact, ==, string, symbol, isless, hash
5+
6+
# also exported by Base:
7+
export normalize_string, is_valid_char, is_assigned_char
8+
9+
# whether codepoints are valid Unicode
10+
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
11+
12+
const UTF8PROC_NULLTERM = (1<<0)
13+
const UTF8PROC_STABLE = (1<<1)
14+
const UTF8PROC_COMPAT = (1<<2)
15+
const UTF8PROC_COMPOSE = (1<<3)
16+
const UTF8PROC_DECOMPOSE = (1<<4)
17+
const UTF8PROC_IGNORE = (1<<5)
18+
const UTF8PROC_REJECTNA = (1<<6)
19+
const UTF8PROC_NLF2LS = (1<<7)
20+
const UTF8PROC_NLF2PS = (1<<8)
21+
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
22+
const UTF8PROC_STRIPCC = (1<<9)
23+
const UTF8PROC_CASEFOLD = (1<<10)
24+
const UTF8PROC_CHARBOUND = (1<<11)
25+
const UTF8PROC_LUMP = (1<<12)
26+
const UTF8PROC_STRIPMARK = (1<<13)
27+
28+
let
29+
const p = Array(Ptr{Uint8}, 1)
30+
global utf8proc_map
31+
function utf8proc_map(s::String, flags::Integer)
32+
result = ccall(:utf8proc_map, Cssize_t,
33+
(Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
34+
bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
35+
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
36+
(Cssize_t,), result)))
37+
a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
38+
(Any, Ptr{Uint8}, Csize_t, Cint),
39+
Vector{Uint8}, p[1], result, true)
40+
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
41+
end
42+
end
43+
44+
function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
45+
flags = 0
46+
stable && (flags = flags | UTF8PROC_STABLE)
47+
compat && (flags = flags | UTF8PROC_COMPAT)
48+
compose && (flags = flags | UTF8PROC_COMPOSE)
49+
if decompose
50+
compose && throw(ArgumentError("compose=true and decompose=true cannot both be specified"))
51+
flags = flags | UTF8PROC_DECOMPOSE
52+
end
53+
stripignore && (flags = flags | UTF8PROC_IGNORE)
54+
rejectna && (flags = flags | UTF8PROC_REJECTNA)
55+
newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
56+
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
57+
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
58+
newline2lf && (flags = flags | UTF8PROC_NLF2LF)
59+
stripcc && (flags = flags | UTF8PROC_STRIPCC)
60+
casefold && (flags = flags | UTF8PROC_CASEFOLD)
61+
lump && (flags = flags | UTF8PROC_LUMP)
62+
stripmark && (flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE))
63+
utf8proc_map(s, flags)
64+
end
65+
66+
function normalize_string(s::String, nf::Symbol)
67+
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
68+
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
69+
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
70+
| UTF8PROC_COMPAT) :
71+
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
72+
| UTF8PROC_COMPAT) :
73+
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
74+
end
75+
76+
# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
77+
function category_code(c)
78+
# note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
79+
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
80+
unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
81+
end
82+
83+
is_assigned_char(c) = category_code(c) != 0
84+
85+
# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
86+
87+
end # module

doc/stdlib/base.rst

+35
Original file line numberDiff line numberDiff line change
@@ -945,6 +945,37 @@ Strings
945945

946946
Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters).
947947

948+
.. function:: normalize_string(s, normalform::Symbol)
949+
950+
Normalize the string ``s`` according to one of the four "normal
951+
forms" of the Unicode standard: ``normalform`` can be ``:NFC``,
952+
``:NFD``, ``:NFKC``, or ``:NFKD``. Normal forms C (canonical
953+
composition) and D (canonical decomposition) convert different
954+
visually identical representations of the same abstract string into
955+
a single canonical form, with form C being more compact. Normal
956+
forms KC and KD additionally canonicalize "compatibility
957+
equivalents": they convert characters that are abstractly similar
958+
but visually distinct into a single canonical choice (e.g. they expand
959+
ligatures into the individual characters), with form KC being more compact.
960+
961+
Alternatively, finer control and additional transformations may be
962+
be obtained by calling `normalize_string(s; keywords...)`, where
963+
any number of the following boolean keywords options (which all default
964+
to ``false``) are specified:
965+
966+
* ``compose=true`` or ``decompose=true``: canonical composition or decomposition, respectively
967+
* ``compat=true``: compatibility equivalents are canonicalized
968+
* ``casefold=true``: perform Unicode case folding, e.g. for case-insensitive string comparison
969+
* ``lump=true``: non--standard canonicalization of various similar-looking characters into a single ASCII character, as defined by the utf8proc library (e.g. fraction and division slashes, space characters, dash characters, etcetera)
970+
* ``newline2lf=true``, ``newline2ls=true``, or ``newline2ps=true``: convert various newline sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or paragraph-separation (PS) character, respectively
971+
* ``stripmark=true``: strip diacritical marks (e.g. accents)
972+
* ``stripignore=true``: strip Unicode's "default ignorable" characters (e.g. the soft hyphen or the left-to-right marker)
973+
* ``stripcc=true``: strip control characters; horizontal tabs and form feeds are converted to spaces; newlines are also converted to spaces unless a newline-conversion flag was specified
974+
* ``rejectna=true``: throw an error if unassigned code points are found
975+
* ``stable=true``: enforce Unicode Versioning Stability
976+
977+
For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
978+
948979
.. function:: is_valid_ascii(s) -> Bool
949980

950981
Returns true if the string or byte vector is valid ASCII, false otherwise.
@@ -957,6 +988,10 @@ Strings
957988

958989
Returns true if the given char or integer is a valid Unicode code point.
959990

991+
.. function:: is_assigned_char(c) -> Bool
992+
993+
Returns true if the given char or integer is an assigned Unicode code point.
994+
960995
.. function:: ismatch(r::Regex, s::String) -> Bool
961996

962997
Test whether a string contains a match of the given regular expression.

0 commit comments

Comments
 (0)