export utf8proc functionality in Julia (followup to #5462 and #5434)

stevengj · stevengj · commit 9e5ce630e8ed · 2014-02-01T14:54:46.000-05:00
diff --git a/base/char.jl b/base/char.jl
@@ -1,8 +1,6 @@
 char(x) = convert(Char, x)
 char(x::FloatingPoint) = char(iround(x))
 
-is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)
-
 integer(x::Char) = int(x)
 unsigned(x::Char) = uint(x)
 
diff --git a/base/exports.jl b/base/exports.jl
@@ -766,6 +766,7 @@ export
     hex2bytes,
     ind2chr,
     info,
+    is_assigned_char,
     is_valid_ascii,
     is_valid_char,
     is_valid_utf8,
@@ -793,6 +794,7 @@ export
     matchall,
     ndigits,
     nextind,
+    normalize_string,
     oct,
     parsefloat,
     parseint,
diff --git a/base/sysimg.jl b/base/sysimg.jl
@@ -75,6 +75,8 @@ include("utf8.jl")
 include("utf16.jl")
 include("iobuffer.jl")
 include("string.jl")
+include("utf8proc.jl")
+importall .UTF8proc
 include("regex.jl")
 include("base64.jl")
 importall .Base64
diff --git a/base/utf8proc.jl b/base/utf8proc.jl
@@ -0,0 +1,87 @@
+# Various Unicode functionality from the utf8proc library
+module UTF8proc
+
+import Base: show, showcompact, ==, string, symbol, isless, hash
+
+# also exported by Base:
+export normalize_string, is_valid_char, is_assigned_char
+
+# whether codepoints are valid Unicode
+is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))
+
+const UTF8PROC_NULLTERM  = (1<<0)
+const UTF8PROC_STABLE    = (1<<1)
+const UTF8PROC_COMPAT    = (1<<2)
+const UTF8PROC_COMPOSE   = (1<<3)
+const UTF8PROC_DECOMPOSE = (1<<4)
+const UTF8PROC_IGNORE    = (1<<5)
+const UTF8PROC_REJECTNA  = (1<<6)
+const UTF8PROC_NLF2LS    = (1<<7)
+const UTF8PROC_NLF2PS    = (1<<8)
+const UTF8PROC_NLF2LF    = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
+const UTF8PROC_STRIPCC   = (1<<9)
+const UTF8PROC_CASEFOLD  = (1<<10)
+const UTF8PROC_CHARBOUND = (1<<11)
+const UTF8PROC_LUMP      = (1<<12)
+const UTF8PROC_STRIPMARK = (1<<13)
+
+let
+    const p = Array(Ptr{Uint8}, 1)
+    global utf8proc_map
+    function utf8proc_map(s::String, flags::Integer)
+        result = ccall(:utf8proc_map, Cssize_t,
+                       (Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
+                       bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
+        result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
+                                             (Cssize_t,), result)))
+        a = ccall(:jl_ptr_to_array_1d, Vector{Uint8}, 
+                  (Any, Ptr{Uint8}, Csize_t, Cint),
+                  Vector{Uint8}, p[1], result, true)
+        ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
+    end
+end
+
+function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=false, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
+    flags = 0
+    stable && (flags = flags | UTF8PROC_STABLE)
+    compat && (flags = flags | UTF8PROC_COMPAT)
+    compose && (flags = flags | UTF8PROC_COMPOSE)
+    if decompose
+        compose && throw(ArgumentError("compose=true and decompose=true cannot both be specified"))
+        flags = flags | UTF8PROC_DECOMPOSE
+    end
+    stripignore && (flags = flags | UTF8PROC_IGNORE)
+    rejectna && (flags = flags | UTF8PROC_REJECTNA)
+    newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
+    newline2ls && (flags = flags | UTF8PROC_NLF2LS)
+    newline2ps && (flags = flags | UTF8PROC_NLF2PS)
+    newline2lf && (flags = flags | UTF8PROC_NLF2LF)
+    stripcc && (flags = flags | UTF8PROC_STRIPCC)
+    casefold && (flags = flags | UTF8PROC_CASEFOLD)
+    lump && (flags = flags | UTF8PROC_LUMP)
+    stripmark && (flags = flags | UTF8PROC_STRIPMARK | (decompose ? 0 : UTF8PROC_COMPOSE))
+    utf8proc_map(s, flags)
+end
+
+function normalize_string(s::String, nf::Symbol)
+    utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
+                    nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
+                    nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
+                                   | UTF8PROC_COMPAT) :
+                    nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
+                                   | UTF8PROC_COMPAT) :
+                    throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
+end
+    
+# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
+function category_code(c)
+    # note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
+    c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
+    unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
+end
+
+is_assigned_char(c) = category_code(c) != 0
+
+# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?
+
+end # module
diff --git a/doc/stdlib/base.rst b/doc/stdlib/base.rst
@@ -945,6 +945,37 @@ Strings
 
    Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters).
 
+.. function:: normalize_string(s, normalform::Symbol)
+
+   Normalize the string ``s`` according to one of the four "normal
+   forms" of the Unicode standard: ``normalform`` can be ``:NFC``,
+   ``:NFD``, ``:NFKC``, or ``:NFKD``.  Normal forms C (canonical
+   composition) and D (canonical decomposition) convert different
+   visually identical representations of the same abstract string into
+   a single canonical form, with form C being more compact.  Normal
+   forms KC and KD additionally canonicalize "compatibility
+   equivalents": they convert characters that are abstractly similar
+   but visually distinct into a single canonical choice (e.g. they expand
+   ligatures into the individual characters), with form KC being more compact.
+
+   Alternatively, finer control and additional transformations may be
+   be obtained by calling `normalize_string(s; keywords...)`, where
+   any number of the following boolean keywords options (which all default
+   to ``false``) are specified:
+
+   * ``compose=true`` or ``decompose=true``: canonical composition or decomposition, respectively
+   * ``compat=true``: compatibility equivalents are canonicalized
+   * ``casefold=true``: perform Unicode case folding, e.g. for case-insensitive string comparison
+   * ``lump=true``: non--standard canonicalization of various similar-looking characters into a single ASCII character, as defined by the utf8proc library (e.g. fraction and division slashes, space characters, dash characters, etcetera)
+   * ``newline2lf=true``, ``newline2ls=true``, or ``newline2ps=true``: convert various newline sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or paragraph-separation (PS) character, respectively
+   * ``stripmark=true``: strip diacritical marks (e.g. accents)
+   * ``stripignore=true``: strip Unicode's "default ignorable" characters (e.g. the soft hyphen or the left-to-right marker)
+   * ``stripcc=true``: strip control characters; horizontal tabs and form feeds are converted to spaces; newlines are also converted to spaces unless a newline-conversion flag was specified
+   * ``rejectna=true``: throw an error if unassigned code points are found
+   * ``stable=true``: enforce Unicode Versioning Stability
+
+   For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.
+
 .. function:: is_valid_ascii(s) -> Bool
 
    Returns true if the string or byte vector is valid ASCII, false otherwise.
@@ -957,6 +988,10 @@ Strings
 
    Returns true if the given char or integer is a valid Unicode code point.
 
+.. function:: is_assigned_char(c) -> Bool
+
+   Returns true if the given char or integer is an assigned Unicode code point.
+
 .. function:: ismatch(r::Regex, s::String) -> Bool
 
    Test whether a string contains a match of the given regular expression.