Skip to content

Commit

Permalink
Merge pull request #5576 from stevengj/utf8proc
Browse files Browse the repository at this point in the history
RFC: export utf8proc Unicode transformation functionality in Julia
  • Loading branch information
stevengj committed Feb 3, 2014
2 parents 793d769 + a500b83 commit 7e5a31d
Show file tree
Hide file tree
Showing 6 changed files with 202 additions and 2 deletions.
2 changes: 0 additions & 2 deletions base/char.jl
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
char(x) = convert(Char, x)
char(x::FloatingPoint) = char(iround(x))

is_valid_char(c) = !('\ud800' <= c <= '\udfff' || '\U10ffff' < c)

integer(x::Char) = int(x)
unsigned(x::Char) = uint(x)

Expand Down
2 changes: 2 additions & 0 deletions base/exports.jl
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,7 @@ export
hex2bytes,
ind2chr,
info,
is_assigned_char,
is_valid_ascii,
is_valid_char,
is_valid_utf8,
Expand Down Expand Up @@ -792,6 +793,7 @@ export
matchall,
ndigits,
nextind,
normalize_string,
oct,
parsefloat,
parseint,
Expand Down
2 changes: 2 additions & 0 deletions base/sysimg.jl
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ include("utf8.jl")
include("utf16.jl")
include("iobuffer.jl")
include("string.jl")
include("utf8proc.jl")
importall .UTF8proc
include("regex.jl")
include("base64.jl")
importall .Base64
Expand Down
89 changes: 89 additions & 0 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# Various Unicode functionality from the utf8proc library
module UTF8proc

import Base: show, showcompact, ==, string, symbol, isless, hash

# also exported by Base:
export normalize_string, is_valid_char, is_assigned_char

# whether codepoints are valid Unicode
is_valid_char(c) = bool(ccall(:utf8proc_codepoint_valid, Cchar, (Int32,), c))

const UTF8PROC_NULLTERM = (1<<0)
const UTF8PROC_STABLE = (1<<1)
const UTF8PROC_COMPAT = (1<<2)
const UTF8PROC_COMPOSE = (1<<3)
const UTF8PROC_DECOMPOSE = (1<<4)
const UTF8PROC_IGNORE = (1<<5)
const UTF8PROC_REJECTNA = (1<<6)
const UTF8PROC_NLF2LS = (1<<7)
const UTF8PROC_NLF2PS = (1<<8)
const UTF8PROC_NLF2LF = (UTF8PROC_NLF2LS | UTF8PROC_NLF2PS)
const UTF8PROC_STRIPCC = (1<<9)
const UTF8PROC_CASEFOLD = (1<<10)
const UTF8PROC_CHARBOUND = (1<<11)
const UTF8PROC_LUMP = (1<<12)
const UTF8PROC_STRIPMARK = (1<<13)

let
const p = Array(Ptr{Uint8}, 1)
global utf8proc_map
function utf8proc_map(s::String, flags::Integer)
result = ccall(:utf8proc_map, Cssize_t,
(Ptr{Uint8}, Cssize_t, Ptr{Ptr{Uint8}}, Cint),
bytestring(s), 0, p, flags | UTF8PROC_NULLTERM)
result < 0 && error(bytestring(ccall(:utf8proc_errmsg, Ptr{Uint8},
(Cssize_t,), result)))
a = ccall(:jl_ptr_to_array_1d, Vector{Uint8},
(Any, Ptr{Uint8}, Csize_t, Cint),
Vector{Uint8}, p[1], result, true)
ccall(:jl_array_to_string, Any, (Any,), a)::ByteString
end
end

function normalize_string(s::String; stable::Bool=false, compat::Bool=false, compose::Bool=true, decompose::Bool=false, stripignore::Bool=false, rejectna::Bool=false, newline2ls::Bool=false, newline2ps::Bool=false, newline2lf::Bool=false, stripcc::Bool=false, casefold::Bool=false, lump::Bool=false, stripmark::Bool=false)
flags = 0
stable && (flags = flags | UTF8PROC_STABLE)
compat && (flags = flags | UTF8PROC_COMPAT)
if decompose
flags = flags | UTF8PROC_DECOMPOSE
elseif compose
flags = flags | UTF8PROC_COMPOSE
elseif compat || stripmark
throw(ArgumentError("compat=true or stripmark=true require compose=true or decompose=true"))
end
stripignore && (flags = flags | UTF8PROC_IGNORE)
rejectna && (flags = flags | UTF8PROC_REJECTNA)
newline2ls + newline2ps + newline2lf > 1 && throw(ArgumentError("only one newline conversion may be specified"))
newline2ls && (flags = flags | UTF8PROC_NLF2LS)
newline2ps && (flags = flags | UTF8PROC_NLF2PS)
newline2lf && (flags = flags | UTF8PROC_NLF2LF)
stripcc && (flags = flags | UTF8PROC_STRIPCC)
casefold && (flags = flags | UTF8PROC_CASEFOLD)
lump && (flags = flags | UTF8PROC_LUMP)
stripmark && (flags = flags | UTF8PROC_STRIPMARK)
utf8proc_map(s, flags)
end

function normalize_string(s::String, nf::Symbol)
utf8proc_map(s, nf == :NFC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE) :
nf == :NFD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE) :
nf == :NFKC ? (UTF8PROC_STABLE | UTF8PROC_COMPOSE
| UTF8PROC_COMPAT) :
nf == :NFKD ? (UTF8PROC_STABLE | UTF8PROC_DECOMPOSE
| UTF8PROC_COMPAT) :
throw(ArgumentError(":$nf is not one of :NFC, :NFD, :NFKC, :NFKD")))
end

# returns UTF8PROC_CATEGORY code in 0..30 giving Unicode category
function category_code(c)
# note: utf8proc returns 0, not UTF8PROC_CATEGORY_CN, for unassigned c
c > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
unsafe_load(ccall(:utf8proc_get_property, Ptr{Uint16}, (Int32,), c))
end

is_assigned_char(c) = category_code(c) != 0

# TODO: use UTF8PROC_CHARBOUND to extract graphemes from a string, e.g. to iterate over graphemes?

end # module
36 changes: 36 additions & 0 deletions doc/stdlib/base.rst
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,38 @@ Strings

Convert a string to a contiguous UTF-8 string (all characters must be valid UTF-8 characters).

.. function:: normalize_string(s, normalform::Symbol)

Normalize the string ``s`` according to one of the four "normal
forms" of the Unicode standard: ``normalform`` can be ``:NFC``,
``:NFD``, ``:NFKC``, or ``:NFKD``. Normal forms C (canonical
composition) and D (canonical decomposition) convert different
visually identical representations of the same abstract string into
a single canonical form, with form C being more compact. Normal
forms KC and KD additionally canonicalize "compatibility
equivalents": they convert characters that are abstractly similar
but visually distinct into a single canonical choice (e.g. they expand
ligatures into the individual characters), with form KC being more compact.

Alternatively, finer control and additional transformations may be
be obtained by calling `normalize_string(s; keywords...)`, where
any number of the following boolean keywords options (which all default
to ``false`` except for ``compose``) are specified:

* ``compose=false``: do not perform canonical composition
* ``decompose=true``: do canonical decomposition instead of canonical composition (``compose=true`` is ignored if present)
* ``compat=true``: compatibility equivalents are canonicalized
* ``casefold=true``: perform Unicode case folding, e.g. for case-insensitive string comparison
* ``lump=true``: non--standard canonicalization of various similar-looking characters into a single ASCII character, as defined by the utf8proc library (e.g. fraction and division slashes, space characters, dash characters, etcetera)
* ``newline2lf=true``, ``newline2ls=true``, or ``newline2ps=true``: convert various newline sequences (LF, CRLF, CR, NEL) into a linefeed (LF), line-separation (LS), or paragraph-separation (PS) character, respectively
* ``stripmark=true``: strip diacritical marks (e.g. accents)
* ``stripignore=true``: strip Unicode's "default ignorable" characters (e.g. the soft hyphen or the left-to-right marker)
* ``stripcc=true``: strip control characters; horizontal tabs and form feeds are converted to spaces; newlines are also converted to spaces unless a newline-conversion flag was specified
* ``rejectna=true``: throw an error if unassigned code points are found
* ``stable=true``: enforce Unicode Versioning Stability

For example, NFKC corresponds to the options ``compose=true, compat=true, stable=true``.

.. function:: is_valid_ascii(s) -> Bool

Returns true if the string or byte vector is valid ASCII, false otherwise.
Expand All @@ -963,6 +995,10 @@ Strings

Returns true if the given char or integer is a valid Unicode code point.

.. function:: is_assigned_char(c) -> Bool

Returns true if the given char or integer is an assigned Unicode code point.

.. function:: ismatch(r::Regex, s::String) -> Bool

Test whether a string contains a match of the given regular expression.
Expand Down
73 changes: 73 additions & 0 deletions test/strings.jl
Original file line number Diff line number Diff line change
Expand Up @@ -857,3 +857,76 @@ for T = (Uint8,Int8,Uint16,Int16,Uint32,Int32,Uint64,Int64,Uint128,Int128,BigInt
n = T != BigInt ? rand(T) : BigInt(rand(Int128))
@test parseint(T,base(b,n),b) == n
end

# normalize_string (Unicode normalization etc.):
@test normalize_string("\u006e\u0303", :NFC) == "\u00f1"
@test "\u006e\u0303" == normalize_string("\u00f1", :NFD)
@test normalize_string("\ufb00", :NFC) != "ff"
@test normalize_string("\ufb00", :NFKC) == "ff"
@test normalize_string("\u006e\u0303\ufb00", :NFKC) == "\u00f1"*"ff"
@test normalize_string("\u00f1\ufb00", :NFKD) == "\u006e\u0303"*"ff"
@test normalize_string("\u006e\u0303", compose=true) == "\u00f1"
@test "\u006e\u0303" == normalize_string("\u00f1", decompose=true)
@test normalize_string("\u006e\u0303\u00b5",compat=true) == "\u00f1\u03bc"
@test normalize_string("Σσς",casefold=true) == "σσσ"
@test normalize_string("∕⁄", lump=true) == "//"
@test normalize_string("\ua\n\r\r\ua", newline2lf=true) == "\ua\ua\ua\ua"
@test normalize_string("\ua\n\r\r\ua", newline2ls=true) == "\u2028\u2028\u2028\u2028"
@test normalize_string("\ua\n\r\r\ua", newline2ps=true) == "\u2029\u2029\u2029\u2029"
@test normalize_string("\u00f1", stripmark=true) == "n"
@test isempty(normalize_string("\u00ad", stripignore=true))
@test normalize_string("\t\r", stripcc=true) == " "
@test normalize_string("\t\r", stripcc=true, newline2ls=true) == " \u2028"

#Tests from Unicode SA#15, "Unicode normalization forms"
#http://www.unicode.org/reports/tr15/

#1. Canonical equivalence
==(a::Array{Char},b::Array{Char}) =
normalize_string(string(a...), :NFC)==normalize_string(string(b...), :NFC)
@test ['C', '̧'] == ['Ç']
@test ['q', '̇', '̣'] == ['q', '̣', '̇']
@test [''] == ['', '']
@test ['Ω'] == ['Ω']

#2. Compatibility Equivalence
==(a::Array{Char},b::Array{Char}) =
normalize_string(string(a...), :NFKC)==normalize_string(string(b...), :NFKC)
@test [''] == [''] == ['H']
@test [''] == [''] == [''] == ['']
@test [''] == ['1']
@test [''] == ['']
@test [''] == ['{']
@test [''] == ['']
@test [''] == ['', '', '', '']
@test ['¼'] == ['1', '', '4']
@test ['dž'] == ['d', 'ž']

#3. Singletons
@test normalize_string("\U212b", :NFD) == "A\U030a"
@test normalize_string("\U212b", :NFC) == "\U00c5"
@test normalize_string("\U2126", :NFC) == normalize_string("\U2126", :NFD) == "\U03a9"

#4. Canonical Composites
@test normalize_string("\U00c5", :NFC) == "\U00c5"
@test normalize_string("\U00c5", :NFD) == "A\U030a"
@test normalize_string("\U00f4", :NFC) == "\U00f4"
@test normalize_string("\U00f4", :NFD) == "o\U0302"

#5. Multiple Combining Marks
@test normalize_string("\U1e69", :NFD) == "s\U0323\U0307"
@test normalize_string("\U1e69", :NFC) == "\U1e69"
@test normalize_string("\U1e0b\U0323", :NFD) == "d\U0323\U0307"
@test normalize_string("\U1e0b\U0323", :NFC) == "\U1e0d\U0307"
@test normalize_string("q\U0307\U0323", :NFC) == "q\U0323\U0307"
@test normalize_string("q\U0307\U0323", :NFD) == "q\U0323\U0307"

#6. Compatibility Composites
@test normalize_string("\Ufb01", :NFD) == normalize_string("\Ufb01", :NFC) == "\Ufb01"
@test normalize_string("\Ufb01", :NFKD) == normalize_string("\Ufb01", :NFKC) == "fi"
@test normalize_string("2\U2075", :NFD) == normalize_string("2\U2075", :NFC) == "2\U2075"
@test normalize_string("2\U2075", :NFKD) == normalize_string("2\U2075", :NFKC) == "25"
@test normalize_string("\U1e9b\U0323", :NFD) == "\U017f\U0323\U0307"
@test normalize_string("\U1e9b\U0323", :NFC) == "\U1e9b\U0323"
@test normalize_string("\U1e9b\U0323", :NFKD) == "s\U0323\U0307"
@test normalize_string("\U1e9b\U0323", :NFKC) == "\U1e69"

0 comments on commit 7e5a31d

Please sign in to comment.