-
Notifications
You must be signed in to change notification settings - Fork 139
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
10,584 additions
and
10,087 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,8 @@ | |
*.dylib | ||
*.dSYM | ||
*.txt | ||
*.ttf | ||
*.sfd | ||
*.out | ||
bench/bench | ||
bench/icu | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
# Following work by @jiahao, we compute character widths using a combination of | ||
# * advance widths from GNU Unifont (advance width 512 = 1 en) | ||
# * UAX 11: East Asian Width | ||
# * a few exceptions as needed | ||
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734 | ||
# | ||
# Requires Julia (obviously) and FontForge. | ||
|
||
############################################################################# | ||
# Widths from GNU Unifont | ||
|
||
universion="7.0.06" | ||
for fontfile in ["unifont-$universion", "unifont_upper-$universion"] | ||
isfile("$fontfile.ttf") || download("http://unifoundry.com/pub/unifont-$universion/font-builds/$fontfile.ttf", "$fontfile.ttf") | ||
isfile("$fontfile.sfd") || run(`fontforge -lang=ff -c "Open(\"$fontfile.ttf\");Save(\"$fontfile.sfd\");Quit(0);"`) | ||
end | ||
|
||
#Read sfdfile for character widths | ||
function parsesfd(filename::String, CharWidths::Dict{Int,Int}=Dict{Int,Int}()) | ||
state=:seekchar | ||
lineno = 0 | ||
for line in readlines(open(filename)) | ||
lineno += 1 | ||
if state==:seekchar #StartChar: nonmarkingreturn | ||
if contains(line, "StartChar: ") | ||
codepoint = nothing | ||
width = nothing | ||
state = :readdata | ||
end | ||
elseif state==:readdata #Encoding: 65538 -1 2, Width: 1024 | ||
contains(line, "Encoding:") && (codepoint = int(split(line)[3])) | ||
contains(line, "Width:") && (width = int(split(line)[2])) | ||
if codepoint!=nothing && width!=nothing && codepoint >= 0 | ||
CharWidths[codepoint]=width | ||
state = :seekchar | ||
end | ||
end | ||
end | ||
CharWidths | ||
end | ||
CharWidths=parsesfd("unifont-$universion.sfd") | ||
CharWidths=parsesfd("unifont_upper-$universion.sfd", CharWidths) | ||
|
||
# convert from advance width (512 units to the en) to character width | ||
for (c,v) in CharWidths | ||
CharWidths[c] = div(v, 512) | ||
end | ||
|
||
############################################################################# | ||
# Widths from UAX #11: East Asian Width | ||
|
||
isfile("EastAsianWidth.txt") || download("http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt", "EastAsianWidth.txt") | ||
for line in readlines(open("EastAsianWidth.txt")) | ||
#Strip comments | ||
line[1] == '#' && continue | ||
precomment = split(line, '#')[1] | ||
#Parse code point range and width code | ||
tokens = split(precomment, ';') | ||
length(tokens) >= 2 || continue | ||
charrange = tokens[1] | ||
width = strip(tokens[2]) | ||
#Parse code point range into Julia UnitRange | ||
rangetokens = split(charrange, "..") | ||
charstart = uint32("0x"*rangetokens[1]) | ||
charend = uint32("0x"*rangetokens[length(rangetokens)>1 ? 2 : 1]) | ||
|
||
#Assign widths | ||
for c in charstart:charend | ||
width=="N" && continue #Ignore neutral characters | ||
CharWidths[c]=(width=="W" || width=="F") ? 2 : #Wide or full | ||
(width=="Na"|| width=="H" || width=="A") ? 1 : #Narrow or half or ambiguous (default to narrow in non-East-Asian contexts, which we can assume to be the default) | ||
error("Unknown East Asian width code: $width for code point: $c") | ||
end | ||
end | ||
|
||
############################################################################# | ||
# A few exceptions to the above cases, found by manual comparison | ||
# to other wcwidth functions. | ||
|
||
# Use ../libutf8proc for category codes, rather than the one in Julia, | ||
# to minimize bootstrapping complexity when a new version of Unicode comes out. | ||
function catcode(c) | ||
uint(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs | ||
return unsafe_load(ccall((:utf8proc_get_property,"../libutf8proc"), Ptr{UInt16}, (Int32,), c)) | ||
end | ||
|
||
|
||
# use Base.UTF8proc module to get category codes constants, since | ||
# we aren't goint to change these in utf8proc. | ||
import Base.UTF8proc | ||
|
||
# make sure format control character (category Cf) have width 0, | ||
# except for the Arabic characters 0x06xx (see unicode std 6.2, sec. 8.2) | ||
for c in keys(CharWidths) | ||
if catcode(c)==UTF8proc.UTF8PROC_CATEGORY_CF && | ||
c ∉ [0x0601,0x0602,0x0603,0x06dd] | ||
CharWidths[c]=0 | ||
end | ||
end | ||
|
||
#By definition, should have zero width (on the same line) | ||
#0x002028 ' ' category: Zl name: LINE SEPARATOR/ | ||
#0x002029 ' ' category: Zp name: PARAGRAPH SEPARATOR/ | ||
CharWidths[0x2028]=0 | ||
CharWidths[0x2029]=0 | ||
|
||
#By definition, should be narrow = width of 1 en space | ||
#0x00202f ' ' category: Zs name: NARROW NO-BREAK SPACE/ | ||
CharWidths[0x202f]=1 | ||
|
||
#By definition, should be wide = width of 1 em space | ||
#0x002001 ' ' category: Zs name: EM QUAD/ | ||
#0x002003 ' ' category: Zs name: EM SPACE/ | ||
CharWidths[0x2001]=2 | ||
CharWidths[0x2003]=2 | ||
|
||
############################################################################# | ||
# Non-printable control characters will be assigned a width of zero | ||
# (wcwidth returns -1 for such characters) | ||
|
||
isprintable(c::Union(Char,Integer)) = c <= 0x10ffff && is_valid_char(c) && isprintable_category(catcode(c)) | ||
isprintable_category(category) = | ||
!( category==UTF8proc.UTF8PROC_CATEGORY_CN # Unassigned | ||
|| category==UTF8proc.UTF8PROC_CATEGORY_CS # Surrogate | ||
|| category==UTF8proc.UTF8PROC_CATEGORY_CC # Control | ||
|| category==0 # Invalid | ||
) | ||
|
||
# Question: should we just use Julia's isprint algorithm here? It is different, | ||
# though it is also based on the character category. | ||
|
||
############################################################################# | ||
# Output (to a file or pipe) for processing by data_generator.rb | ||
# ... don't bother to output zero widths since that will be the default. | ||
|
||
firstc = 0x000000 | ||
lastv = 0 | ||
uhex(c) = uppercase(hex(c,4)) | ||
for c in 0x0000:0x110000 | ||
v = isprintable(c) ? get(CharWidths, c, 0) : 0 | ||
if v != lastv || c == 0x110000 | ||
v < 4 || error("invalid charwidth $v for $c") | ||
if firstc+1 < c | ||
println(uhex(firstc), "..", uhex(c-1), "; ", lastv) | ||
else | ||
println(uhex(firstc), "; ", lastv) | ||
end | ||
firstc = c | ||
lastv = v | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.