Skip to content

Commit

Permalink
Encodings (brucemiller#2435)
Browse files Browse the repository at this point in the history
* Fix mapping for ^,~ which should be accents

* Correct more fontmap entries for ^ and ~ which are actually accents

* Fix ^ and ~ in test cases

* Patch up accents on i,j and dotless i,j

* Careful about missing base letter in \accent; Careful to use pack (not chr) in lcToken,ucToken for 128--256

* Add math properties for &, small triangles

* CharDef has optional encoding to use instead of current font/encoding; have \DeclareTextSymbol use specified encoding

* If input is unicode, FontDecodeString should NOT use upper half of FontMaps but assume already Unicode

* Watch for short FontMaps

* Watch for comment tokens when parsing siunits

* Try for more portable babel/frech test across various texlives

* Fix typo in babel test setup

* Fix confusion between inverted breve (U+0311) and double inverted breve (U+0361 for \t tie) and update tests

* double inverted breve in OML encoding as well

* Create a predicate defn->isCharDef for CharDefs

* Improved \accent to accept <optional assignments> and propery recognize the <character>; enhance the test case
  • Loading branch information
brucemiller authored Nov 20, 2024
1 parent f2b2a7e commit 9ec6a41
Show file tree
Hide file tree
Showing 28 changed files with 291 additions and 172 deletions.
3 changes: 3 additions & 0 deletions lib/LaTeXML/Core/Definition.pm
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ sub isRegister {
sub isFontDef { # ONLY FontDef handles this!
return ''; }

sub isCharDef { # ONLY CharDef handles this!
return ''; }

sub isPrefix {
return 0; }

Expand Down
21 changes: 15 additions & 6 deletions lib/LaTeXML/Core/Definition/CharDef.pm
Original file line number Diff line number Diff line change
Expand Up @@ -18,24 +18,29 @@ use LaTeXML::Common::Error;
use LaTeXML::Core::Token;
use LaTeXML::Core::Tokens;
use LaTeXML::Core::Box;
use LaTeXML::Util::Unicode;
use base qw(LaTeXML::Core::Definition::Register);

# A CharDef is a specialized register;
# You can't assign it; when you invoke the control sequence, it returns
# the result of evaluating the character (more like a regular primitive).
# When $mode is 'math', interprets $value as a (3-part) mathcode, otherwise just index into current font.
# When $mathglyph is provided, it is the unicode corresponding to the \mathchar of $value
# Optionally provide the encoding, otherwise use current encoding when digested.
sub new {
my ($class, $cs, $mode, $value) = @_;
my ($class, $cs, $mode, $value, $encoding) = @_;
return bless { cs => $cs, parameters => undef,
mode => $mode, value => $value,
mode => $mode, value => $value, encoding => $encoding,
registerType => 'Number', readonly => 1,
locator => $STATE->getStomach->getGullet->getMouth->getLocator }, $class; }

sub valueOf {
my ($self) = @_;
return $$self{value}; }

sub isCharDef {
return 1; }

sub setValue {
my ($self, $value, $scope) = @_;
Error('unexpected', $self, undef, "Can't assign to chardef " . $self->getCSName);
Expand All @@ -50,11 +55,15 @@ sub invoke {
my $src = $$self{locator} && $$self{locator}->toString;
my $local = $src && $src !~ /\.(?:sty|ltxml|ltxmlc)/; # Dumps currently have undefined src!
if ($$self{mode} eq 'text') { # text; but note defered font/encoding till digestion!
# Decode the codepoint using current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($nvalue);
## Decode the codepoint using requested encoding ELSE current font & encoding
my ($glyph, $adjfont) = LaTeXML::Package::FontDecode($nvalue, $$self{encoding});
my %props = ();
if ($STATE->lookupValue('IN_MATH')) { # Add math properties if IN math (even for text \chardef)
my $charinfo = unicode_math_properties($glyph);
%props = %$charinfo if $charinfo; }
return Box($glyph, $adjfont, undef,
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs})); }
else { # Else math mode, mathDecode!
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs}), %props); }
else { # Else math mode, mathDecode!
my ($glyph, $f, $rev, %props) = LaTeXML::Package::decodeMathChar($nvalue);
if (!defined $props{name}) { # Synthesize name attribute from CS, if needed (Clarify purpose of name!)
my $n = $self->getCSName;
Expand Down
5 changes: 1 addition & 4 deletions lib/LaTeXML/Engine/LaTeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -2765,17 +2765,14 @@ DefMacro('\ProvideTextCommandDefault DefToken', '\ProvideTextCommand{#1}{?}');

DefPrimitive('\DeclareTextSymbol DefToken {}{Number}', sub {
my ($stomach, $cs, $encoding, $code) = @_;
$code = $code->valueOf;
my $css = ToString($cs);
$encoding = ToString(Expand($encoding));
if (isDefinable($cs)) { # If not already defined...
DefMacroI($cs, undef,
'\expandafter\ifx\csname\cf@encoding\string' . $css . '\endcsname\relax\csname?\string' . $css . '\endcsname'
. '\else\csname\cf@encoding\string' . $css . '\endcsname\fi'); }
my $ecs = T_CS('\\' . $encoding . $css);
DefPrimitiveI($ecs, undef, sub {
my ($glyph, $adjfont) = FontDecode($code, $encoding);
Box($glyph, $adjfont, undef, $cs); });
$STATE->installDefinition(LaTeXML::Core::Definition::CharDef->new($ecs, 'text', $code, $encoding));
return; });

DefPrimitive('\DeclareTextSymbolDefault DefToken {}', sub {
Expand Down
53 changes: 43 additions & 10 deletions lib/LaTeXML/Engine/TeX_Character.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,14 @@ sub applyAccent {
my $locator = $box->getLocator;
my $font = $box->getFont;
my $string = $box->toString;
$string =~ tr/\x{0131}\x{0237}/ij/;
$string =~ s/\s/ /g;
# In Unicode (but not always (La)TeX), overaccents generally mask the dots of i,j.
# So we replace dotless so NFC can normalize better.
if (my $entry = unicode_accent($standalonechar)) {
if (($$entry{role} || '') eq 'OVERACCENT') {
$string =~ tr/\x{0131}\x{0237}/ij/; } } # Replace dotless i,j with dotted version
if (($string =~ /[ij]/) && ($combiningchar eq "\x{0307}")) { # a dot on i,j Not needed
$combiningchar = ''; }
my @letters = split(//, $string);
return Box(($string =~ /^\s*$/
? $standalonechar
Expand All @@ -81,19 +87,46 @@ sub DefAccent {
protected => 1);
return; }

# This will fail if there really are "assignments" after the number! (See TeX Book)
# We're given a number pointing into the font; the FontMap presumably has the standalone char.
# If there's no letter to be accented, just use the stanadalone.
# \accent <number> <optional assignments><character>; See TeX Book p.287
# <assignments>: (<prefix>) simple assignment or macro assignment
# <character> : letter, other, \char, \chardef token, \noboundary
# Eventually, we're given a number pointing into the font;
# the FontMap presumably has the standalone char, to use if there is no base letter
# Otherwise, use the Util::Unicode module to find the appropriate combining character
DefPrimitive('\accent Number {}', sub {
my ($stomach, $num, $letter) = @_;
DefPrimitive('\accent Number', sub {
my ($stomach, $num) = @_;
my $gullet = $stomach->getGullet;
# Decode & Fetch the accent BEFORE processing any "assignments"
my $n = $num->valueOf;
my ($glyph, $adjfont) = FontDecode($n);
my @assignments = ();
## Check for (& accumulate) various kinds of <assignments>
my ($token, $cc, $defn);
while (($token = $gullet->readXNonSpace)
&& ($defn = $STATE->lookupDefinition($token))
&& ($defn->isPrefix
|| $defn->isFontDef
|| ($defn->isRegister && !$defn->isCharDef)
|| ($token->getString =~ /^\\(?:def|edef|gdef|xdef)$/))) {
push(@assignments, $stomach->invokeToken($token)); }
## Check for various kinds of <character>
my $letter = Tokens();
if (!$token) { }
elsif ((($cc = $token->getCatcode) == CC_LETTER) || ($cc == CC_OTHER)
|| ($defn && $defn->isCharDef)) {
$letter = $token; }
elsif ($token->equals(T_CS('\char'))) {
$letter = Tokens(Invocation($token, $gullet->readNumber)); }
elsif ($token->equals(T_CS('\noboundary'))) { } # Treat as empty
else {
$gullet->unread($token); } # No appropriate <character> ? Treat as empty
my $result;
if (my $entry = unicode_accent($glyph)) {
applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
$result = applyAccent($stomach, $letter, $$entry{combiner}, $$entry{standalone},
Invocation(T_CS('\accent'), $num, $letter)); }
else { # Unknown accent ? Attempt to OVERLAY the accent on top of $letter
Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($glyph), T_END)); } });
$result = Digest(Tokens(T_CS('\lx@overlay'), T_BEGIN, $letter, T_END, T_BEGIN, T_OTHER($glyph), T_END)); }
return (@assignments, $result); });

#======================================================================
# \chardef iq provides an alternate way to define a control sequence that returns a character.
Expand All @@ -118,12 +151,12 @@ DefPrimitive('\chardef Token SkipSpaces SkipMatch:=', sub {
sub ucToken {
my ($token) = @_;
my $code = $STATE->lookupUCcode($token->getString);
return ((defined $code) && ($code != 0) ? Token(chr($code), $token->getCatcode) : $token); }
return ((defined $code) && ($code != 0) ? Token(pack('U', $code), $token->getCatcode) : $token); }

sub lcToken {
my ($token) = @_;
my $code = $STATE->lookupLCcode($token->getString);
return ((defined $code) && ($code != 0) ? Token(chr($code), $token->getCatcode) : $token); }
return ((defined $code) && ($code != 0) ? Token(pack('U', $code), $token->getCatcode) : $token); }

# Note that these are NOT expandable, even though the "return" tokens!
DefPrimitive('\uppercase GeneralText', sub {
Expand Down
2 changes: 1 addition & 1 deletion lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ DeclareFontMap('OML',
# p q r s t u v w
'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
# x y z dotless i dotless j weier-p arrow acc. inv.breve
'x', 'y', 'z', "\x{0131}", "j", "\x{2118}", "\x{2192}", UTF(0xA0) . "\x{0311}"]);
'x', 'y', 'z', "\x{0131}", "j", "\x{2118}", "\x{2192}", UTF(0xA0) . "\x{0361}"]);
DeclareFontMap('OMS',
[ #minus dot times ast divide diamond plus-minus minus-plus
"-", "\x{22C5}", UTF(0xD7), "\x{2217}", UTF(0xF7), "\x{22C4}", UTF(0xB1), "\x{2213}",
Expand Down
14 changes: 10 additions & 4 deletions lib/LaTeXML/Package.pm
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ use Unicode::Normalize;
use LaTeXML::Util::Unicode;
use Text::Balanced;
use Text::Unidecode;
use Encode;
use base qw(Exporter);
our @EXPORT = (qw(&DefAutoload &DefExpandable
&DefMacro &DefMacroI
Expand Down Expand Up @@ -2794,16 +2795,21 @@ sub FontDecodeString {
my ($string, $encoding, $implicit) = @_;
return if !defined $string;
my ($map, $font);
my $map_max = 256; # Up to 256 chars in FontMap
my $input_enc = $STATE->lookupValue('INPUT_ENCODING');
# BUT, if input was in utf8, we'll assume the upper half 128-256 is ALREADY unicode!
if ($input_enc && ($input_enc eq 'utf8')) {
$map_max = 128; }
if (!$encoding) {
$font = LookupValue('font');
$encoding = $font->getEncoding; }
if ($encoding && ($map = LoadFontMap($encoding))) { # OK got some map.
if ($encoding && ($map = LoadFontMap($encoding))) { # OK got some map.
my ($family, $fmap);
if ($font && ($family = $font->getFamily) && ($fmap = LookupValue($encoding . '_' . $family . '_fontmap'))) {
$map = $fmap; } } # Use the family specific map, if any.

$map = $fmap; } } # Use the family specific map, if any.
$map_max = 128 if $map && !defined($$map[128]); # ALSO for short font maps
return join('', grep { defined $_ }
map { ($implicit ? (($map && ($_ < 128)) ? $$map[$_] : pack('U', $_))
map { ($implicit ? (($map && ($_ < $map_max)) ? $$map[$_] : pack('U', $_))
: ($map ? $$map[$_] : undef)) }
map { ord($_) } split(//, $string)); }

Expand Down
2 changes: 2 additions & 0 deletions lib/LaTeXML/Package/french.ldf.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ DefMacro('\nombre{}', '\@ifpackageloaded{numprint}{\numprint{#1}}{\ltx@orig@nomb
AtBeginDocument(sub {
Let('\degre', '\textdegree');
DefMacro('\degres', '\hbox to 0.3em{\degre}');
Let('\tild', '\textasciitilde');
Let('\circonflexe', '\textasciicircum');
});

1;
Expand Down
62 changes: 31 additions & 31 deletions lib/LaTeXML/Package/ly1.fontmap.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -16,38 +16,38 @@ use warnings;
use LaTeXML::Package;

DeclareFontMap('LY1', [
undef, undef, undef, undef, "\x{2044}", "\x{02D9}", "\x{02DD}", "\x{02DB}",
"\x{FB02}", undef, undef, undef, "\x{FB01}", undef, undef, undef,
"\x{0131}", undef, UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{0152}", UTF(0xD8),
" ", "!", "\"", "#", "\$", "%", "&", "\x{2019}",
"(", ")", "*", "+", ",", "-", ".", "/",
"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", ":", ";", "<", "=", ">", "?",
"\@", "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "U", "V", "W",
"X", "Y", "Z", "[", "\\", "]", "^", UTF(0x5F),
"\x{2018}", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "~", "\x{2010}",
"\x{0141}", "'", "\x{201A}", "\x{0192}", "\x{201E}", "\x{2026}", "\x{2020}", "\x{2021}",
"^", "\x{2030}", "\x{0160}", "\x{2039}", "\x{0152}", "\x{017D}", UTF(0x5E), "-",
undef, undef, undef, undef, "\x{2044}", "\x{02D9}", "\x{02DD}", "\x{02DB}",
"\x{FB02}", undef, undef, undef, "\x{FB01}", undef, undef, undef,
"\x{0131}", undef, UTF(0x60), UTF(0xB4), "\x{02C7}", "\x{02D8}", UTF(0xAF), "\x{02DA}",
UTF(0xB8), UTF(0xDF), UTF(0xE6), "\x{0153}", UTF(0xF8), UTF(0xC6), "\x{0152}", UTF(0xD8),
" ", "!", "\"", "#", "\$", "%", "&", "\x{2019}",
"(", ")", "*", "+", ",", "-", ".", "/",
"0", "1", "2", "3", "4", "5", "6", "7",
"8", "9", ":", ";", "<", "=", ">", "?",
"\@", "A", "B", "C", "D", "E", "F", "G",
"H", "I", "J", "K", "L", "M", "N", "O",
"P", "Q", "R", "S", "T", "U", "V", "W",
"X", "Y", "Z", "[", "\\", "]", "\x{02C6}", UTF(0x5F),
"\x{2018}", "a", "b", "c", "d", "e", "f", "g",
"h", "i", "j", "k", "l", "m", "n", "o",
"p", "q", "r", "s", "t", "u", "v", "w",
"x", "y", "z", "{", "|", "}", "\x{02DC}", "\x{2010}",
"\x{0141}", "'", "\x{201A}", "\x{0192}", "\x{201E}", "\x{2026}", "\x{2020}", "\x{2021}",
"^", "\x{2030}", "\x{0160}", "\x{2039}", "\x{0152}", "\x{017D}", UTF(0x5E), "-",
"\x{0142}", "\x{2018}", "\x{2019}", "\x{201C}", "\x{201D}", "\x{2022}", "\x{2013}", "\x{2014}",
"~", "\x{2122}", "\x{0161}", "\x{203A}", "\x{0153}", "\x{017E}", UTF(0x7E), "\x{0178}",
undef, UTF(0xA1), UTF(0xA2), UTF(0xA3), UTF(0xA4), UTF(0xA5), UTF(0xA6), UTF(0xA7),
UTF(0xA8), UTF(0xA9), UTF(0xAA), UTF(0xAB), UTF(0xAC), undef, UTF(0xAE), UTF(0xAF),
UTF(0xB0), UTF(0xB1), UTF(0xB2), UTF(0xB3), UTF(0xB4), UTF(0xB5), UTF(0xB6), UTF(0xB7),
UTF(0xB8), UTF(0xB9), UTF(0xBA), UTF(0xBB), UTF(0xBC), UTF(0xBD), UTF(0xBE), UTF(0xBF),
UTF(0xC0), UTF(0xC1), UTF(0xC2), UTF(0xC3), UTF(0xC4), UTF(0xC5), UTF(0xC6), UTF(0xC7),
UTF(0xC8), UTF(0xC9), UTF(0xCA), UTF(0xCB), UTF(0xCC), UTF(0xCD), UTF(0xCE), UTF(0xCF),
UTF(0xD0), UTF(0xD1), UTF(0xD2), UTF(0xD3), UTF(0xD4), UTF(0xD5), UTF(0xD6), UTF(0xD7),
UTF(0xD8), UTF(0xD9), UTF(0xDA), UTF(0xDB), UTF(0xDC), UTF(0xDD), UTF(0xDE), UTF(0xDF),
UTF(0xE0), UTF(0xE1), UTF(0xE2), UTF(0xE3), UTF(0xE4), UTF(0xE5), UTF(0xE6), UTF(0xE7),
UTF(0xE8), UTF(0xE9), UTF(0xEA), UTF(0xEB), UTF(0xEC), UTF(0xED), UTF(0xEE), UTF(0xEF),
UTF(0xF0), UTF(0xF1), UTF(0xF2), UTF(0xF3), UTF(0xF4), UTF(0xF5), UTF(0xF6), UTF(0xF7),
UTF(0xF8), UTF(0xF9), UTF(0xFA), UTF(0xFB), UTF(0xFC), UTF(0xFD), UTF(0xFE), UTF(0xFF),
"\x{02DC}", "\x{2122}", "\x{0161}", "\x{203A}", "\x{0153}", "\x{017E}", UTF(0x7E), "\x{0178}",
undef, UTF(0xA1), UTF(0xA2), UTF(0xA3), UTF(0xA4), UTF(0xA5), UTF(0xA6), UTF(0xA7),
UTF(0xA8), UTF(0xA9), UTF(0xAA), UTF(0xAB), UTF(0xAC), undef, UTF(0xAE), UTF(0xAF),
UTF(0xB0), UTF(0xB1), UTF(0xB2), UTF(0xB3), UTF(0xB4), UTF(0xB5), UTF(0xB6), UTF(0xB7),
UTF(0xB8), UTF(0xB9), UTF(0xBA), UTF(0xBB), UTF(0xBC), UTF(0xBD), UTF(0xBE), UTF(0xBF),
UTF(0xC0), UTF(0xC1), UTF(0xC2), UTF(0xC3), UTF(0xC4), UTF(0xC5), UTF(0xC6), UTF(0xC7),
UTF(0xC8), UTF(0xC9), UTF(0xCA), UTF(0xCB), UTF(0xCC), UTF(0xCD), UTF(0xCE), UTF(0xCF),
UTF(0xD0), UTF(0xD1), UTF(0xD2), UTF(0xD3), UTF(0xD4), UTF(0xD5), UTF(0xD6), UTF(0xD7),
UTF(0xD8), UTF(0xD9), UTF(0xDA), UTF(0xDB), UTF(0xDC), UTF(0xDD), UTF(0xDE), UTF(0xDF),
UTF(0xE0), UTF(0xE1), UTF(0xE2), UTF(0xE3), UTF(0xE4), UTF(0xE5), UTF(0xE6), UTF(0xE7),
UTF(0xE8), UTF(0xE9), UTF(0xEA), UTF(0xEB), UTF(0xEC), UTF(0xED), UTF(0xEE), UTF(0xEF),
UTF(0xF0), UTF(0xF1), UTF(0xF2), UTF(0xF3), UTF(0xF4), UTF(0xF5), UTF(0xF6), UTF(0xF7),
UTF(0xF8), UTF(0xF9), UTF(0xFA), UTF(0xFB), UTF(0xFC), UTF(0xFD), UTF(0xFE), UTF(0xFF),
]);

1;
Expand Down
1 change: 1 addition & 0 deletions lib/LaTeXML/Package/siunitx.sty.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,7 @@ sub six_apply_mathligatures {
my $repl;
if (@tokens && ($repl = $six_mathligatures{ $t->getCSName }{ $tokens[0]->getCSName })) {
shift(@tokens); push(@r, $repl); }
elsif ($t->getCatcode == CC_COMMENT) { }
else {
push(@r, $t); } }
return @r; }
Expand Down
Loading

0 comments on commit 9ec6a41

Please sign in to comment.