Skip to content

Commit

Permalink
Rearrangements (brucemiller#2387)
Browse files Browse the repository at this point in the history
* Recognize more TeX font names; note that fonts loaded as 'scaled' are stored as-if 'at'; make mathDefault font use OT1 encoding; make relativeTo also compare encoding; new Font->asFontinfo to return simulated fontinfo for eventually better integration

* Have \chardef revert to \char only for definitions from main source, otherwise use the cmd itself; similar for \mathchardef

* Slightly more robust coding

* Corrections to INITEX initializations of mathcode

* Corrected \sfcode default

* Moved dash & quote ligatures and helpers from plain.pool

* Moved cdots,ldots ligatures from plain.pool

* Moved alignLine helper from plain.pool

* More TeX-like allocation,\newcount, etc; initialized more fonts; initialized \mathcodes; removed helpers and ligatures to core Engine pool files

* accommodate change to allocators

* Retract two changes that are too early

* Fix typo for scaled font

* Assign allocated_boxes globally

* use the utilities that we've just defined

* Give clue when alignment of a line is dropped

* Revert "Slightly more robust coding"
which really was changes to Fontmap that weren't yet ready for inclusion

This reverts commit 8144738.
  • Loading branch information
brucemiller authored Aug 12, 2024
1 parent 40e9fa8 commit bc2d8f6
Show file tree
Hide file tree
Showing 9 changed files with 218 additions and 143 deletions.
109 changes: 67 additions & 42 deletions lib/LaTeXML/Common/Font.pm
Original file line number Diff line number Diff line change
Expand Up @@ -60,44 +60,53 @@ my $FLAG_EMPH = 0x10;
# NOTE: This probably doesn't really belong in here...

my %font_family = (
cmr => { family => 'serif' }, cmss => { family => 'sansserif' },
cmtt => { family => 'typewriter' }, cmvtt => { family => 'typewriter' },
cmt => { family => 'serif' }, # for cmti "text italic"
cmfib => { family => 'serif' }, cmfr => { family => 'serif' },
cmdh => { family => 'serif' }, cm => { family => 'serif' },
ptm => { family => 'serif' }, ppl => { family => 'serif' },
pnc => { family => 'serif' }, pbk => { family => 'serif' },
phv => { family => 'sansserif' }, pag => { family => 'serif' },
pcr => { family => 'typewriter' }, pzc => { family => 'script' },
put => { family => 'serif' }, bch => { family => 'serif' },
psy => { family => 'symbol' }, pzd => { family => 'dingbats' },
ccr => { family => 'serif' }, ccy => { family => 'symbol' },
cmbr => { family => 'sansserif' }, cmtl => { family => 'typewriter' },
cmbrs => { family => 'symbol' }, ul9 => { family => 'typewriter' },
txr => { family => 'serif' }, txss => { family => 'sansserif' },
txtt => { family => 'typewriter' }, txms => { family => 'symbol' },
txsya => { family => 'symbol' }, txsyb => { family => 'symbol' },
pxr => { family => 'serif' }, pxms => { family => 'symbol' },
pxsya => { family => 'symbol' }, pxsyb => { family => 'symbol' },
futs => { family => 'serif' },
uaq => { family => 'serif' }, ugq => { family => 'sansserif' },
eur => { family => 'serif' }, eus => { family => 'script' },
euf => { family => 'fraktur' }, euex => { family => 'symbol' },
cmr => { family => 'serif' },
cmss => { family => 'sansserif' },
cmssq => { family => 'sansserif' }, # quote style?
cmssqi => { family => 'sansserif', shape => 'italic' }, # quote style?
cmtt => { family => 'typewriter' }, cmvtt => { family => 'typewriter' },
cmt => { family => 'serif' }, # for cmti "text italic"
cmfib => { family => 'serif' },
cmfr => { family => 'serif' },
cm => { family => 'serif' },
cmdh => { family => 'serif' },
cmr => { family => 'serif' },
cmdunh => { family => 'serif' }, # like cmr10 but with tall body heights
cmu => { family => 'serif' }, # unslanted italic ??
ptm => { family => 'serif' }, ppl => { family => 'serif' },
pnc => { family => 'serif' }, pbk => { family => 'serif' },
phv => { family => 'sansserif' }, pag => { family => 'serif' },
pcr => { family => 'typewriter' }, pzc => { family => 'script' },
put => { family => 'serif' }, bch => { family => 'serif' },
psy => { family => 'symbol' }, pzd => { family => 'dingbats' },
ccr => { family => 'serif' }, ccy => { family => 'symbol' },
cmbr => { family => 'sansserif' }, cmtl => { family => 'typewriter' },
cmbrs => { family => 'symbol' }, ul9 => { family => 'typewriter' },
txr => { family => 'serif' }, txss => { family => 'sansserif' },
txtt => { family => 'typewriter' }, txms => { family => 'symbol' },
txsya => { family => 'symbol' }, txsyb => { family => 'symbol' },
pxr => { family => 'serif' }, pxms => { family => 'symbol' },
pxsya => { family => 'symbol' }, pxsyb => { family => 'symbol' },
futs => { family => 'serif' },
uaq => { family => 'serif' }, ugq => { family => 'sansserif' },
eur => { family => 'serif' }, eus => { family => 'script' },
euf => { family => 'fraktur' }, euex => { family => 'symbol' },
# The following are actually math fonts.
ms => { family => 'symbol' },
ccm => { family => 'serif', shape => 'italic' },
cmm => { family => 'italic', encoding => 'OML' },
cmex => { family => 'symbol', encoding => 'OMX' }, # Not really symbol, but...
cmsy => { family => 'symbol', encoding => 'OMS' },
ccitt => { family => 'typewriter', shape => 'italic' },
cmbrm => { family => 'sansserif', shape => 'italic' },
futm => { family => 'serif', shape => 'italic' },
futmi => { family => 'serif', shape => 'italic' },
txmi => { family => 'serif', shape => 'italic' },
pxmi => { family => 'serif', shape => 'italic' },
bbm => { family => 'blackboard' },
bbold => { family => 'blackboard' },
bbmss => { family => 'blackboard' },
ms => { family => 'symbol' },
ccm => { family => 'serif', shape => 'italic' },
cmm => { family => 'math', shape => 'italic', encoding => 'OML' },
cmex => { family => 'symbol', encoding => 'OMX' }, # Not really symbol, but...
cmsy => { family => 'symbol', encoding => 'OMS' },
ccitt => { family => 'typewriter', shape => 'italic' },
cmsltt => { family => 'typewriter', shape => 'slanted' },
cmbrm => { family => 'sansserif', shape => 'italic' },
futm => { family => 'serif', shape => 'italic' },
futmi => { family => 'serif', shape => 'italic' },
txmi => { family => 'serif', shape => 'italic' },
pxmi => { family => 'serif', shape => 'italic' },
bbm => { family => 'blackboard' },
bbold => { family => 'blackboard' },
bbmss => { family => 'blackboard' },
# some ams fonts
cmmib => { family => 'italic', series => 'bold' },
cmbsy => { family => 'symbol', series => 'bold' },
Expand Down Expand Up @@ -177,16 +186,17 @@ sub decodeFontname {
if (my $ffam = lookupFontFamily($fam)) { map { $props{$_} = $$ffam{$_} } keys %$ffam; }
if (my $fser = lookupFontSeries($ser)) { map { $props{$_} = $$fser{$_} } keys %$fser; }
if (my $fsh = lookupFontShape($shp)) { map { $props{$_} = $$fsh{$_} } keys %$fsh; }
$size = 1 unless $size; # Yes, also if 0, "" (from regexp)
$size = $at if defined $at;
$size *= $scaled if defined $scaled;
$size = 1 unless $size; # Yes, also if 0, "" (from regexp)
$size = $at if defined $at;
$size = $size * $scaled if defined $scaled;
$props{name} = $name;
$props{size} = $size;
# Experimental Hack !?!?!?
$props{encoding} = 'OT1' unless defined $props{encoding};
$props{at} = $at . "pt" if defined $at;
return %props; }
else {
return; } }
Info('unrecognized', 'font', undef, "Unrecognized fontname '$name'");
return (family => $name, size => DEFSIZE()); } }

sub lookupTeXFont {
my ($fontname, $seriescode, $shapecode) = @_;
Expand Down Expand Up @@ -266,6 +276,7 @@ sub stringify {
no warnings 'recursion';
my ($self) = @_;
my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self;
# !!!!!
$fam = 'serif' if $fam && ($fam eq 'math');
return 'Font[' . join(',', map { Stringify($_) } grep { $_ }
(isDiff($fam, $DEFFAMILY) ? ($fam) : ()),
Expand All @@ -280,6 +291,16 @@ sub stringify {
)
. ']'; }

# Return a Fontinfo-like hash
# Eventually a more integrated representation of Fonts that accommodates
# both low-level TeX-like commands, and higher-level CSS-like ones.
sub asFontinfo {
my ($self) = @_;
my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self;
return { family => $fam, series => $ser, shape => $shp, size => $siz,
color => $col, background => $bkg, opacity => $opa,
encoding => $enc || 'OT1', language => $lang, mathstyle => $mstyle }; }

sub equals {
my ($self, $other) = @_;
return (defined $other) && ((ref $self) eq (ref $other))
Expand Down Expand Up @@ -328,6 +349,7 @@ sub relativeTo {
my ($self, $other) = @_;
my ($fam, $ser, $shp, $siz, $col, $bkg, $opa, $enc, $lang, $mstyle, $flags) = @$self;
my ($ofam, $oser, $oshp, $osiz, $ocol, $obkg, $oopa, $oenc, $olang, $omstyle, $oflags) = @$other;
# !!!!
$fam = 'serif' if $fam && ($fam eq 'math');
$ofam = 'serif' if $ofam && ($ofam eq 'math');
## my $emph = 0;
Expand Down Expand Up @@ -358,6 +380,9 @@ sub relativeTo {
(isDiff($opa, $oopa)
? (opacity => { value => $opa, properties => { opacity => $opa } })
: ()),
(isDiff($enc, $oenc)
? (encoding => { value => $enc, properties => { encoding => $enc } })
: ()),
(isDiff($lang, $olang)
? ('xml:lang' => { value => $lang, properties => { language => $lang } })
: ()),
Expand Down
14 changes: 9 additions & 5 deletions lib/LaTeXML/Core/Definition/CharDef.pm
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,17 @@ sub invoke {
my $mathglyph = $$self{mathglyph};
# A dilemma: If the \chardef were in a style file, you're prefer to revert to the $cs
# but if defined in the document source, better to use \char ###\relax, so it still "works"
if (defined $mathglyph) { # Must be a math char
my $src = $$self{locator} && $$self{locator}->toString;
my $local = $src && $src !~ /\.(?:sty|ltxml|ltxmlc)/; # Dumps currently have undefined src!
if (defined $mathglyph) { # Must be a math char
return Box($mathglyph, undef, undef,
Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')),
($local ? Tokens(T_CS('\mathchar'), $value->revert, T_CS('\relax')) : $$self{cs}),
role => $$self{role}); }
else { # else text; but note defered font/encoding till digestion!
return Box(LaTeXML::Package::FontDecode($value->valueOf), undef, undef,
Tokens(T_CS('\char'), $value->revert, T_CS('\relax'))); } }
else { # else text; but note defered font/encoding till digestion!
my ($char, %props) = LaTeXML::Package::FontDecode($value->valueOf);
return Box($char, undef, undef,
($local ? Tokens(T_CS('\char'), $value->revert, T_CS('\relax')) : $$self{cs}),
%props); } }

sub equals {
my ($self, $other) = @_;
Expand Down
7 changes: 3 additions & 4 deletions lib/LaTeXML/Core/State.pm
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,12 @@ sub new {
$$self{delcode} = {};
$$self{tracing_definitions} = {};
# Initializations that INITEX would have set.
$$self{mathcode}{'.'} = [0];
for (my $c = ord('0') ; $c <= ord('9') ; $c++) {
$$self{mathcode}{ chr($c) } = [0x7000]; }
$$self{mathcode}{ chr($c) } = [0x7000 + $c]; }
for (my $c = ord('a') ; $c <= ord('z') ; $c++) {
my $C = $c + ord('A') - ord('a');
$$self{mathcode}{ chr($c) } = [0x7100];
$$self{mathcode}{ chr($C) } = [0x7100];
$$self{mathcode}{ chr($c) } = [0x7100 + $c];
$$self{mathcode}{ chr($C) } = [0x7100 + $C];
$$self{uccode}{ chr($c) } = [$C];
$$self{lccode}{ chr($C) } = [$c];
$$self{sfcode}{ chr($C) } = [999]; }
Expand Down
1 change: 1 addition & 0 deletions lib/LaTeXML/Engine/LaTeX.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -4889,6 +4889,7 @@ DefConstructor('\@framebox[Dimension][]{}',
$document->setAttribute($c[0], $k => $v); } } } }
);

AssignValue(allocated_boxes => 0, 'global');
DefPrimitive('\newsavebox DefToken', sub {
my $n = LookupValue('allocated_boxes') + 1;
AssignValue(allocated_boxes => $n, 'global');
Expand Down
2 changes: 1 addition & 1 deletion lib/LaTeXML/Engine/TeX_Character.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ DefRegister('\catcode Number', Number(0),
# Not used anywhere (yet)
DefRegister('\sfcode Number', Number(0),
getter => sub { my $code = $STATE->lookupSFcode(chr($_[0]->valueOf));
Number(defined $code ? $code : 0); },
Number(defined $code ? $code : 1000); },
setter => sub { $STATE->assignSFcode(chr($_[2]->valueOf) => $_[0]->valueOf, $_[1]); });
DefRegister('\lccode Number', Number(0),
getter => sub { my $code = $STATE->lookupLCcode(chr($_[0]->valueOf));
Expand Down
33 changes: 33 additions & 0 deletions lib/LaTeXML/Engine/TeX_Fonts.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -285,5 +285,38 @@ DeclareFontMap('OMX',
# [missing tips for horizontal curly braces]
"\x{2191}", "\x{2193}", undef, undef, undef, undef, "\x{21D1}", "\x{21D3}"]);

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# TeX's ligatures handled by rewrite regexps.
# Note: applied in reverse order of definition (latest defined applied first!)
# Note also, these area only applied in text content, not in attributes!
sub nonTypewriter {
my ($font) = @_;
return ($font->getFamily ne 'typewriter'); }

sub nonTypewriterT1 {
my ($font) = @_;
return ($font->getFamily ne 'typewriter') && (($font->getEncoding || 'OT1') =~ /^(OT1|T1)$/); }

# EN DASH (NOTE: With digits before & aft => \N{FIGURE DASH})
DefLigature(qr{--}, "\x{2013}", fontTest => \&nonTypewriter); # EN dash
DefLigature(qr{---}, "\x{2014}", fontTest => \&nonTypewriter); # EM dash

# Ligatures for doubled single left & right quotes to convert to double quotes
# [should ligatures be part of a font, in the first place? (it is in TeX!)
DefLigature(qr{\x{2018}\x{2018}}, "\x{201C}", fontTest => \&nonTypewriterT1); # double left quote
DefLigature(qr{\x{2019}\x{2019}}, "\x{201D}", fontTest => \&nonTypewriterT1); # double right quote
DefLigature(qr{\?\x{2018}}, UTF(0xBF), fontTest => \&nonTypewriterT1); # ? backquote
DefLigature(qr{!\x{2018}}, UTF(0xA1), fontTest => \&nonTypewriterT1); # ! backquote
# These ligatures are also handled by TeX.
# However, it appears that decent modern fonts in modern browsers handle these at that level.
# So it's likely not worth doing it at the conversion level, possibly adversely affecting search.
# DefLigature(qr{ff}, "\x{FB00}", fontTest => \&nonTypewriterT1);
# DefLigature(qr{fi}, "\x{FB01}", fontTest => \&nonTypewriterT1);
# DefLigature(qr{fl}, "\x{FB02}", fontTest => \&nonTypewriterT1);
# DefLigature(qr{ffi}, "\x{FB03}", fontTest => \&nonTypewriterT1);
# DefLigature(qr{ffl}, "\x{FB04}", fontTest => \&nonTypewriterT1);

DefLigature(qr{\.\.\.}, "\x{2026}", fontTest => \&nonTypewriter); # ldots

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1;
9 changes: 9 additions & 0 deletions lib/LaTeXML/Engine/TeX_Math.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -1193,5 +1193,14 @@ DefConstructor('\lx@eqno{}',
"^ <ltx:tags><ltx:tag><ltx:Math><ltx:XMath>#1</ltx:XMath></ltx:Math></ltx:tag></ltx:tags>",
reversion => '');

#======================================================================
# Pretest for XMath to keep from interpreting math that the DOM may not allow!!
##DefMathRewrite(xpath=>'descendant-or-self::ltx:XMath',match=>'\cdot\cdot\cdot',replace=>'\cdots');

DefMathLigature("\x{22C5}\x{22C5}\x{22C5}" => "\x{22EF}", role => 'ID', name => 'cdots');

#DefMathRewrite(xpath=>'descendant-or-self::ltx:XMath',match=>'...',replace=>'\ldots');
DefMathLigature("..." => "\x{2026}", role => 'ID', name => 'ldots');

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1;
13 changes: 13 additions & 0 deletions lib/LaTeXML/Engine/TeX_Paragraph.pool.ltxml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,19 @@ DefConstructorI('\noindent', undef, sub {
# Otherwise ignore.
return; });

sub alignLine {
my ($document, $line, $alignment) = @_;
if ($document->isOpenable('ltx:p')) {
$document->insertElement('ltx:p', $line, class => 'ltx_align_' . $alignment); }
elsif ($document->isOpenable('ltx:text')) {
$document->insertElement('ltx:text', $line, class => 'ltx_align_' . $alignment);
$document->insertElement('ltx:break'); }
else {
Info('unexpected', 'alignment', $document,
"Lost requested alignment '$alignment'; no suitable element");
$document->absorb($line); }
return; }

# <ltx:para> represents a Logical Paragraph, whereas <ltx:p> is a `physical paragraph'.
# A para can contain both p and displayed equations and such.

Expand Down
Loading

0 comments on commit bc2d8f6

Please sign in to comment.