Skip to content

Commit e4b102e

Browse files
committed
improve displaystyle math recognition
1 parent 2d0dc90 commit e4b102e

File tree

1 file changed

+11
-6
lines changed

1 file changed

+11
-6
lines changed

pix2tex/dataset/extract_latex.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import argparse
2+
import html
23
import os
34
import re
45
import numpy as np
@@ -10,7 +11,7 @@
1011
inline = re.compile(r'(\\\((.*?)(?<!\\)\\\))|(\\\[(.{%i,%i}?)(?<!\\)\\\])' % (1, MAX_CHARS))
1112
equation = re.compile(r'\\begin\{(equation|math|displaymath)\*?\}(.{%i,%i}?)\\end\{\1\*?\}' % (1, MAX_CHARS), re.S)
1213
align = re.compile(r'(\\begin\{(align|alignedat|alignat|flalign|eqnarray|aligned|split|gather)\*?\}(.{%i,%i}?)\\end\{\2\*?\})' % (1, MAX_CHARS), re.S)
13-
displaymath = re.compile(r'(\\displaystyle)(.{%i,%i}?)(\}(?:<|"))' % (1, MAX_CHARS))
14+
displaymath = re.compile(r'(?:\\displaystyle)(.{%i,%i}?)((?<!\\)\}?(?:\"|<))' % (1, MAX_CHARS), re.S)
1415
outer_whitespace = re.compile(
1516
r'^\\,|\\,$|^~|~$|^\\ |\\ $|^\\thinspace|\\thinspace$|^\\!|\\!$|^\\:|\\:$|^\\;|\\;$|^\\enspace|\\enspace$|^\\quad|\\quad$|^\\qquad|\\qquad$|^\\hspace{[a-zA-Z0-9]+}|\\hspace{[a-zA-Z0-9]+}$|^\\hfill|\\hfill$')
1617
label_names = [re.compile(r'\\%s\s?\{(.*?)\}' % s) for s in ['ref', 'cite', 'label', 'eqref']]
@@ -78,7 +79,7 @@ def find_math(s: str, wiki=False) -> List[str]:
7879
7980
Args:
8081
s (str): String to search
81-
wiki (bool, optional): Search for `\displaymath` as it can be found in the wikipedia page source code. Defaults to False.
82+
wiki (bool, optional): Search for `\displaystyle` as it can be found in the wikipedia page source code. Defaults to False.
8283
8384
Returns:
8485
List[str]: List of all found mathematical expressions
@@ -91,7 +92,7 @@ def find_math(s: str, wiki=False) -> List[str]:
9192
groups = [1, 1, 0]
9293
else:
9394
patterns = [displaymath]
94-
groups = [1]
95+
groups = [0]
9596
for i, pattern in zip(groups, patterns):
9697
x = re.findall(pattern, s)
9798
matches.extend([g[i] for g in x])
@@ -102,14 +103,18 @@ def find_math(s: str, wiki=False) -> List[str]:
102103
parser = argparse.ArgumentParser()
103104
parser.add_argument(dest='file', type=str, help='file to find equations in')
104105
parser.add_argument('--out','-o', type=str, default=None, help='file to save equations to. If none provided, print all equations.')
105-
parser.add_argument('--wiki', action='store_true', help='only look for math starting with \\displaymath')
106+
parser.add_argument('--wiki', action='store_true', help='only look for math starting with \\displaystyle')
107+
parser.add_argument('--unescape', action='store_true', help='call `html.unescape` on input')
106108
args = parser.parse_args()
107109

108110
if not os.path.exists(args.file):
109111
raise ValueError('File can not be found. %s' % args.file)
110112

111-
s = open(args.file, 'r').read()
112-
math = '\n'.join(find_math(s, args.wiki))
113+
from pix2tex.dataset.demacro import pydemacro
114+
s = pydemacro(open(args.file, 'r', encoding='utf-8').read())
115+
if args.unescape:
116+
s = html.unescape(s)
117+
math = '\n'.join(sorted(find_math(s, args.wiki)))
113118
if args.out is None:
114119
print(math)
115120
else:

0 commit comments

Comments
 (0)