1
1
import argparse
2
+ import html
2
3
import os
3
4
import re
4
5
import numpy as np
10
11
inline = re .compile (r'(\\\((.*?)(?<!\\)\\\))|(\\\[(.{%i,%i}?)(?<!\\)\\\])' % (1 , MAX_CHARS ))
11
12
equation = re .compile (r'\\begin\{(equation|math|displaymath)\*?\}(.{%i,%i}?)\\end\{\1\*?\}' % (1 , MAX_CHARS ), re .S )
12
13
align = re .compile (r'(\\begin\{(align|alignedat|alignat|flalign|eqnarray|aligned|split|gather)\*?\}(.{%i,%i}?)\\end\{\2\*?\})' % (1 , MAX_CHARS ), re .S )
13
- displaymath = re .compile (r'(\\displaystyle)(.{%i,%i}?)(\} (?:<|" ))' % (1 , MAX_CHARS ))
14
+ displaymath = re .compile (r'(?: \\displaystyle)(.{%i,%i}?)((?<!\\)\}? (?:\"|< ))' % (1 , MAX_CHARS ), re . S )
14
15
outer_whitespace = re .compile (
15
16
r'^\\,|\\,$|^~|~$|^\\ |\\ $|^\\thinspace|\\thinspace$|^\\!|\\!$|^\\:|\\:$|^\\;|\\;$|^\\enspace|\\enspace$|^\\quad|\\quad$|^\\qquad|\\qquad$|^\\hspace{[a-zA-Z0-9]+}|\\hspace{[a-zA-Z0-9]+}$|^\\hfill|\\hfill$' )
16
17
label_names = [re .compile (r'\\%s\s?\{(.*?)\}' % s ) for s in ['ref' , 'cite' , 'label' , 'eqref' ]]
@@ -78,7 +79,7 @@ def find_math(s: str, wiki=False) -> List[str]:
78
79
79
80
Args:
80
81
s (str): String to search
81
- wiki (bool, optional): Search for `\displaymath ` as it can be found in the wikipedia page source code. Defaults to False.
82
+ wiki (bool, optional): Search for `\displaystyle ` as it can be found in the wikipedia page source code. Defaults to False.
82
83
83
84
Returns:
84
85
List[str]: List of all found mathematical expressions
@@ -91,7 +92,7 @@ def find_math(s: str, wiki=False) -> List[str]:
91
92
groups = [1 , 1 , 0 ]
92
93
else :
93
94
patterns = [displaymath ]
94
- groups = [1 ]
95
+ groups = [0 ]
95
96
for i , pattern in zip (groups , patterns ):
96
97
x = re .findall (pattern , s )
97
98
matches .extend ([g [i ] for g in x ])
@@ -102,14 +103,18 @@ def find_math(s: str, wiki=False) -> List[str]:
102
103
parser = argparse .ArgumentParser ()
103
104
parser .add_argument (dest = 'file' , type = str , help = 'file to find equations in' )
104
105
parser .add_argument ('--out' ,'-o' , type = str , default = None , help = 'file to save equations to. If none provided, print all equations.' )
105
- parser .add_argument ('--wiki' , action = 'store_true' , help = 'only look for math starting with \\ displaymath' )
106
+ parser .add_argument ('--wiki' , action = 'store_true' , help = 'only look for math starting with \\ displaystyle' )
107
+ parser .add_argument ('--unescape' , action = 'store_true' , help = 'call `html.unescape` on input' )
106
108
args = parser .parse_args ()
107
109
108
110
if not os .path .exists (args .file ):
109
111
raise ValueError ('File can not be found. %s' % args .file )
110
112
111
- s = open (args .file , 'r' ).read ()
112
- math = '\n ' .join (find_math (s , args .wiki ))
113
+ from pix2tex .dataset .demacro import pydemacro
114
+ s = pydemacro (open (args .file , 'r' , encoding = 'utf-8' ).read ())
115
+ if args .unescape :
116
+ s = html .unescape (s )
117
+ math = '\n ' .join (sorted (find_math (s , args .wiki )))
113
118
if args .out is None :
114
119
print (math )
115
120
else :
0 commit comments