8
8
import sys
9
9
import argparse
10
10
import logging
11
- import shutil
12
- import subprocess
13
11
import tarfile
14
12
import tempfile
15
13
import chardet
16
14
import logging
17
15
import requests
18
16
import urllib .request
17
+ from tqdm import tqdm
19
18
from urllib .error import HTTPError
20
- from pix2tex .dataset .extract_latex import *
21
- from pix2tex .dataset .scraping import *
19
+ from pix2tex .dataset .extract_latex import find_math
20
+ from pix2tex .dataset .scraping import recursive_search
22
21
from pix2tex .dataset .demacro import *
23
22
24
23
# logging.getLogger().setLevel(logging.INFO)
@@ -49,7 +48,7 @@ def download(url, dir_path='./'):
49
48
return 0
50
49
51
50
52
- def read_tex_files (file_path , demacro = True ):
51
+ def read_tex_files (file_path ):
53
52
tex = ''
54
53
try :
55
54
with tempfile .TemporaryDirectory () as tempdir :
@@ -58,18 +57,11 @@ def read_tex_files(file_path, demacro=True):
58
57
tf .extractall (tempdir )
59
58
tf .close ()
60
59
texfiles = [os .path .abspath (x ) for x in glob .glob (os .path .join (tempdir , '**' , '*.tex' ), recursive = True )]
61
- # de-macro
62
- if demacro :
63
- ret = subprocess .run (['de-macro' , * texfiles ], cwd = tempdir , capture_output = True )
64
- if ret .returncode == 0 :
65
- texfiles = glob .glob (os .path .join (tempdir , '**' , '*-clean.tex' ), recursive = True )
66
60
except tarfile .ReadError as e :
67
61
texfiles = [file_path ] # [os.path.join(tempdir, file_path+'.tex')]
68
- #shutil.move(file_path, texfiles[0])
69
-
70
62
for texfile in texfiles :
71
63
try :
72
- tex += open (texfile , 'r' , encoding = chardet .detect (open (texfile , 'br' ).readline ())['encoding' ]).read ()
64
+ tex += open (texfile , 'r' , encoding = chardet .detect (open (texfile , 'br' ).readline ())['encoding' ]).read ()
73
65
except UnicodeDecodeError :
74
66
pass
75
67
tex = unfold (convert (tex ))
@@ -85,32 +77,32 @@ def download_paper(arxiv_id, dir_path='./'):
85
77
return download (url , dir_path )
86
78
87
79
88
- def read_paper (targz_path , delete = True , demacro = True ):
80
+ def read_paper (targz_path , delete = True ):
89
81
paper = ''
90
82
if targz_path != 0 :
91
- paper = read_tex_files (targz_path , demacro )
83
+ paper = read_tex_files (targz_path )
92
84
if delete :
93
85
os .remove (targz_path )
94
86
return paper
95
87
96
88
97
- def parse_arxiv (id , demacro = True ):
89
+ def parse_arxiv (id ):
98
90
tempdir = tempfile .gettempdir ()
99
- text = read_paper (download_paper (id , tempdir ), demacro = demacro )
91
+ text = read_paper (download_paper (id , tempdir ))
100
92
#print(text, file=open('paper.tex', 'w'))
101
93
#linked = list(set([l for l in re.findall(arxiv_id, text)]))
102
94
103
95
return find_math (text , wiki = False ), []
104
96
105
97
106
98
if __name__ == '__main__' :
99
+ # logging.getLogger().setLevel(logging.DEBUG)
107
100
parser = argparse .ArgumentParser (description = 'Extract math from arxiv' )
108
- parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top100' , 'id ' , 'dir' ],
101
+ parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top100' , 'ids ' , 'dir' ],
109
102
help = 'Where to extract code from. top100: current 100 arxiv papers, id: specific arxiv ids. \
110
103
Usage: `python arxiv.py -m id id001 id002`, dir: a folder full of .tar.gz files. Usage: `python arxiv.py -m dir directory`' )
111
- parser .add_argument (nargs = '+ ' , dest = 'args' , default = [])
104
+ parser .add_argument (nargs = '* ' , dest = 'args' , default = [])
112
105
parser .add_argument ('-o' , '--out' , default = os .path .join (os .path .dirname (os .path .realpath (__file__ )), 'data' ), help = 'output directory' )
113
- parser .add_argument ('-d' , '--no-demacro' , dest = 'demacro' , action = 'store_false' , help = 'Use de-macro (Slows down extraction but improves quality)' )
114
106
args = parser .parse_args ()
115
107
if '.' in args .out :
116
108
args .out = os .path .dirname (args .out )
@@ -122,7 +114,7 @@ def parse_arxiv(id, demacro=True):
122
114
if args .mode == 'ids' :
123
115
visited , math = recursive_search (parse_arxiv , args .args , skip = skip , unit = 'paper' )
124
116
elif args .mode == 'top100' :
125
- url = 'https://arxiv.org/list/hep-th/2012 ?skip=0&show=100' # https://arxiv.org/list/hep-th/2012 ?skip=0&show=100
117
+ url = 'https://arxiv.org/list/physics/pastweek ?skip=0&show=100' #' https://arxiv.org/list/hep-th/2203 ?skip=0&show=100'
126
118
ids = get_all_arxiv_ids (requests .get (url ).text )
127
119
math , visited = [], ids
128
120
for id in tqdm (ids ):
@@ -133,15 +125,16 @@ def parse_arxiv(id, demacro=True):
133
125
math , visited = [], []
134
126
for f in tqdm (dirs ):
135
127
try :
136
- text = read_paper (os .path .join (args .args [0 ], f ), False , args . demacro )
128
+ text = read_paper (os .path .join (args .args [0 ], f ), False )
137
129
math .extend (find_math (text , wiki = False ))
138
- visited .append (os .path .basename (f ))
130
+ visited .append (os .path .basename (f ))
139
131
except Exception as e :
140
132
logging .debug (e )
141
133
pass
142
134
else :
143
135
raise NotImplementedError
144
-
136
+ print ('\n ' .join (math ))
137
+ sys .exit (0 )
145
138
for l , name in zip ([visited , math ], ['visited_arxiv.txt' , 'math_arxiv.txt' ]):
146
139
f = os .path .join (args .out , name )
147
140
if not os .path .exists (f ):
0 commit comments