8
8
import sys
9
9
import argparse
10
10
import logging
11
- import shutil
12
- import subprocess
13
11
import tarfile
14
12
import tempfile
15
13
import chardet
@@ -50,7 +48,7 @@ def download(url, dir_path='./'):
50
48
return 0
51
49
52
50
53
- def read_tex_files (file_path , demacro = True ):
51
+ def read_tex_files (file_path ):
54
52
tex = ''
55
53
try :
56
54
with tempfile .TemporaryDirectory () as tempdir :
@@ -59,18 +57,11 @@ def read_tex_files(file_path, demacro=True):
59
57
tf .extractall (tempdir )
60
58
tf .close ()
61
59
texfiles = [os .path .abspath (x ) for x in glob .glob (os .path .join (tempdir , '**' , '*.tex' ), recursive = True )]
62
- # de-macro
63
- if demacro :
64
- ret = subprocess .run (['de-macro' , * texfiles ], cwd = tempdir , capture_output = True )
65
- if ret .returncode == 0 :
66
- texfiles = glob .glob (os .path .join (tempdir , '**' , '*-clean.tex' ), recursive = True )
67
60
except tarfile .ReadError as e :
68
61
texfiles = [file_path ] # [os.path.join(tempdir, file_path+'.tex')]
69
- #shutil.move(file_path, texfiles[0])
70
-
71
62
for texfile in texfiles :
72
63
try :
73
- tex += open (texfile , 'r' , encoding = chardet .detect (open (texfile , 'br' ).readline ())['encoding' ]).read ()
64
+ tex += open (texfile , 'r' , encoding = chardet .detect (open (texfile , 'br' ).readline ())['encoding' ]).read ()
74
65
except UnicodeDecodeError :
75
66
pass
76
67
tex = unfold (convert (tex ))
@@ -86,32 +77,32 @@ def download_paper(arxiv_id, dir_path='./'):
86
77
return download (url , dir_path )
87
78
88
79
89
- def read_paper (targz_path , delete = True , demacro = True ):
80
+ def read_paper (targz_path , delete = True ):
90
81
paper = ''
91
82
if targz_path != 0 :
92
- paper = read_tex_files (targz_path , demacro )
83
+ paper = read_tex_files (targz_path )
93
84
if delete :
94
85
os .remove (targz_path )
95
86
return paper
96
87
97
88
98
- def parse_arxiv (id , demacro = True ):
89
+ def parse_arxiv (id ):
99
90
tempdir = tempfile .gettempdir ()
100
- text = read_paper (download_paper (id , tempdir ), demacro = demacro )
91
+ text = read_paper (download_paper (id , tempdir ))
101
92
#print(text, file=open('paper.tex', 'w'))
102
93
#linked = list(set([l for l in re.findall(arxiv_id, text)]))
103
94
104
95
return find_math (text , wiki = False ), []
105
96
106
97
107
98
if __name__ == '__main__' :
99
+ # logging.getLogger().setLevel(logging.DEBUG)
108
100
parser = argparse .ArgumentParser (description = 'Extract math from arxiv' )
109
- parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top100' , 'id ' , 'dir' ],
101
+ parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top100' , 'ids ' , 'dir' ],
110
102
help = 'Where to extract code from. top100: current 100 arxiv papers, id: specific arxiv ids. \
111
103
Usage: `python arxiv.py -m id id001 id002`, dir: a folder full of .tar.gz files. Usage: `python arxiv.py -m dir directory`' )
112
- parser .add_argument (nargs = '+ ' , dest = 'args' , default = [])
104
+ parser .add_argument (nargs = '* ' , dest = 'args' , default = [])
113
105
parser .add_argument ('-o' , '--out' , default = os .path .join (os .path .dirname (os .path .realpath (__file__ )), 'data' ), help = 'output directory' )
114
- parser .add_argument ('-d' , '--no-demacro' , dest = 'demacro' , action = 'store_false' , help = 'Use de-macro (Slows down extraction but improves quality)' )
115
106
args = parser .parse_args ()
116
107
if '.' in args .out :
117
108
args .out = os .path .dirname (args .out )
@@ -123,7 +114,7 @@ def parse_arxiv(id, demacro=True):
123
114
if args .mode == 'ids' :
124
115
visited , math = recursive_search (parse_arxiv , args .args , skip = skip , unit = 'paper' )
125
116
elif args .mode == 'top100' :
126
- url = 'https://arxiv.org/list/hep-th/2012 ?skip=0&show=100' # https://arxiv.org/list/hep-th/2012 ?skip=0&show=100
117
+ url = 'https://arxiv.org/list/physics/pastweek ?skip=0&show=100' #' https://arxiv.org/list/hep-th/2203 ?skip=0&show=100'
127
118
ids = get_all_arxiv_ids (requests .get (url ).text )
128
119
math , visited = [], ids
129
120
for id in tqdm (ids ):
@@ -134,15 +125,16 @@ def parse_arxiv(id, demacro=True):
134
125
math , visited = [], []
135
126
for f in tqdm (dirs ):
136
127
try :
137
- text = read_paper (os .path .join (args .args [0 ], f ), False , args . demacro )
128
+ text = read_paper (os .path .join (args .args [0 ], f ), False )
138
129
math .extend (find_math (text , wiki = False ))
139
- visited .append (os .path .basename (f ))
130
+ visited .append (os .path .basename (f ))
140
131
except Exception as e :
141
132
logging .debug (e )
142
133
pass
143
134
else :
144
135
raise NotImplementedError
145
-
136
+ print ('\n ' .join (math ))
137
+ sys .exit (0 )
146
138
for l , name in zip ([visited , math ], ['visited_arxiv.txt' , 'math_arxiv.txt' ]):
147
139
f = os .path .join (args .out , name )
148
140
if not os .path .exists (f ):
0 commit comments