1
1
# modified from https://github.com/soskek/arxiv_leaks
2
2
3
3
import argparse
4
- import json
4
+ import subprocess
5
5
import os
6
6
import glob
7
7
import re
10
10
import logging
11
11
import tarfile
12
12
import tempfile
13
- import chardet
14
13
import logging
15
14
import requests
16
15
import urllib .request
22
21
23
22
# logging.getLogger().setLevel(logging.INFO)
24
23
arxiv_id = re .compile (r'(?<!\d)(\d{4}\.\d{5})(?!\d)' )
25
- arxiv_base = 'https://arxiv.org/e-print/'
24
+ arxiv_base = 'https://export. arxiv.org/e-print/'
26
25
27
26
28
27
def get_all_arxiv_ids (text ):
@@ -48,7 +47,7 @@ def download(url, dir_path='./'):
48
47
return 0
49
48
50
49
51
- def read_tex_files (file_path ):
50
+ def read_tex_files (file_path , demacro = False ):
52
51
tex = ''
53
52
try :
54
53
with tempfile .TemporaryDirectory () as tempdir :
@@ -59,50 +58,59 @@ def read_tex_files(file_path):
59
58
texfiles = [os .path .abspath (x ) for x in glob .glob (os .path .join (tempdir , '**' , '*.tex' ), recursive = True )]
60
59
except tarfile .ReadError as e :
61
60
texfiles = [file_path ] # [os.path.join(tempdir, file_path+'.tex')]
61
+ if demacro :
62
+ ret = subprocess .run (['de-macro' , * texfiles ], cwd = tempdir , capture_output = True )
63
+ if ret .returncode == 0 :
64
+ texfiles = glob .glob (os .path .join (tempdir , '**' , '*-clean.tex' ), recursive = True )
62
65
for texfile in texfiles :
63
66
try :
64
- tex += open (texfile , 'r' , encoding = chardet .detect (open (texfile , 'br' ).readline ())['encoding' ]).read ()
65
- except UnicodeDecodeError :
67
+ ct = open (texfile , 'r' , encoding = 'utf-8' ).read ()
68
+ tex += ct
69
+ except UnicodeDecodeError as e :
70
+ logging .debug (e )
66
71
pass
67
- tex = unfold (convert (tex ))
68
72
except Exception as e :
69
73
logging .debug ('Could not read %s: %s' % (file_path , str (e )))
70
- pass
71
- # remove comments
72
- return re . sub ( r'(?<!\\)%.*\n' , '' , tex )
74
+ raise e
75
+ tex = pydemacro ( tex )
76
+ return tex
73
77
74
78
75
79
def download_paper (arxiv_id , dir_path = './' ):
76
80
url = arxiv_base + arxiv_id
77
81
return download (url , dir_path )
78
82
79
83
80
- def read_paper (targz_path , delete = True ):
84
+ def read_paper (targz_path , delete = False , demacro = False ):
81
85
paper = ''
82
86
if targz_path != 0 :
83
- paper = read_tex_files (targz_path )
87
+ paper = read_tex_files (targz_path , demacro = demacro )
84
88
if delete :
85
89
os .remove (targz_path )
86
90
return paper
87
91
88
92
89
- def parse_arxiv (id ):
90
- tempdir = tempfile .gettempdir ()
91
- text = read_paper (download_paper (id , tempdir ))
92
- #print(text, file=open('paper.tex', 'w'))
93
- #linked = list(set([l for l in re.findall(arxiv_id, text)]))
93
+ def parse_arxiv (id , save = None , demacro = True ):
94
+ if save is None :
95
+ dir = tempfile .gettempdir ()
96
+ else :
97
+ dir = save
98
+ text = read_paper (download_paper (id , dir ), delete = save is None , demacro = demacro )
94
99
95
100
return find_math (text , wiki = False ), []
96
101
97
102
98
103
if __name__ == '__main__' :
99
104
# logging.getLogger().setLevel(logging.DEBUG)
100
105
parser = argparse .ArgumentParser (description = 'Extract math from arxiv' )
101
- parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top100 ' , 'ids' , 'dir ' ],
102
- help = 'Where to extract code from. top100 : current 100 arxiv papers, id: specific arxiv ids. \
103
- Usage: `python arxiv.py -m id id001 id002`, dir : a folder full of .tar.gz files. Usage: `python arxiv.py -m dir directory`' )
106
+ parser .add_argument ('-m' , '--mode' , default = 'top100' , choices = ['top ' , 'ids' , 'dirs ' ],
107
+ help = 'Where to extract code from. top : current 100 arxiv papers (-m top int for any other number of papers) , id: specific arxiv ids. \
108
+ Usage: `python arxiv.py -m id id001 id002`, dirs : a folder full of .tar.gz files. Usage: `python arxiv.py -m dir directory`' )
104
109
parser .add_argument (nargs = '*' , dest = 'args' , default = [])
105
110
parser .add_argument ('-o' , '--out' , default = os .path .join (os .path .dirname (os .path .realpath (__file__ )), 'data' ), help = 'output directory' )
111
+ parser .add_argument ('-d' , '--demacro' , dest = 'demacro' , action = 'store_true' ,
112
+ help = 'Deprecated - Use de-macro (Slows down extraction, may but improves quality). Install https://www.ctan.org/pkg/de-macro' )
113
+ parser .add_argument ('-s' , '--save' , default = None , type = str , help = 'When downloading files from arxiv. Where to save the .tar.gz files. Default: Only temporary' )
106
114
args = parser .parse_args ()
107
115
if '.' in args .out :
108
116
args .out = os .path .dirname (args .out )
@@ -111,30 +119,47 @@ def parse_arxiv(id):
111
119
skip = open (skips , 'r' , encoding = 'utf-8' ).read ().split ('\n ' )
112
120
else :
113
121
skip = []
114
- if args .mode == 'ids' :
115
- visited , math = recursive_search (parse_arxiv , args .args , skip = skip , unit = 'paper' )
116
- elif args .mode == 'top100' :
117
- url = 'https://arxiv.org/list/physics/pastweek?skip=0&show=100' #'https://arxiv.org/list/hep-th/2203?skip=0&show=100'
118
- ids = get_all_arxiv_ids (requests .get (url ).text )
119
- math , visited = [], ids
120
- for id in tqdm (ids ):
121
- m , _ = parse_arxiv (id )
122
- math .extend (m )
123
- elif args .mode == 'dir' :
124
- dirs = os .listdir (args .args [0 ])
125
- math , visited = [], []
126
- for f in tqdm (dirs ):
127
- try :
128
- text = read_paper (os .path .join (args .args [0 ], f ), False )
129
- math .extend (find_math (text , wiki = False ))
130
- visited .append (os .path .basename (f ))
131
- except Exception as e :
132
- logging .debug (e )
133
- pass
134
- else :
135
- raise NotImplementedError
136
- print ('\n ' .join (math ))
137
- sys .exit (0 )
122
+ if args .save is not None :
123
+ os .makedirs (args .save , exist_ok = True )
124
+ try :
125
+ if args .mode == 'ids' :
126
+ visited , math = recursive_search (parse_arxiv , args .args , skip = skip , unit = 'paper' , save = args .save , demacro = args .demacro )
127
+ elif args .mode == 'top' :
128
+ num = 100 if len (args .args ) == 0 else int (args .args [0 ])
129
+ url = 'https://arxiv.org/list/physics/pastweek?skip=0&show=%i' % num # 'https://arxiv.org/list/hep-th/2203?skip=0&show=100'
130
+ ids = get_all_arxiv_ids (requests .get (url ).text )
131
+ math , visited = [], ids
132
+ for id in tqdm (ids ):
133
+ try :
134
+ m , _ = parse_arxiv (id , save = args .save , demacro = args .demacro )
135
+ math .extend (m )
136
+ except ValueError :
137
+ pass
138
+ elif args .mode == 'dirs' :
139
+ files = []
140
+ for folder in args .args :
141
+ files .extend ([os .path .join (folder , p ) for p in os .listdir (folder )])
142
+ math , visited = [], []
143
+ for f in tqdm (files ):
144
+ try :
145
+ text = read_paper (f , delete = False , demacro = args .demacro )
146
+ math .extend (find_math (text , wiki = False ))
147
+ visited .append (os .path .basename (f ))
148
+ except DemacroError as e :
149
+ logging .debug (f + str (e ))
150
+ pass
151
+ except KeyboardInterrupt :
152
+ break
153
+ except Exception as e :
154
+ logging .debug (e )
155
+ raise e
156
+ else :
157
+ raise NotImplementedError
158
+ except KeyboardInterrupt :
159
+ pass
160
+ print ('Found %i instances of math latex code' % len (math ))
161
+ # print('\n'.join(math))
162
+ # sys.exit(0)
138
163
for l , name in zip ([visited , math ], ['visited_arxiv.txt' , 'math_arxiv.txt' ]):
139
164
f = os .path .join (args .out , name )
140
165
if not os .path .exists (f ):
0 commit comments