-
Notifications
You must be signed in to change notification settings - Fork 2
/
Mine_Tool.py
85 lines (70 loc) · 2.65 KB
/
Mine_Tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import urllib.request as libreq
import pandas as pd
import os
import pdftotext
import urllib.request
# Tools designed to work with ArXiv_Miner.Miner_base and ArXiv_Miner.Arxiv
# Mostly for converting file types and querying files
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def get_arxiv_pdf(pdf_id, target='Target.pdf'):
# Attempts to locate an ArXiv pdf with a specific id
# Does a 2nd pass if none is found at ID with a /hep-ex/ prefix
# An outer loop checks again if nothing is found (fixes most server end issues)
# for a maximum of 4 checks
flag = 0
while flag < 2:
try:
urllib.request.urlretrieve('https://export.arxiv.org/pdf/' + pdf_id + '.pdf', target)
flag += 1
return None
except:
print('ArXiv PDF not present at address')
try:
pdf_id.replace("hep-ex/", "")
urllib.request.urlretrieve('https://export.arxiv.org/pdf/' + pdf_id + '.pdf', target)
flag += 1
return None
except:
print('still failed to find PDF with replaced hep-ex prefix')
flag += 1
def convert_pdf_query_to_text(target='Target.pdf'):
# Warning: Current version must be used directly after get_arxiv_pdf because requiring to save
# PDF in order to get plaintext is awful badness.
# Takes the saved PDF file and returns a plaintext string
try:
with open(target, "rb") as f:
try:
pdf = pdftotext.PDF(f)
if os.path.exists(target):
os.remove(target) # Remove PDF to sort any memory leakage
except:
pdf = 'nullpdf'
print('PDF not found')
return pdf
except FileNotFoundError:
return 'Dummy string'
def load_plaintext_file(target='Target.pdf'):
try:
with open(target, "rb") as f:
pdf = f.read()
return pdf
except FileNotFoundError:
return 'Dummy string'
def pdf_contains_word(pdf_text, wordlist, lower=True):
# Boolean expression checking entire word list versus arXiv plaintext
# First checks if you care about case sensitivity
if lower:
sentence = "\n\n".join(pdf_text).lower()
else:
sentence = "\n\n".join(pdf_text)
# Returns if word exists within plaintext
if any(word in sentence for word in wordlist):
return 1
else:
return 0
def prep_plotting(target_pdf=''):
# TODO this is just a notebook shortcut. Delete later
df = pd.read_csv(target_pdf)
df = df[['year', 'mention']]
print(df)
return df