Skip to content
This repository has been archived by the owner on Mar 1, 2023. It is now read-only.

Commit

Permalink
Accommodating missing URLs
Browse files Browse the repository at this point in the history
  • Loading branch information
rabdill committed May 23, 2019
1 parent 6443353 commit 5a26612
Showing 1 changed file with 27 additions and 1 deletion.
28 changes: 27 additions & 1 deletion spider/spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,29 @@ def __init__(self):
self.session.headers['User-Agent'] = config.user_agent
self.log = Logger()

def get_urls(self):
# fills in URLs for papers that are for some reason missing them. Determines URLs
# by resolving the DOI.
to_save = []
with self.connection.db.cursor() as cursor:
# find abstracts for any articles without them
cursor.execute(f"SELECT id, doi FROM {config.db['schema']}.articles WHERE url IS NULL OR url='';")
for x in cursor:
print(f'{x[0]}: {x[1]}')
try:
r = requests.get(f"https://doi.org/{x[1]}")
except Exception as e:
self.log.record(f'Problem resolving DOI: {e}', 'error')
continue
if r.status_code != 200:
self.log.record(f"Got weird status code resolving DOI: {r.status_code}", "error")
continue
to_save.append((r.url, x[0]))
with self.connection.db.cursor() as cursor:
# find abstracts for any articles without them
cursor.executemany(f"UPDATE {config.db['schema']}.articles SET url=%s WHERE id=%s;", to_save)


def _pull_crossref_data_date(self, datestring, retry=True):
# Datestring should be format YYYY-MM-DD
self.log.record(f"Beginning retrieval of Crossref data for {datestring}", "info")
Expand Down Expand Up @@ -315,7 +338,7 @@ def refresh_article_stats(self, collection=None, cap=10000, id=None, get_authors
elif collection is None:
cursor.execute("SELECT id, url, doi FROM articles WHERE collection IS NULL AND last_crawled < now() - interval %s;", (config.refresh_interval,))
else:
cursor.execute("SELECT id, url, doi FROM articles WHERE collection=%s AND last_crawled < now() - interval %s ORDER BY last_crawled ASC;", (collection, config.refresh_interval))
cursor.execute(f"SELECT id, url, doi FROM {config.db['schema']}.articles WHERE collection=%s AND last_crawled < now() - interval %s ORDER BY last_crawled ASC;", (collection, config.refresh_interval))
else:
cursor.execute("SELECT id, url, doi FROM articles WHERE id=%s;", (id,))
updated = 0
Expand All @@ -324,6 +347,9 @@ def refresh_article_stats(self, collection=None, cap=10000, id=None, get_authors
url = article[1]
doi = article[2]
self.log.record(f"\nRefreshing article {article_id}", "debug")
if url is None:
self.log.record(f'No URL for article {article_id}. Skipping.', 'warn')
continue
if config.polite:
time.sleep(1)
stat_table, authors = self.get_article_stats(url)
Expand Down

0 comments on commit 5a26612

Please sign in to comment.