Accommodating missing URLs

blekhmanlab · May 23, 2019 · 5a26612 · 5a26612
1 parent 6443353
commit 5a26612
Showing 1 changed file with 27 additions and 1 deletion.
diff --git a/spider/spider.py b/spider/spider.py
@@ -83,6 +83,29 @@ def __init__(self):
     self.session.headers['User-Agent'] = config.user_agent
     self.log = Logger()
 
+  def get_urls(self):
+    # fills in URLs for papers that are for some reason missing them. Determines URLs
+    # by resolving the DOI.
+    to_save = []
+    with self.connection.db.cursor() as cursor:
+      # find abstracts for any articles without them
+      cursor.execute(f"SELECT id, doi FROM {config.db['schema']}.articles WHERE url IS NULL OR url='';")
+      for x in cursor:
+        print(f'{x[0]}: {x[1]}')
+        try:
+          r = requests.get(f"https://doi.org/{x[1]}")
+        except Exception as e:
+          self.log.record(f'Problem resolving DOI: {e}', 'error')
+          continue
+        if r.status_code != 200:
+          self.log.record(f"Got weird status code resolving DOI: {r.status_code}", "error")
+          continue
+        to_save.append((r.url, x[0]))
+    with self.connection.db.cursor() as cursor:
+      # find abstracts for any articles without them
+      cursor.executemany(f"UPDATE {config.db['schema']}.articles SET url=%s WHERE id=%s;", to_save)
+
+
   def _pull_crossref_data_date(self, datestring, retry=True):
     # Datestring should be format YYYY-MM-DD
     self.log.record(f"Beginning retrieval of Crossref data for {datestring}", "info")
@@ -315,7 +338,7 @@ def refresh_article_stats(self, collection=None, cap=10000, id=None, get_authors
         elif collection is None:
           cursor.execute("SELECT id, url, doi FROM articles WHERE collection IS NULL AND last_crawled < now() - interval %s;", (config.refresh_interval,))
         else:
-          cursor.execute("SELECT id, url, doi FROM articles WHERE collection=%s AND last_crawled < now() - interval %s ORDER BY last_crawled ASC;", (collection, config.refresh_interval))
+          cursor.execute(f"SELECT id, url, doi FROM {config.db['schema']}.articles WHERE collection=%s AND last_crawled < now() - interval %s ORDER BY last_crawled ASC;", (collection, config.refresh_interval))
       else:
         cursor.execute("SELECT id, url, doi FROM articles WHERE id=%s;", (id,))
       updated = 0
@@ -324,6 +347,9 @@ def refresh_article_stats(self, collection=None, cap=10000, id=None, get_authors
         url = article[1]
         doi = article[2]
         self.log.record(f"\nRefreshing article {article_id}", "debug")
+        if url is None:
+          self.log.record(f'No URL for article {article_id}. Skipping.', 'warn')
+          continue
         if config.polite:
           time.sleep(1)
         stat_table, authors = self.get_article_stats(url)