Comparing search and meta

Comparing articles found by searching to the meta articles referenced in the article
elainewlin · Nov 4, 2015 · 7089a5b · 7089a5b
1 parent bbb0fab
commit 7089a5b
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 16 deletions.
diff --git a/parseReference.py b/parseReference.py
@@ -103,11 +103,13 @@ def getMetaNums(fileName):
     meta = [article for article in allArticles if allArticles[article] > 1]
     return meta
 
-folder = 'meta'
-test = 'A Systematic Review and Meta-Analysis of the Pharmacological Treatment of Cancer-Related Fatigue.txt'
-fileName = '{}/{}'.format(folder, test)
-metaNums = getMetaNums(fileName)
-allReferences = findEndRef(fileName)
-metaReferences = [allReferences[num-1] for num in metaNums] #metaNums is 1 indexed, arrays are 0 indexed
-print metaNums
-print metaReferences
+'''
+Given a file name, returns the list of all meta articles referenced
+'''
+def getMetaReferences(fileName):
+    folder = 'meta'
+    fileName = '{}/{}'.format(folder, fileName)
+    metaNums = getMetaNums(fileName)
+    allReferences = findEndRef(fileName)
+    metaReferences = [allReferences[num-1] for num in metaNums] #metaNums is 1 indexed, arrays are 0 indexed
+    return metaReferences
diff --git a/parseReference.pyc b/parseReference.pyc
diff --git a/search.py b/search.py
@@ -1,8 +1,7 @@
 from bs4 import BeautifulSoup
 import os
 import reader
-from parseReference import removeTags
-
+import parseReference
 '''
 annotatedMeta.tsv contains article titles and the human-annotated key words
 Given a title of an article, returns the key words to search for
@@ -19,17 +18,29 @@ def generateURL(query):
     A url in the required format is generated.
     """
     query = '+'.join(query.split())
-    url = 'http://jnci.oxfordjournals.org/search?fulltext=' + query + '&hits=25&submit=yes'
+   # url = 'http://jnci.oxfordjournals.org/search?fulltext=' + query + '&hits=25&submit=yes' #search oxford journals
+    url = 'http://www.ncbi.nlm.nih.gov/pmc/?term='+query #search pubmed
     return url
 
 #Gets a list of all article titles
 def getTitles(url):
     source = reader.readURL(url)
     soup = BeautifulSoup(source)
-    links = soup.findAll("span", {"class":"cit-title"})
+    #links = soup.findAll("span", {"class":"cit-title"}) #oxford journals
+    links = soup.findAll("div", {"class":"title"}) #pubmed
+    links = map(parseReference.removeTags, links)
     return links
 
-query = 'breast cancer'
-url = generateURL(query)
-print url
-print map(removeTags,getTitles(url))
+#test = 'Meta-Analysis of Soy Intake and Breast Cancer Risk.txt'
+for fileName in os.listdir(os.getcwd()+"/meta"):
+    if ".txt" in fileName and len(fileName) > 8: #HACKY, FIX
+        query = findKeyWord(fileName) 
+        url = generateURL(query)
+        print url
+        searchResults = getTitles(url)
+        metaReferences = parseReference.getMetaReferences(fileName)
+        for meta in metaReferences:
+            for result in searchResults:
+                if result in meta:
+                    print result
+#print getTitles(testURL)