Skip to content

Commit

Permalink
Comparing search and meta
Browse files Browse the repository at this point in the history
Comparing articles found by searching to the meta articles referenced
in the article
  • Loading branch information
Elaine Lin committed Nov 4, 2015
1 parent bbb0fab commit 7089a5b
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 16 deletions.
18 changes: 10 additions & 8 deletions parseReference.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,13 @@ def getMetaNums(fileName):
meta = [article for article in allArticles if allArticles[article] > 1]
return meta

folder = 'meta'
test = 'A Systematic Review and Meta-Analysis of the Pharmacological Treatment of Cancer-Related Fatigue.txt'
fileName = '{}/{}'.format(folder, test)
metaNums = getMetaNums(fileName)
allReferences = findEndRef(fileName)
metaReferences = [allReferences[num-1] for num in metaNums] #metaNums is 1 indexed, arrays are 0 indexed
print metaNums
print metaReferences
'''
Given a file name, returns the list of all meta articles referenced
'''
def getMetaReferences(fileName):
folder = 'meta'
fileName = '{}/{}'.format(folder, fileName)
metaNums = getMetaNums(fileName)
allReferences = findEndRef(fileName)
metaReferences = [allReferences[num-1] for num in metaNums] #metaNums is 1 indexed, arrays are 0 indexed
return metaReferences
Binary file modified parseReference.pyc
Binary file not shown.
27 changes: 19 additions & 8 deletions search.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
from bs4 import BeautifulSoup
import os
import reader
from parseReference import removeTags

import parseReference
'''
annotatedMeta.tsv contains article titles and the human-annotated key words
Given a title of an article, returns the key words to search for
Expand All @@ -19,17 +18,29 @@ def generateURL(query):
A url in the required format is generated.
"""
query = '+'.join(query.split())
url = 'http://jnci.oxfordjournals.org/search?fulltext=' + query + '&hits=25&submit=yes'
# url = 'http://jnci.oxfordjournals.org/search?fulltext=' + query + '&hits=25&submit=yes' #search oxford journals
url = 'http://www.ncbi.nlm.nih.gov/pmc/?term='+query #search pubmed
return url

#Gets a list of all article titles
def getTitles(url):
source = reader.readURL(url)
soup = BeautifulSoup(source)
links = soup.findAll("span", {"class":"cit-title"})
#links = soup.findAll("span", {"class":"cit-title"}) #oxford journals
links = soup.findAll("div", {"class":"title"}) #pubmed
links = map(parseReference.removeTags, links)
return links

query = 'breast cancer'
url = generateURL(query)
print url
print map(removeTags,getTitles(url))
#test = 'Meta-Analysis of Soy Intake and Breast Cancer Risk.txt'
for fileName in os.listdir(os.getcwd()+"/meta"):
if ".txt" in fileName and len(fileName) > 8: #HACKY, FIX
query = findKeyWord(fileName)
url = generateURL(query)
print url
searchResults = getTitles(url)
metaReferences = parseReference.getMetaReferences(fileName)
for meta in metaReferences:
for result in searchResults:
if result in meta:
print result
#print getTitles(testURL)

0 comments on commit 7089a5b

Please sign in to comment.