forked from Pold87/academic-keyword-occurrence
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 3e8d2bd
Showing
2 changed files
with
104 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
# Historic word occurrence in academic papers | ||
|
||
## Summary | ||
|
||
This script extracts the historic word occurrence of a search term in | ||
academic papers (from Google Scholar). It allows for spotting trends | ||
in research and analyzing the relevance of a topic over time. | ||
|
||
## Usage | ||
|
||
`python term_frequency.py '<keyword>' <start date> <end date>` | ||
|
||
This command lists the number of publications for every year using | ||
this keyword. The script just searches for articles and excludes | ||
patents and citations. | ||
|
||
## Example | ||
|
||
- Search term: 'bitcoin' | ||
- Desired time span: 2000 to 2015 | ||
- Command: `python term_frequency.py 'bitcoin' 2000 2015` | ||
- Output: `out.csv`, with the following contents: | ||
| year | results | | ||
|------+---------| | ||
| 2011 | 6320 | | ||
| 2012 | 7250 | | ||
| 2013 | 8170 | | ||
| 2014 | 8260 | | ||
| 2015 | 8150 | | ||
|
||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# By: Volker Strobel | ||
from bs4 import BeautifulSoup | ||
import urllib | ||
from urllib2 import Request, build_opener, HTTPCookieProcessor | ||
from cookielib import MozillaCookieJar | ||
import re | ||
import time | ||
import sys | ||
|
||
def get_num_results(search_term, start_date, end_date): | ||
""" | ||
Helper method, sends HTTP request and returns response payload. | ||
""" | ||
|
||
# Open website and read html | ||
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36' | ||
url = "https://scholar.google.nl/scholar?as_vis=1&q=self-disclosure&hl=en&as_sdt=1,5&as_ylo={0}&as_yhi={1}".format(start_date, end_date) | ||
opener = build_opener() | ||
request = Request(url=url, headers={'User-Agent': user_agent}) | ||
handler = opener.open(request) | ||
html = handler.read() | ||
|
||
# Create soup for parsing HTML and extracting the relevant information | ||
soup = BeautifulSoup(html, 'html.parser') | ||
div_results = soup.find("div", {"id": "gs_ab_md"}) # find line 'About x results (y sec) | ||
|
||
if div_results != None: | ||
res = re.findall(r'\s(\d+),?(\d+)?\s', div_results.text) # extract number of search results | ||
num_results = ''.join(res[0]) # convert string to number | ||
success = True | ||
else: | ||
success = False | ||
num_results = 0 | ||
|
||
return num_results, success | ||
|
||
|
||
def get_range(search_term, start_date, end_date): | ||
|
||
fp = open("out.csv", 'w') | ||
fp.write("year,results\n") | ||
print("year,results") | ||
|
||
for date in range(start_date, end_date): | ||
|
||
num_results, success = get_num_results(search_term, date, date) | ||
if not(success): | ||
print("It seems that you made to many requests to Google Scholar. Please wait a couple of hours and try again.") | ||
break | ||
year_results = "{0},{1}".format(date, num_results) | ||
print(year_results) | ||
fp.write(year_results + '\n') | ||
time.sleep(0.8) | ||
|
||
fp.close() | ||
|
||
if __name__ == "__main__": | ||
|
||
if len(sys.argv) < 3: | ||
print "******" | ||
print "Academic word relevance" | ||
print "******" | ||
print "" | ||
print "Usage: python term_frequency.py '<search term>' <start date> <end date>" | ||
|
||
else: | ||
search_term = sys.argv[1] | ||
start_date = int(sys.argv[2]) | ||
end_date = int(sys.argv[3]) | ||
html = get_range(search_term, start_date, end_date) |