Skip to content

Commit

Permalink
Working version
Browse files Browse the repository at this point in the history
  • Loading branch information
Pold87 committed Feb 16, 2016
0 parents commit 3e8d2bd
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 0 deletions.
34 changes: 34 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Historic word occurrence in academic papers

## Summary

This script extracts the historic word occurrence of a search term in
academic papers (from Google Scholar). It allows for spotting trends
in research and analyzing the relevance of a topic over time.

## Usage

`python term_frequency.py '<keyword>' <start date> <end date>`

This command lists the number of publications for every year using
this keyword. The script just searches for articles and excludes
patents and citations.

## Example

- Search term: 'bitcoin'
- Desired time span: 2000 to 2015
- Command: `python term_frequency.py 'bitcoin' 2000 2015`
- Output: `out.csv`, with the following contents:
| year | results |
|------+---------|
| 2011 | 6320 |
| 2012 | 7250 |
| 2013 | 8170 |
| 2014 | 8260 |
| 2015 | 8150 |





70 changes: 70 additions & 0 deletions extract_num_results.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
# By: Volker Strobel
from bs4 import BeautifulSoup
import urllib
from urllib2 import Request, build_opener, HTTPCookieProcessor
from cookielib import MozillaCookieJar
import re
import time
import sys

def get_num_results(search_term, start_date, end_date):
"""
Helper method, sends HTTP request and returns response payload.
"""

# Open website and read html
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36'
url = "https://scholar.google.nl/scholar?as_vis=1&q=self-disclosure&hl=en&as_sdt=1,5&as_ylo={0}&as_yhi={1}".format(start_date, end_date)
opener = build_opener()
request = Request(url=url, headers={'User-Agent': user_agent})
handler = opener.open(request)
html = handler.read()

# Create soup for parsing HTML and extracting the relevant information
soup = BeautifulSoup(html, 'html.parser')
div_results = soup.find("div", {"id": "gs_ab_md"}) # find line 'About x results (y sec)

if div_results != None:
res = re.findall(r'\s(\d+),?(\d+)?\s', div_results.text) # extract number of search results
num_results = ''.join(res[0]) # convert string to number
success = True
else:
success = False
num_results = 0

return num_results, success


def get_range(search_term, start_date, end_date):

fp = open("out.csv", 'w')
fp.write("year,results\n")
print("year,results")

for date in range(start_date, end_date):

num_results, success = get_num_results(search_term, date, date)
if not(success):
print("It seems that you made to many requests to Google Scholar. Please wait a couple of hours and try again.")
break
year_results = "{0},{1}".format(date, num_results)
print(year_results)
fp.write(year_results + '\n')
time.sleep(0.8)

fp.close()

if __name__ == "__main__":

if len(sys.argv) < 3:
print "******"
print "Academic word relevance"
print "******"
print ""
print "Usage: python term_frequency.py '<search term>' <start date> <end date>"

else:
search_term = sys.argv[1]
start_date = int(sys.argv[2])
end_date = int(sys.argv[3])
html = get_range(search_term, start_date, end_date)

0 comments on commit 3e8d2bd

Please sign in to comment.