From cf892b384753f4c8ac2cb88d460651120476bbf1 Mon Sep 17 00:00:00 2001 From: Volker Strobel Date: Wed, 17 Feb 2016 01:14:12 +0100 Subject: [PATCH] fixed bug in search url --- README.md~ | 18 ++++++++++++++++++ extract_num_results.py~ | 5 +++++ extract_occurrences.py | 2 +- out.csv | 2 ++ table.org | 7 +++++++ table.org~ | 6 ++++++ 6 files changed, 39 insertions(+), 1 deletion(-) create mode 100644 README.md~ create mode 100644 extract_num_results.py~ create mode 100644 out.csv create mode 100644 table.org create mode 100644 table.org~ diff --git a/README.md~ b/README.md~ new file mode 100644 index 0000000..36ee6e1 --- /dev/null +++ b/README.md~ @@ -0,0 +1,18 @@ +# Academic word frequency extractor + +This script extracts the word frequency in papers of a search term. It writes the number of papers containing this +word at a certain year to a CSV file: + +| year | results | +|------+---------| +| 2011 | 6320 | +| 2012 | 7250 | +| 2013 | 8170 | +| 2014 | 8260 | +| 2015 | 8150 | + + +The script excludes patents and citations + + + diff --git a/extract_num_results.py~ b/extract_num_results.py~ new file mode 100644 index 0000000..18dc431 --- /dev/null +++ b/extract_num_results.py~ @@ -0,0 +1,5 @@ +from bs4 import BeautifulSoup +import urllib +r = urllib.urlopen('https://scholar.google.nl/scholar?hl=en&as_sdt=1,5&q=self-disclosure').read() +soup = BeautifulSoup(r) +print type(soup) diff --git a/extract_occurrences.py b/extract_occurrences.py index e3db6dc..0f32c7c 100644 --- a/extract_occurrences.py +++ b/extract_occurrences.py @@ -14,7 +14,7 @@ def get_num_results(search_term, start_date, end_date): # Open website and read html user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.109 Safari/537.36' - url = "https://scholar.google.nl/scholar?as_vis=1&q=self-disclosure&hl=en&as_sdt=1,5&as_ylo={0}&as_yhi={1}".format(start_date, end_date) + url = "https://scholar.google.nl/scholar?as_vis=1&q={0}&hl=en&as_sdt=1,5&as_ylo={1}&as_yhi={2}".format(search_term, start_date, end_date) opener = build_opener() request = Request(url=url, headers={'User-Agent': user_agent}) handler = opener.open(request) diff --git a/out.csv b/out.csv new file mode 100644 index 0000000..e23d746 --- /dev/null +++ b/out.csv @@ -0,0 +1,2 @@ +year,results +2014,8260 diff --git a/table.org b/table.org new file mode 100644 index 0000000..b7eca00 --- /dev/null +++ b/table.org @@ -0,0 +1,7 @@ +| year | results | +|------+---------| +| 2011 | 6320 | +| 2012 | 7250 | +| 2013 | 8170 | +| 2014 | 8260 | +| 2015 | 8150 | diff --git a/table.org~ b/table.org~ new file mode 100644 index 0000000..af101e9 --- /dev/null +++ b/table.org~ @@ -0,0 +1,6 @@ +year,results +2011,6320 +2012,7250 +2013,8170 +2014,8260 +2015,8150