Skip to content

Commit de191d0

Browse files
committed
Merge pull request #6 from alfonsodg/master
Better unicode support, added language processing in queries
2 parents 57e96b5 + 128d572 commit de191d0

File tree

2 files changed

+22
-12
lines changed

2 files changed

+22
-12
lines changed

google/modules/standard_search.py

+11-8
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@ def __repr__(self):
3232

3333
def _limit_str_size(self, str_element, size_limit):
3434
"""Limit the characters of the string, adding .. at the end."""
35-
3635
if not str_element:
3736
return None
3837

@@ -44,7 +43,7 @@ def _limit_str_size(self, str_element, size_limit):
4443

4544

4645
# PUBLIC
47-
def search(query, pages=1):
46+
def search(query, pages=1, lang='en'):
4847
"""Returns a list of GoogleResult.
4948
5049
Args:
@@ -56,16 +55,17 @@ def search(query, pages=1):
5655

5756
results = []
5857
for i in range(pages):
59-
url = _get_search_url(query, i)
58+
url = _get_search_url(query, i, lang=lang)
6059
html = get_html(url)
6160

6261
if html:
6362
soup = BeautifulSoup(html, "html.parser")
6463
lis = soup.findAll("li", attrs={"class": "g"})
65-
64+
6665
j = 0
6766
for li in lis:
6867
res = GoogleResult()
68+
6969
res.page = i
7070
res.index = j
7171

@@ -86,7 +86,10 @@ def search(query, pages=1):
8686
def _get_name(li):
8787
"""Return the name of a google search."""
8888
a = li.find("a")
89-
return a.text.strip()
89+
#return a.text.encode("utf-8").strip()
90+
if a is not None:
91+
return a.text.strip()
92+
return None
9093

9194

9295
def _get_link(li):
@@ -123,9 +126,9 @@ def _get_description(li):
123126
sdiv = li.find("div", attrs={"class": "s"})
124127
if sdiv:
125128
stspan = sdiv.find("span", attrs={"class": "st"})
126-
127-
return stspan.text.encode("utf-8").strip()
128-
129+
if stspan is not None:
130+
#return stspan.text.encode("utf-8").strip()
131+
return stspan.text.strip()
129132
else:
130133
return None
131134

google/modules/utils.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from selenium import webdriver
55
import urllib2
66
from functools import wraps
7+
#import requests
8+
from urllib import urlencode
9+
710

811

912
def measure_time(fn):
@@ -25,17 +28,21 @@ def normalize_query(query):
2528
return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
2629

2730

28-
def _get_search_url(query, page=0, per_page=10):
31+
def _get_search_url(query, page=0, per_page=10, lang='en'):
2932
# note: num per page might not be supported by google anymore (because of
3033
# google instant)
31-
return "http://www.google.com/search?hl=en&q=%s&start=%i&num=%i" % (normalize_query(query), page * per_page, per_page)
3234

35+
params = {'nl': lang, 'q': normalize_query(query).encode('utf8'), 'start':page * per_page, 'num':per_page}
36+
params = urlencode(params)
37+
url = u"http://www.google.com/search?" + params
38+
#return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" % (lang, normalize_query(query), page * per_page, per_page)
39+
return url
3340

3441
def get_html(url):
42+
header = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
3543
try:
3644
request = urllib2.Request(url)
37-
request.add_header(
38-
"User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
45+
request.add_header("User-Agent", header)
3946
html = urllib2.urlopen(request).read()
4047
return html
4148
except urllib2.HTTPError as e:

0 commit comments

Comments
 (0)