Merge pull request #6 from alfonsodg/master

abenassi · abenassi · commit de191d0b8e0c · 2015-08-28T17:10:10.000-03:00
Better unicode support, added language processing in queries
diff --git a/google/modules/standard_search.py b/google/modules/standard_search.py
@@ -32,7 +32,6 @@ def __repr__(self):
 
     def _limit_str_size(self, str_element, size_limit):
         """Limit the characters of the string, adding .. at the end."""
-
         if not str_element:
             return None
 
@@ -44,7 +43,7 @@ def _limit_str_size(self, str_element, size_limit):
 
 
 # PUBLIC
-def search(query, pages=1):
+def search(query, pages=1, lang='en'):
     """Returns a list of GoogleResult.
 
     Args:
@@ -56,16 +55,17 @@ def search(query, pages=1):
 
     results = []
     for i in range(pages):
-        url = _get_search_url(query, i)
+        url = _get_search_url(query, i, lang=lang)
         html = get_html(url)
 
         if html:
             soup = BeautifulSoup(html, "html.parser")
             lis = soup.findAll("li", attrs={"class": "g"})
-
+            
             j = 0
             for li in lis:
                 res = GoogleResult()
+
                 res.page = i
                 res.index = j
 
@@ -86,7 +86,10 @@ def search(query, pages=1):
 def _get_name(li):
     """Return the name of a google search."""
     a = li.find("a")
-    return a.text.strip()
+    #return a.text.encode("utf-8").strip()
+    if a is not None:
+        return a.text.strip()
+    return None
 
 
 def _get_link(li):
@@ -123,9 +126,9 @@ def _get_description(li):
     sdiv = li.find("div", attrs={"class": "s"})
     if sdiv:
         stspan = sdiv.find("span", attrs={"class": "st"})
-
-        return stspan.text.encode("utf-8").strip()
-
+        if stspan is not None:
+        #return stspan.text.encode("utf-8").strip()
+            return stspan.text.strip()
     else:
         return None
 
diff --git a/google/modules/utils.py b/google/modules/utils.py
@@ -4,6 +4,9 @@
 from selenium import webdriver
 import urllib2
 from functools import wraps
+#import requests
+from urllib import urlencode
+
 
 
 def measure_time(fn):
@@ -25,17 +28,21 @@ def normalize_query(query):
     return query.strip().replace(":", "%3A").replace("+", "%2B").replace("&", "%26").replace(" ", "+")
 
 
-def _get_search_url(query, page=0, per_page=10):
+def _get_search_url(query, page=0, per_page=10, lang='en'):
     # note: num per page might not be supported by google anymore (because of
     # google instant)
-    return "http://www.google.com/search?hl=en&q=%s&start=%i&num=%i" % (normalize_query(query), page * per_page, per_page)
 
+    params = {'nl': lang, 'q': normalize_query(query).encode('utf8'), 'start':page * per_page, 'num':per_page}
+    params = urlencode(params)
+    url = u"http://www.google.com/search?" + params
+    #return u"http://www.google.com/search?hl=%s&q=%s&start=%i&num=%i" % (lang, normalize_query(query), page * per_page, per_page)
+    return url
 
 def get_html(url):
+    header = "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101"
     try:
         request = urllib2.Request(url)
-        request.add_header(
-            "User-Agent", "Mozilla/5.001 (windows; U; NT4.0; en-US; rv:1.0) Gecko/25250101")
+        request.add_header("User-Agent", header)
         html = urllib2.urlopen(request).read()
         return html
     except urllib2.HTTPError as e: