Skip to content

Commit

Permalink
the script now returns list of absolute paths of the images downloaded
Browse files Browse the repository at this point in the history
added more clarification on providing path of chromedriver on windows OS, in the documentation
  • Loading branch information
Vasa committed May 12, 2018
1 parent c9dc970 commit 66b5fff
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 18 deletions.
16 changes: 13 additions & 3 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ If you would want to use this library from another python file, you could use it
from google_images_download import google_images_download
response = google_images_download.googleimagesdownload()
response.download({<Arguments...>})
absolute_image_paths = response.download({<Arguments...>})
Arguments
Expand Down Expand Up @@ -226,6 +226,12 @@ Arguments
| | | |
| | | This argument does not take any value. Just add '--print_size' or '-ps' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| print_paths | pp | Prints the list of all the absolute paths of the downloaded images |
| | | |
| | | When calling the script from another python file, this list will be saved in a variable (as shown in the example below) |
| | | |
| | | This argument also allows you to print the list on the console |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| metadata | m | Prints the metada of the image on the console. |
| | | |
| | | This includes image size, origin, image attributes, description, image URL, etc. |
Expand Down Expand Up @@ -308,8 +314,8 @@ Examples
response = google_images_download.googleimagesdownload() #class instantiation
arguments = {"keywords":"Polar bears,baloons,Beaches","limit":20,"print_urls":True} #creating list of arguments
response.download(arguments) #passing the arguments to the function
paths = response.download(arguments) #passing the arguments to the function
print(paths) #printing absolute paths of the downloaded images
- If you are passing arguments from a config file, simply pass the config_file argument with name of your JSON file

Expand Down Expand Up @@ -475,6 +481,10 @@ If you have pip installed the library or run the setup.py file, Selenium would h

On **Windows** or **MAC** if for some reason the chromedriver gives you trouble, download it under the current directory and run the command.

On windows however, the path to chromedriver has to be given in the following format:

'C:\\complete\\path\\to\\chromedriver.exe'

On **Linux** if you are having issues installing google chrome browser, refer to this `CentOS or Amazon Linux Guide <https://intoli.com/blog/installing-google-chrome-on-centos/>`__
or `Ubuntu Guide <https://askubuntu.com/questions/510056/how-to-install-google-chrome in documentation>`__

Expand Down
49 changes: 35 additions & 14 deletions google_images_download/google_images_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@
from urllib.request import Request, urlopen
from urllib.request import URLError, HTTPError
from urllib.parse import quote
import html
import http.client
http.client._MAXHEADERS = 1000
else: # If the Current Version of Python is 2.x
import urllib2
from urllib2 import Request, urlopen
from urllib2 import URLError, HTTPError
from urllib import quote
import httplib
httplib._MAXHEADERS = 1000
import time # Importing the time library to check the time of code execution
import os
import argparse
Expand All @@ -27,12 +30,14 @@
import json
import re
import codecs
import socket

args_list = ["keywords", "keywords_from_file", "prefix_keywords", "suffix_keywords",
"limit", "related_images", "format", "color", "color_type", "usage_rights", "size",
"exact_size", "aspect_ratio", "type", "time", "time_range", "delay", "url", "single_image",
"output_directory", "proxy", "similar_images", "specific_site", "print_urls", "print_size",
"metadata", "extract_metadata", "socket_timeout", "thumbnail", "language", "prefix", "chromedriver"]
"print_paths", "metadata", "extract_metadata", "socket_timeout", "thumbnail", "language",
"prefix", "chromedriver"]


def user_input():
Expand Down Expand Up @@ -86,6 +91,7 @@ def user_input():
parser.add_argument('-ss', '--specific_site', help='downloads images that are indexed from a specific website', type=str, required=False)
parser.add_argument('-p', '--print_urls', default=False, help="Print the URLs of the images", action="store_true")
parser.add_argument('-ps', '--print_size', default=False, help="Print the size of the images on disk", action="store_true")
parser.add_argument('-pp', '--print_paths', default=False, help="Prints the list of absolute paths of the images",action="store_true")
parser.add_argument('-m', '--metadata', default=False, help="Print the metadata of the image", action="store_true")
parser.add_argument('-e', '--extract_metadata', default=False, help="Dumps all the logs into a text file", action="store_true")
parser.add_argument('-st', '--socket_timeout', default=False, help="Connection timeout waiting for the image to download", type=float)
Expand Down Expand Up @@ -279,10 +285,10 @@ def single_image(self,image_url):
output_file = open(file_name, 'wb')
output_file.write(data)
output_file.close()
except OSError as e:
raise e
except IOError as e:
raise e
except OSError as e:
raise e

print("completed ====> " + image_name)
return
Expand Down Expand Up @@ -572,14 +578,12 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri
output_file = open(path, 'wb')
output_file.write(data)
output_file.close()
absolute_path = os.path.abspath(path)
except OSError as e:
download_status = 'fail'
download_message = "OSError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
except IOError as e:
download_status = 'fail'
download_message = "IOError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

#return image name back to calling method to use it for thumbnail downloads
download_status = 'success'
Expand All @@ -594,28 +598,39 @@ def download_image(self,image_url,image_format,main_directory,dir_name,count,pri
download_status = 'fail'
download_message = "UnicodeEncodeError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

except URLError as e:
download_status = 'fail'
download_message = "URLError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

except HTTPError as e: # If there is any HTTPError
download_status = 'fail'
download_message = "HTTPError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

except URLError as e:
download_status = 'fail'
download_message = "URLError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

except ssl.CertificateError as e:
download_status = 'fail'
download_message = "CertificateError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

except IOError as e: # If there is any IOError
download_status = 'fail'
download_message = "IOError on an image...trying next one..." + " Error: " + str(e)
return_image_name = ''
absolute_path = ''

return download_status,download_message,return_image_name
return download_status,download_message,return_image_name,absolute_path


# Finding 'Next Image' from the given raw page
Expand Down Expand Up @@ -650,6 +665,7 @@ def _get_next_item(self,s):
# Getting all links with the help of '_images_get_next_image'
def _get_all_items(self,page,main_directory,dir_name,limit,arguments):
items = []
abs_path = []
errorCount = 0
i = 0
count = 1
Expand All @@ -668,7 +684,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments):
items.append(object) # Append all the links in the list named 'Links'

#download the images
download_status,download_message,return_image_name = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'])
download_status,download_message,return_image_name,absolute_path = self.download_image(object['image_link'],object['image_format'],main_directory,dir_name,count,arguments['print_urls'],arguments['socket_timeout'],arguments['prefix'],arguments['print_size'])
print(download_message)
if download_status == "success":

Expand All @@ -678,6 +694,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments):
print(download_message_thumbnail)

count += 1
abs_path.append(absolute_path)
else:
errorCount += 1

Expand All @@ -691,7 +708,7 @@ def _get_all_items(self,page,main_directory,dir_name,limit,arguments):
print("\n\nUnfortunately all " + str(
limit) + " could not be downloaded because some images were not downloadable. " + str(
count-1) + " is all we got for this search filter!")
return items,errorCount
return items,errorCount,abs_path


# Bulk Download
Expand Down Expand Up @@ -768,6 +785,7 @@ def download(self,arguments):
os.environ["https_proxy"] = arguments['proxy']
######Initialization Complete

paths = {}
for pky in prefix_keywords:
for sky in suffix_keywords: # 1.for every suffix keywords
i = 0
Expand All @@ -790,7 +808,8 @@ def download(self,arguments):
raw_html = self.download_extended_page(url,arguments['chromedriver'])

print("Starting Download...")
items,errorCount = self._get_all_items(raw_html,main_directory,dir_name,limit,arguments) #get all image items and download images
items,errorCount,abs_path = self._get_all_items(raw_html,main_directory,dir_name,limit,arguments) #get all image items and download images
paths[pky + search_keyword[i] + sky] = abs_path

#dumps into a text file
if arguments['extract_metadata']:
Expand Down Expand Up @@ -819,7 +838,9 @@ def download(self,arguments):

i += 1
print("\nErrors: " + str(errorCount) + "\n")
return
if arguments['print_paths']:
print(paths)
return paths

#------------- Main Program -------------#
def main():
Expand All @@ -832,7 +853,7 @@ def main():
else: # or download multiple images based on keywords/keyphrase search
t0 = time.time() # start the timer
response = googleimagesdownload()
response.download(arguments)
paths = response.download(arguments) #wrapping response in a variable just for consistency

print("\nEverything downloaded!")
t1 = time.time() # stop the timer
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from codecs import open
from os import path

__version__ = '2.1.2'
__version__ = '2.2.0'

here = path.abspath(path.dirname(__file__))

Expand Down

0 comments on commit 66b5fff

Please sign in to comment.