Skip to content

Commit

Permalink
feature to download all the relevant images to the keyword provided
Browse files Browse the repository at this point in the history
  • Loading branch information
Vasa committed Mar 31, 2018
1 parent a252a2d commit 655a7fa
Show file tree
Hide file tree
Showing 3 changed files with 73 additions and 3 deletions.
19 changes: 19 additions & 0 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ Arguments
| | | |
| | | If this value is not specified, it defaults to 100. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| related_images | ri | This argument downloads a ton of images related to the keyword you provided. |
| | | |
| | | Google Images page returns list of related keywords to the keyword you have mentioned in the query. This tool downloads |
| | | images from each of those related keywords based on the limit you have mentioned in your query |
| | | |
| | | This argument does not take any value. Just add '--related_images' or '-ri' in your query. |
| | | |
| | | **Note:** This argument can download hundreds or thousands of additional images so please use this carefully. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| format | f | Denotes the format/extension of the image that you want to download. |
| | | |
| | | `Possible values: jpg, gif, png, bmp, svg, webp, ico` |
Expand Down Expand Up @@ -176,18 +185,26 @@ Arguments
| specific_site | ss | Allows you to download images with keywords only from a specific website/domain name you mention. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| print_urls | p | Print the URLs of the images on the console. These image URLs can be used for debugging purposes |
| | | |
| | | This argument does not take any value. Just add '--print_urls' or '-p' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| print_size | ps | Prints the size of the images on the console |
| | | |
| | | The size denoted the actual size of the image and not the size of the image on disk |
| | | |
| | | This argument does not take any value. Just add '--print_size' or '-ps' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| metadata | m | Prints the metada of the image on the console. |
| | | |
| | | This includes image size, origin, image attributes, description, image URL, etc. |
| | | |
| | | This argument does not take any value. Just add '--metadata' or '-m' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| extract_metadata | e | This option allows you to save metadata of all the downloaded images in a text file. |
| | | |
| | | This file can be found in the ``logs/`` directory. The name of the file would be same as the keyword nam |
| | | |
| | | This argument does not take any value. Just add '--extract_metadata' or '-e' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| socket_timeout | st | Allows you to specify the time to wait for socket connection. |
| | | |
Expand All @@ -196,6 +213,8 @@ Arguments
| thumbnail | th | Downloads image thumbnails corresponding to each image downloaded. |
| | | |
| | | Thumbnails are saved in their own sub-directories inside of the main directory. |
| | | |
| | | This argument does not take any value. Just add '--thumbnail' or '-th' in your query. |
+-------------------+-------------+-------------------------------------------------------------------------------------------------------------------------------+
| language | la | Defines the language filter. The search results are automatically returned in that language |
| | | |
Expand Down
55 changes: 53 additions & 2 deletions google_images_download/google_images_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@
parser.add_argument('-pr', '--prefix', default=False, help="A word that you would want to prefix in front of each image name", type=str, required=False)
parser.add_argument('-px', '--proxy', help='specify a proxy address and port', type=str, required=False)
parser.add_argument('-cd', '--chromedriver', help='specify the path to chromedriver executable in your local machine', type=str, required=False)
parser.add_argument('-ri', '--related_images', default=False, help="Downloads images that are similar to the keyword provided", action="store_true")

args = parser.parse_args()
arguments = vars(args)
Expand Down Expand Up @@ -135,8 +136,9 @@ def download_extended_page(url):
try:
browser = webdriver.Chrome(arguments['chromedriver'], chrome_options=options)
except:
print("Looks like we cannot locate the path the 'chromedriver'. Please use the '--chromedriver' "
"argument to specify the path to the executable.")
print("Looks like we cannot locate the path the 'chromedriver' (use the '--chromedriver' "
"argument to specify the path to the executable.) or google chrome browser is not "
"installed on your machine")
sys.exit()
browser.set_window_size(1024, 768)

Expand Down Expand Up @@ -178,6 +180,41 @@ def repair(brokenjson):
invalid_escape = re.compile(r'\\[0-7]{1,3}') # up to 3 digits for byte values up to FF
return invalid_escape.sub(replace_with_byte, brokenjson)

# Finding 'Next Image' from the given raw page
def get_next_tab(s):
start_line = s.find('class="ZO5Spb"')
if start_line == -1: # If no links are found then give an error!
end_quote = 0
link = "no_tabs"
return link,'',end_quote
else:
start_line = s.find('class="ZO5Spb"')
start_content = s.find('href="', start_line + 1)
end_content = s.find('">', start_content + 1)
url_item = "https://www.google.com" + str(s[start_content+6:end_content])
url_item = url_item.replace('&', '&')

start_line_2 = s.find('class="ZO5Spb"')
start_content_2 = s.find(':', start_line_2 + 1)
end_content_2 = s.find('"', start_content_2 + 1)
url_item_name = str(s[start_content_2 + 1:end_content_2])

#print(url_item,url_item_name)
return url_item,url_item_name,end_content


# Getting all links with the help of '_images_get_next_image'
def get_all_tabs(page):
tabs = {}
while True:
item,item_name,end_content = get_next_tab(page)
if item == "no_tabs":
break
else:
tabs[item_name] = item # Append all the links in the list named 'Links'
time.sleep(0.1) # Timer could be used to slow down the request for image downloads
page = page[end_content:]
return tabs

#Format the object in readable format
def format_object(object):
Expand Down Expand Up @@ -613,6 +650,20 @@ def bulk_download(search_keyword,suffix_keywords,limit,main_directory):
text_file.write(json.dumps(items, indent=4, sort_keys=True))
text_file.close()

#Related images
if arguments['related_images']:
print("\nGetting list of related keywords...this may take a few moments")
tabs = get_all_tabs(raw_html)
for key, value in tabs.items():
final_search_term = (search_term + " - " + key)
print("\nNow Downloading - " + final_search_term)
if limit < 101:
new_raw_html = download_page(value) # download page
else:
new_raw_html = download_extended_page(value)
create_directories(main_directory, final_search_term)
_get_all_items(new_raw_html, main_directory, search_term + " - " + key, limit)

i += 1
return errorCount

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
from codecs import open
from os import path

__version__ = '1.3.2'
__version__ = '1.4.2'

here = path.abspath(path.dirname(__file__))

Expand Down

0 comments on commit 655a7fa

Please sign in to comment.