Add documentation for crawl_images

eliasdabbas · Mar 22, 2024 · 036e925 · 036e925
1 parent a54a075
commit 036e925
Show file tree

Hide file tree

Showing 3 changed files with 109 additions and 30 deletions.
diff --git a/advertools/crawlytics.py b/advertools/crawlytics.py
@@ -10,6 +10,7 @@
    <iframe width="560" height="315" src="https://www.youtube.com/embed/rt0LhxNW8GM?si=Pm5v7JKUK5CiS-Lo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen></iframe>
 
 |
+
 There are certain columns in the crawl DataFrame that can be analyzed separately and
 independently, like page size and status codes. They can of course be analyzed together
 with other columns like URL and title to put these columns and their data in context.

diff --git a/advertools/image_spider.py b/advertools/image_spider.py
@@ -1,3 +1,70 @@
+"""
+
+Image Crawler and Downloader
+============================
+
+**Experimental feature - expect changes**
+
+This is a crawler that downloads all images on a given list of URLs. Using 
+:func:`crawl_images` is straightforward:
+
+>>> import advertools as adv
+>>> adv.crawl_images([URL_1, URL_2, URL_3, ...], "output_dir")
+
+This would go to the supplied URLs and download all images found on those URLs, and place
+them in ``output_dir``.
+
+You can set a few conditions to modify the behavior:
+
+* ``min_width``: The minimum width in pixels for an image to be downloaded. This is
+  mainly to avoid downloading logos, tracking pixels, navigational elemenst as images,
+  and so on.
+* ``min_height``: The minimum height in pixels for an image to be downloaded
+* ``include_img_regex``: A regular expression that the image path needs to match in
+  order for it to be downloaded. In some cases, after checking the patterns of images
+  for example, you might want to only download images that contain "sports", or any
+  other pattern. Or maybe images of interest are under the /economy/ folder and you only
+  want those images.
+* ``custom_settings``: Just like other crawl functions, you can set any custom settings
+  you want to control the crawler's behavior. Some examples include changing the
+  User-agent, (dis)obeying robots.txt rules, and so on. More options and code details can
+  be found in the :ref:`crawling strategies <crawl_strategies>` page.
+
+To run the :func:`crawl_images` function you need to set an ``output_dir``. This is
+where all images will be downloaded. You also get a summary file with details about the
+downloaded images. You can read this file through the special function
+:func:`summarize_crawled_imgs` to get a few more details about those images.
+
+>>> adv.summarize_crawled_imgs("path/to/output_dir")
+
+====  ==============================================================================================  ==========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
+  ..  image_location                                                                                  image_urls
+====  ==============================================================================================  ==========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
+   0  https://www.buzzfeed.com/hannahdobro/dirty-little-industry-secrets?origin=tuh                   https://img.buzzfeed.com/buzzfeed-static/static/user_images/6r1oxXOpC_large.jpg?downsize=120:*&output-format=jpg&output-quality=auto
+   0  https://www.buzzfeed.com/hannahdobro/dirty-little-industry-secrets?origin=tuh                   https://img.buzzfeed.com/buzzfeed-static/static/2024-03/18/16/asset/fce856744ed8/sub-buzz-1303-1710779249-1.jpg
+   0  https://www.buzzfeed.com/hannahdobro/dirty-little-industry-secrets?origin=tuh                   data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
+   0  https://www.buzzfeed.com/hannahdobro/dirty-little-industry-secrets?origin=tuh                   https://img.buzzfeed.com/buzzfeed-static/static/2024-03/18/16/asset/245ecfa321e9/sub-buzz-894-1710779358-1.jpg
+   1  https://www.buzzfeed.com/chelseastewart/josh-peck-statement-drake-bell-abuse-claims?origin=tuh  https://img.buzzfeed.com/buzzfeed-static/static/2017-12/12/13/user_images/buzzfeed-prod-web-03/chelseastewart-v2-5590-1513102854-0_large.jpg?downsize=120:*&output-format=jpg&output-quality=auto
+   1  https://www.buzzfeed.com/chelseastewart/josh-peck-statement-drake-bell-abuse-claims?origin=tuh  https://img.buzzfeed.com/buzzfeed-static/static/2024-03/21/19/asset/ea6298160040/sub-buzz-1093-1711048323-1.jpg?downsize=700%3A%2A&output-quality=auto&output-format=auto
+   1  https://www.buzzfeed.com/chelseastewart/josh-peck-statement-drake-bell-abuse-claims?origin=tuh  data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
+   1  https://www.buzzfeed.com/chelseastewart/josh-peck-statement-drake-bell-abuse-claims?origin=tuh  data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAFQAAAA7CAMAAADSF118AAAAP1BMVEUAAADIGxPOHBK5EwDFGhi5Fwi8GRTEGhe7EQDMHR7////vyMfddnm5Hx334+Py8fHdj5DLVVXnq6zJOTzVbG1s8SkwAAAACXRSTlMAv4Eo10JnqA8IHfydAAABJUlEQVRYw93Y64rCMBCG4czk5FSzdav3f63bDaxfV4Qm+AXR96/wMNj0kLhtPib9LcutYA8K+F1rKXqH4KmIPZVIOvwnszEqumFjMVLB3+YsRiv8zRqMWHa1ZNQiBuUV3Jo3cn5FlY3qimY2KitajB3+UmLRxRGovgmqTj4HXc69aN5Hj9PcyYqzfXSavk58tJMNTWgv24pW9kpE0fGbioKlomCZKNgLEUXLhYiiMx+dT+xJ8SxgoCDZ6EJcp7jsPBQLlIbiVmpEwy7aS1poeZ30PvqlAQVJRGeQtLfp1dBLPyb0bdDER+OYL2nHR7E34yUjtjw6ZMc3am/KXlSpoodCHrQWiWbxI85Q6Kc9pneHSCmHJ0VJGPPuAC3LWqO/OURL0aEfg76m8Izrt6EAAAAASUVORK5CYII=
+   2  https://www.buzzfeed.com/josephlongo/celebs-wearing-rewearing-same-dress?origin=tuh             https://img.buzzfeed.com/buzzfeed-static/static/2021-06/3/16/user_images/a824550933a9/tomiobaro-v2-2174-1622738336-41_large.jpg?downsize=120:*&output-format=jpg&output-quality=auto
+   2  https://www.buzzfeed.com/josephlongo/celebs-wearing-rewearing-same-dress?origin=tuh             https://img.buzzfeed.com/buzzfeed-static/static/2024-03/19/13/asset/6634db63f453/sub-buzz-576-1710855734-6.jpg?downsize=700%3A%2A&output-quality=auto&output-format=auto
+   2  https://www.buzzfeed.com/josephlongo/celebs-wearing-rewearing-same-dress?origin=tuh             https://img.buzzfeed.com/buzzfeed-static/static/2024-03/19/13/asset/cb8db05df7e7/sub-buzz-1743-1710855790-4.jpg
+   2  https://www.buzzfeed.com/josephlongo/celebs-wearing-rewearing-same-dress?origin=tuh             data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7
+====  ==============================================================================================  ==========================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================================
+
+Image file names
+----------------
+
+The downloaded images need to be given a name naturally, and the name is taken from the
+slug of the image URL, excluding any query parameters or slashes.
+The full URLs of those images can be found in the summary file, and you can access them
+through :func:`summarize_crawled_imgs`. You also see where those images are located as
+you can see in the table above.
+
+"""
+
 import json
 import re
 import subprocess
@@ -9,9 +76,9 @@
 
 import advertools as adv
 
-image_spider_path = adv.__path__[0] + '/image_spider.py'
+image_spider_path = adv.__path__[0] + "/image_spider.py"
 
-user_agent = f'advertools/{adv.__version__}'
+user_agent = f"advertools/{adv.__version__}"
 
 
 class ImgItem(Item):
@@ -23,24 +90,24 @@ class ImgItem(Item):
 class AdvImagesPipeline(ImagesPipeline):
     def file_path(self, request, response=None, info=None, *, item=None):
         img_url = request.url
-        return urlsplit(img_url).path.split('/')[-1]
+        return urlsplit(img_url).path.split("/")[-1]
 
 
 class ImageSpider(Spider):
-    name = 'image_spider'
+    name = "image_spider"
     include_img_regex = None
     custom_settings = {
-        'USER_AGENT': user_agent,
-        'ROBOTSTXT_OBEY': True,
-        'HTTPERROR_ALLOW_ALL': True,
-        'ITEM_PIPELINES': {AdvImagesPipeline: 1},
-        'AUTOTHROTTLE_ENABLED': True,
-        'AUTOTHROTTLE_TARGET_CONCURRENCY': 8,
+        "USER_AGENT": user_agent,
+        "ROBOTSTXT_OBEY": True,
+        "HTTPERROR_ALLOW_ALL": True,
+        "ITEM_PIPELINES": {AdvImagesPipeline: 1},
+        "AUTOTHROTTLE_ENABLED": True,
+        "AUTOTHROTTLE_TARGET_CONCURRENCY": 8,
     }
 
     def __init__(self, start_urls, include_img_regex=None, *args, **kwargs):
         super().__init__(*args, **kwargs)
-        self.start_urls = json.loads(json.dumps(start_urls.split(',')))
+        self.start_urls = json.loads(json.dumps(start_urls.split(",")))
         if include_img_regex is not None:
             self.include_img_regex = include_img_regex
 
@@ -50,14 +117,17 @@ def start_requests(self):
 
     def parse(self, response):
         img_item = ImgItem()
-        img_src = response.xpath('//img/@src').getall()
+        img_src = response.xpath("//img/@src").getall()
         if self.include_img_regex is not None:
-            img_src = [response.urljoin(src) for src in img_src
-                       if re.findall(self.include_img_regex, src)]
+            img_src = [
+                response.urljoin(src)
+                for src in img_src
+                if re.findall(self.include_img_regex, src)
+            ]
         else:
             img_src = [response.urljoin(src) for src in img_src]
-        img_item['image_urls'] = img_src
-        img_item['image_location'] = response.request.url
+        img_item["image_urls"] = img_src
+        img_item["image_location"] = response.request.url
         yield img_item
 
 
@@ -67,7 +137,7 @@ def crawl_images(
     min_width=0,
     min_height=0,
     include_img_regex=None,
-    custom_settings=None
+    custom_settings=None,
 ):
     """Download all images available on start_urls and save them to output_dir.
 
@@ -94,21 +164,28 @@ def crawl_images(
     if custom_settings is not None:
         for key, val in custom_settings.items():
             if isinstance(val, dict):
-                setting = '='.join([key, json.dumps(val)])
+                setting = "=".join([key, json.dumps(val)])
             else:
-                setting = '='.join([key, str(val)])
-            settings_list.extend(['-s', setting])
+                setting = "=".join([key, str(val)])
+            settings_list.extend(["-s", setting])
 
     command = [
-        'scrapy', 'runspider', image_spider_path,
-        '-a', 'start_urls=' + ','.join(start_urls),
-        '-s', 'IMAGES_STORE=' + output_dir,
-        '-s', 'IMAGES_MIN_HEIGHT=' + str(min_height),
-        '-s', 'IMAGES_MIN_WIDTH=' + str(min_width),
-        '-o', output_dir + '/image_summary.jl'
-        ] + settings_list
+        "scrapy",
+        "runspider",
+        image_spider_path,
+        "-a",
+        "start_urls=" + ",".join(start_urls),
+        "-s",
+        "IMAGES_STORE=" + output_dir,
+        "-s",
+        "IMAGES_MIN_HEIGHT=" + str(min_height),
+        "-s",
+        "IMAGES_MIN_WIDTH=" + str(min_width),
+        "-o",
+        output_dir + "/image_summary.jl",
+    ] + settings_list
     if include_img_regex is not None:
-        command += ['-a', 'include_img_regex=' + include_img_regex]
+        command += ["-a", "include_img_regex=" + include_img_regex]
     subprocess.run(command)
 
 
@@ -128,5 +205,5 @@ def summarize_crawled_imgs(image_dir):
     image_dir : str
       The path to the directory that you provided to crawl_images
     """
-    df = pd.read_json(image_dir.rstrip('/') + '/image_summary.jl', lines=True)
-    return df[['image_location', 'image_urls']].explode('image_urls')
+    df = pd.read_json(image_dir.rstrip("/") + "/image_summary.jl", lines=True)
+    return df[["image_location", "image_urls"]].explode("image_urls")
diff --git a/docs/index.rst b/docs/index.rst
@@ -89,6 +89,7 @@ To install advertools, run the following from the command line::
    Crawl Strategies <advertools.code_recipes.spider_strategies>
    Crawl Analytics <advertools.crawlytics>
    Crawl headers (HEAD method only) <advertools.header_spider>
+   Crawl images <advertools.image_spider>
    Crawl Logs Analysis <advertools.logs>
    Reverse DNS Lookup <advertools.reverse_dns_lookup>
    Analyze Search Engine Results (SERPs) <advertools.serp>