urlstechie · vsoch · Mar 27, 2022 · Mar 26, 2022 · Mar 26, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,7 @@ and **Merged pull requests**. Critical items to know are:
 Referenced versions in headers are tagged on Github, in parentheses are for pypi.
 
 ## [vxx](https://github.com/urlstechie/urlschecker-python/tree/master) (master)
+ - multiprocessing to speed up checks (0.0.26)
  - bug fix for verbose option to only print file names that have failures (0.0.25)
  - adding option to print a summary that contains file names and urls (0.0.24)
  - updating container base to use debian buster and adding certifi (0.0.23)

diff --git a/LICENSE b/LICENSE
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal

diff --git a/README.md b/README.md
@@ -8,7 +8,9 @@
 This is a python module to collect urls over static files (code and documentation)
 and then test for and report broken links. If you are interesting in using
 this as a GitHub action, see [urlchecker-action](https://github.com/urlstechie/urlchecker-action). There are also container
-bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags).
+bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags). As of version
+0.0.26, we use multiprocessing so the checks run a lot faster, and you can set `URLCHECKER_WORKERS` to change the number of workers
+(defaults to 9). If you don't want multiprocessing, use version 0.0.25 or earlier.
 
 ## Module Documentation
 

diff --git a/docs/source/fileproc.rst b/docs/source/fileproc.rst
@@ -1,5 +1,5 @@
 urlchecker.core.fileproc
-==========================
+========================
 
 
 .. automodule:: urlchecker.core.fileproc

diff --git a/urlchecker/__init__.py b/urlchecker/__init__.py
@@ -1,10 +1,3 @@
-"""
-
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
-
-This source code is licensed under the terms of the MIT license.  
-For a copy, see <https://opensource.org/licenses/MIT>.
-
-"""
-
 from urlchecker.version import __version__
+
+assert __version__
diff --git a/urlchecker/client/__init__.py b/urlchecker/client/__init__.py
@@ -2,7 +2,7 @@
 
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/client/check.py b/urlchecker/client/check.py
@@ -1,6 +1,6 @@
 """
 client/github.py: entrypoint for interaction with a GitHub repostiory.
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 """
 
 import re
@@ -106,9 +106,9 @@ def main(args, extra):
         if args.verbose:
             print("\n\U0001F914 Uh oh... The following urls did not pass:")
             for file_name, result in checker.checks.items():
-                if result.failed:
+                if result["failed"]:
                     print_failure(file_name + ":")
-                    for url in result.failed:
+                    for url in result["failed"]:
                         print_failure("     " + url)
         else:
             print("\n\U0001F914 Uh oh... The following urls did not pass:")

diff --git a/urlchecker/core/check.py b/urlchecker/core/check.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.
@@ -12,6 +12,7 @@
 import re
 import sys
 from urlchecker.core import fileproc
+from urlchecker.core.worker import Workers
 from urlchecker.core.urlproc import UrlCheckResult
 
 
@@ -41,6 +42,8 @@ def __init__(
         """
         # Initiate results object, and checks lookup (holds UrlCheck) for each file
         self.results = {"passed": set(), "failed": set(), "excluded": set()}
+
+        # Results organized by filename
         self.checks = {}
 
         # Save run parameters
@@ -123,12 +126,18 @@ def save_results(self, file_path, sep=",", header=None, relative_paths=True):
                     else:
                         file_name = os.path.relpath(file_name)
 
-                [writer.writerow([url, "failed", file_name]) for url in result.failed]
+                [
+                    writer.writerow([url, "failed", file_name])
+                    for url in result["failed"]
+                ]
                 [
                     writer.writerow([url, "excluded", file_name])
-                    for url in result.excluded
+                    for url in result["excluded"]
+                ]
+                [
+                    writer.writerow([url, "passed", file_name])
+                    for url in result["passed"]
                 ]
-                [writer.writerow([url, "passed", file_name]) for url in result.passed]
 
         return file_path
 
@@ -161,27 +170,56 @@ def run(
         exclude_urls = exclude_urls or []
         exclude_patterns = exclude_patterns or []
 
-        # loop through files files
-        for file_name in file_paths:
-
-            # Instantiate a checker to extract urls
-            checker = UrlCheckResult(
-                file_name=file_name,
-                exclude_patterns=exclude_patterns,
-                exclude_urls=exclude_urls,
-                print_all=self.print_all,
-            )
-
-            # Check the urls
-            checker.check_urls(retry_count=retry_count, timeout=timeout)
+        # Run with multiprocessing
+        tasks = {}
+        funcs = {}
+        workers = Workers()
 
-            # Update flattened results
-            self.results["failed"].update(checker.failed)
-            self.results["passed"].update(checker.passed)
-            self.results["excluded"].update(checker.excluded)
+        # loop through files
+        for file_name in file_paths:
 
-            # Save the checker in the lookup
-            self.checks[file_name] = checker
+            # Export parameters and functions, use the same check task for all
+            tasks[file_name] = {
+                "file_name": file_name,
+                "exclude_patterns": exclude_patterns,
+                "exclude_urls": exclude_urls,
+                "print_all": self.print_all,
+                "retry_count": retry_count,
+                "timeout": timeout,
+            }
+            funcs[file_name] = check_task
+
+        results = workers.run(funcs, tasks)
+        for file_name, result in results.items():
+            self.checks[file_name] = result
+            self.results["failed"].update(result["failed"])
+            self.results["passed"].update(result["passed"])
+            self.results["excluded"].update(result["excluded"])
 
         # A flattened dict of passed and failed
         return self.results
+
+
+def check_task(*args, **kwargs):
+    """
+    A checking task, the default we use
+    """
+    # Instantiate a checker to extract urls
+    checker = UrlCheckResult(
+        file_name=kwargs["file_name"],
+        exclude_patterns=kwargs.get("exclude_patterns", []),
+        exclude_urls=kwargs.get("exclude_urls", []),
+        print_all=kwargs.get("print_all", True),
+    )
+
+    # Check the urls
+    checker.check_urls(
+        retry_count=kwargs.get("retry_count", 2), timeout=kwargs.get("timeout", 5)
+    )
+
+    # Update flattened results
+    return {
+        "failed": checker.failed,
+        "passed": checker.passed,
+        "excluded": checker.excluded,
+    }
diff --git a/urlchecker/core/exclude.py b/urlchecker/core/exclude.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/core/fileproc.py b/urlchecker/core/fileproc.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/core/urlmarker.py b/urlchecker/core/urlmarker.py
@@ -4,7 +4,7 @@
 http://daringfireball.net/2010/07/improved_regex_for_matching_urls
 https://gist.github.com/gruber/8891611
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/core/urlproc.py b/urlchecker/core/urlproc.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.  
 For a copy, see <https://opensource.org/licenses/MIT>.
@@ -168,14 +168,9 @@ def check_urls(self, urls=None, retry_count=1, timeout=5):
         # if no urls are found, mention it if required
         if not urls:
             if self.print_all:
-                if self.file_name:
-                    print("\n", self.file_name, "\n", "-" * len(self.file_name))
                 print("No urls found.")
             return
 
-        if self.file_name:
-            print("\n", self.file_name, "\n", "-" * len(self.file_name))
-
         # init seen urls list
         seen = set()
 

diff --git a/urlchecker/core/worker.py b/urlchecker/core/worker.py
@@ -0,0 +1,109 @@
+"""
+
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
+
+This source code is licensed under the terms of the MIT license.  
+For a copy, see <https://opensource.org/licenses/MIT>.
+
+"""
+
+import itertools
+import multiprocessing
+import os
+import time
+import signal
+import sys
+
+from urlchecker.logger import get_logger
+
+logger = get_logger()
+
+
+class Workers:
+    def __init__(self, workers=None):
+
+        if workers is None:
+            workers = int(os.environ.get("URLCHECKER_WORKERS", 9))
+        self.workers = workers
+        logger.debug(f"Using {self.workers} workers for multiprocess.")
+
+    def start(self):
+        logger.debug("Starting multiprocess")
+        self.start_time = time.time()
+
+    def end(self):
+        self.end_time = time.time()
+        self.runtime = self.runtime = self.end_time - self.start_time
+        logger.debug(f"Ending multiprocess, runtime: {self.runtime} sec")
+
+    def run(self, funcs, tasks):
+        """run will send a list of tasks, a tuple with arguments, through a function.
+        the arguments should be ordered correctly.
+
+        Parameters
+        ==========
+        funcs: the functions to run with multiprocessing.pool, a dictionary
+               with lookup by the task name
+        tasks: a dict of tasks, each task name (key) with a
+               tuple of arguments to process
+        """
+        # Number of tasks must == number of functions
+        assert len(funcs) == len(tasks)
+
+        # Keep track of some progress for the user
+        progress = 1
+
+        # if we don't have tasks, don't run
+        if not tasks:
+            return
+
+        # results will also have the same key to look up
+        finished = dict()
+        results = []
+
+        try:
+            pool = multiprocessing.Pool(self.workers, init_worker)
+
+            self.start()
+            for key, params in tasks.items():
+                func = funcs[key]
+                result = pool.apply_async(multi_wrapper, multi_package(func, [params]))
+
+                # Store the key with the result
+                results.append((key, result))
+
+            while len(results) > 0:
+                pair = results.pop()
+                key, result = pair
+                result.wait()
+                progress += 1
+                finished[key] = result.get()
+
+            self.end()
+            pool.close()
+            pool.join()
+
+        except (KeyboardInterrupt, SystemExit):
+            logger.error("Keyboard interrupt detected, terminating workers!")
+            pool.terminate()
+            sys.exit(1)
+
+        except:
+            logger.exit("Error running task")
+
+        return finished
+
+
+# Supporting functions for MultiProcess Worker
+def init_worker():
+    signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
+def multi_wrapper(func_args):
+    function, kwargs = func_args
+    return function(**kwargs)
+
+
+def multi_package(func, kwargs):
+    zipped = zip(itertools.repeat(func), kwargs)
+    return zipped
diff --git a/urlchecker/logger.py b/urlchecker/logger.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.  
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/main/github.py b/urlchecker/main/github.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.  
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/main/utils.py b/urlchecker/main/utils.py
@@ -1,6 +1,6 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.  
 For a copy, see <https://opensource.org/licenses/MIT>.

diff --git a/urlchecker/version.py b/urlchecker/version.py
@@ -1,13 +1,13 @@
 """
 
-Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
+Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
 
 This source code is licensed under the terms of the MIT license.
 For a copy, see <https://opensource.org/licenses/MIT>.
 
 """
 
-__version__ = "0.0.25"
+__version__ = "0.0.26"
 AUTHOR = "Ayoub Malek, Vanessa Sochat"
 AUTHOR_EMAIL = "superkogito@gmail.com, vsochat@stanford.edu"
 NAME = "urlchecker"