Skip to content

Commit 1f9bd5c

Browse files
authored
testing multiprocessing for faster finds! (#63)
* testing multiprocessing for faster finds! * remove extra verbose logging of task info Signed-off-by: vsoch <vsoch@users.noreply.github.com> Co-authored-by: vsoch <vsoch@users.noreply.github.com>
1 parent 7d919bf commit 1f9bd5c

17 files changed

+191
-53
lines changed

CHANGELOG.md

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ and **Merged pull requests**. Critical items to know are:
1212
Referenced versions in headers are tagged on Github, in parentheses are for pypi.
1313

1414
## [vxx](https://github.com/urlstechie/urlschecker-python/tree/master) (master)
15+
- multiprocessing to speed up checks (0.0.26)
1516
- bug fix for verbose option to only print file names that have failures (0.0.25)
1617
- adding option to print a summary that contains file names and urls (0.0.24)
1718
- updating container base to use debian buster and adding certifi (0.0.23)

LICENSE

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
MIT License
22

3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44

55
Permission is hereby granted, free of charge, to any person obtaining a copy
66
of this software and associated documentation files (the "Software"), to deal

README.md

+3-1
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
This is a python module to collect urls over static files (code and documentation)
99
and then test for and report broken links. If you are interesting in using
1010
this as a GitHub action, see [urlchecker-action](https://github.com/urlstechie/urlchecker-action). There are also container
11-
bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags).
11+
bases available on [quay.io/urlstechie/urlchecker](https://quay.io/repository/urlstechie/urlchecker?tab=tags). As of version
12+
0.0.26, we use multiprocessing so the checks run a lot faster, and you can set `URLCHECKER_WORKERS` to change the number of workers
13+
(defaults to 9). If you don't want multiprocessing, use version 0.0.25 or earlier.
1214

1315
## Module Documentation
1416

docs/source/fileproc.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
urlchecker.core.fileproc
2-
==========================
2+
========================
33

44

55
.. automodule:: urlchecker.core.fileproc

urlchecker/__init__.py

+2-9
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,3 @@
1-
"""
2-
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
4-
5-
This source code is licensed under the terms of the MIT license.
6-
For a copy, see <https://opensource.org/licenses/MIT>.
7-
8-
"""
9-
101
from urlchecker.version import __version__
2+
3+
assert __version__

urlchecker/client/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
"""
44
5-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
5+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
66
77
This source code is licensed under the terms of the MIT license.
88
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/client/check.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
client/github.py: entrypoint for interaction with a GitHub repostiory.
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
"""
55

66
import re
@@ -106,9 +106,9 @@ def main(args, extra):
106106
if args.verbose:
107107
print("\n\U0001F914 Uh oh... The following urls did not pass:")
108108
for file_name, result in checker.checks.items():
109-
if result.failed:
109+
if result["failed"]:
110110
print_failure(file_name + ":")
111-
for url in result.failed:
111+
for url in result["failed"]:
112112
print_failure(" " + url)
113113
else:
114114
print("\n\U0001F914 Uh oh... The following urls did not pass:")

urlchecker/core/check.py

+61-23
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.
@@ -12,6 +12,7 @@
1212
import re
1313
import sys
1414
from urlchecker.core import fileproc
15+
from urlchecker.core.worker import Workers
1516
from urlchecker.core.urlproc import UrlCheckResult
1617

1718

@@ -41,6 +42,8 @@ def __init__(
4142
"""
4243
# Initiate results object, and checks lookup (holds UrlCheck) for each file
4344
self.results = {"passed": set(), "failed": set(), "excluded": set()}
45+
46+
# Results organized by filename
4447
self.checks = {}
4548

4649
# Save run parameters
@@ -123,12 +126,18 @@ def save_results(self, file_path, sep=",", header=None, relative_paths=True):
123126
else:
124127
file_name = os.path.relpath(file_name)
125128

126-
[writer.writerow([url, "failed", file_name]) for url in result.failed]
129+
[
130+
writer.writerow([url, "failed", file_name])
131+
for url in result["failed"]
132+
]
127133
[
128134
writer.writerow([url, "excluded", file_name])
129-
for url in result.excluded
135+
for url in result["excluded"]
136+
]
137+
[
138+
writer.writerow([url, "passed", file_name])
139+
for url in result["passed"]
130140
]
131-
[writer.writerow([url, "passed", file_name]) for url in result.passed]
132141

133142
return file_path
134143

@@ -161,27 +170,56 @@ def run(
161170
exclude_urls = exclude_urls or []
162171
exclude_patterns = exclude_patterns or []
163172

164-
# loop through files files
165-
for file_name in file_paths:
166-
167-
# Instantiate a checker to extract urls
168-
checker = UrlCheckResult(
169-
file_name=file_name,
170-
exclude_patterns=exclude_patterns,
171-
exclude_urls=exclude_urls,
172-
print_all=self.print_all,
173-
)
174-
175-
# Check the urls
176-
checker.check_urls(retry_count=retry_count, timeout=timeout)
173+
# Run with multiprocessing
174+
tasks = {}
175+
funcs = {}
176+
workers = Workers()
177177

178-
# Update flattened results
179-
self.results["failed"].update(checker.failed)
180-
self.results["passed"].update(checker.passed)
181-
self.results["excluded"].update(checker.excluded)
178+
# loop through files
179+
for file_name in file_paths:
182180

183-
# Save the checker in the lookup
184-
self.checks[file_name] = checker
181+
# Export parameters and functions, use the same check task for all
182+
tasks[file_name] = {
183+
"file_name": file_name,
184+
"exclude_patterns": exclude_patterns,
185+
"exclude_urls": exclude_urls,
186+
"print_all": self.print_all,
187+
"retry_count": retry_count,
188+
"timeout": timeout,
189+
}
190+
funcs[file_name] = check_task
191+
192+
results = workers.run(funcs, tasks)
193+
for file_name, result in results.items():
194+
self.checks[file_name] = result
195+
self.results["failed"].update(result["failed"])
196+
self.results["passed"].update(result["passed"])
197+
self.results["excluded"].update(result["excluded"])
185198

186199
# A flattened dict of passed and failed
187200
return self.results
201+
202+
203+
def check_task(*args, **kwargs):
204+
"""
205+
A checking task, the default we use
206+
"""
207+
# Instantiate a checker to extract urls
208+
checker = UrlCheckResult(
209+
file_name=kwargs["file_name"],
210+
exclude_patterns=kwargs.get("exclude_patterns", []),
211+
exclude_urls=kwargs.get("exclude_urls", []),
212+
print_all=kwargs.get("print_all", True),
213+
)
214+
215+
# Check the urls
216+
checker.check_urls(
217+
retry_count=kwargs.get("retry_count", 2), timeout=kwargs.get("timeout", 5)
218+
)
219+
220+
# Update flattened results
221+
return {
222+
"failed": checker.failed,
223+
"passed": checker.passed,
224+
"excluded": checker.excluded,
225+
}

urlchecker/core/exclude.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/core/fileproc.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/core/urlmarker.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
http://daringfireball.net/2010/07/improved_regex_for_matching_urls
55
https://gist.github.com/gruber/8891611
66
7-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
7+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
88
99
This source code is licensed under the terms of the MIT license.
1010
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/core/urlproc.py

+1-6
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.
@@ -168,14 +168,9 @@ def check_urls(self, urls=None, retry_count=1, timeout=5):
168168
# if no urls are found, mention it if required
169169
if not urls:
170170
if self.print_all:
171-
if self.file_name:
172-
print("\n", self.file_name, "\n", "-" * len(self.file_name))
173171
print("No urls found.")
174172
return
175173

176-
if self.file_name:
177-
print("\n", self.file_name, "\n", "-" * len(self.file_name))
178-
179174
# init seen urls list
180175
seen = set()
181176

urlchecker/core/worker.py

+109
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,109 @@
1+
"""
2+
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
4+
5+
This source code is licensed under the terms of the MIT license.
6+
For a copy, see <https://opensource.org/licenses/MIT>.
7+
8+
"""
9+
10+
import itertools
11+
import multiprocessing
12+
import os
13+
import time
14+
import signal
15+
import sys
16+
17+
from urlchecker.logger import get_logger
18+
19+
logger = get_logger()
20+
21+
22+
class Workers:
23+
def __init__(self, workers=None):
24+
25+
if workers is None:
26+
workers = int(os.environ.get("URLCHECKER_WORKERS", 9))
27+
self.workers = workers
28+
logger.debug(f"Using {self.workers} workers for multiprocess.")
29+
30+
def start(self):
31+
logger.debug("Starting multiprocess")
32+
self.start_time = time.time()
33+
34+
def end(self):
35+
self.end_time = time.time()
36+
self.runtime = self.runtime = self.end_time - self.start_time
37+
logger.debug(f"Ending multiprocess, runtime: {self.runtime} sec")
38+
39+
def run(self, funcs, tasks):
40+
"""run will send a list of tasks, a tuple with arguments, through a function.
41+
the arguments should be ordered correctly.
42+
43+
Parameters
44+
==========
45+
funcs: the functions to run with multiprocessing.pool, a dictionary
46+
with lookup by the task name
47+
tasks: a dict of tasks, each task name (key) with a
48+
tuple of arguments to process
49+
"""
50+
# Number of tasks must == number of functions
51+
assert len(funcs) == len(tasks)
52+
53+
# Keep track of some progress for the user
54+
progress = 1
55+
56+
# if we don't have tasks, don't run
57+
if not tasks:
58+
return
59+
60+
# results will also have the same key to look up
61+
finished = dict()
62+
results = []
63+
64+
try:
65+
pool = multiprocessing.Pool(self.workers, init_worker)
66+
67+
self.start()
68+
for key, params in tasks.items():
69+
func = funcs[key]
70+
result = pool.apply_async(multi_wrapper, multi_package(func, [params]))
71+
72+
# Store the key with the result
73+
results.append((key, result))
74+
75+
while len(results) > 0:
76+
pair = results.pop()
77+
key, result = pair
78+
result.wait()
79+
progress += 1
80+
finished[key] = result.get()
81+
82+
self.end()
83+
pool.close()
84+
pool.join()
85+
86+
except (KeyboardInterrupt, SystemExit):
87+
logger.error("Keyboard interrupt detected, terminating workers!")
88+
pool.terminate()
89+
sys.exit(1)
90+
91+
except:
92+
logger.exit("Error running task")
93+
94+
return finished
95+
96+
97+
# Supporting functions for MultiProcess Worker
98+
def init_worker():
99+
signal.signal(signal.SIGINT, signal.SIG_IGN)
100+
101+
102+
def multi_wrapper(func_args):
103+
function, kwargs = func_args
104+
return function(**kwargs)
105+
106+
107+
def multi_package(func, kwargs):
108+
zipped = zip(itertools.repeat(func), kwargs)
109+
return zipped

urlchecker/logger.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/main/github.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/main/utils.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.

urlchecker/version.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
"""
22
3-
Copyright (c) 2020-2021 Ayoub Malek and Vanessa Sochat
3+
Copyright (c) 2020-2022 Ayoub Malek and Vanessa Sochat
44
55
This source code is licensed under the terms of the MIT license.
66
For a copy, see <https://opensource.org/licenses/MIT>.
77
88
"""
99

10-
__version__ = "0.0.25"
10+
__version__ = "0.0.26"
1111
AUTHOR = "Ayoub Malek, Vanessa Sochat"
1212
AUTHOR_EMAIL = "superkogito@gmail.com, vsochat@stanford.edu"
1313
NAME = "urlchecker"

0 commit comments

Comments
 (0)