-
Notifications
You must be signed in to change notification settings - Fork 9.6k
/
Copy pathcheck_links.py
executable file
·157 lines (129 loc) · 4.94 KB
/
check_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Modified from:
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py
import argparse
import logging
import os
import pathlib
import re
import sys
from multiprocessing.dummy import Pool
from typing import NamedTuple, Optional, Tuple
import requests
from mmcv.utils import get_logger
def parse_args():
parser = argparse.ArgumentParser(
description='Goes through all the inline-links '
'in markdown files and reports the breakages')
parser.add_argument(
'--num-threads',
type=int,
default=100,
help='Number of processes to confirm the link')
parser.add_argument('--https-proxy', type=str, help='https proxy')
parser.add_argument(
'--out',
type=str,
default='link_reports.txt',
help='output path of reports')
args = parser.parse_args()
return args
OK_STATUS_CODES = (
200,
401, # the resource exists but may require some sort of login.
403, # ^ same
405, # HEAD method not allowed.
# the resource exists, but our default 'Accept-' header may not
# match what the server can provide.
406,
)
class MatchTuple(NamedTuple):
source: str
name: str
link: str
def check_link(
match_tuple: MatchTuple,
http_session: requests.Session,
logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
reason: Optional[str] = None
if match_tuple.link.startswith('http'):
result_ok, reason = check_url(match_tuple, http_session)
else:
result_ok = check_path(match_tuple)
if logger is None:
print(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
else:
logger.info(f" {'✓' if result_ok else '✗'} {match_tuple.link}")
return match_tuple, result_ok, reason
def check_url(match_tuple: MatchTuple,
http_session: requests.Session) -> Tuple[bool, str]:
"""Check if a URL is reachable."""
try:
result = http_session.head(
match_tuple.link, timeout=5, allow_redirects=True)
return (
result.ok or result.status_code in OK_STATUS_CODES,
f'status code = {result.status_code}',
)
except (requests.ConnectionError, requests.Timeout):
return False, 'connection error'
def check_path(match_tuple: MatchTuple) -> bool:
"""Check if a file in this repository exists."""
relative_path = match_tuple.link.split('#')[0]
full_path = os.path.join(
os.path.dirname(str(match_tuple.source)), relative_path)
return os.path.exists(full_path)
def main():
args = parse_args()
# setup logger
logger = get_logger(name='mmdet', log_file=args.out)
# setup https_proxy
if args.https_proxy:
os.environ['https_proxy'] = args.https_proxy
# setup http_session
http_session = requests.Session()
for resource_prefix in ('http://', 'https://'):
http_session.mount(
resource_prefix,
requests.adapters.HTTPAdapter(
max_retries=5,
pool_connections=20,
pool_maxsize=args.num_threads),
)
logger.info('Finding all markdown files in the current directory...')
project_root = (pathlib.Path(__file__).parent / '..').resolve()
markdown_files = project_root.glob('**/*.md')
all_matches = set()
url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
for markdown_file in markdown_files:
with open(markdown_file) as handle:
for line in handle.readlines():
matches = url_regex.findall(line)
for name, link in matches:
if 'localhost' not in link:
all_matches.add(
MatchTuple(
source=str(markdown_file),
name=name,
link=link))
logger.info(f' {len(all_matches)} markdown files found')
logger.info('Checking to make sure we can retrieve each link...')
with Pool(processes=args.num_threads) as pool:
results = pool.starmap(check_link, [(match, http_session, logger)
for match in list(all_matches)])
# collect unreachable results
unreachable_results = [(match_tuple, reason)
for match_tuple, success, reason in results
if not success]
if unreachable_results:
logger.info('================================================')
logger.info(f'Unreachable links ({len(unreachable_results)}):')
for match_tuple, reason in unreachable_results:
logger.info(' > Source: ' + match_tuple.source)
logger.info(' Name: ' + match_tuple.name)
logger.info(' Link: ' + match_tuple.link)
if reason is not None:
logger.info(' Reason: ' + reason)
sys.exit(1)
logger.info('No Unreachable link found.')
if __name__ == '__main__':
main()