.dev_scripts/check_links.py

# Modified from:
# https://github.com/allenai/allennlp/blob/main/scripts/check_links.py

import argparse
import logging
import os
import pathlib
import re
import sys
from multiprocessing.dummy import Pool
from typing import NamedTuple, Optional, Tuple

import requests
from mmcv.utils import get_logger


def parse_args():
    parser = argparse.ArgumentParser(
        description='Goes through all the inline-links '
        'in markdown files and reports the breakages')
    parser.add_argument(
        '--num-threads',
        type=int,
        default=100,
        help='Number of processes to confirm the link')
    parser.add_argument('--https-proxy', type=str, help='https proxy')
    parser.add_argument(
        '--out',
        type=str,
        default='link_reports.txt',
        help='output path of reports')
    args = parser.parse_args()
    return args


OK_STATUS_CODES = (
    200,
    401,  # the resource exists but may require some sort of login.
    403,  # ^ same
    405,  # HEAD method not allowed.
    # the resource exists, but our default 'Accept-' header may not
    # match what the server can provide.
    406,
)


class MatchTuple(NamedTuple):
    source: str
    name: str
    link: str


def check_link(
        match_tuple: MatchTuple,
        http_session: requests.Session,
        logger: logging = None) -> Tuple[MatchTuple, bool, Optional[str]]:
    reason: Optional[str] = None
    if match_tuple.link.startswith('http'):
        result_ok, reason = check_url(match_tuple, http_session)
    else:
        result_ok = check_path(match_tuple)
    if logger is None:
        print(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    else:
        logger.info(f"  {'✓' if result_ok else '✗'} {match_tuple.link}")
    return match_tuple, result_ok, reason


def check_url(match_tuple: MatchTuple,
              http_session: requests.Session) -> Tuple[bool, str]:
    """Check if a URL is reachable."""
    try:
        result = http_session.head(
            match_tuple.link, timeout=5, allow_redirects=True)
        return (
            result.ok or result.status_code in OK_STATUS_CODES,
            f'status code = {result.status_code}',
        )
    except (requests.ConnectionError, requests.Timeout):
        return False, 'connection error'


def check_path(match_tuple: MatchTuple) -> bool:
    """Check if a file in this repository exists."""
    relative_path = match_tuple.link.split('#')[0]
    full_path = os.path.join(
        os.path.dirname(str(match_tuple.source)), relative_path)
    return os.path.exists(full_path)


def main():
    args = parse_args()

    # setup logger
    logger = get_logger(name='mmdet', log_file=args.out)

    # setup https_proxy
    if args.https_proxy:
        os.environ['https_proxy'] = args.https_proxy

    # setup http_session
    http_session = requests.Session()
    for resource_prefix in ('http://', 'https://'):
        http_session.mount(
            resource_prefix,
            requests.adapters.HTTPAdapter(
                max_retries=5,
                pool_connections=20,
                pool_maxsize=args.num_threads),
        )

    logger.info('Finding all markdown files in the current directory...')

    project_root = (pathlib.Path(__file__).parent / '..').resolve()
    markdown_files = project_root.glob('**/*.md')

    all_matches = set()
    url_regex = re.compile(r'\[([^!][^\]]+)\]\(([^)(]+)\)')
    for markdown_file in markdown_files:
        with open(markdown_file) as handle:
            for line in handle.readlines():
                matches = url_regex.findall(line)
                for name, link in matches:
                    if 'localhost' not in link:
                        all_matches.add(
                            MatchTuple(
                                source=str(markdown_file),
                                name=name,
                                link=link))

    logger.info(f'  {len(all_matches)} markdown files found')
    logger.info('Checking to make sure we can retrieve each link...')

    with Pool(processes=args.num_threads) as pool:
        results = pool.starmap(check_link, [(match, http_session, logger)
                                            for match in list(all_matches)])

    # collect unreachable results
    unreachable_results = [(match_tuple, reason)
                           for match_tuple, success, reason in results
                           if not success]

    if unreachable_results:
        logger.info('================================================')
        logger.info(f'Unreachable links ({len(unreachable_results)}):')
        for match_tuple, reason in unreachable_results:
            logger.info('  > Source: ' + match_tuple.source)
            logger.info('    Name: ' + match_tuple.name)
            logger.info('    Link: ' + match_tuple.link)
            if reason is not None:
                logger.info('    Reason: ' + reason)
        sys.exit(1)
    logger.info('No Unreachable link found.')


if __name__ == '__main__':
    main()