.ci/table_of_content.py

import json
import pathlib
import argparse
import re

TABLE_OF_CONTENT = r"#+\s+Table of content:?"


def find_tc_in_cell(cell):
    tc_cell = None
    tc_line_number = None
    for i, line in enumerate(cell["source"]):
        if re.match(TABLE_OF_CONTENT, line):
            tc_cell = cell
            tc_line_number = i
            break

    return tc_cell, tc_line_number


def create_title_for_tc(title):
    title_for_tc = title.lstrip("#").lstrip()
    title_for_tc = re.sub(r"[\[\]\n]", "", title_for_tc)
    title_for_tc = re.sub(r"\(http.*\)", "", title_for_tc)

    return title_for_tc


def create_link_for_tc(title):
    link = re.sub(r"[`$^]", "", title)
    link = link.replace(" ", "-")

    return link


def remove_old_tc(cell, idx):
    if cell is not None:
        for line in cell["source"][idx:]:
            if re.match(r"\s*-\s*\[.*\]\(#.*\).*", line) or re.match(TABLE_OF_CONTENT, line):
                cell["source"].remove(line)
    return cell


def get_tc_line(title, title_for_tc, link, tc_list, titles_list):
    # calc indents for Table of content
    try:
        indents_num = (title.index(" ") - 2) * 4
    except:
        indents_num = -1

    if len(tc_list) == 0 or indents_num < 0:
        # when first list item have more than 1 indents the alignment would be broken
        indents_num = 0
    elif indents_num - tc_list[-1].index("-") > 4:
        # when previous list item have n indents and current have n+4+1 it broke the alignment
        indents_num = tc_list[-1].index("-") + 4
    elif indents_num != tc_list[-1].index("-") and title.index(" ") == titles_list[-1].index(" "):
        # when we have several titles with same wrong alignments
        indents_num = tc_list[-1].index("-")

    indents = " " * indents_num + "-" + " "
    line = f"{indents}[{title_for_tc}](#{link})\n"

    return line


def is_ref_to_top_exists(cell, idx):
    ref_exists = False
    for row in cell[idx + 1 :]:
        row = row.strip()
        if "[back to top ⬆️](#Table-of-content" in row:
            ref_exists = True
            break
        elif row != "":
            # content of block started
            break
    return ref_exists


def is_markdown(cell):
    return "markdown" == cell["cell_type"]


def is_title(line):
    return line.strip().startswith("#") and line.strip().lstrip("#").lstrip()


def generate_table_of_content(notebook_path: pathlib.Path):
    table_of_content = []

    table_of_content_cell = None
    table_of_content_cell_idx = None

    with open(notebook_path, "r", encoding="utf-8") as notebook_file:
        notebook_json = json.load(notebook_file)

    if not notebook_json["cells"]:
        return

    table_of_content_cell, table_of_content_cell_idx = find_tc_in_cell(notebook_json["cells"][0])

    all_titles = []
    for cell in filter(is_markdown, notebook_json["cells"][1:]):
        if table_of_content_cell is None:
            table_of_content_cell, table_of_content_cell_idx = find_tc_in_cell(cell)
            if not table_of_content_cell is None:
                continue

        titles = [line for line in cell["source"] if is_title(line)]
        for title in titles:
            idx = cell["source"].index(title)
            if not is_ref_to_top_exists(cell["source"], idx):
                if not title.endswith("\n"):
                    cell["source"].insert(idx, title + "\n")
                cell["source"].insert(idx + 1, "[back to top ⬆️](#Table-of-contents:)\n")
                cell["source"].insert(idx + 2, "")

            title = title.strip()
            title_for_tc = create_title_for_tc(title)
            link_for_tc = create_link_for_tc(title_for_tc)
            new_line = get_tc_line(title, title_for_tc, link_for_tc, table_of_content, all_titles)

            if table_of_content.count(new_line) > 1:
                print(
                    f'WARINING: the title "{title_for_tc}" has already used in titles.\n'
                    + "Navigation will work inccorect, the link will only point to "
                    + "the first encountered title"
                )

            table_of_content.append(new_line)
            all_titles.append(title)

    table_of_content = ["\n", "#### Table of contents:\n\n"] + table_of_content + ["\n"]

    if table_of_content_cell is not None:
        table_of_content_cell = remove_old_tc(table_of_content_cell, table_of_content_cell_idx)

    if table_of_content_cell is not None:
        table_of_content_cell["source"].extend(table_of_content)
    else:
        notebook_json["cells"][0]["source"].extend(table_of_content)

    with open(notebook_path, "w", encoding="utf-8") as in_f:
        json.dump(notebook_json, in_f, ensure_ascii=False, indent=1)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "-s",
        "--source",
        help="Please, specify notebook or folder with notebooks.\
                            Table of content will be added or modified in each.",
        required=True,
    )

    args = parser.parse_args()
    path_to_source = pathlib.Path(args.source)
    if not path_to_source.exists():
        print(f"Incorrect path to notebook(s) {path_to_source}")
        exit()
    elif path_to_source.is_file():
        generate_table_of_content(path_to_source)
    elif path_to_source.is_dir():
        for notebook in path_to_source.glob("**/*.ipynb"):
            generate_table_of_content(notebook)