Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable checking against mypy with minimal set of fixes #1402

Closed
wants to merge 11 commits into from
2 changes: 1 addition & 1 deletion .github/workflows/tox.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ jobs:
fail-fast: false
max-parallel: 4
matrix:
tox-env: [flake8, pep517check, checkspelling]
tox-env: [mypy, flake8, pep517check, checkspelling]

env:
TOXENV: ${{ matrix.tox-env }}
Expand Down
2 changes: 1 addition & 1 deletion markdown/blockprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def test(self, parent: etree.Element, block: str) -> bool:
return block.startswith(' '*self.tab_length) and \
not self.parser.state.isstate('detabbed') and \
(parent.tag in self.ITEM_TYPES or
(len(parent) and parent[-1] is not None and
(len(parent) > 0 and parent[-1] is not None and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This change should be completely unnecessary. The code works just fine and any tool which prevents me from writing code that way is unwanted.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code can actually sometimes return an instance of int rather than bool. If we have to, I'll follow the usual over-the-top process of proving to you that this is a bug and writing a test.

(parent[-1].tag in self.LIST_TYPES)))

def run(self, parent: etree.Element, blocks: list[str]) -> None:
Expand Down
24 changes: 16 additions & 8 deletions markdown/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,14 @@ class Markdown:
callable which accepts an [`Element`][xml.etree.ElementTree.Element] and returns a `str`.
"""

tab_length: int
ESCAPED_CHARS: list[str]
block_level_elements: list[str]
registeredExtensions: list[Extension]
stripTopLevelTags: bool
references: dict[str, tuple[str, str]]
htmlStash: util.HtmlStash

Comment on lines +88 to +95
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are instance attributes, not class attributes. Will you please stop changing the code to fit a tool which I have repeatedly stated does not fit our needs.

I am closing this issue as you are clearly either not understanding my direction or intentionally ignoring it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the correct way to annotate instance attributes according to all tools and documentation

def __init__(self, **kwargs):
"""
Creates a new Markdown instance.
Expand All @@ -106,23 +114,23 @@ def __init__(self, **kwargs):

"""

self.tab_length: int = kwargs.get('tab_length', 4)
self.tab_length = kwargs.get('tab_length', 4)

self.ESCAPED_CHARS: list[str] = [
self.ESCAPED_CHARS = [
'\\', '`', '*', '_', '{', '}', '[', ']', '(', ')', '>', '#', '+', '-', '.', '!'
]
""" List of characters which get the backslash escape treatment. """

self.block_level_elements: list[str] = BLOCK_LEVEL_ELEMENTS.copy()
self.block_level_elements = BLOCK_LEVEL_ELEMENTS.copy()

self.registeredExtensions: list[Extension] = []
self.registeredExtensions = []
self.docType = "" # TODO: Maybe delete this. It does not appear to be used anymore.
self.stripTopLevelTags: bool = True
self.stripTopLevelTags = True

self.build_parser()

self.references: dict[str, tuple[str, str]] = {}
self.htmlStash: util.HtmlStash = util.HtmlStash()
self.references = {}
self.htmlStash = util.HtmlStash()
self.registerExtensions(extensions=kwargs.get('extensions', []),
configs=kwargs.get('extension_configs', {}))
self.set_output_format(kwargs.get('output_format', 'xhtml'))
Expand Down Expand Up @@ -446,7 +454,7 @@ def convertFile(
else:
# Encode manually and write bytes to stdout.
html = html.encode(encoding, "xmlcharrefreplace")
sys.stdout.buffer.write(html)
sys.stdout.buffer.write(html) # type: ignore

return self

Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/abbr.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def __init__(self, pattern: str, title: str):
super().__init__(pattern)
self.title = title

def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]:
def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element, int, int]: # type: ignore[override]
abbr = etree.Element('abbr')
abbr.text = AtomicString(m.group('abbr'))
abbr.set('title', self.title)
Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/attr_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def _handle_word(s, t):
return t, t


_scanner = re.Scanner([
_scanner = re.Scanner([ # type: ignore[attr-defined]
(r'[^ =]+=".*?"', _handle_double_quote),
(r"[^ =]+='.*?'", _handle_single_quote),
(r'[^ =]+=[^ =]+', _handle_key_value),
Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/codehilite.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@
lexer = get_lexer_by_name('text', **self.options)
if not self.lang:
# Use the guessed lexer's language instead
self.lang = lexer.aliases[0]
self.lang = lexer.aliases[0] # type: ignore[attr-defined]

Check warning on line 164 in markdown/extensions/codehilite.py

View check run for this annotation

Codecov / codecov/patch

markdown/extensions/codehilite.py#L164

Added line #L164 was not covered by tests
lang_str = f'{self.lang_prefix}{self.lang}'
if isinstance(self.pygments_formatter, str):
try:
Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/fenced_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,7 @@ def handle_attrs(self, attrs: Iterable[tuple[str, str]]) -> tuple[str, list[str]
""" Return tuple: `(id, [list, of, classes], {configs})` """
id = ''
classes = []
configs = {}
configs: dict[str, Any] = {}
for k, v in attrs:
if k == 'id':
id = v
Expand Down
9 changes: 6 additions & 3 deletions markdown/extensions/footnotes.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
class FootnoteExtension(Extension):
""" Footnote Extension. """

found_refs: dict[str, int]
used_refs: set[str]

def __init__(self, **kwargs):
""" Setup configs. """

Expand Down Expand Up @@ -68,8 +71,8 @@ def __init__(self, **kwargs):

# In multiple invocations, emit links that don't get tangled.
self.unique_prefix = 0
self.found_refs: dict[str, int] = {}
self.used_refs: set[str] = set()
self.found_refs = {}
self.used_refs = set()

self.reset()

Expand Down Expand Up @@ -290,7 +293,7 @@ def detectTabbed(self, blocks: list[str]) -> list[str]:
break
return fn_blocks

def detab(self, block: str) -> str:
def detab(self, block: str) -> str: # type: ignore[override]
""" Remove one level of indent from a block.

Preserve lazily indented blocks by only removing indent from indented lines.
Expand Down
8 changes: 6 additions & 2 deletions markdown/extensions/md_in_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ class HTMLExtractorExtra(HTMLExtractor):
Markdown.
"""

mdstack: list[str] = [] # When markdown=1, stack contains a list of tags
treebuilder: etree.TreeBuilder
mdstate: list[Literal['block', 'span', 'off', None]]

def __init__(self, md: Markdown, *args, **kwargs):
# All block-level tags.
self.block_level_tags = set(md.block_level_elements.copy())
Expand All @@ -58,9 +62,9 @@ def __init__(self, md: Markdown, *args, **kwargs):

def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.mdstack: list[str] = [] # When markdown=1, stack contains a list of tags
self.mdstack = [] # When markdown=1, stack contains a list of tags
self.treebuilder = etree.TreeBuilder()
self.mdstate: list[Literal['block', 'span', 'off', None]] = []
self.mdstate = []
super().reset()

def close(self):
Expand Down
2 changes: 1 addition & 1 deletion markdown/extensions/meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ def run(self, lines: list[str]) -> list[str]:
else:
lines.insert(0, line)
break # no meta data - done
self.md.Meta = meta
self.md.Meta = meta # type: ignore[attr-defined]
return lines


Expand Down
12 changes: 7 additions & 5 deletions markdown/extensions/smarty.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,9 @@ def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:

class SmartyExtension(Extension):
""" Add Smarty to Markdown. """

substitutions: dict[str, str]

def __init__(self, **kwargs):
self.config = {
'smart_quotes': [True, 'Educate quotes'],
Expand All @@ -189,7 +192,7 @@ def __init__(self, **kwargs):
}
""" Default configuration options. """
super().__init__(**kwargs)
self.substitutions: dict[str, str] = dict(substitutions)
self.substitutions = dict(substitutions)
self.substitutions.update(self.getConfig('substitutions', default={}))

def _addPatterns(
Expand All @@ -199,9 +202,8 @@ def _addPatterns(
serie: str,
priority: int,
):
for ind, pattern in enumerate(patterns):
pattern += (md,)
pattern = SubstituteTextPattern(*pattern)
for ind, pattern_args in enumerate(patterns):
pattern = SubstituteTextPattern(*pattern_args, md)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again the dynamic nature of Python is being taken away here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is indeed a limitation of mypy.
That being said, this change is just a mordernization that could've been done anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be clear. if the code had been written that way originally when it was submitted in a PR, I would have accepted It. But I would be willing to accept the existing code also (as I did). I don't have an objection to either way, but I do have an objection to a tool forcing one way on me and not allowing the other. That is not a tool I am interested in using.

name = 'smarty-%s-%d' % (serie, ind)
self.inlinePatterns.register(pattern, name, priority-ind)

Expand Down Expand Up @@ -253,7 +255,7 @@ def educateQuotes(self, md: Markdown) -> None:
)
self._addPatterns(md, patterns, 'quotes', 30)

def extendMarkdown(self, md):
def extendMarkdown(self, md: Markdown):
configs = self.getConfigs()
self.inlinePatterns: Registry[inlinepatterns.InlineProcessor] = Registry()
if configs['smart_ellipses']:
Expand Down
6 changes: 3 additions & 3 deletions markdown/extensions/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> st
def _html_sub(m: re.Match[str]) -> str:
""" Substitute raw html with plain text. """
try:
raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
raw: str = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
except (IndexError, TypeError): # pragma: no cover
return m.group(0)
# Strip out tags and/or entities - leaving text
Expand Down Expand Up @@ -335,8 +335,8 @@ def run(self, doc: etree.Element) -> None:
toc = self.md.serializer(div)
for pp in self.md.postprocessors:
toc = pp.run(toc)
self.md.toc_tokens = toc_tokens
self.md.toc = toc
self.md.toc_tokens = toc_tokens # type: ignore[attr-defined]
self.md.toc = toc # type: ignore[attr-defined]


class TocExtension(Extension):
Expand Down
1 change: 1 addition & 0 deletions markdown/extensions/wikilinks.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def __init__(self, pattern: str, config: dict[str, Any]):
self.config = config

def handleMatch(self, m: re.Match[str], data: str) -> tuple[etree.Element | str, int, int]:
a: etree.Element | str
if m.group(1).strip():
base_url, end_url, html_class = self._getMeta()
label = m.group(1).strip()
Expand Down
13 changes: 8 additions & 5 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
import re
import importlib.util
import sys
from typing import TYPE_CHECKING, Sequence
from typing import TYPE_CHECKING, Any, Sequence

if TYPE_CHECKING: # pragma: no cover
from markdown import Markdown
Expand All @@ -37,7 +37,7 @@
# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
# Users can still do `from html import parser` and get the default behavior.
spec = importlib.util.find_spec('html.parser')
htmlparser = importlib.util.module_from_spec(spec)
htmlparser: Any = importlib.util.module_from_spec(spec)
spec.loader.exec_module(htmlparser)
sys.modules['htmlparser'] = htmlparser

Expand Down Expand Up @@ -80,6 +80,9 @@ class HTMLExtractor(htmlparser.HTMLParser):
is stored in `cleandoc` as a list of strings.
"""

stack: list[str]
cleandoc: list[str]

def __init__(self, md: Markdown, *args, **kwargs):
if 'convert_charrefs' not in kwargs:
kwargs['convert_charrefs'] = False
Expand All @@ -97,9 +100,9 @@ def reset(self):
"""Reset this instance. Loses all unprocessed data."""
self.inraw = False
self.intail = False
self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
self._cache: list[str] = []
self.cleandoc: list[str] = []
self.stack = [] # When `inraw==True`, stack contains a list of tags
self._cache = []
self.cleandoc = []
self.lineno_start_cache = [0]

super().reset()
Expand Down
46 changes: 27 additions & 19 deletions markdown/inlinepatterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ class EmStrongItem(NamedTuple):
# -----------------------------------------------------------------------------


class Pattern: # pragma: no cover
class _BasePattern:
"""
Base class that inline patterns subclass.

Expand Down Expand Up @@ -238,31 +238,18 @@ def getCompiledRegExp(self) -> re.Pattern:
""" Return a compiled regular expression. """
return self.compiled_re

def handleMatch(self, m: re.Match[str]) -> etree.Element | str:
"""Return a ElementTree element from the given match.

Subclasses should override this method.

Arguments:
m: A match object containing a match of the pattern.

Returns: An ElementTree Element object.

"""
pass # pragma: no cover

def type(self) -> str:
""" Return class name, to define pattern type """
return self.__class__.__name__

def unescape(self, text: str) -> str:
""" Return unescaped text given text with an inline placeholder. """
try:
stash = self.md.treeprocessors['inline'].stashed_nodes
stash = self.md.treeprocessors['inline'].stashed_nodes # type: ignore[attr-defined]
except KeyError: # pragma: no cover
return text

def get_stash(m):
def get_stash(m: re.Match[str]) -> str:
id = m.group(1)
if id in stash:
value = stash.get(id)
Expand All @@ -274,6 +261,27 @@ def get_stash(m):
return util.INLINE_PLACEHOLDER_RE.sub(get_stash, text)


class LegacyPattern(_BasePattern):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have very serious objections to the changes to this file. We have API changes. Under no circumstances am I interested in making API changes for type checking. If that means it's not possible to move forward with mypy, then we don't move forward with mypy.

To be clear, this specific line of code is not any more or less relevant than any other. However, the changes are rather sweeping and it was not possible to easily highlight and comment on all relevant lines. I'm speaking about most of the changes to this file in general.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There is no actual change to the API, all usages can remain the same.

But this is still indeed the worst change here. I spent a lot of time thinking of this but it's the best I came up with. We could also solicit ideas regarding this.

I'll comment further on this soon.

Copy link
Contributor Author

@oprypin oprypin Nov 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me explain what the problem here is.

This is the general explanation:
https://stackoverflow.com/questions/6034662/python-method-overriding-does-signature-matter

For the particular situation at hand:
class InlineProcessor(Pattern): violates the Liskov Substitution Principle.

And let's just quote directly from mypy's output:

Signature of "handleMatch" incompatible with supertype "Pattern"  [override]
     Superclass:
         def handleMatch(self, m: Match[str]) -> Element | str
     Subclass:
         def handleMatch(self, m: Match[str], data: str) -> tuple[Element | str | None, int | None, int | None]

The particular violation happens regardless of mypy, it's an actual run-time problem if users expect it to work like is widely accepted:

If you write:

if isinstance(foo, Pattern):
    foo.handleMatch(single_argument)

it must work, because every instance of Pattern must have a single-argument handleMatch method, because it's written so right there!

An instance of InlineProcessor is an instance of Pattern, and every instance of it violates this principle.

More specific code example:

>>> import re
>>> from markdown.inlinepatterns import EscapeInlineProcessor, Pattern

>>> foo = Pattern('a')
>>> foo.handleMatch(re.search('a', 'a'))  # OK

>>> foo = EscapeInlineProcessor('a')
>>> isinstance(foo, Pattern)
True
>>> foo.handleMatch(re.search('a', 'a'))
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
TypeError: EscapeInlineProcessor.handleMatch() missing 1 required positional argument: 'data'

The correct hierarchy would be this:

  graph TD;
      BasePattern["BasePattern<br>defines helper methods only"]-->Pattern["Pattern<br>abstract def handleMatch(m):"];
      BasePattern-->InlineProcessor["InlineProcessor<br>abstract def handleMatch(m, data):"];
      Pattern-->SimpleTextPattern["SimpleTextPattern<br>def handleMatch(m):"]
      Pattern-->SimpleTagPattern["SimpleTagPattern<br>def handleMatch(m):"]
      InlineProcessor-->EscapeInlineProcessor["EscapeInlineProcessor<br>def handleMatch(m, data):"]
      InlineProcessor-->AsteriskProcessor["AsteriskProcessor<br>def handleMatch(m, data):"]
Loading

But the current hierarchy is:

  graph TD;
      Pattern["Pattern<br>defines helper methods<br>abstract def handleMatch(m):"];
      Pattern-->InlineProcessor["InlineProcessor<br>abstract def handleMatch(m, data):"];
      Pattern-->SimpleTextPattern["SimpleTextPattern<br>def handleMatch(m):"]
      Pattern-->SimpleTagPattern["SimpleTagPattern<br>def handleMatch(m):"]
      InlineProcessor-->EscapeInlineProcessor["EscapeInlineProcessor<br>def handleMatch(m, data):"]
      InlineProcessor-->AsteriskProcessor["AsteriskProcessor<br>def handleMatch(m, data):"]
Loading

As such, the entire hierarchy of InlineProcessor is said to have improper overrides.

What my change accomplishes is to construct the correct type hierarchy to mypy and to all typechecking users going forward.
However at run time I am keeping everything just like it was. LegacyPattern is just a new alias to Pattern

Copy link
Contributor Author

@oprypin oprypin Nov 3, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See a code search of occurrences in the wild: currently every subclass of InlineProcessor needs to add an ignore comment for this override.

https://github.com/search?q=%2FInlineProcessor%5C%29%2F+%2FhandleMatch.%2Bignore%2F+NOT+path%3Azerver&type=code

Of course at the moment that's not this repository's fault (as it doesn't have py.typed) but just because the type stubs are written in a way that directly mirrors it:
https://github.com/python/typeshed/blob/9ff393558a6f4fedc086bc9dc4386dbe529a3331/stubs/Markdown/markdown/inlinepatterns.pyi#L48

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's also this issue report (which I found through the above code search because it's directly inspired by this case). Which if accepted would resolve this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

An equivalent change to the external stubs would be this:
oprypin/typeshed@43593a4
Maybe this more condensed view is clearer to understand.

def handleMatch(self, m: re.Match[str]) -> etree.Element | str:
"""Return a ElementTree element from the given match.

Subclasses should override this method.

Arguments:
m: A match object containing a match of the pattern.

Returns: An ElementTree Element object.

"""
pass # pragma: no cover


if TYPE_CHECKING: # pragma: no cover
Pattern = _BasePattern
else:
Pattern = LegacyPattern


class InlineProcessor(Pattern):
"""
Base class that inline processors subclass.
Expand Down Expand Up @@ -505,13 +513,13 @@ def handleMatch(self, m: re.Match[str], data: str) -> tuple[str, int, int]:
def unescape(self, text: str) -> str:
""" Return unescaped text given text with an inline placeholder. """
try:
stash = self.md.treeprocessors['inline'].stashed_nodes
stash = self.md.treeprocessors['inline'].stashed_nodes # type: ignore[attr-defined]
except KeyError: # pragma: no cover
return text

def get_stash(m: re.Match[str]) -> str:
id = m.group(1)
value = stash.get(id)
value: etree.Element | None = stash.get(id)
if value is not None:
try:
return self.md.serializer(value)
Expand All @@ -523,7 +531,7 @@ def get_stash(m: re.Match[str]) -> str:
def backslash_unescape(self, text: str) -> str:
""" Return text with backslash escapes undone (backslashes are restored). """
try:
RE = self.md.treeprocessors['unescape'].RE
RE = self.md.treeprocessors['unescape'].RE # type: ignore[attr-defined]
except KeyError: # pragma: no cover
return text

Expand Down
2 changes: 1 addition & 1 deletion markdown/postprocessors.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def run(self, text: str) -> str:
""" Iterate over html stash and restore html. """
replacements = OrderedDict()
for i in range(self.md.htmlStash.html_counter):
html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i])
html = self.stash_to_string(self.md.htmlStash.rawHtmlBlocks[i]) # type: ignore[arg-type]
if self.isblocklevel(html):
replacements["<p>{}</p>".format(
self.md.htmlStash.get_placeholder(i))] = html
Expand Down
4 changes: 2 additions & 2 deletions markdown/serializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@

from __future__ import annotations

from xml.etree.ElementTree import ProcessingInstruction
from xml.etree.ElementTree import Comment, ElementTree, Element, QName, HTML_EMPTY
from xml.etree.ElementTree import ProcessingInstruction, Comment, ElementTree, Element, QName
from xml.etree.ElementTree import HTML_EMPTY # type: ignore[attr-defined]
import re
from typing import Callable, Literal, NoReturn

Expand Down
Loading
Loading