Skip to content

Commit f52249e

Browse files
chrisjsewellhukkin
andauthored
♻️ REFACTOR: Replace character codes with strings (#270)
The use of `StateBase.srcCharCode` is deprecated (with backward-compatibility), and all core uses are replaced by `StateBase.src`. Conversion of source string characters to an integer representing the Unicode character is prevalent in the upstream JavaScript implementation, to improve performance. However, it is unnecessary in Python and leads to harder to read code and performance deprecations (during the conversion in the `StateBase` initialisation). `StateBase.srcCharCode` is no longer populated on initiation, but is left as an on-demand, cached property, to allow backward compatibility for plugins (deprecation warnings are emitted to identify where updates are required). `isStrSpace` is supplied as a replacement for `isSpace`, and similarly `StateBlock.skipCharsStr`/`StateBlock.skipCharsStrBack` replace `StateBlock.skipChars`/`StateBlock.skipCharsBack` Co-authored-by: Taneli Hukkinen <3275109+hukkin@users.noreply.github.com>
1 parent c6754a2 commit f52249e

32 files changed

+321
-282
lines changed

markdown_it/common/utils.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,12 @@
44

55
import html
66
import re
7-
from typing import Any, Match, TypeVar
7+
from typing import Match, TypeVar
88

99
from .entities import entities
1010

1111

12-
def charCodeAt(src: str, pos: int) -> Any:
12+
def charCodeAt(src: str, pos: int) -> int | None:
1313
"""
1414
Returns the Unicode value of the character at the specified location.
1515
@@ -24,6 +24,21 @@ def charCodeAt(src: str, pos: int) -> Any:
2424
return None
2525

2626

27+
def charStrAt(src: str, pos: int) -> str | None:
28+
"""
29+
Returns the Unicode value of the character at the specified location.
30+
31+
@param - index The zero-based index of the desired character.
32+
If there is no character at the specified index, NaN is returned.
33+
34+
This was added for compatibility with python
35+
"""
36+
try:
37+
return src[pos]
38+
except IndexError:
39+
return None
40+
41+
2742
_ItemTV = TypeVar("_ItemTV")
2843

2944

@@ -96,7 +111,7 @@ def replaceEntityPattern(match: str, name: str) -> str:
96111
if name in entities:
97112
return entities[name]
98113

99-
if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):
114+
if name[0] == "#" and DIGITAL_ENTITY_TEST_RE.search(name):
100115
code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
101116
if isValidEntityCode(code):
102117
return fromCodePoint(code)
@@ -178,8 +193,14 @@ def escapeRE(string: str) -> str:
178193
# //////////////////////////////////////////////////////////////////////////////
179194

180195

181-
def isSpace(code: object) -> bool:
182-
return code in {0x09, 0x20}
196+
def isSpace(code: int | None) -> bool:
197+
"""Check if character code is a whitespace."""
198+
return code in (0x09, 0x20)
199+
200+
201+
def isStrSpace(ch: str | None) -> bool:
202+
"""Check if character is a whitespace."""
203+
return ch in ("\t", " ")
183204

184205

185206
MD_WHITESPACE = {
@@ -188,7 +209,7 @@ def isSpace(code: object) -> bool:
188209
0x0B, # \v
189210
0x0C, # \f
190211
0x0D, # \r
191-
0x20,
212+
0x20, # space
192213
0xA0,
193214
0x1680,
194215
0x202F,
@@ -213,6 +234,7 @@ def isWhiteSpace(code: int) -> bool:
213234

214235
# Currently without astral characters support.
215236
def isPunctChar(ch: str) -> bool:
237+
"""Check if character is a punctuation character."""
216238
return UNICODE_PUNCT_RE.search(ch) is not None
217239

218240

markdown_it/helpers/parse_link_destination.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def parseLinkDestination(string: str, pos: int, maximum: int) -> _Result:
4949
while pos < maximum:
5050
code = charCodeAt(string, pos)
5151

52-
if code == 0x20:
52+
if code is None or code == 0x20:
5353
break
5454

5555
# ascii control characters

markdown_it/helpers/parse_link_label.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,16 +17,16 @@ def parseLinkLabel(state: StateInline, start: int, disableNested: bool = False)
1717
level = 1
1818

1919
while state.pos < state.posMax:
20-
marker = state.srcCharCode[state.pos]
21-
if marker == 0x5D: # /* ] */)
20+
marker = state.src[state.pos]
21+
if marker == "]":
2222
level -= 1
2323
if level == 0:
2424
found = True
2525
break
2626

2727
prevPos = state.pos
2828
state.md.inline.skipToken(state)
29-
if marker == 0x5B: # /* [ */)
29+
if marker == "[":
3030
if prevPos == state.pos - 1:
3131
# increase level if we find text `[`,
3232
# which is not a part of any token

markdown_it/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ def __init__(
4646
"""
4747
# add modules
4848
self.utils = utils
49-
self.helpers: Any = helpers
49+
self.helpers = helpers
5050

5151
# initialise classes
5252
self.inline = ParserInline()

markdown_it/parser_block.py

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -97,16 +97,11 @@ def tokenize(
9797
state.line = line
9898

9999
def parse(
100-
self,
101-
src: str,
102-
md: MarkdownIt,
103-
env: EnvType,
104-
outTokens: list[Token],
105-
ords: tuple[int, ...] | None = None,
100+
self, src: str, md: MarkdownIt, env: EnvType, outTokens: list[Token]
106101
) -> list[Token] | None:
107102
"""Process input string and push block tokens into `outTokens`."""
108103
if not src:
109104
return None
110-
state = StateBlock(src, md, env, outTokens, ords)
105+
state = StateBlock(src, md, env, outTokens)
111106
self.tokenize(state, state.line, state.lineMax)
112107
return state.tokens

markdown_it/port.yaml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@
2323
to manipulate `Token.attrs`, which have an identical signature to those upstream.
2424
- Use python version of `charCodeAt`
2525
- |
26-
Reduce use of charCodeAt() by storing char codes in a srcCharCodes attribute for state
27-
objects and sharing those whenever possible
26+
Use `str` units instead of `int`s to represent Unicode codepoints.
2827
This provides a significant performance boost
2928
- |
3029
In markdown_it/rules_block/reference.py,

markdown_it/ruler.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ class Ruler
2020
from collections.abc import Callable, Iterable
2121
from dataclasses import dataclass, field
2222
from typing import TYPE_CHECKING, TypedDict
23+
import warnings
2324

2425
from markdown_it._compat import DATACLASS_KWARGS
2526

@@ -30,8 +31,6 @@ class Ruler
3031

3132

3233
class StateBase:
33-
srcCharCode: tuple[int, ...] # noqa: N815
34-
3534
def __init__(self, src: str, md: MarkdownIt, env: EnvType):
3635
self.src = src
3736
self.env = env
@@ -44,7 +43,18 @@ def src(self) -> str:
4443
@src.setter
4544
def src(self, value: str) -> None:
4645
self._src = value
47-
self.srcCharCode = tuple(ord(c) for c in self.src)
46+
self._srcCharCode: tuple[int, ...] | None = None
47+
48+
@property
49+
def srcCharCode(self) -> tuple[int, ...]:
50+
warnings.warn(
51+
"StateBase.srcCharCode is deprecated. Use StateBase.src instead.",
52+
DeprecationWarning,
53+
stacklevel=2,
54+
)
55+
if self._srcCharCode is None:
56+
self._srcCharCode = tuple(ord(c) for c in self._src)
57+
return self._srcCharCode
4858

4959

5060
# The first positional arg is always a subtype of `StateBase`. Other

markdown_it/rules_block/blockquote.py

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import logging
55

6-
from ..common.utils import isSpace
6+
from ..common.utils import isStrSpace
77
from .state_block import StateBlock
88

99
LOGGER = logging.getLogger(__name__)
@@ -23,7 +23,7 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
2323

2424
# check the block quote marker
2525
try:
26-
if state.srcCharCode[pos] != 0x3E: # /* > */
26+
if state.src[pos] != ">":
2727
return False
2828
except IndexError:
2929
return False
@@ -38,20 +38,20 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
3838
initial = offset = state.sCount[startLine] + 1
3939

4040
try:
41-
second_char_code: int | None = state.srcCharCode[pos]
41+
second_char: str | None = state.src[pos]
4242
except IndexError:
43-
second_char_code = None
43+
second_char = None
4444

4545
# skip one optional space after '>'
46-
if second_char_code == 0x20: # /* space */
46+
if second_char == " ":
4747
# ' > test '
4848
# ^ -- position start of line here:
4949
pos += 1
5050
initial += 1
5151
offset += 1
5252
adjustTab = False
5353
spaceAfterMarker = True
54-
elif second_char_code == 0x09: # /* tab */
54+
elif second_char == "\t":
5555
spaceAfterMarker = True
5656

5757
if (state.bsCount[startLine] + offset) % 4 == 3:
@@ -74,10 +74,10 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
7474
state.bMarks[startLine] = pos
7575

7676
while pos < max:
77-
ch = state.srcCharCode[pos]
77+
ch = state.src[pos]
7878

79-
if isSpace(ch):
80-
if ch == 0x09: # / tab /
79+
if isStrSpace(ch):
80+
if ch == "\t":
8181
offset += (
8282
4
8383
- (offset + state.bsCount[startLine] + (1 if adjustTab else 0)) % 4
@@ -147,7 +147,7 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
147147
# Case 1: line is not inside the blockquote, and this line is empty.
148148
break
149149

150-
evaluatesTrue = state.srcCharCode[pos] == 0x3E and not isOutdented # /* > */
150+
evaluatesTrue = state.src[pos] == ">" and not isOutdented
151151
pos += 1
152152
if evaluatesTrue:
153153
# This line is inside the blockquote.
@@ -156,20 +156,20 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
156156
initial = offset = state.sCount[nextLine] + 1
157157

158158
try:
159-
next_char: int | None = state.srcCharCode[pos]
159+
next_char: str | None = state.src[pos]
160160
except IndexError:
161161
next_char = None
162162

163163
# skip one optional space after '>'
164-
if next_char == 0x20: # /* space */
164+
if next_char == " ":
165165
# ' > test '
166166
# ^ -- position start of line here:
167167
pos += 1
168168
initial += 1
169169
offset += 1
170170
adjustTab = False
171171
spaceAfterMarker = True
172-
elif next_char == 0x09: # /* tab */
172+
elif next_char == "\t":
173173
spaceAfterMarker = True
174174

175175
if (state.bsCount[nextLine] + offset) % 4 == 3:
@@ -192,10 +192,10 @@ def blockquote(state: StateBlock, startLine: int, endLine: int, silent: bool) ->
192192
state.bMarks[nextLine] = pos
193193

194194
while pos < max:
195-
ch = state.srcCharCode[pos]
195+
ch = state.src[pos]
196196

197-
if isSpace(ch):
198-
if ch == 0x09:
197+
if isStrSpace(ch):
198+
if ch == "\t":
199199
offset += (
200200
4
201201
- (

markdown_it/rules_block/fence.py

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,14 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
1919
if pos + 3 > maximum:
2020
return False
2121

22-
marker = state.srcCharCode[pos]
22+
marker = state.src[pos]
2323

24-
# /* ~ */ /* ` */
25-
if marker != 0x7E and marker != 0x60:
24+
if marker not in ("~", "`"):
2625
return False
2726

2827
# scan marker length
2928
mem = pos
30-
pos = state.skipChars(pos, marker)
29+
pos = state.skipCharsStr(pos, marker)
3130

3231
length = pos - mem
3332

@@ -37,8 +36,7 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
3736
markup = state.src[mem:pos]
3837
params = state.src[pos:maximum]
3938

40-
# /* ` */
41-
if marker == 0x60 and chr(marker) in params:
39+
if marker == "`" and marker in params:
4240
return False
4341

4442
# Since start is found, we can report success here in validation mode
@@ -65,15 +63,15 @@ def fence(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bool
6563
break
6664

6765
try:
68-
if state.srcCharCode[pos] != marker:
66+
if state.src[pos] != marker:
6967
continue
7068
except IndexError:
7169
break
7270

7371
if state.is_code_block(nextLine):
7472
continue
7573

76-
pos = state.skipChars(pos, marker)
74+
pos = state.skipCharsStr(pos, marker)
7775

7876
# closing code fence must be at least as long as the opening one
7977
if pos - mem < length:

markdown_it/rules_block/heading.py

Lines changed: 9 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import logging
55

6-
from ..common.utils import isSpace
6+
from ..common.utils import isStrSpace
77
from .state_block import StateBlock
88

99
LOGGER = logging.getLogger(__name__)
@@ -18,29 +18,27 @@ def heading(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bo
1818
if state.is_code_block(startLine):
1919
return False
2020

21-
ch: int | None = state.srcCharCode[pos]
21+
ch: str | None = state.src[pos]
2222

23-
# /* # */
24-
if ch != 0x23 or pos >= maximum:
23+
if ch != "#" or pos >= maximum:
2524
return False
2625

2726
# count heading level
2827
level = 1
2928
pos += 1
3029
try:
31-
ch = state.srcCharCode[pos]
30+
ch = state.src[pos]
3231
except IndexError:
3332
ch = None
34-
# /* # */
35-
while ch == 0x23 and pos < maximum and level <= 6:
33+
while ch == "#" and pos < maximum and level <= 6:
3634
level += 1
3735
pos += 1
3836
try:
39-
ch = state.srcCharCode[pos]
37+
ch = state.src[pos]
4038
except IndexError:
4139
ch = None
4240

43-
if level > 6 or (pos < maximum and not isSpace(ch)):
41+
if level > 6 or (pos < maximum and not isStrSpace(ch)):
4442
return False
4543

4644
if silent:
@@ -49,8 +47,8 @@ def heading(state: StateBlock, startLine: int, endLine: int, silent: bool) -> bo
4947
# Let's cut tails like ' ### ' from the end of string
5048

5149
maximum = state.skipSpacesBack(maximum, pos)
52-
tmp = state.skipCharsBack(maximum, 0x23, pos) # #
53-
if tmp > pos and isSpace(state.srcCharCode[tmp - 1]):
50+
tmp = state.skipCharsStrBack(maximum, "#", pos)
51+
if tmp > pos and isStrSpace(state.src[tmp - 1]):
5452
maximum = tmp
5553

5654
state.line = startLine + 1

0 commit comments

Comments
 (0)