Skip to content

Commit 6958e77

Browse files
authored
[mypyc] Add str.isalnum() primitive (#20852)
Added `str.isalnum()` similar to `str.isspace()`. One interesting thing to point out here is that the benchmarks decline in speed relative to the string's length: | All-alphanumeric | mypyc (s) | Python (s) | Speedup | |--------|----------:|------------:|--------:| | length 1 (`'a'`) | 0.645 | 2.036 | 3.16x | | length 10 (`'abcde12345'`) | 1.026 | 2.607 | 2.54x | | length 100 (`'a' * 100`) | 3.599 | 7.848 | 2.18x | | length 1 (UCS-2: U+00E9 `é`) | 0.816 | 1.976 | 2.42x | | length 10 (UCS-2: U+00E9 * 10) | 2.091 | 2.587 | 1.24x | | length 100 (UCS-2: U+00E9 * 100) | 14.298 | 7.814 | 0.55x | <br /> | Non-alphanumeric (early exit) | mypyc (s) | Python (s) | Speedup | |--------|----------:|------------:|--------:| | length 1 (`' '`) | 0.622 | 2.006 | 3.22x | | length 100 (`'!' * 100`) | 0.617 | 2.024 | 3.28x | | length 100 (`'a' * 99 + '!'`) | 3.453 | 10.246 | 2.97x | <br /> Not entirely sure how to interpret this but could it be because the [Py_UNICODE_ISALNUM](https://github.com/python/cpython/blob/175ab31377d9e616efb95168099d8c2c9036504a/Include/cpython/unicodeobject.h#L769) calls 4 functions internally which is more optimized in CPython due to PGO & LTO (?)
1 parent a20e897 commit 6958e77

File tree

7 files changed

+77
-0
lines changed

7 files changed

+77
-0
lines changed

mypyc/doc/str_operations.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ Methods
3838
* ``s1.find(s2: str)``
3939
* ``s1.find(s2: str, start: int)``
4040
* ``s1.find(s2: str, start: int, end: int)``
41+
* ``s.isspace()``
42+
* ``s.isalnum()``
4143
* ``s.join(x: Iterable)``
4244
* ``s.lstrip()``
4345
* ``s.lstrip(chars: str)``

mypyc/lib-rt/CPy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -781,6 +781,7 @@ Py_ssize_t CPyStr_CountFull(PyObject *unicode, PyObject *substring, CPyTagged st
781781
CPyTagged CPyStr_Ord(PyObject *obj);
782782
PyObject *CPyStr_Multiply(PyObject *str, CPyTagged count);
783783
bool CPyStr_IsSpace(PyObject *str);
784+
bool CPyStr_IsAlnum(PyObject *str);
784785

785786
// Bytes operations
786787

mypyc/lib-rt/str_ops.c

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -654,3 +654,26 @@ bool CPyStr_IsSpace(PyObject *str) {
654654
}
655655
return true;
656656
}
657+
658+
bool CPyStr_IsAlnum(PyObject *str) {
659+
Py_ssize_t len = PyUnicode_GET_LENGTH(str);
660+
if (len == 0) return false;
661+
662+
if (PyUnicode_IS_ASCII(str)) {
663+
const Py_UCS1 *data = PyUnicode_1BYTE_DATA(str);
664+
for (Py_ssize_t i = 0; i < len; i++) {
665+
if (!Py_ISALNUM(data[i]))
666+
return false;
667+
}
668+
return true;
669+
}
670+
671+
int kind = PyUnicode_KIND(str);
672+
const void *data = PyUnicode_DATA(str);
673+
for (Py_ssize_t i = 0; i < len; i++) {
674+
Py_UCS4 ch = PyUnicode_READ(kind, data, i);
675+
if (!Py_UNICODE_ISALNUM(ch))
676+
return false;
677+
}
678+
return true;
679+
}

mypyc/primitives/str_ops.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,15 @@
405405
error_kind=ERR_NEVER,
406406
)
407407

408+
method_op(
409+
name="isalnum",
410+
arg_types=[str_rprimitive],
411+
return_type=bool_rprimitive,
412+
c_function_name="CPyStr_IsAlnum",
413+
error_kind=ERR_NEVER,
414+
)
415+
416+
408417
# obj.decode()
409418
method_op(
410419
name="decode",

mypyc/test-data/fixtures/ir.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,7 @@ def removesuffix(self, suffix: str, /) -> str: ...
132132
def islower(self) -> bool: ...
133133
def count(self, substr: str, start: Optional[int] = None, end: Optional[int] = None) -> int: pass
134134
def isspace(self) -> bool: ...
135+
def isalnum(self) -> bool: ...
135136

136137
class float:
137138
def __init__(self, x: object) -> None: pass

mypyc/test-data/irbuild-str.test

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -983,3 +983,14 @@ def is_space(x):
983983
L0:
984984
r0 = CPyStr_IsSpace(x)
985985
return r0
986+
987+
[case testStrIsAlnum]
988+
def is_alnum(x: str) -> bool:
989+
return x.isalnum()
990+
[out]
991+
def is_alnum(x):
992+
x :: str
993+
r0 :: bool
994+
L0:
995+
r0 = CPyStr_IsAlnum(x)
996+
return r0

mypyc/test-data/run-strings.test

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1269,3 +1269,33 @@ def test_isspace() -> None:
12691269
c = chr(i)
12701270
a: Any = c
12711271
assert c.isspace() == a.isspace()
1272+
1273+
[case testIsAlnum]
1274+
def test_isalnum_basic() -> None:
1275+
assert "abc".isalnum()
1276+
assert "ABC".isalnum()
1277+
assert "abc123".isalnum()
1278+
assert "123".isalnum()
1279+
assert not "".isalnum()
1280+
assert not " ".isalnum()
1281+
assert not "abc!".isalnum()
1282+
assert not "hello world".isalnum()
1283+
assert not "abc-123".isalnum()
1284+
1285+
def test_isalnum_unicode() -> None:
1286+
# Single chars: letters and digits from various scripts
1287+
assert "\u00E9".isalnum() # é (UCS-1 Latin letter)
1288+
assert "\u0660".isalnum() # ٠ (UCS-2 Arabic-Indic digit)
1289+
assert "\u4E2D".isalnum() # 中 (UCS-2 CJK ideograph)
1290+
assert "\U00010400".isalnum() # 𐐀 (UCS-4 Deseret capital letter long I)
1291+
assert not "\u2000".isalnum() # EN QUAD (whitespace)
1292+
assert not "\u0021".isalnum() # !
1293+
assert not "\u00B6".isalnum() # ¶ (pilcrow sign, punctuation)
1294+
1295+
# Mixed Unicode letters and digits — all alnum
1296+
assert "\u00E9\u0660".isalnum() # é٠
1297+
assert "\u4E2D\u0041\u0660".isalnum() # 中A٠
1298+
1299+
# Unicode letter/digit mixed with punctuation — not alnum
1300+
assert not "\u00E9!".isalnum()
1301+
assert not "\u4E2D\u2000".isalnum() # CJK + whitespace

0 commit comments

Comments
 (0)