-
-
Notifications
You must be signed in to change notification settings - Fork 31.5k
Closed
Labels
interpreter-core(Objects, Python, Grammar, and Parser dirs)(Objects, Python, Grammar, and Parser dirs)type-featureA feature request or enhancementA feature request or enhancement
Description
Feature or enhancement
Right now PyUnicode_Count
from
cpython/Objects/unicodeobject.c
Lines 8968 to 9040 in cbdeda8
Py_ssize_t | |
PyUnicode_Count(PyObject *str, | |
PyObject *substr, | |
Py_ssize_t start, | |
Py_ssize_t end) | |
{ | |
Py_ssize_t result; | |
int kind1, kind2; | |
const void *buf1 = NULL, *buf2 = NULL; | |
Py_ssize_t len1, len2; | |
if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0) | |
return -1; | |
kind1 = PyUnicode_KIND(str); | |
kind2 = PyUnicode_KIND(substr); | |
if (kind1 < kind2) | |
return 0; | |
len1 = PyUnicode_GET_LENGTH(str); | |
len2 = PyUnicode_GET_LENGTH(substr); | |
ADJUST_INDICES(start, end, len1); | |
if (end - start < len2) | |
return 0; | |
buf1 = PyUnicode_DATA(str); | |
buf2 = PyUnicode_DATA(substr); | |
if (kind2 != kind1) { | |
buf2 = unicode_askind(kind2, buf2, len2, kind1); | |
if (!buf2) | |
goto onError; | |
} | |
switch (kind1) { | |
case PyUnicode_1BYTE_KIND: | |
if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr)) | |
result = asciilib_count( | |
((const Py_UCS1*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
else | |
result = ucs1lib_count( | |
((const Py_UCS1*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
case PyUnicode_2BYTE_KIND: | |
result = ucs2lib_count( | |
((const Py_UCS2*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
case PyUnicode_4BYTE_KIND: | |
result = ucs4lib_count( | |
((const Py_UCS4*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
default: | |
Py_UNREACHABLE(); | |
} | |
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); | |
if (kind2 != kind1) | |
PyMem_Free((void *)buf2); | |
return result; | |
onError: | |
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr))); | |
if (kind2 != kind1) | |
PyMem_Free((void *)buf2); | |
return -1; | |
} |
unicode_count
from cpython/Objects/unicodeobject.c
Lines 10854 to 10916 in cbdeda8
static PyObject * | |
unicode_count(PyObject *self, PyObject *args) | |
{ | |
PyObject *substring = NULL; /* initialize to fix a compiler warning */ | |
Py_ssize_t start = 0; | |
Py_ssize_t end = PY_SSIZE_T_MAX; | |
PyObject *result; | |
int kind1, kind2; | |
const void *buf1, *buf2; | |
Py_ssize_t len1, len2, iresult; | |
if (!parse_args_finds_unicode("count", args, &substring, &start, &end)) | |
return NULL; | |
kind1 = PyUnicode_KIND(self); | |
kind2 = PyUnicode_KIND(substring); | |
if (kind1 < kind2) | |
return PyLong_FromLong(0); | |
len1 = PyUnicode_GET_LENGTH(self); | |
len2 = PyUnicode_GET_LENGTH(substring); | |
ADJUST_INDICES(start, end, len1); | |
if (end - start < len2) | |
return PyLong_FromLong(0); | |
buf1 = PyUnicode_DATA(self); | |
buf2 = PyUnicode_DATA(substring); | |
if (kind2 != kind1) { | |
buf2 = unicode_askind(kind2, buf2, len2, kind1); | |
if (!buf2) | |
return NULL; | |
} | |
switch (kind1) { | |
case PyUnicode_1BYTE_KIND: | |
iresult = ucs1lib_count( | |
((const Py_UCS1*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
case PyUnicode_2BYTE_KIND: | |
iresult = ucs2lib_count( | |
((const Py_UCS2*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
case PyUnicode_4BYTE_KIND: | |
iresult = ucs4lib_count( | |
((const Py_UCS4*)buf1) + start, end - start, | |
buf2, len2, PY_SSIZE_T_MAX | |
); | |
break; | |
default: | |
Py_UNREACHABLE(); | |
} | |
result = PyLong_FromSsize_t(iresult); | |
assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring))); | |
if (kind2 != kind1) | |
PyMem_Free((void *)buf2); | |
return result; | |
} |
They can be unified, because the do the same thing.
Pitch
Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.
Previous discussion
Link: #96929
PR in the works.
Metadata
Metadata
Assignees
Labels
interpreter-core(Objects, Python, Grammar, and Parser dirs)(Objects, Python, Grammar, and Parser dirs)type-featureA feature request or enhancementA feature request or enhancement