Skip to content

Unify PyUncode_Count and unicode_count #97982

Closed
@sobolevn

Description

@sobolevn

Feature or enhancement

Right now PyUnicode_Count from

Py_ssize_t
PyUnicode_Count(PyObject *str,
PyObject *substr,
Py_ssize_t start,
Py_ssize_t end)
{
Py_ssize_t result;
int kind1, kind2;
const void *buf1 = NULL, *buf2 = NULL;
Py_ssize_t len1, len2;
if (ensure_unicode(str) < 0 || ensure_unicode(substr) < 0)
return -1;
kind1 = PyUnicode_KIND(str);
kind2 = PyUnicode_KIND(substr);
if (kind1 < kind2)
return 0;
len1 = PyUnicode_GET_LENGTH(str);
len2 = PyUnicode_GET_LENGTH(substr);
ADJUST_INDICES(start, end, len1);
if (end - start < len2)
return 0;
buf1 = PyUnicode_DATA(str);
buf2 = PyUnicode_DATA(substr);
if (kind2 != kind1) {
buf2 = unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
goto onError;
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
result = asciilib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
else
result = ucs1lib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_2BYTE_KIND:
result = ucs2lib_count(
((const Py_UCS2*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_4BYTE_KIND:
result = ucs4lib_count(
((const Py_UCS4*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return result;
onError:
assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return -1;
}
and unicode_count from

cpython/Objects/unicodeobject.c

Lines 10854 to 10916 in cbdeda8

static PyObject *
unicode_count(PyObject *self, PyObject *args)
{
PyObject *substring = NULL; /* initialize to fix a compiler warning */
Py_ssize_t start = 0;
Py_ssize_t end = PY_SSIZE_T_MAX;
PyObject *result;
int kind1, kind2;
const void *buf1, *buf2;
Py_ssize_t len1, len2, iresult;
if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
return NULL;
kind1 = PyUnicode_KIND(self);
kind2 = PyUnicode_KIND(substring);
if (kind1 < kind2)
return PyLong_FromLong(0);
len1 = PyUnicode_GET_LENGTH(self);
len2 = PyUnicode_GET_LENGTH(substring);
ADJUST_INDICES(start, end, len1);
if (end - start < len2)
return PyLong_FromLong(0);
buf1 = PyUnicode_DATA(self);
buf2 = PyUnicode_DATA(substring);
if (kind2 != kind1) {
buf2 = unicode_askind(kind2, buf2, len2, kind1);
if (!buf2)
return NULL;
}
switch (kind1) {
case PyUnicode_1BYTE_KIND:
iresult = ucs1lib_count(
((const Py_UCS1*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_2BYTE_KIND:
iresult = ucs2lib_count(
((const Py_UCS2*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
case PyUnicode_4BYTE_KIND:
iresult = ucs4lib_count(
((const Py_UCS4*)buf1) + start, end - start,
buf2, len2, PY_SSIZE_T_MAX
);
break;
default:
Py_UNREACHABLE();
}
result = PyLong_FromSsize_t(iresult);
assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
if (kind2 != kind1)
PyMem_Free((void *)buf2);
return result;
}
share a lot of code.

They can be unified, because the do the same thing.

Pitch

Citing @encukou:

Apparently unicode_count missed an optimization in 2011, otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

Previous discussion

Link: #96929

PR in the works.

Metadata

Metadata

Assignees

Labels

interpreter-core(Objects, Python, Grammar, and Parser dirs)type-featureA feature request or enhancement

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions