Unify `PyUncode_Count` and `unicode_count`

# Feature or enhancement

Right now `PyUnicode_Count` from https://github.com/python/cpython/blob/cbdeda8ce7a3543cb3376d70e4cd46fcf24f42a7/Objects/unicodeobject.c#L8968-L9040 and `unicode_count` from https://github.com/python/cpython/blob/cbdeda8ce7a3543cb3376d70e4cd46fcf24f42a7/Objects/unicodeobject.c#L10854-L10916 share a lot of code.

They can be unified, because the do the same thing.

# Pitch

[Citing](https://github.com/python/cpython/pull/96929#issuecomment-1270262538) @encukou:

> Apparently unicode_count missed [an optimization in 2011](https://github.com/python/cpython/commit/c3cec7868bf1019c0987f1e9aadb56d73fa93d61), otherwise they're equivalent (except arg parsing & converting the return value). Merging them could add the optimization to unicode_count.
If you want to work on that, note that there's also anylib_count that duplicates the main switch.

# Previous discussion



Link: https://github.com/python/cpython/pull/96929



PR in the works.

	Py_ssize_t
	PyUnicode_Count(PyObject *str,
	PyObject *substr,
	Py_ssize_t start,
	Py_ssize_t end)
	{
	Py_ssize_t result;
	int kind1, kind2;
	const void buf1 = NULL, buf2 = NULL;
	Py_ssize_t len1, len2;

	if (ensure_unicode(str) < 0 \|\| ensure_unicode(substr) < 0)
	return -1;

	kind1 = PyUnicode_KIND(str);
	kind2 = PyUnicode_KIND(substr);
	if (kind1 < kind2)
	return 0;

	len1 = PyUnicode_GET_LENGTH(str);
	len2 = PyUnicode_GET_LENGTH(substr);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return 0;

	buf1 = PyUnicode_DATA(str);
	buf2 = PyUnicode_DATA(substr);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	goto onError;
	}

	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	if (PyUnicode_IS_ASCII(str) && PyUnicode_IS_ASCII(substr))
	result = asciilib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	else
	result = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	result = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	result = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	onError:
	assert((kind2 != kind1) == (buf2 != PyUnicode_DATA(substr)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);
	return -1;
	}

	static PyObject *
	unicode_count(PyObject self, PyObject args)
	{
	PyObject substring = NULL; / initialize to fix a compiler warning */
	Py_ssize_t start = 0;
	Py_ssize_t end = PY_SSIZE_T_MAX;
	PyObject *result;
	int kind1, kind2;
	const void buf1, buf2;
	Py_ssize_t len1, len2, iresult;

	if (!parse_args_finds_unicode("count", args, &substring, &start, &end))
	return NULL;

	kind1 = PyUnicode_KIND(self);
	kind2 = PyUnicode_KIND(substring);
	if (kind1 < kind2)
	return PyLong_FromLong(0);

	len1 = PyUnicode_GET_LENGTH(self);
	len2 = PyUnicode_GET_LENGTH(substring);
	ADJUST_INDICES(start, end, len1);
	if (end - start < len2)
	return PyLong_FromLong(0);

	buf1 = PyUnicode_DATA(self);
	buf2 = PyUnicode_DATA(substring);
	if (kind2 != kind1) {
	buf2 = unicode_askind(kind2, buf2, len2, kind1);
	if (!buf2)
	return NULL;
	}
	switch (kind1) {
	case PyUnicode_1BYTE_KIND:
	iresult = ucs1lib_count(
	((const Py_UCS1*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_2BYTE_KIND:
	iresult = ucs2lib_count(
	((const Py_UCS2*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	case PyUnicode_4BYTE_KIND:
	iresult = ucs4lib_count(
	((const Py_UCS4*)buf1) + start, end - start,
	buf2, len2, PY_SSIZE_T_MAX
	);
	break;
	default:
	Py_UNREACHABLE();
	}

	result = PyLong_FromSsize_t(iresult);

	assert((kind2 == kind1) == (buf2 == PyUnicode_DATA(substring)));
	if (kind2 != kind1)
	PyMem_Free((void *)buf2);

	return result;
	}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Unify `PyUncode_Count` and `unicode_count` #97982

Feature or enhancement

Pitch

Previous discussion

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Uh oh!

Unify PyUncode_Count and unicode_count #97982

Description

Feature or enhancement

Pitch

Previous discussion

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions

Unify `PyUncode_Count` and `unicode_count` #97982