Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

gh-89653: PEP 670: Convert unicodeobject.h macros to functions #92648

Merged
merged 1 commit into from
May 11, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 66 additions & 48 deletions Include/cpython/unicodeobject.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,53 +15,31 @@
# define USE_UNICODE_WCHAR_CACHE 1
#endif /* USE_UNICODE_WCHAR_CACHE */

/* Since splitting on whitespace is an important use case, and
whitespace in most situations is solely ASCII whitespace, we
optimize for the common case by using a quick look-up table
_Py_ascii_whitespace (see below) with an inlined check.

*/
#define Py_UNICODE_ISSPACE(ch) \
((Py_UCS4)(ch) < 128U ? _Py_ascii_whitespace[(ch)] : _PyUnicode_IsWhitespace(ch))

#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)

#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)

#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)
// Static inline functions to work with surrogates
static inline int Py_UNICODE_IS_SURROGATE(Py_UCS4 ch) {
return (0xD800 <= ch && ch <= 0xDFFF);
}
static inline int Py_UNICODE_IS_HIGH_SURROGATE(Py_UCS4 ch) {
return (0xD800 <= ch && ch <= 0xDBFF);
}
static inline int Py_UNICODE_IS_LOW_SURROGATE(Py_UCS4 ch) {
return (0xDC00 <= ch && ch <= 0xDFFF);
}

#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)
// Join two surrogate characters and return a single Py_UCS4 value.
static inline Py_UCS4 Py_UNICODE_JOIN_SURROGATES(Py_UCS4 high, Py_UCS4 low) {
return 0x10000 + (((high & 0x03FF) << 10) | (low & 0x03FF));
}

#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)
// High surrogate = top 10 bits added to D800
static inline Py_UCS4 Py_UNICODE_HIGH_SURROGATE(Py_UCS4 ch) {
return (0xD800 - (0x10000 >> 10) + (ch >> 10));
}

#define Py_UNICODE_ISALNUM(ch) \
(Py_UNICODE_ISALPHA(ch) || \
Py_UNICODE_ISDECIMAL(ch) || \
Py_UNICODE_ISDIGIT(ch) || \
Py_UNICODE_ISNUMERIC(ch))

/* macros to work with surrogates */
#define Py_UNICODE_IS_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDFFF)
#define Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= (ch) && (ch) <= 0xDBFF)
#define Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= (ch) && (ch) <= 0xDFFF)
/* Join two surrogate characters and return a single Py_UCS4 value. */
#define Py_UNICODE_JOIN_SURROGATES(high, low) \
(((((Py_UCS4)(high) & 0x03FF) << 10) | \
((Py_UCS4)(low) & 0x03FF)) + 0x10000)
/* high surrogate = top 10 bits added to D800 */
#define Py_UNICODE_HIGH_SURROGATE(ch) (0xD800 - (0x10000 >> 10) + ((ch) >> 10))
/* low surrogate = bottom 10 bits added to DC00 */
#define Py_UNICODE_LOW_SURROGATE(ch) (0xDC00 + ((ch) & 0x3FF))
// Low surrogate = bottom 10 bits added to DC00
static inline Py_UCS4 Py_UNICODE_LOW_SURROGATE(Py_UCS4 ch) {
return (0xDC00 + (ch & 0x3FF));
}

/* --- Unicode Type ------------------------------------------------------- */

Expand Down Expand Up @@ -1013,10 +991,6 @@ PyAPI_FUNC(Py_ssize_t) _PyUnicode_InsertThousandsGrouping(

/* === Characters Type APIs =============================================== */

/* Helper array used by Py_UNICODE_ISSPACE(). */

PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];

/* These should not be used directly. Use the Py_UNICODE_IS* and
Py_UNICODE_TO* macros instead.

Expand Down Expand Up @@ -1124,6 +1098,50 @@ PyAPI_FUNC(int) _PyUnicode_IsAlpha(
Py_UCS4 ch /* Unicode character */
);

// Helper array used by Py_UNICODE_ISSPACE().
PyAPI_DATA(const unsigned char) _Py_ascii_whitespace[];

// Since splitting on whitespace is an important use case, and
// whitespace in most situations is solely ASCII whitespace, we
// optimize for the common case by using a quick look-up table
// _Py_ascii_whitespace (see below) with an inlined check.
static inline int Py_UNICODE_ISSPACE(Py_UCS4 ch) {
if (ch < 128) {
return _Py_ascii_whitespace[ch];
}
return _PyUnicode_IsWhitespace(ch);
}

#define Py_UNICODE_ISLOWER(ch) _PyUnicode_IsLowercase(ch)
#define Py_UNICODE_ISUPPER(ch) _PyUnicode_IsUppercase(ch)
#define Py_UNICODE_ISTITLE(ch) _PyUnicode_IsTitlecase(ch)
#define Py_UNICODE_ISLINEBREAK(ch) _PyUnicode_IsLinebreak(ch)

#define Py_UNICODE_TOLOWER(ch) _PyUnicode_ToLowercase(ch)
#define Py_UNICODE_TOUPPER(ch) _PyUnicode_ToUppercase(ch)
#define Py_UNICODE_TOTITLE(ch) _PyUnicode_ToTitlecase(ch)

#define Py_UNICODE_ISDECIMAL(ch) _PyUnicode_IsDecimalDigit(ch)
#define Py_UNICODE_ISDIGIT(ch) _PyUnicode_IsDigit(ch)
#define Py_UNICODE_ISNUMERIC(ch) _PyUnicode_IsNumeric(ch)
#define Py_UNICODE_ISPRINTABLE(ch) _PyUnicode_IsPrintable(ch)

#define Py_UNICODE_TODECIMAL(ch) _PyUnicode_ToDecimalDigit(ch)
#define Py_UNICODE_TODIGIT(ch) _PyUnicode_ToDigit(ch)
#define Py_UNICODE_TONUMERIC(ch) _PyUnicode_ToNumeric(ch)

#define Py_UNICODE_ISALPHA(ch) _PyUnicode_IsAlpha(ch)

static inline int Py_UNICODE_ISALNUM(Py_UCS4 ch) {
return (Py_UNICODE_ISALPHA(ch)
|| Py_UNICODE_ISDECIMAL(ch)
|| Py_UNICODE_ISDIGIT(ch)
|| Py_UNICODE_ISNUMERIC(ch));
}


/* === Misc functions ===================================================== */

PyAPI_FUNC(PyObject*) _PyUnicode_FormatLong(PyObject *, int, int, int);

/* Return an interned Unicode object for an Identifier; may fail if there is no memory.*/
Expand Down