5858typedef RE_UINT32 RE_CODE;
5959typedef unsigned char BYTE;
6060
61+ /* An unassigned codepoint. */
62+ #define UNASSIGNED_CODEPOINT 0x10FFFF
63+
6164/* Properties in the General Category. */
6265#define RE_PROP_GC_CN ((RE_PROP_GC << 16) | RE_PROP_CN)
6366#define RE_PROP_GC_LU ((RE_PROP_GC << 16) | RE_PROP_LU)
@@ -157,6 +160,11 @@ typedef RE_UINT32 RE_STATUS_T;
157160/* Various flags stored in a node status member. */
158161#define RE_STATUS_SHIFT 11
159162
163+ #define RE_ENCODING_SHIFT 16
164+ #define ASCII_ENCODING 1
165+ #define UNICODE_ENCODING 2
166+ #define ENCODING_KIND(NODE) (((NODE)->status >> RE_ENCODING_SHIFT) & 0x3)
167+
160168#define RE_STATUS_FUZZY (RE_FUZZY_OP << RE_STATUS_SHIFT)
161169#define RE_STATUS_REVERSE (RE_REVERSE_OP << RE_STATUS_SHIFT)
162170#define RE_STATUS_REQUIRED (RE_REQUIRED_OP << RE_STATUS_SHIFT)
@@ -809,12 +817,8 @@ Py_LOCAL_INLINE(BOOL) unicode_has_property(RE_CODE property, Py_UCS4 ch);
809817/* Checks whether a character has a property. */
810818Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
811819 if (ch > RE_ASCII_MAX) {
812- /* Outside the ASCII range. */
813- RE_UINT32 value;
814-
815- value = property & 0xFFFF;
816-
817- return value == 0;
820+ /* Treat it as an unassigned codepoint. */
821+ ch = UNASSIGNED_CODEPOINT;
818822 }
819823
820824 return unicode_has_property(property, ch);
@@ -824,19 +828,12 @@ Py_LOCAL_INLINE(BOOL) ascii_has_property(RE_CODE property, Py_UCS4 ch) {
824828Py_LOCAL_INLINE(BOOL) ascii_has_property_ign(RE_CODE property, Py_UCS4 ch) {
825829 RE_UINT32 prop;
826830
827- prop = property >> 16;
828-
829- /* We are working with ASCII. */
830- if (property == RE_PROP_GC_LU || property == RE_PROP_GC_LL || property ==
831- RE_PROP_GC_LT) {
832- RE_UINT32 value;
833-
834- value = re_get_general_category(ch);
831+ if (ch > RE_ASCII_MAX) {
832+ /* Treat it as an unassigned codepoint. */
833+ ch = UNASSIGNED_CODEPOINT;
834+ }
835835
836- return value == RE_PROP_LU || value == RE_PROP_LL || value ==
837- RE_PROP_LT;
838- } else if (prop == RE_PROP_UPPERCASE || prop == RE_PROP_LOWERCASE)
839- return (BOOL)re_get_cased(ch);
836+ prop = property >> 16;
840837
841838 /* The property is case-insensitive. */
842839 return ascii_has_property(property, ch);
@@ -2902,7 +2899,14 @@ Py_LOCAL_INLINE(BOOL) matches_CHARACTER_IGN(RE_EncodingTable* encoding,
29022899/* Checks whether a character has a property. */
29032900Py_LOCAL_INLINE(BOOL) matches_PROPERTY(RE_EncodingTable* encoding,
29042901 RE_LocaleInfo* locale_info, RE_Node* node, Py_UCS4 ch) {
2905- return encoding->has_property(locale_info, node->values[0], ch);
2902+ switch (ENCODING_KIND(node)) {
2903+ case ASCII_ENCODING:
2904+ return ascii_encoding.has_property(locale_info, node->values[0], ch);
2905+ case UNICODE_ENCODING:
2906+ return unicode_encoding.has_property(locale_info, node->values[0], ch);
2907+ default:
2908+ return encoding->has_property(locale_info, node->values[0], ch);
2909+ }
29062910}
29072911
29082912/* Checks whether a character has a property, ignoring case. */
@@ -2914,6 +2918,15 @@ Py_LOCAL_INLINE(BOOL) matches_PROPERTY_IGN(RE_EncodingTable* encoding,
29142918 property = node->values[0];
29152919 prop = property >> 16;
29162920
2921+ switch (ENCODING_KIND(node)) {
2922+ case ASCII_ENCODING:
2923+ encoding = &ascii_encoding;
2924+ break;
2925+ case UNICODE_ENCODING:
2926+ encoding = &unicode_encoding;
2927+ break;
2928+ }
2929+
29172930 /* We need to do special handling of case-sensitive properties according to
29182931 * the 'encoding'.
29192932 */
@@ -3000,7 +3013,15 @@ Py_LOCAL_INLINE(BOOL) matches_member(RE_EncodingTable* encoding, RE_LocaleInfo*
30003013 /* values are: property */
30013014 TRACE(("%s %d %d\n", re_op_text[member->op], member->match,
30023015 member->values[0]))
3003- return encoding->has_property(locale_info, member->values[0], ch);
3016+
3017+ switch (ENCODING_KIND(member)) {
3018+ case ASCII_ENCODING:
3019+ return ascii_encoding.has_property(locale_info, member->values[0], ch);
3020+ case UNICODE_ENCODING:
3021+ return unicode_encoding.has_property(locale_info, member->values[0], ch);
3022+ default:
3023+ return encoding->has_property(locale_info, member->values[0], ch);
3024+ }
30043025 case RE_OP_RANGE:
30053026 /* values are: lower, upper */
30063027 TRACE(("%s %d %d %d\n", re_op_text[member->op], member->match,
@@ -4006,7 +4027,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY(RE_State* state, RE_Node* node,
40064027
40074028 text = state->text;
40084029 match = node->match == match;
4009- encoding = state->encoding;
4030+
4031+ switch (ENCODING_KIND(node)) {
4032+ case ASCII_ENCODING:
4033+ encoding = &ascii_encoding;
4034+ break;
4035+ case UNICODE_ENCODING:
4036+ encoding = &unicode_encoding;
4037+ break;
4038+ default:
4039+ encoding = state->encoding;
4040+ break;
4041+ }
4042+
40104043 locale_info = state->locale_info;
40114044 property = node->values[0];
40124045
@@ -4104,7 +4137,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN(RE_State* state, RE_Node*
41044137
41054138 text = state->text;
41064139 match = node->match == match;
4107- encoding = state->encoding;
4140+
4141+ switch (ENCODING_KIND(node)) {
4142+ case ASCII_ENCODING:
4143+ encoding = &ascii_encoding;
4144+ break;
4145+ case UNICODE_ENCODING:
4146+ encoding = &unicode_encoding;
4147+ break;
4148+ default:
4149+ encoding = state->encoding;
4150+ break;
4151+ }
4152+
41084153 locale_info = state->locale_info;
41094154 property = node->values[0];
41104155
@@ -4202,7 +4247,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_IGN_REV(RE_State* state,
42024247
42034248 text = state->text;
42044249 match = node->match == match;
4205- encoding = state->encoding;
4250+
4251+ switch (ENCODING_KIND(node)) {
4252+ case ASCII_ENCODING:
4253+ encoding = &ascii_encoding;
4254+ break;
4255+ case UNICODE_ENCODING:
4256+ encoding = &unicode_encoding;
4257+ break;
4258+ default:
4259+ encoding = state->encoding;
4260+ break;
4261+ }
4262+
42064263 locale_info = state->locale_info;
42074264 property = node->values[0];
42084265
@@ -4300,7 +4357,19 @@ Py_LOCAL_INLINE(Py_ssize_t) match_many_PROPERTY_REV(RE_State* state, RE_Node*
43004357
43014358 text = state->text;
43024359 match = node->match == match;
4303- encoding = state->encoding;
4360+
4361+ switch (ENCODING_KIND(node)) {
4362+ case ASCII_ENCODING:
4363+ encoding = &ascii_encoding;
4364+ break;
4365+ case UNICODE_ENCODING:
4366+ encoding = &unicode_encoding;
4367+ break;
4368+ default:
4369+ encoding = state->encoding;
4370+ break;
4371+ }
4372+
43044373 locale_info = state->locale_info;
43054374 property = node->values[0];
43064375
@@ -6882,8 +6951,17 @@ Py_LOCAL_INLINE(int) try_match_ANY_U_REV(RE_State* state, RE_Node* node,
68826951/* Checks whether a position is on a word boundary. */
68836952Py_LOCAL_INLINE(int) try_match_BOUNDARY(RE_State* state, RE_Node* node,
68846953 Py_ssize_t text_pos) {
6885- return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6886- node->match);
6954+ switch (ENCODING_KIND(node)) {
6955+ case ASCII_ENCODING:
6956+ return bool_as_status(ascii_encoding.at_boundary(state, text_pos) ==
6957+ node->match);
6958+ case UNICODE_ENCODING:
6959+ return bool_as_status(unicode_encoding.at_boundary(state, text_pos) ==
6960+ node->match);
6961+ default:
6962+ return bool_as_status(state->encoding->at_boundary(state, text_pos) ==
6963+ node->match);
6964+ }
68876965}
68886966
68896967/* Checks whether there's a character at a position. */
@@ -7724,7 +7802,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY(RE_State* state, RE_Node*
77247802 node, Py_ssize_t text_pos, BOOL* is_partial) {
77257803 BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
77267804
7727- at_boundary = state->encoding->at_boundary;
7805+ switch (ENCODING_KIND(node)) {
7806+ case ASCII_ENCODING:
7807+ at_boundary = ascii_encoding.at_boundary;
7808+ break;
7809+ case UNICODE_ENCODING:
7810+ at_boundary = unicode_encoding.at_boundary;
7811+ break;
7812+ default:
7813+ at_boundary = state->encoding->at_boundary;
7814+ break;
7815+ }
77287816
77297817 *is_partial = FALSE;
77307818
@@ -7744,7 +7832,17 @@ Py_LOCAL_INLINE(Py_ssize_t) search_start_BOUNDARY_rev(RE_State* state, RE_Node*
77447832 node, Py_ssize_t text_pos, BOOL* is_partial) {
77457833 BOOL (*at_boundary)(RE_State* state, Py_ssize_t text_pos);
77467834
7747- at_boundary = state->encoding->at_boundary;
7835+ switch (ENCODING_KIND(node)) {
7836+ case ASCII_ENCODING:
7837+ at_boundary = ascii_encoding.at_boundary;
7838+ break;
7839+ case UNICODE_ENCODING:
7840+ at_boundary = unicode_encoding.at_boundary;
7841+ break;
7842+ default:
7843+ at_boundary = state->encoding->at_boundary;
7844+ break;
7845+ }
77487846
77497847 *is_partial = FALSE;
77507848
0 commit comments