Skip to content

Commit 73b9584

Browse files
committed
Move \p{user-defined} to core from utf8_heavy.pl
This large commit moves the handling of user-defined properties to C code. This should speed it up, but the main reason to do this is to stop using swashes in this case, leaving only tr/// using them. Once that too is converted, all swash handling can be ripped out of perl. Doing this in perl has caused some nasty interactions that will now be fixed automatically. The change is not entirely transparent, however (besides speed and the possibility of removing these interactions). perldelta in this commit details these.
1 parent dd52e3c commit 73b9584

File tree

8 files changed

+988
-245
lines changed

8 files changed

+988
-245
lines changed

embed.fnc

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2513,10 +2513,23 @@ EnsR |int |edit_distance |NN const UV *src \
25132513
|const STRLEN x \
25142514
|const STRLEN y \
25152515
|const SSize_t maxDistance
2516-
EXp |SV * |parse_uniprop_string|NN const char * const name \
2517-
|const Size_t name_len \
2518-
|const bool to_fold \
2519-
|NN bool * invert
2516+
EpX |SV * |parse_uniprop_string|NN const char * const name \
2517+
|const Size_t name_len \
2518+
|const bool is_utf8 \
2519+
|const bool to_fold \
2520+
|const bool runtime \
2521+
|NN bool * user_defined_ptr \
2522+
|NN SV * msg \
2523+
|const STRLEN level
2524+
EXp |SV * |handle_user_defined_property|NN const char * name \
2525+
|const STRLEN name_len \
2526+
|const bool is_utf8 \
2527+
|const bool to_fold \
2528+
|const bool runtime \
2529+
|NN SV* contents \
2530+
|NN bool *user_defined_ptr \
2531+
|NN SV * msg \
2532+
|const STRLEN level
25202533
# ifdef DEBUGGING
25212534
Ep |int |re_indentf |NN const char *fmt|U32 depth|...
25222535
Es |void |regdump_intflags|NULLOK const char *lead| const U32 flags

embed.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1188,6 +1188,7 @@
11881188
#define handle_named_backref(a,b,c,d) S_handle_named_backref(aTHX_ a,b,c,d)
11891189
#define handle_possible_posix(a,b,c,d,e) S_handle_possible_posix(aTHX_ a,b,c,d,e)
11901190
#define handle_regex_sets(a,b,c,d,e) S_handle_regex_sets(aTHX_ a,b,c,d,e)
1191+
#define handle_user_defined_property(a,b,c,d,e,f,g,h,i) Perl_handle_user_defined_property(aTHX_ a,b,c,d,e,f,g,h,i)
11911192
#define invlist_contents(a,b) S_invlist_contents(aTHX_ a,b)
11921193
#define invlist_highest S_invlist_highest
11931194
#define invlist_is_iterating S_invlist_is_iterating
@@ -1201,7 +1202,7 @@
12011202
#define nextchar(a) S_nextchar(aTHX_ a)
12021203
#define output_posix_warnings(a,b) S_output_posix_warnings(aTHX_ a,b)
12031204
#define parse_lparen_question_flags(a) S_parse_lparen_question_flags(aTHX_ a)
1204-
#define parse_uniprop_string(a,b,c,d) Perl_parse_uniprop_string(aTHX_ a,b,c,d)
1205+
#define parse_uniprop_string(a,b,c,d,e,f,g,h) Perl_parse_uniprop_string(aTHX_ a,b,c,d,e,f,g,h)
12051206
#define populate_ANYOF_from_invlist(a,b) S_populate_ANYOF_from_invlist(aTHX_ a,b)
12061207
#define reg(a,b,c,d) S_reg(aTHX_ a,b,c,d)
12071208
#define reg2Lanode(a,b,c,d) S_reg2Lanode(aTHX_ a,b,c,d)

pod/perldelta.pod

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,72 @@ trees.
316316

317317
Avoid leak in multiconcat with overloading. [perl #133789]
318318

319+
=item *
320+
321+
The handling of user-defined C<\p{}> properties (see
322+
L<perlunicode/User-Defined Character Properties>) has been rewritten to
323+
be in C (instead of Perl). This speeds things up, but in the process
324+
several inconsistencies and bug fixes are made.
325+
326+
=over
327+
328+
=item 1
329+
330+
A few error messages have minor wording changes. This is essentially
331+
because the new way is integrated into the regex error handling
332+
mechanism that marks the position in the input at which the error
333+
occurred. That was not possible previously. The messages now also
334+
contain additional back-trace-like information in case the error occurs
335+
deep in nested calls.
336+
337+
=item 2
338+
339+
A user-defined property is implemented as a perl subroutine with certain
340+
highly constrained naming conventions. It was documented previously
341+
that the sub would be in the current package if the package was
342+
unspecified. This turned out not to be true in all cases, but now it
343+
is.
344+
345+
=item 3
346+
347+
All recursive calls are treated as infinite recursion. Previously they
348+
would cause the interpreter to panic. Now, they cause the regex pattern
349+
to fail to compile.
350+
351+
=item 4
352+
353+
Similarly, any other error likely would lead to a panic; now to just the
354+
pattern failing to compile.
355+
356+
=item 5
357+
358+
The old mechanism did not detect illegal ranges in the definition of the
359+
property. Now, the range max must not be smaller than the range min.
360+
Otherwise, the pattern fails to compile.
361+
362+
=item 6
363+
364+
The intention was to have each sub called only once during the lifetime
365+
of the program, so that a property's definition is immutable. This was
366+
relaxed so that it could be called once for all /i compilations, and
367+
potentially a second time for non-/i (the sub is passed a parameter
368+
indicating which). However, in practice there were instances when this
369+
was broken, and multiple calls were possible. Those have been fixed.
370+
Now (besides the /i,non-/i cases) the only way a sub can be called
371+
multiple times is if some component of it has not been defined yet. For
372+
example, suppose we have sub IsA() whose definition is known at compile
373+
time, and it in turn calls isB() whose definition is not yet known.
374+
isA() will be called each time a pattern it appears in is compiled. If
375+
isA() also calls isC() and that definition is known, isC() will be
376+
called just once.
377+
378+
=item 7
379+
380+
There were some races and very long hangs should one thread be compiling
381+
the same property as another simultaneously. These have now been fixed.
382+
383+
=back
384+
319385
=back
320386

321387
=head1 Acknowledgements

pod/perlunicode.pod

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -965,7 +965,8 @@ A single hexadecimal number denoting a code point to include.
965965
=item *
966966

967967
Two hexadecimal numbers separated by horizontal whitespace (space or
968-
tabular characters) denoting a range of code points to include.
968+
tabular characters) denoting a range of code points to include. The
969+
second number must not be smaller than the first.
969970

970971
=item *
971972

proto.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5446,6 +5446,9 @@ STATIC int S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, const char*
54465446
STATIC regnode_offset S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV ** return_invlist, I32 *flagp, U32 depth, char * const oregcomp_parse);
54475447
#define PERL_ARGS_ASSERT_HANDLE_REGEX_SETS \
54485448
assert(pRExC_state); assert(flagp); assert(oregcomp_parse)
5449+
PERL_CALLCONV SV * Perl_handle_user_defined_property(pTHX_ const char * name, const STRLEN name_len, const bool is_utf8, const bool to_fold, const bool runtime, SV* contents, bool *user_defined_ptr, SV * msg, const STRLEN level);
5450+
#define PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY \
5451+
assert(name); assert(contents); assert(user_defined_ptr); assert(msg)
54495452
STATIC SV* S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style)
54505453
__attribute__warn_unused_result__;
54515454
#define PERL_ARGS_ASSERT_INVLIST_CONTENTS \
@@ -5503,9 +5506,9 @@ STATIC void S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_w
55035506
STATIC void S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state);
55045507
#define PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS \
55055508
assert(pRExC_state)
5506-
PERL_CALLCONV SV * Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, const bool to_fold, bool * invert);
5509+
PERL_CALLCONV SV * Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, const bool is_utf8, const bool to_fold, const bool runtime, bool * user_defined_ptr, SV * msg, const STRLEN level);
55075510
#define PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING \
5508-
assert(name); assert(invert)
5511+
assert(name); assert(user_defined_ptr); assert(msg)
55095512
STATIC void S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr);
55105513
#define PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST \
55115514
assert(node); assert(invlist_ptr)

0 commit comments

Comments
 (0)