PCRE2Project · PhilipHazel · Oct 2, 2024 · Sep 29, 2024
diff --git a/doc/pcre2compat.3 b/doc/pcre2compat.3
@@ -226,9 +226,15 @@ handled by PCRE2, either by the interpreter or the JIT. An example is
 /(?:|(?0)abcd)(?(R)|\ez)/, which matches a sequence of any number of repeated
 "abcd" substrings at the end of the subject.
 .P
-23. From release 10.45, PCRE2 gives an error if \ex is not followed by a 
-hexadecimal digit or a curly bracket. It used to interpret this as the NUL 
-character. Perl still generates NUL, but warns in its warning mode.
+23. Both PCRE2 and Perl error when \ex{ escapes are invalid, but Perl tries to
+recover and prints a warning if the problem was that an invalid hexadecimal
+digit was found, since PCRE2 doesn't have warnings it returns an error instead.
+Additionally, Perl accepts \ex{} and generates NUL unlike PCRE2.
+.P
+24. From release 10.45, PCRE2 gives an error if \ex is not followed by a
+hexadecimal digit or a curly bracket. It used to interpret this as the NUL
+character. Perl still generates NUL, but warns when in warning mode in most
+cases.
 .
 .
 .SH AUTHOR

diff --git a/doc/pcre2test.1 b/doc/pcre2test.1
@@ -516,7 +516,7 @@ this makes it possible to construct invalid UTF-8 sequences for testing
 purposes. On the other hand, \ex{hh} is interpreted as a UTF-8 character in
 UTF-8 mode, generating more than one byte if the value is greater than 127.
 When testing the 8-bit library not in UTF-8 mode, \ex{hh} generates one byte
-for values less than 256, and causes an error for greater values.
+for values that could fit on it, and causes an error for greater values.
 .P
 In UTF-16 mode, all 4-digit \ex{hhhh} values are accepted. This makes it
 possible to construct invalid UTF-16 sequences for testing purposes.

diff --git a/perltest.sh b/perltest.sh
@@ -314,7 +314,13 @@ for (;;)
       }
     else
       {
-      $x = eval "\"$_\"";   # To get escapes processed
+      s/(?<!\\)\\$//;     # Remove pcre2test specific trailing backslash
+      $x = eval "\"$_\""; # To get escapes processed
+      if ($interact && $@)
+        {
+        print STDERR "$@";
+        redo;
+        }
       }
 
     # Empty array for holding results, ensure $REGERROR and $REGMARK are

diff --git a/src/pcre2test.c b/src/pcre2test.c
@@ -7174,10 +7174,10 @@ while ((c = *p++) != 0)
     break;
 
     case 'x':
+    c = 0;
     if (*p == '{')
       {
       uint8_t *pt = p;
-      c = 0;
 
       /* We used to have "while (isxdigit(*(++pt)))" here, but it fails
       when isxdigit() is a macro that refers to its argument more than
@@ -7187,36 +7187,41 @@ while ((c = *p++) != 0)
       for (pt++; isxdigit(*pt); pt++)
         {
         if (++i == 9)
+          {
           fprintf(outfile, "** Too many hex digits in \\x{...} item; "
                            "using only the first eight.\n");
-        else c = c * 16 + (tolower(*pt) - ((isdigit(*pt))? '0' : 'a' - 10));
+          while (isxdigit(*pt)) pt++;
+          break;
+          }
+        else c = c * 16 + (tolower(*pt) - (isdigit(*pt)? '0' : 'a' - 10));
         }
-      if (*pt == '}')
+      if (i == 0 || *pt != '}')
         {
-        p = pt + 1;
-        break;
+        fprintf(outfile, "** Malformed \\x{ escape\n");
+        return PR_OK;
         }
-      /* Not correct form for \x{...}; fall through */
+      else p = pt + 1;
       }
-
-    /* \x without {} always defines just one byte in 8-bit mode. This
-    allows UTF-8 characters to be constructed byte by byte, and also allows
-    invalid UTF-8 sequences to be made. Just copy the byte in UTF-8 mode.
-    Otherwise, pass it down as data. */
-
-    c = 0;
-    while (i++ < 2 && isxdigit(*p))
+    else
       {
-      c = c * 16 + (tolower(*p) - ((isdigit(*p))? '0' : 'a' - 10));
-      p++;
-      }
+      /* \x without {} always defines just one byte in 8-bit mode. This
+      allows UTF-8 characters to be constructed byte by byte, and also allows
+      invalid UTF-8 sequences to be made. Just copy the byte in UTF-8 mode.
+      Otherwise, pass it down as data. */
+
+      while (i++ < 2 && isxdigit(*p))
+        {
+        c = c * 16 + (tolower(*p) - (isdigit(*p)? '0' : 'a' - 10));
+        p++;
+        }
 #if defined SUPPORT_PCRE2_8
-    if (utf && (test_mode == PCRE8_MODE))
-      {
-      *q8++ = c;
-      continue;
-      }
+      if (utf && (test_mode == PCRE8_MODE))
+        {
+        *q8++ = c;
+        continue;
+        }
 #endif
+      }
     break;
 
     case 0:     /* \ followed by EOF allows for an empty line */
@@ -7309,10 +7314,7 @@ while ((c = *p++) != 0)
     }
 #endif
 #ifdef SUPPORT_PCRE2_32
-  if (test_mode == PCRE32_MODE)
-    {
-    *q32++ = c;
-    }
+  if (test_mode == PCRE32_MODE) *q32++ = c;
 #endif
   }
 

diff --git a/testdata/testinput10 b/testdata/testinput10
@@ -187,9 +187,6 @@
     \x{c0}
     \x{f0}
 
-/Ā{3,4}/IB,utf
-  \x{100}\x{100}\x{100}\x{100\x{100}
-
 /(\x{100}+|x)/IB,utf
 
 /(\x{100}*a|x)/IB,utf

diff --git a/testdata/testinput12 b/testdata/testinput12
@@ -56,9 +56,6 @@
     \x{c0}
     \x{f0}
 
-/Ā{3,4}/IB,utf
-  \x{100}\x{100}\x{100}\x{100\x{100}
-
 /(\x{100}+|x)/IB,utf
 
 /(\x{100}*a|x)/IB,utf

diff --git a/testdata/testoutput10 b/testdata/testoutput10
@@ -492,22 +492,6 @@ No match
     \x{f0}
 No match
 
-/Ā{3,4}/IB,utf
-------------------------------------------------------------------
-        Bra
-        \x{100}{3}
-        \x{100}?+
-        Ket
-        End
-------------------------------------------------------------------
-Capture group count = 0
-Options: utf
-First code unit = \xc4
-Last code unit = \x80
-Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
- 0: \x{100}\x{100}\x{100}
-
 /(\x{100}+|x)/IB,utf
 ------------------------------------------------------------------
         Bra

diff --git a/testdata/testoutput12-16 b/testdata/testoutput12-16
@@ -273,22 +273,6 @@ No match
     \x{f0}
 No match
 
-/Ā{3,4}/IB,utf
-------------------------------------------------------------------
-        Bra
-        \x{100}{3}
-        \x{100}?+
-        Ket
-        End
-------------------------------------------------------------------
-Capture group count = 0
-Options: utf
-First code unit = \x{100}
-Last code unit = \x{100}
-Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
- 0: \x{100}\x{100}\x{100}
-
 /(\x{100}+|x)/IB,utf
 ------------------------------------------------------------------
         Bra

diff --git a/testdata/testoutput12-32 b/testdata/testoutput12-32
@@ -268,22 +268,6 @@ No match
     \x{f0}
 No match
 
-/Ā{3,4}/IB,utf
-------------------------------------------------------------------
-        Bra
-        \x{100}{3}
-        \x{100}?+
-        Ket
-        End
-------------------------------------------------------------------
-Capture group count = 0
-Options: utf
-First code unit = \x{100}
-Last code unit = \x{100}
-Subject length lower bound = 3
-  \x{100}\x{100}\x{100}\x{100\x{100}
- 0: \x{100}\x{100}\x{100}
-
 /(\x{100}+|x)/IB,utf
 ------------------------------------------------------------------
         Bra