Skip to content

Commit 0426f63

Browse files
committed
Revise \p{nv=float} lookup
The Numeric Value property allows one to find all code points that have a certain numeric value. An example would be to match against any character in any of the world's scripts which is effectively equivalent to the digit zero. It is documented that we accept either integers (like \p{nv=9}) or rationals (like \p{nv=1/2}). But we also accept floating point representations in case a conversion to numeric has happened. I think it is right that we not document these and their vagaries. One reason is that Unicode might someday create a new rational number that, to the precision we currently accept, is indistinguishable from an existing one, so that we would have to increase the precision. But there was a bug I introduced years ago. I thought that in order for a float to be considered to match a close rational, that 3 significant digits of precision would be needed, like .667 to match 2/3. That still seems reasonable. But I didn't implement that concept. Instead, prior to this commit, it was 3 (not necessarily significant) digits, so that for 1/160, it would match .001. This commit corrects that, and makes the lookup simpler. mktables will use sprintf %e to get the number normalized and having the 3 signicant digits required. At runtime, a floating number is normalized using the same format, and the result looked up in a hash. This eliminates the need to worry about matching within some epsilon. Further simplifications in utf8_heavy.pl are achieved by making a more precise definition as to what an acceptable number looks like, so we don't have to check later to see if what matched really was one.
1 parent 7a6f684 commit 0426f63

File tree

5 files changed

+120
-184
lines changed

5 files changed

+120
-184
lines changed

charclass_invlists.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -374323,7 +374323,7 @@ static const U8 WB_table[24][24] = {
374323374323
* be0f129691d479aa38646e4ca0ec1ee576ae7f75b0300a5624a7fa862fa8abba lib/unicore/extracted/DLineBreak.txt
374324374324
* 92449d354d9f6b6f2f97a292ebb59f6344ffdeb83d120d7d23e569c43ba67cd5 lib/unicore/extracted/DNumType.txt
374325374325
* e3a319527153b0c6c0c549b40fc6f3a01a7a0dcd6620784391db25901df3b154 lib/unicore/extracted/DNumValues.txt
374326-
* 6f7e75c46e2c6e4cff53fd9c14a0fbc77611809565d609b15cb98868c5891cdd lib/unicore/mktables
374326+
* c237f9e6bda604db4388693b42a20ee0d5c2cf9c08152beca27aa0e1ee735550 lib/unicore/mktables
374327374327
* 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version
374328374328
* 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl
374329374329
* 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl

lib/unicore/mktables

Lines changed: 88 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1436,10 +1436,10 @@ my @missing_early_files; # Generated list of absent files that we need to
14361436
my @files_actually_output; # List of files we generated.
14371437
my @more_Names; # Some code point names are compound; this is used
14381438
# to store the extra components of them.
1439-
my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1440-
# the minimum before we consider it equivalent to a
1441-
# candidate rational
1442-
my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1439+
my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1440+
# point of a normalized floating point number
1441+
# needed to match before we consider it equivalent
1442+
# to a candidate rational
14431443

14441444
# These store references to certain commonly used property objects
14451445
my $age;
@@ -12955,6 +12955,7 @@ sub register_fraction($) {
1295512955
my $rational = shift;
1295612956

1295712957
my $float = eval $rational;
12958+
$float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
1295812959
$nv_floating_to_rational{$float} = $rational;
1295912960
return;
1296012961
}
@@ -17656,10 +17657,10 @@ $loose_to_file_of
1765617657
$nv_floating_to_rational
1765717658
);
1765817659

17659-
# If a floating point number doesn't have enough digits in it to get this
17660-
# close to a fraction, it isn't considered to be that fraction even if all the
17661-
# digits it does have match.
17662-
\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
17660+
# If a %e floating point number doesn't have this number of digits in it after
17661+
# the decimal point to get this close to a fraction, it isn't considered to be
17662+
# that fraction even if all the digits it does have match.
17663+
\$utf8::e_precision = $E_FLOAT_PRECISION;
1766317664

1766417665
# Deprecated tables to generate a warning for. The key is the file containing
1766517666
# the table, so as to avoid duplication, as many property names can map to the
@@ -18982,21 +18983,12 @@ sub make_property_test_script() {
1898218983

1898318984
$t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
1898418985

18985-
# Keep going down an order of magnitude
18986-
# until find that adding this quantity to
18987-
# 1 remains 1; but put an upper limit on
18988-
# this so in case this algorithm doesn't
18989-
# work properly on some platform, that we
18990-
# won't loop forever.
18991-
my $digits = 0;
18992-
my $min_floating_slop = 1;
18993-
while (1+ $min_floating_slop != 1
18994-
&& $digits++ < 50)
18995-
{
18996-
my $next = $min_floating_slop / 10;
18997-
last if $next == 0; # If underflows,
18998-
# use previous one
18999-
$min_floating_slop = $next;
18986+
# Create a list of what the %f representation is for each rational number.
18987+
# This will be used below.
18988+
my @valid_base_floats = '0.0';
18989+
foreach my $e_representation (keys %nv_floating_to_rational) {
18990+
push @valid_base_floats,
18991+
eval $nv_floating_to_rational{$e_representation};
1900018992
}
1900118993

1900218994
# It doesn't matter whether the elements of this array contain single lines
@@ -19144,70 +19136,82 @@ EOF_CODE
1914419136
# floating point equivalent.
1914519137
if ($table_name =~ qr{/}) {
1914619138

19147-
# Calculate the float, and find just the fraction.
19139+
# Calculate the float, and the %e representation
1914819140
my $float = eval $table_name;
19149-
my ($whole, $fraction)
19150-
= $float =~ / (.*) \. (.*) /x;
19151-
19152-
# Starting with one digit after the decimal point,
19153-
# create a test for each possible precision (number of
19154-
# digits past the decimal point) until well beyond the
19155-
# native number found on this machine. (If we started
19156-
# with 0 digits, it would be an integer, which could
19157-
# well match an unrelated table)
19158-
PLACE:
19159-
for my $i (1 .. $min_floating_slop + 3) {
19160-
my $table_name = sprintf("%.*f", $i, $float);
19161-
if ($i < $MIN_FRACTION_LENGTH) {
19162-
19163-
# If the test case has fewer digits than the
19164-
# minimum acceptable precision, it shouldn't
19165-
# succeed, so we expect an error for it.
19166-
# E.g., 2/3 = .7 at one decimal point, and we
19167-
# shouldn't say it matches .7. We should make
19168-
# it be .667 at least before agreeing that the
19169-
# intent was to match 2/3. But at the
19170-
# less-than- acceptable level of precision, it
19171-
# might actually match an unrelated number.
19172-
# So don't generate a test case if this
19173-
# conflating is possible. In our example, we
19174-
# don't want 2/3 matching 7/10, if there is
19175-
# a 7/10 code point.
19176-
19177-
# First, integers are not in the rationals
19178-
# table. Don't generate an error if this
19179-
# rounds to an integer using the given
19180-
# precision.
19181-
my $round = sprintf "%.0f", $table_name;
19182-
next PLACE if abs($table_name - $round)
19183-
< $MAX_FLOATING_SLOP;
19184-
19185-
# Here, isn't close enough to an integer to be
19186-
# confusable with one. Now, see it it's
19187-
# "close" to a known rational
19188-
for my $existing
19189-
(keys %nv_floating_to_rational)
19141+
my $e_representation = sprintf("%.*e",
19142+
$E_FLOAT_PRECISION, $float);
19143+
# Parse that
19144+
my ($non_zeros, $zeros, $exponent_sign, $exponent)
19145+
= $e_representation
19146+
=~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
19147+
my $min_e_precision;
19148+
my $min_f_precision;
19149+
19150+
if ($exponent_sign eq '+' && $exponent != 0) {
19151+
Carp::my_carp_bug("Not yet equipped to handle"
19152+
. " positive exponents");
19153+
return;
19154+
}
19155+
else {
19156+
# We're trying to find the minimum precision that
19157+
# is needed to indicate this particular rational
19158+
# for the given $E_FLOAT_PRECISION. For %e, any
19159+
# trailing zeros, like 1.500e-02 aren't needed, so
19160+
# the correct value is how many non-trailing zeros
19161+
# there are after the decimal point.
19162+
$min_e_precision = length $non_zeros;
19163+
19164+
# For %f, like .01500, we want at least
19165+
# $E_FLOAT_PRECISION digits, but any trailing
19166+
# zeros aren't needed, so we can subtract the
19167+
# length of those. But we also need to include
19168+
# the zeros after the decimal point, but before
19169+
# the first significant digit.
19170+
$min_f_precision = $E_FLOAT_PRECISION
19171+
+ $exponent
19172+
- length $zeros;
19173+
}
19174+
19175+
# Make tests for each possible precision from 1 to
19176+
# just past the worst case.
19177+
my $upper_limit = ($min_e_precision > $min_f_precision)
19178+
? $min_e_precision
19179+
: $min_f_precision;
19180+
19181+
for my $i (1 .. $upper_limit + 1) {
19182+
for my $format ("e", "f") {
19183+
my $this_table
19184+
= sprintf("%.*$format", $i, $float);
19185+
19186+
# If we don't have enough precision digits,
19187+
# make a fail test; otherwise a pass test.
19188+
my $pass = ($format eq "e")
19189+
? $i >= $min_e_precision
19190+
: $i >= $min_f_precision;
19191+
if ($pass) {
19192+
push @output, generate_tests($property_name,
19193+
$this_table,
19194+
$valid,
19195+
$invalid,
19196+
$warning,
19197+
);
19198+
}
19199+
elsif ( $format eq "e"
19200+
19201+
# Here we would fail, but in the %f
19202+
# case, the representation at this
19203+
# precision could actually be a
19204+
# valid one for some other rational
19205+
|| ! grep { $_ eq $this_table }
19206+
@valid_base_floats)
1919019207
{
19191-
next PLACE
19192-
if abs($table_name - $existing)
19193-
< $MAX_FLOATING_SLOP;
19208+
push @output,
19209+
generate_error($property_name,
19210+
$this_table,
19211+
1 # 1 => already an
19212+
# error
19213+
);
1919419214
}
19195-
push @output, generate_error($property_name,
19196-
$table_name,
19197-
1 # 1 => already an error
19198-
);
19199-
}
19200-
else {
19201-
19202-
# Here the number of digits exceeds the
19203-
# minimum we think is needed. So generate a
19204-
# success test case for it.
19205-
push @output, generate_tests($property_name,
19206-
$table_name,
19207-
$valid,
19208-
$invalid,
19209-
$warning,
19210-
);
1921119215
}
1921219216
}
1921319217
}

0 commit comments

Comments
 (0)