Skip to content

Commit 9cfefc2

Browse files
committed
For nested features with N nested levels (N > 1), the statistics counting the number of values in CommonStatistics and WeightedCommonStatistics will rely on the innermost level.
PiperOrigin-RevId: 631265288
1 parent a7059ac commit 9cfefc2

File tree

7 files changed

+252
-162
lines changed

7 files changed

+252
-162
lines changed

RELEASE.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,10 +54,13 @@
5454
time, any M1 support for TFDV is experimental and untested.
5555
* Bumps the pybind11 version to 2.11.1.
5656
* Depends on `tensorflow~=2.15.0`.
57-
* Depends on `apache-beam[gcp]>=2.53.0,<3` for Python 3.11 and on
57+
* Depends on `apache-beam[gcp]>=2.53.0,<3` for Python 3.11 and on
5858
`apache-beam[gcp]>=2.47.0,<3` for 3.9 and 3.10.
5959
* Depends on `protobuf>=4.25.2,<5` for Python 3.11 and on `protobuf>3.20.3,<5`
6060
for 3.9 and 3.10.
61+
* For nested features with N nested levels (N > 1), the statistics counting
62+
the number of values in `CommonStatistics` and `WeightedCommonStatistics`
63+
will rely on the innermost level.
6164

6265
## Known Issues
6366

tensorflow_data_validation/anomalies/schema_anomalies_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ TEST(SchemaAnomalies, SimpleBadSchemaConfigurations) {
214214
num_non_missing: 4
215215
min_num_values: 1
216216
max_num_values: 3
217-
avg_num_values: 1.5
217+
avg_num_values: 2
218218
presence_and_valency_stats {
219219
num_missing: 10
220220
num_non_missing: 4

tensorflow_data_validation/anomalies/schema_test.cc

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1933,8 +1933,8 @@ TEST(SchemaTest, GetSchemaWithValueCounts) {
19331933
num_missing: 10
19341934
min_num_values: 1
19351935
max_num_values: 1
1936-
avg_num_values: 1.0
1937-
tot_num_values: 10
1936+
avg_num_values: 1.5
1937+
tot_num_values: 15
19381938
presence_and_valency_stats {
19391939
num_non_missing: 10
19401940
num_missing: 10
@@ -2383,7 +2383,7 @@ TEST(SchemaTest, UpdateBadStartingSchema) {
23832383
num_non_missing: 4
23842384
min_num_values: 1
23852385
max_num_values: 3
2386-
avg_num_values: 1.5
2386+
avg_num_values: 2
23872387
presence_and_valency_stats {
23882388
num_missing: 10
23892389
num_non_missing: 4

tensorflow_data_validation/anomalies/statistics_view_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,7 @@ TEST(FeatureStatsView, GetNumMissingNestedMultipleNestednessLevels) {
331331
type: FLOAT
332332
num_stats: {
333333
common_stats {
334-
num_missing: 3
334+
num_missing: 2
335335
weighted_common_stats { num_missing: 2 }
336336
presence_and_valency_stats { num_missing: 2 }
337337
presence_and_valency_stats { num_missing: 0 }

tensorflow_data_validation/integration_tests/sequence_example_e2e_test.py

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -416,28 +416,28 @@
416416
common_stats {
417417
num_non_missing: 10
418418
num_missing: 10
419-
min_num_values: 1
420-
max_num_values: 1
421-
avg_num_values: 1.0
419+
min_num_values: 2
420+
max_num_values: 2
421+
avg_num_values: 2.0
422422
num_values_histogram {
423423
buckets {
424-
low_value: 1.0
425-
high_value: 1.0
424+
low_value: 2.0
425+
high_value: 2.0
426426
sample_count: 3.3333333333333335
427427
}
428428
buckets {
429-
low_value: 1.0
430-
high_value: 1.0
429+
low_value: 2.0
430+
high_value: 2.0
431431
sample_count: 3.3333333333333335
432432
}
433433
buckets {
434-
low_value: 1.0
435-
high_value: 1.0
434+
low_value: 2.0
435+
high_value: 2.0
436436
sample_count: 3.3333333333333335
437437
}
438438
type: QUANTILES
439439
}
440-
tot_num_values: 10
440+
tot_num_values: 20
441441
presence_and_valency_stats {
442442
num_non_missing: 10
443443
num_missing: 10
@@ -511,17 +511,17 @@
511511
num_stats {
512512
common_stats {
513513
num_non_missing: 20
514-
min_num_values: 1
514+
min_num_values: 2
515515
max_num_values: 3
516-
avg_num_values: 2.0
516+
avg_num_values: 2.5
517517
num_values_histogram {
518518
buckets {
519-
low_value: 1.0
520-
high_value: 1.0
519+
low_value: 2.0
520+
high_value: 2.0
521521
sample_count: 10.0
522522
}
523523
buckets {
524-
low_value: 1.0
524+
low_value: 2.0
525525
high_value: 3.0
526526
sample_count: 5.0
527527
}
@@ -532,7 +532,7 @@
532532
}
533533
type: QUANTILES
534534
}
535-
tot_num_values: 40
535+
tot_num_values: 50
536536
presence_and_valency_stats {
537537
num_non_missing: 20
538538
min_num_values: 1
@@ -1026,34 +1026,34 @@
10261026
common_stats {
10271027
num_non_missing: 10
10281028
num_missing: 10
1029-
min_num_values: 1
1030-
max_num_values: 1
1031-
avg_num_values: 1.0
1029+
min_num_values: 2
1030+
max_num_values: 2
1031+
avg_num_values: 2.0
10321032
num_values_histogram {
10331033
buckets {
1034-
low_value: 1.0
1035-
high_value: 1.0
1034+
low_value: 2.0
1035+
high_value: 2.0
10361036
sample_count: 3.3333333333333335
10371037
}
10381038
buckets {
1039-
low_value: 1.0
1040-
high_value: 1.0
1039+
low_value: 2.0
1040+
high_value: 2.0
10411041
sample_count: 3.3333333333333335
10421042
}
10431043
buckets {
1044-
low_value: 1.0
1045-
high_value: 1.0
1044+
low_value: 2.0
1045+
high_value: 2.0
10461046
sample_count: 3.3333333333333335
10471047
}
10481048
type: QUANTILES
10491049
}
10501050
weighted_common_stats {
10511051
num_non_missing: 50.0
10521052
num_missing: 100.0
1053-
avg_num_values: 1.0
1054-
tot_num_values: 50.0
1053+
avg_num_values: 2.0
1054+
tot_num_values: 100.0
10551055
}
1056-
tot_num_values: 10
1056+
tot_num_values: 20
10571057
presence_and_valency_stats {
10581058
num_non_missing: 10
10591059
num_missing: 10
@@ -1157,17 +1157,17 @@
11571157
num_stats {
11581158
common_stats {
11591159
num_non_missing: 20
1160-
min_num_values: 1
1160+
min_num_values: 2
11611161
max_num_values: 3
1162-
avg_num_values: 2.0
1162+
avg_num_values: 2.5
11631163
num_values_histogram {
11641164
buckets {
1165-
low_value: 1.0
1166-
high_value: 1.0
1165+
low_value: 2.0
1166+
high_value: 2.0
11671167
sample_count: 10.0
11681168
}
11691169
buckets {
1170-
low_value: 1.0
1170+
low_value: 2.0
11711171
high_value: 3.0
11721172
sample_count: 5.0
11731173
}
@@ -1180,10 +1180,10 @@
11801180
}
11811181
weighted_common_stats {
11821182
num_non_missing: 150.0
1183-
avg_num_values: 1.6666666666666667
1184-
tot_num_values: 250.0
1183+
avg_num_values: 2.6666666666666667
1184+
tot_num_values: 400.0
11851185
}
1186-
tot_num_values: 40
1186+
tot_num_values: 50
11871187
presence_and_valency_stats {
11881188
num_non_missing: 20
11891189
min_num_values: 1

0 commit comments

Comments
 (0)