@@ -152,24 +152,32 @@ def include_example(filters_line: Dict[str, str]) -> bool:
152
152
153
153
# dry run: estimate the total number of examples and the number of examples in the filtered subset
154
154
with jsonlines .open (cfg .preds_path , "r" ) as reader :
155
- total = sum (1 for _ in reader )
155
+ num_total = sum (1 for _ in reader )
156
156
with jsonlines .open (cfg .filter .path , "r" ) as filters_reader :
157
- included = sum (1 for filters_line in filters_reader if include_example (filters_line ))
157
+ num_included = sum (1 for filters_line in filters_reader if include_example (filters_line ))
158
158
159
159
# TODO: make configurable?
160
160
# when computing metrics on out-of-filters subset, select a random subsample of the same size as filtered subset
161
161
if not cfg .filter .fit_filters :
162
- num_examples_subset = total - included
163
- logging .warning (
164
- f"Total number of examples: { total } , will consider a random subset of { num_examples_subset } examples ({ num_examples_subset / total * 100 :.2f} %)."
165
- )
162
+ with jsonlines .open (cfg .filter .path , "r" ) as filters_reader :
163
+ num_filtered = sum (
164
+ 1
165
+ for filters_line in filters_reader
166
+ if all (filters_line [filter_col ] for filter_col in cfg .filter .filters_to_include )
167
+ )
168
+
169
+ num_examples_subset = num_total - num_included
170
+ if num_included > num_filtered :
171
+ logging .warning (
172
+ f"Total number of examples: { num_total } , number of examples to include: { num_included } , will consider a random subset of { num_examples_subset } examples ({ num_examples_subset / num_total * 100 :.2f} %)."
173
+ )
166
174
with jsonlines .open (cfg .filter .path , "r" ) as filters_reader :
167
175
ids = [i for i , filters_line in enumerate (filters_reader ) if include_example (filters_line )]
168
176
subset_ids = set (random .sample (ids , k = num_examples_subset ))
169
177
else :
170
178
subset_ids = None
171
179
logging .warning (
172
- f"Total number of examples: { total } , will consider { included } examples ({ included / total * 100 :.2f} %)."
180
+ f"Total number of examples: { num_total } , will consider { num_included } examples ({ num_included / num_total * 100 :.2f} %)."
173
181
)
174
182
175
183
with jsonlines .open (cfg .preds_path , "r" ) as reader :
0 commit comments