Skip to content

Commit 4c2d023

Browse files
committed
Refactor and fix the logic for out-of-filters metrics
1 parent 286e814 commit 4c2d023

File tree

1 file changed

+15
-7
lines changed

1 file changed

+15
-7
lines changed

compute_metrics.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -152,24 +152,32 @@ def include_example(filters_line: Dict[str, str]) -> bool:
152152

153153
# dry run: estimate the total number of examples and the number of examples in the filtered subset
154154
with jsonlines.open(cfg.preds_path, "r") as reader:
155-
total = sum(1 for _ in reader)
155+
num_total = sum(1 for _ in reader)
156156
with jsonlines.open(cfg.filter.path, "r") as filters_reader:
157-
included = sum(1 for filters_line in filters_reader if include_example(filters_line))
157+
num_included = sum(1 for filters_line in filters_reader if include_example(filters_line))
158158

159159
# TODO: make configurable?
160160
# when computing metrics on out-of-filters subset, select a random subsample of the same size as filtered subset
161161
if not cfg.filter.fit_filters:
162-
num_examples_subset = total - included
163-
logging.warning(
164-
f"Total number of examples: {total}, will consider a random subset of {num_examples_subset} examples ({num_examples_subset / total * 100 :.2f}%)."
165-
)
162+
with jsonlines.open(cfg.filter.path, "r") as filters_reader:
163+
num_filtered = sum(
164+
1
165+
for filters_line in filters_reader
166+
if all(filters_line[filter_col] for filter_col in cfg.filter.filters_to_include)
167+
)
168+
169+
num_examples_subset = num_total - num_included
170+
if num_included > num_filtered:
171+
logging.warning(
172+
f"Total number of examples: {num_total}, number of examples to include: {num_included}, will consider a random subset of {num_examples_subset} examples ({num_examples_subset / num_total * 100 :.2f}%)."
173+
)
166174
with jsonlines.open(cfg.filter.path, "r") as filters_reader:
167175
ids = [i for i, filters_line in enumerate(filters_reader) if include_example(filters_line)]
168176
subset_ids = set(random.sample(ids, k=num_examples_subset))
169177
else:
170178
subset_ids = None
171179
logging.warning(
172-
f"Total number of examples: {total}, will consider {included} examples ({included / total * 100 :.2f}%)."
180+
f"Total number of examples: {num_total}, will consider {num_included} examples ({num_included / num_total * 100 :.2f}%)."
173181
)
174182

175183
with jsonlines.open(cfg.preds_path, "r") as reader:

0 commit comments

Comments
 (0)