migirate inspect_ai to the HF dataset

Zilinghan · Zilinghan · commit c67df4384d69 · 2025-02-17T11:08:07.000-06:00
diff --git a/eval/inspect_ai/README.md b/eval/inspect_ai/README.md
@@ -16,26 +16,26 @@ However, there are some additional command line arguments that could be useful a
 
 - `--max-connections`: Maximum amount of API connections to the evaluated model.
 - `--limit`: Limit of the number of samples to evaluate in the SciCode dataset.
-- `-T input_path=<another_input_json_file>`: This is useful when user wants to change to another json dataset (e.g., the dev set).
+- `-T split=validation/test`: Whether the user wants to run on the small `validation` set (15 samples) or the large `test` set (65 samples).
 - `-T output_dir=<your_output_dir>`: This changes the default output directory (`./tmp`).
 - `-T h5py_file=<your_h5py_file>`: This is used if your h5py file is not downloaded in the recommended directory.
 - `-T with_background=True/False`: Whether to include problem background.
 - `-T mode=normal/gold/dummy`: This provides two additional modes for sanity checks.
     - `normal` mode is the standard mode to evaluate a model
-    - `gold` mode can only be used on the dev set which loads the gold answer
+    - `gold` mode can only be used on the validation set which loads the gold answer
     - `dummy` mode does not call any real LLMs and generates some dummy outputs
 
-For example, user can run five sames on the dev set with background as
+For example, user can run five samples on the validation set with background as
 
 ```bash
 inspect eval scicode.py \
     --model openai/gpt-4o \
     --temperature 0 \
     --limit 5 \
-    -T input_path=../data/problems_dev.jsonl \
-    -T output_dir=./tmp/dev \
+    -T split=validation \
+    -T output_dir=./tmp/val \
     -T with_background=True \
-    -T mode=gold
+    -T mode=normal
 ```
 
 User can run the evaluation on `Deepseek-v3` using together ai via the following command:
diff --git a/eval/inspect_ai/scicode.py b/eval/inspect_ai/scicode.py
@@ -5,7 +5,7 @@
 from typing import Any
 from pathlib import Path
 from inspect_ai import Task, task
-from inspect_ai.dataset import json_dataset, Sample
+from inspect_ai.dataset import Sample, hf_dataset
 from inspect_ai.solver import solver, TaskState, Generate
 from inspect_ai.scorer import scorer, mean, metric, Metric, Score, Target
 from scicode.parse.parse import extract_function_name, get_function_from_code
@@ -392,26 +392,26 @@ async def score(state: TaskState, target: Target):
 
 @task
 def scicode(
-    input_path: str = '../data/problems_all.jsonl',
+    split: str = 'test',
     output_dir: str = './tmp',
     with_background: bool = False,
     h5py_file: str = '../data/test_data.h5',
     mode: str = 'normal',
 ):
-    dataset = json_dataset(
-        input_path, 
-        record_to_sample
+    
+    dataset =  hf_dataset(
+        'Zilinghan/scicode',
+        split=split,
+        sample_fields=record_to_sample,
     )
     return Task(
         dataset=dataset,
         solver=scicode_solver(
-            input_path=input_path,
             output_dir=output_dir,
             with_background=with_background,
             mode=mode,
         ),
         scorer=scicode_scorer(
-            input_path=input_path,
             output_dir=output_dir,
             with_background=with_background,
             h5py_file=h5py_file,