Skip to content

Commit fdcce7e

Browse files
carlosgjslsetiawan
andauthored
feat: Do deterministic inference (#25)
Co-authored-by: Don Setiawan <landungs@uw.edu>
1 parent 89899bb commit fdcce7e

File tree

9 files changed

+117
-9
lines changed

9 files changed

+117
-9
lines changed

azureml/eval.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ command: >
55
--model-path ${{inputs.model_path}}
66
--sys-id ${{inputs.sys_id}}
77
--instruc-id ${{inputs.instruc_id}}
8+
--param do_sample=${{inputs.do_sample}}
89
--param temperature=${{inputs.temperature}}
910
--param top_k=${{inputs.top_k}}
1011
--param top_p=${{inputs.top_p}}
@@ -18,9 +19,10 @@ inputs:
1819
# type: uri_folder
1920
# path: azureml://datastores/workspaceblobstore/paths/base_models
2021
model_path: meta-llama/Llama-2-7b-chat-hf
21-
temperature: 0.7
22+
temperature: 0.01
23+
do_sample: 0
2224
top_p: 0.95
23-
top_k: 40
25+
top_k: 1
2426
sys_id: SYS_1
2527
instruc_id: INSTR_SWEETP_1
2628
# using a curated environment doesn't work because we need additional packages

azureml/generate.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ command: >
55
--output ./outputs/output.txt
66
--sys-id ${{inputs.sys_id}}
77
--instruc-id ${{inputs.instruc_id}}
8+
--param do_sample=${{inputs.do_sample}}
89
--param temperature=${{inputs.temperature}}
910
--param top_k=${{inputs.top_k}}
1011
--param top_p=${{inputs.top_p}}
@@ -17,6 +18,7 @@ inputs:
1718
# path: azureml://datastores/workspaceblobstore/paths/base_models
1819
model_path: meta-llama/Llama-2-7b-chat-hf
1920
temperature: 0.7
21+
do_sample: 0
2022
top_p: 0.95
2123
top_k: 40
2224
sys_id: SYS_1

data/autora/code1.txt

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
import numpy as np
2+
import pandas as pd
3+
import sympy as sp
4+
from autora.experiment_runner.synthetic.abstract.equation import equation_experiment
5+
from autora.experimentalist.random import random_pool
6+
from autora.state import StandardState, estimator_on_state, on_state
7+
from autora.theorist.bms import BMSRegressor
8+
from autora.variable import ValueType, Variable, VariableCollection
9+
10+
####################################################################################
11+
## Define initial data
12+
####################################################################################
13+
14+
#### Define variable data ####
15+
iv = Variable(name="x", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))
16+
dv = Variable(name="y", type=ValueType.REAL)
17+
variables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])
18+
19+
#### Define seed condition data ####
20+
conditions = random_pool(variables, num_samples=10, random_state=0)
21+
22+
####################################################################################
23+
## Define experimentalist
24+
####################################################################################
25+
26+
experimentalist = on_state(random_pool, output=["conditions"])
27+
28+
####################################################################################
29+
## Define experiment runner
30+
####################################################################################
31+
32+
sin_experiment = equation_experiment(
33+
sp.simplify("sin(x)"), variables.independent_variables, variables.dependent_variables[0]
34+
)
35+
sin_runner = sin_experiment.experiment_runner
36+
37+
experiment_runner = on_state(sin_runner, output=["experiment_data"])
38+
39+
####################################################################################
40+
## Define theorist
41+
####################################################################################
42+
43+
theorist = estimator_on_state(BMSRegressor(epochs=100))
44+
45+
####################################################################################
46+
## Define state
47+
####################################################################################
48+
49+
s = StandardState(
50+
variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=["x", "y"])
51+
)
52+
53+
####################################################################################
54+
## Cycle through the state
55+
####################################################################################
56+
57+
print("Pre-Defined State:")
58+
print(f"Number of datapoints collected: {len(s['experiment_data'])}")
59+
print(f"Derived models: {s['models']}")
60+
print("\n")
61+
62+
for i in range(5):
63+
s = experimentalist(s, num_samples=10, random_state=42)
64+
s = experiment_runner(s, added_noise=1.0, random_state=42)
65+
s = theorist(s)
66+
print(f"\nCycle {i+1} Results:")
67+
print(f"Number of datapoints collected: {len(s['experiment_data'])}")
68+
print(f"Derived models: {s['models']}")
69+
print("\n")

data/autora/data.jsonl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
{"instruction": "import numpy as np\nimport pandas as pd\nimport sympy as sp\nfrom autora.experiment_runner.synthetic.abstract.equation import equation_experiment\nfrom autora.experimentalist.random import random_pool\nfrom autora.state import StandardState, estimator_on_state, on_state\nfrom autora.theorist.bms import BMSRegressor\nfrom autora.variable import ValueType, Variable, VariableCollection\n\n####################################################################################\n## Define initial data\n####################################################################################\n\n#### Define variable data ####\niv = Variable(name=\"x\", value_range=(0, 2 * np.pi), allowed_values=np.linspace(0, 2 * np.pi, 30))\ndv = Variable(name=\"y\", type=ValueType.REAL)\nvariables = VariableCollection(independent_variables=[iv], dependent_variables=[dv])\n\n#### Define seed condition data ####\nconditions = random_pool(variables, num_samples=10, random_state=0)\n\n####################################################################################\n## Define experimentalist\n####################################################################################\n\nexperimentalist = on_state(random_pool, output=[\"conditions\"])\n\n####################################################################################\n## Define experiment runner\n####################################################################################\n\nsin_experiment = equation_experiment(\n sp.simplify(\"sin(x)\"), variables.independent_variables, variables.dependent_variables[0]\n)\nsin_runner = sin_experiment.experiment_runner\n\nexperiment_runner = on_state(sin_runner, output=[\"experiment_data\"])\n\n####################################################################################\n## Define theorist\n####################################################################################\n\ntheorist = estimator_on_state(BMSRegressor(epochs=100))\n\n####################################################################################\n## Define state\n####################################################################################\n\ns = StandardState(\n variables=variables, conditions=conditions, experiment_data=pd.DataFrame(columns=[\"x\", \"y\"])\n)\n\n####################################################################################\n## Cycle through the state\n####################################################################################\n\nprint(\"Pre-Defined State:\")\nprint(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\nprint(f\"Derived models: {s['models']}\")\nprint(\"\\n\")\n\nfor i in range(5):\n s = experimentalist(s, num_samples=10, random_state=42)\n s = experiment_runner(s, added_noise=1.0, random_state=42)\n s = theorist(s)\n print(f\"\\nCycle {i+1} Results:\")\n print(f\"Number of datapoints collected: {len(s['experiment_data'])}\")\n print(f\"Derived models: {s['models']}\")\n print(\"\\n\")\n", "output": "The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection. \n\nThe discovery problem is defined by a single independent variable $x \\in [0, 2 \\pi]$ and dependent variable $y$.\nThe experiment amounts to a simple sine wave, $y = \\sin(x)$, which is the model we are trying to discover.\n\nThe discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a \"random\" experimentalist, which samples novel experimental conditions for $x$ every cycle. \nThe experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; Guimerà et al., in Science Advances) to identify a scientific model that explains the data. "}

data/autora/text1.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
The following example demonstrates how to use AutoRA to automate the process of model discovery, experimental design, and data collection.
2+
3+
The discovery problem is defined by a single independent variable $x \in [0, 2 \pi]$ and dependent variable $y$.
4+
The experiment amounts to a simple sine wave, $y = \sin(x)$, which is the model we are trying to discover.
5+
6+
The discovery cycle iterates between the experimentalist, experiment runner, and theorist. Here, we us a "random" experimentalist, which samples novel experimental conditions for $x$ every cycle.
7+
The experiment runner then collects data for the corresponding $y$ values. Finally, the theorist uses a [Bayesian Machine Scientist](https://autoresearch.github.io/autora/user-guide/theorists/bms/) (BMS; Guimerà et al., in Science Advances) to identify a scientific model that explains the data.
File renamed without changes.

src/autora/doc/pipelines/main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,21 @@ def import_model(model_name: str) -> None:
108108
pass
109109

110110

111+
@app.command()
112+
def import_data(code_file: str, text_file: str, output_file: str = "data.jsonl") -> None:
113+
from pathlib import Path
114+
115+
import jsonlines
116+
117+
# alpaca jsonl format:
118+
def read_text(file: str) -> str:
119+
return Path(file).read_text()
120+
121+
d = {"instruction": read_text(code_file), "output": read_text(text_file)}
122+
with jsonlines.open(output_file, "a") as file:
123+
file.write(d)
124+
125+
111126
if __name__ == "__main__":
112127
logger.info(f"Torch version: {torch.__version__} , Cuda available: {torch.cuda.is_available()}")
113128

src/autora/doc/runtime/predict_hf.py

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,20 +32,23 @@ def predict(
3232
sys: str,
3333
instr: str,
3434
inputs: List[str],
35-
temperature: float = 0.6,
35+
do_sample: float = 0.0,
36+
temperature: float = 0.01,
3637
top_p: float = 0.95,
37-
top_k: float = 40,
38+
top_k: float = 1,
3839
max_length: float = 2048,
3940
num_ret_seq: float = 1,
4041
) -> List[List[str]]:
42+
# convert to bool in case it came in as a generate float param from the CLI
43+
do_sample = bool(do_sample)
4144
logger.info(
42-
f"Generating {len(inputs)} predictions. Temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, "
43-
f"max_length: {max_length}"
45+
f"Generating {len(inputs)} predictions. do_sample: {do_sample}, temperature: {temperature}, top_p: {top_p},"
46+
f" top_k: {top_k}, max_length: {max_length}"
4447
)
4548
prompts = [TEMP_LLAMA2.format(sys=sys, instr=instr, input=input) for input in inputs]
4649
sequences = self.pipeline(
4750
prompts,
48-
do_sample=True,
51+
do_sample=do_sample,
4952
temperature=temperature,
5053
top_p=top_p,
5154
top_k=int(top_k),

tests/test_main.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,14 @@
11
from pathlib import Path
22

3-
from autora.doc.pipelines.main import eval, generate
3+
from autora.doc.pipelines.main import eval, generate, import_data
44
from autora.doc.runtime.prompts import InstructionPrompts, SystemPrompts
55

66
# dummy HF model for testing
77
TEST_HF_MODEL = "hf-internal-testing/tiny-random-FalconForCausalLM"
88

99

1010
def test_predict() -> None:
11-
data = Path(__file__).parent.joinpath("../data/data.jsonl").resolve()
11+
data = Path(__file__).parent.joinpath("../data/sweetpea/data.jsonl").resolve()
1212
outputs = eval(str(data), TEST_HF_MODEL, SystemPrompts.SYS_1, InstructionPrompts.INSTR_SWEETP_1, [])
1313
assert len(outputs) == 3, "Expected 3 outputs"
1414
for output in outputs:
@@ -25,3 +25,12 @@ def test_generate() -> None:
2525
assert output.exists(), f"Expected output file {output} to exist"
2626
with open(str(output), "r") as f:
2727
assert len(f.read()) > 0, f"Expected non-empty output file {output}"
28+
29+
30+
def test_import(tmp_path: Path) -> None:
31+
data = tmp_path.joinpath("data.jsonl")
32+
code = Path(__file__).parent.joinpath("../data/autora/code1.txt").resolve()
33+
text = Path(__file__).parent.joinpath("../data/autora/text1.txt").resolve()
34+
import_data(str(code), str(text), str(data))
35+
new_lines = data.read_text().splitlines()
36+
assert len(new_lines) == 1, "Expected one new line"

0 commit comments

Comments
 (0)