Skip to content

Commit

Permalink
a high performning meta prompt for the tgd optimizer adding prompt en…
Browse files Browse the repository at this point in the history
…gineering strategy
  • Loading branch information
liyin2015 committed Jan 5, 2025
1 parent f0328b0 commit 0ad2490
Show file tree
Hide file tree
Showing 27 changed files with 1,288 additions and 883 deletions.
2 changes: 0 additions & 2 deletions adalflow/adalflow/core/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,8 +553,6 @@ def __call__(self, *args, **kwargs):
# the difference between training vs. inference.
from adalflow.optim.parameter import Parameter

print("has_bicall", self._has_bicall())

if self._has_bicall():
output = self.bicall(*args, **kwargs)

Expand Down
23 changes: 17 additions & 6 deletions adalflow/adalflow/core/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -699,7 +699,12 @@ def backward(
else:
backward = False
for pred in children_params:
if pred.requires_opt and pred.param_type == ParameterType.PROMPT:
if pred.requires_opt and pred.param_type in [
ParameterType.PROMPT,
ParameterType.GENERATOR_OUTPUT,
ParameterType.RETRIEVER_OUTPUT,
ParameterType.OUTPUT,
]:
backward = True
break
if backward:
Expand Down Expand Up @@ -738,9 +743,12 @@ def _backward_through_all_predecessors(
for k, v in prompt_kwargs.items()
}

print(f"gt: {response.get_gt()}")

conversation_prompt_kwargs = {
"input_value": input_prompt_kwargs,
"llm_output": response.get_prompt_data(),
# "gt": response.get_gt(),
}

conversation_str = Prompt(
Expand All @@ -759,6 +767,9 @@ def _backward_through_all_predecessors(
conv_ins_template = CONVERSATION_START_INSTRUCTION_CHAIN
obj_ins_template = OBJECTIVE_INSTRUCTION_CHAIN
response_gradient = response.get_gradients_str()
# response_gradient = response.get_gradients_component_schema(
# skip_correct_sample=False
# )
if not response_gradient:
raise ValueError(
f"Generator: No gradient found for {response}. Please check the response."
Expand Down Expand Up @@ -791,7 +802,7 @@ def _backward_through_all_predecessors(
backward_engine_prompt_str = backward_engine.get_prompt(
**backward_engine_prompt_kwargs
)
print(f"Backward engine prompt: {backward_engine_prompt_str}")
# print(f"Backward engine prompt: {backward_engine_prompt_str}")

gradient_output: GeneratorOutput = None
response_gradient_list = [""] * len(children_params)
Expand Down Expand Up @@ -830,6 +841,8 @@ def _backward_through_all_predecessors(
response_gradient_list = [failure_message] * len(children_params)
printc(f"failure_message: {failure_message}", color="red")

print(f"gradient list: {response_gradient_list}")

# generate the gradient for each child
for i, pred in enumerate(children_params):
if not pred.requires_opt or pred.param_type == ParameterType.DEMOS:
Expand Down Expand Up @@ -879,9 +892,6 @@ def _backward_through_one_predecessor(
f"Generator: Skipping {pred} as it does not require optimization."
)
return
printc(
f"Generator: Backward through {pred}, is_intermediate_node: {is_intermediate_node}"
)

if pred.check_if_already_computed_gradient_respect_to(response.id):
log.debug(
Expand All @@ -902,6 +912,7 @@ def _backward_through_one_predecessor(
conversation_prompt_kwargs = {
"input_value": input_prompt_kwargs,
"llm_output": response.get_prompt_data(),
"gt": response.get_gt(),
}

conversation_str = Prompt(
Expand Down Expand Up @@ -953,7 +964,7 @@ def _backward_through_one_predecessor(
backward_engine_prompt_str = backward_engine.get_prompt(
**backward_engine_prompt_kwargs
)
print(f"Backward engine prompt: {backward_engine_prompt_str}")
# print(f"Backward engine prompt: {backward_engine_prompt_str}")
gradient_output: GeneratorOutput = None
if (
backward_pass_setup.compute_grad_for_errors_only
Expand Down
185 changes: 166 additions & 19 deletions adalflow/adalflow/datasets/hotpot_qa.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
import random
import os
import csv
from typing import Literal

from adalflow.utils.lazy_import import safe_import, OptionalPackages


from adalflow.utils.data import Dataset
from adalflow.utils.file_io import save_csv
from adalflow.utils.file_io import save_csv, save_json, load_json
from adalflow.datasets.utils import prepare_dataset_path
from adalflow.core.base_data_class import DataClass
from adalflow.datasets.types import HotPotQAData
Expand All @@ -29,6 +28,8 @@ def __init__(
sampled_valset: 3916
test: 7405
All answers are a phrase in the supporting context where we can choose supporting facts from the context.
You can specify the size of the dataset to load by setting the size parameter.
"""
if split not in ["train", "val", "test"]:
Expand All @@ -44,7 +45,7 @@ def __init__(
self.task_name = f"hotpot_qa_{keep_details}"
data_path = prepare_dataset_path(self.root, self.task_name)
# download and save
split_csv_path = os.path.join(data_path, f"{split}.csv")
split_csv_path = os.path.join(data_path, f"{split}.json")
print(f"split_csv_path: {split_csv_path}")
self._check_or_download_dataset(
split_csv_path, split, only_hard_examples, keep_details
Expand All @@ -55,12 +56,20 @@ def __init__(
# created_data_class = DynamicDataClassFactory.from_dict(
# "HotPotQAData", {"id": "str", "question": "str", "answer": "str"}

with open(split_csv_path, newline="") as csvfile:
reader = csv.DictReader(csvfile)
for i, row in enumerate(reader):
if size is not None and i >= size:
break
self.data.append(HotPotQAData.from_dict(row))
# with open(split_csv_path, newline="") as csvfile:
# reader = csv.DictReader(csvfile)
# for i, row in enumerate(reader):
# if size is not None and i >= size:
# break
# self.data.append(HotPotQAData.from_dict(row))

self.data = load_json(split_csv_path)
if size is not None:
# use random seed to make sure the same data is loaded
# random.Random(0).shuffle(self.data)
self.data = self.data[:size]
# convert to dataclass
self.data = [HotPotQAData.from_dict(d) for d in self.data]

def _check_or_download_dataset(
self,
Expand Down Expand Up @@ -99,6 +108,24 @@ def _check_or_download_dataset(
hf_official_dev = load_dataset(
"hotpot_qa", "fullwiki", split="validation", trust_remote_code=True
)
data_path_dir = os.path.dirname(data_path)
# save all the original data
all_original_keys = hf_official_train[0].keys()
for split, examples in zip(
["hf_official_train", "hf_official_dev"],
[hf_official_train, hf_official_dev],
):
target_path = os.path.join(data_path_dir, f"{split}.csv")
save_csv(examples, f=target_path, fieldnames=all_original_keys)
# for example in examples:
# # is answer in the context
# print(f"example: {example}")
# context = str(json.dumps(example["context"]))
# if example["answer"] in context:
# print(f"answer in context")
# else:
# print(f"answer not in context")
print(f"saved {split} to {target_path}")
keys = ["question", "answer"]
if keep_details == "all":
keys = [
Expand All @@ -110,7 +137,7 @@ def _check_or_download_dataset(
"context",
]
elif keep_details == "dev_titles":
keys = ["id", "question", "answer", "supporting_facts"]
keys = ["id", "question", "answer", "supporting_facts", "context"]

official_train = [] # 15661
for raw_example in hf_official_train:
Expand All @@ -119,19 +146,19 @@ def _check_or_download_dataset(

if "supporting_facts" in example:
example["gold_titles"] = set(example["supporting_facts"]["title"])
del example["supporting_facts"]
# del example["supporting_facts"]

official_train.append(example)
print(f"official_train: {len(official_train)}")

rng = random.Random(0)
rng.shuffle(official_train)

sampled_trainset = official_train[: len(official_train) * 75 // 100] # 11745
sampled_trainset = official_train[: len(official_train) * 70 // 100] # 11745
print(f"sampled_trainset: {len(sampled_trainset)}")

sampled_valset = official_train[ # 3916
len(official_train) * 75 // 100 :
len(official_train) * 70 // 100 :
] # this is not the official dev set

print(f"sampled_valset: {len(sampled_valset)}")
Expand All @@ -141,6 +168,8 @@ def _check_or_download_dataset(
# del example["gold_titles"]

test = [] # 7405

print(f"raw_example: {hf_official_dev[0]}")
for raw_example in hf_official_dev:
assert raw_example["level"] == "hard"
example = {
Expand All @@ -149,19 +178,33 @@ def _check_or_download_dataset(
}
if "supporting_facts" in example:
example["gold_titles"] = set(example["supporting_facts"]["title"])
del example["supporting_facts"]

# del example["supporting_facts"]
test.append(example)

keys = ["id", "question", "answer", "gold_titles"]
data_path_dir = os.path.dirname(data_path)
keys = ["id", "question", "answer", "gold_titles", "context"]

# split test into val and test
# random shuff the test
rng.shuffle(test)
test_split = test[: len(test) * 50 // 100] # 3702
val_split = test[len(test) * 50 // 100 :] # 3703

# save to csv
for split, examples in zip(
["train", "val", "test"],
[sampled_trainset, sampled_valset, test],
[sampled_trainset, val_split, test_split],
):
# target_path = prepare_dataset_path(self.root, task_name, split)
target_path = os.path.join(data_path_dir, f"{split}.csv")
save_csv(examples, f=target_path, fieldnames=keys)
target_path = os.path.join(data_path_dir, f"{split}.json")
# filter the examples with only the keys
save_examples = []
for example in examples:
save_example = {k: example[k] for k in keys if k in example}
save_examples.append(save_example)
save_json(save_examples, f=target_path)
if split == "train":
print(f"train example: {examples[0]}")
print(f"saved {split} to {target_path}")

if split == "train":
Expand Down Expand Up @@ -190,3 +233,107 @@ def __len__(self):
print(len(testdataset))
print(f"valdataset[0]: {valdataset[0]}")
print(f"testdataset[0]: {testdataset[0]}")
# example = {
# "id": "5a8b57f25542995d1e6f1371",
# "question": "Were Scott Derrickson and Ed Wood of the same nationality?",
# "answer": "yes",
# "type": "comparison",
# "level": "hard",
# "supporting_facts": {
# "title": ["Scott Derrickson", "Ed Wood"],
# "sent_id": [0, 0],
# },
# "context": {
# "title": [
# "Adam Collis",
# "Ed Wood (film)",
# "Tyler Bates",
# "Doctor Strange (2016 film)",
# "Hellraiser: Inferno",
# "Sinister (film)",
# "Deliver Us from Evil (2014 film)",
# "Woodson, Arkansas",
# "Conrad Brooks",
# "The Exorcism of Emily Rose",
# ],
# "sentences": [
# [
# "Adam Collis is an American filmmaker and actor.",
# " He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.",
# " He also studied cinema at the University of Southern California from 1991 to 1997.",
# ' Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995).',
# ' In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".',
# ],
# [
# "Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.",
# " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.",
# " Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.",
# ],
# [
# "Tyler Bates (born June 5, 1965) is an American musician, music producer, and composer for films, television, and video games.",
# ' Much of his work is in the action and horror film genres, with films like "Dawn of the Dead, 300, Sucker Punch," and "John Wick."',
# " He has collaborated with directors like Zack Snyder, Rob Zombie, Neil Marshall, William Friedkin, Scott Derrickson, and James Gunn.",
# ' With Gunn, he has scored every one of the director\'s films; including "Guardians of the Galaxy", which became one of the highest grossing domestic movies of 2014, and its 2017 sequel.',
# ' In addition, he is also the lead guitarist of the American rock band Marilyn Manson, and produced its albums "The Pale Emperor" and "Heaven Upside Down".',
# ],
# [
# "Doctor Strange is a 2016 American superhero film based on the Marvel Comics character of the same name, produced by Marvel Studios and distributed by Walt Disney Studios Motion Pictures.",
# " It is the fourteenth film of the Marvel Cinematic Universe (MCU).",
# " The film was directed by Scott Derrickson, who wrote it with Jon Spaihts and C. Robert Cargill, and stars Benedict Cumberbatch as Stephen Strange, along with Chiwetel Ejiofor, Rachel McAdams, Benedict Wong, Michael Stuhlbarg, Benjamin Bratt, Scott Adkins, Mads Mikkelsen, and Tilda Swinton.",
# ' In "Doctor Strange", surgeon Strange learns the mystic arts after a career-ending car accident.',
# ],
# [
# "Hellraiser: Inferno (also known as Hellraiser V: Inferno) is a 2000 American horror film.",
# ' It is the fifth installment in the "Hellraiser" series and the first "Hellraiser" film to go straight-to-DVD.',
# " It was directed by Scott Derrickson and released on October 3, 2000.",
# " The film concerns a corrupt detective who discovers Lemarchand's box at a crime scene.",
# " The film's reviews were mixed.",
# ],
# [
# "Sinister is a 2012 supernatural horror film directed by Scott Derrickson and written by Derrickson and C. Robert Cargill.",
# " It stars Ethan Hawke as fictional true-crime writer Ellison Oswalt who discovers a box of home movies in his attic that puts his family in danger.",
# ],
# [
# "Deliver Us from Evil is a 2014 American supernatural horror film directed by Scott Derrickson and produced by Jerry Bruckheimer.",
# ' The film is officially based on a 2001 non-fiction book entitled "Beware the Night" by Ralph Sarchie and Lisa Collier Cool, and its marketing campaign highlighted that it was "inspired by actual accounts".',
# " The film stars Eric Bana, Édgar Ramírez, Sean Harris, Olivia Munn, and Joel McHale in the main roles and was released on July 2, 2014.",
# ],
# [
# "Woodson is a census-designated place (CDP) in Pulaski County, Arkansas, in the United States.",
# " Its population was 403 at the 2010 census.",
# " It is part of the Little Rock–North Little Rock–Conway Metropolitan Statistical Area.",
# " Woodson and its accompanying Woodson Lake and Wood Hollow are the namesake for Ed Wood Sr., a prominent plantation owner, trader, and businessman at the turn of the 20th century.",
# " Woodson is adjacent to the Wood Plantation, the largest of the plantations own by Ed Wood Sr.",
# ],
# [
# "Conrad Brooks (born Conrad Biedrzycki on January 3, 1931 in Baltimore, Maryland) is an American actor.",
# " He moved to Hollywood, California in 1948 to pursue a career in acting.",
# ' He got his start in movies appearing in Ed Wood films such as "Plan 9 from Outer Space", "Glen or Glenda", and "Jail Bait."',
# " He took a break from acting during the 1960s and 1970s but due to the ongoing interest in the films of Ed Wood, he reemerged in the 1980s and has become a prolific actor.",
# " He also has since gone on to write, produce and direct several films.",
# ],
# [
# "The Exorcism of Emily Rose is a 2005 American legal drama horror film directed by Scott Derrickson and starring Laura Linney and Tom Wilkinson.",
# " The film is loosely based on the story of Anneliese Michel and follows a self-proclaimed agnostic who acts as defense counsel (Linney) representing a parish priest (Wilkinson), accused by the state of negligent homicide after he performed an exorcism.",
# ],
# ],
# },
# }

# # save to csv
# keys = ["id", "question", "answer", "gold_titles", "context"]
# example["gold_titles"] = set(example["supporting_facts"]["title"])

# # test, save to hotpotQA

# data = HotPotQAData.from_dict({k: example[k] for k in keys})
# print(f"data: {data}")

# # save to json
# save_json([data.to_dict()], f="test.json")

# # load from json
# loaded_data = load_json("test.json")
# # convert to dataclass
# data = HotPotQAData.from_dict(loaded_data[0])
# print(f"data: {data}")
Loading

0 comments on commit 0ad2490

Please sign in to comment.