Skip to content

Save pass@k result & use custom tokenizer #20

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jul 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions bigcodebench/evaluate.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,32 @@ def stucking_checker():
if not os.path.isfile(result_path):
with open(result_path, "w") as f:
json.dump(results, f, indent=2)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe adding if not os.path.isfile(pass_at_k_path):?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, I think a better way is to check if at least the Pass@1 scores are the same and decide whether we need to rewrite the result_path and pass_at_k_path. Wdyt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yes, sounds good

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! I'll merge the PR after your update.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

pass_at_k_path = result_path.replace("_eval_results.json", "_pass_at_k.json")
pass_at_k["model"] = flags.samples.split("/")[-1].replace(".jsonl", "")
pass_at_k["subset"] = flags.subset

def save_pass_at_k():
with open(pass_at_k_path, "w") as f:
json.dump(pass_at_k, f, indent=2)

if os.path.isfile(pass_at_k_path):
saved_pass_at_k = json.load(open(pass_at_k_path, "r"))
# compare saved_pass_at_k with pass_at_k
for k in saved_pass_at_k.keys():
if pass_at_k[k] != saved_pass_at_k[k]:
cprint(f"Warning: {k} is different from the saved one", "yellow")

# ask user whether to save the pass@k
decision = ""
while decision.lower() not in ["y", "n"]:
print(f"Save pass@k to {pass_at_k_path}? [Y/N]")
decision = input()
if decision.lower() == "y":
save_pass_at_k()

else:
save_pass_at_k()


def main():
Expand Down
9 changes: 6 additions & 3 deletions bigcodebench/generate.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def codegen(

if model.is_direct_completion() and subset == "instruct":
raise Exception("Base model does not support direct completion for instruct tasks")

# create save_path if it doesn't exist, e.g., a/b.jsonl
dirname = os.path.dirname(save_path)
if not os.path.exists(dirname) and dirname != "":
Expand Down Expand Up @@ -118,6 +118,8 @@ def main():
parser.add_argument("--base_url", default=None, type=str)
parser.add_argument("--tp", default=1, type=int)
parser.add_argument("--trust_remote_code", action="store_true")
parser.add_argument("--tokenizer_name", default=None, type=str)

args = parser.parse_args()


Expand Down Expand Up @@ -145,7 +147,8 @@ def main():
temperature=args.temperature,
base_url=args.base_url,
tp=args.tp,
trust_remote_code=args.trust_remote_code
trust_remote_code=args.trust_remote_code,
tokenizer_name=args.tokenizer_name
)

if not args.save_path:
Expand All @@ -161,7 +164,7 @@ def main():
strip_newlines=args.strip_newlines,
n_samples=args.n_samples,
resume=args.resume,
id_range=args.id_range,
id_range=args.id_range
)


Expand Down
22 changes: 16 additions & 6 deletions bigcodebench/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def __init__(
max_new_tokens: int = 1280,
dtype: str = "bfloat16", # default
trust_remote_code: bool = False,
tokenizer_name: str = None,
) -> None:
print("Initializing a decoder model: {} ...".format(name))
self.name = name
Expand All @@ -102,6 +103,7 @@ def __init__(
self.max_new_tokens = max_new_tokens
self.dtype = dtype
self.trust_remote_code = trust_remote_code
self.tokenizer_name = tokenizer_name

@abstractmethod
def codegen(
Expand Down Expand Up @@ -129,11 +131,13 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
"dtype": self.dtype,
"trust_remote_code": self.trust_remote_code,
}

self.tokenizer = AutoTokenizer.from_pretrained(self.name, **kwargs)
if self.tokenizer_name is None:
self.tokenizer_name = self.name

self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs)
if self.tokenizer.chat_template is None:
self.eos += extra_eos_for_direct_completion(dataset)
self.llm = LLM(model=name, max_model_len=2048, **kwargs)
self.llm = LLM(model=name, max_model_len=2048, tokenizer=self.tokenizer_name, **kwargs)

def is_direct_completion(self) -> bool:
return self.tokenizer.chat_template is None
Expand Down Expand Up @@ -185,9 +189,12 @@ def __init__(self, name: str, dataset: str, **kwargs):
kwargs["torch_dtype"] = getattr(torch, self.dtype)
self.skip_special_tokens = True

print(f"{kwargs = }")
print(f"{kwargs = }", self.tokenizer_name)

if self.tokenizer_name is None:
self.tokenizer_name = self.name

self.tokenizer = AutoTokenizer.from_pretrained(name, **kwargs)
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name, **kwargs)
if self.tokenizer.chat_template is None:
self.eos += extra_eos_for_direct_completion(dataset)

Expand Down Expand Up @@ -253,7 +260,7 @@ def __init__(self, name: str, **kwargs):
super().__init__(name=name, **kwargs)
self.eos += ["\n```\n"]
print(f"EOS strings: {self.eos}")
self.tokenizer = AutoTokenizer.from_pretrained(self.name, **kwargs)
self.tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name if self.tokenizer_name else self.name, **kwargs)

def codegen(
self, prompt: str, do_sample: bool = True, num_samples: int = 200
Expand Down Expand Up @@ -486,6 +493,7 @@ def make_model(
tp=1,
base_url=None,
trust_remote_code=False,
tokenizer_name=None,
):
if backend == "vllm":
return GeneralVllmDecoder(
Expand All @@ -495,6 +503,7 @@ def make_model(
dataset=dataset,
tp=tp,
trust_remote_code=trust_remote_code,
tokenizer_name=tokenizer_name,
)
elif backend == "hf":
return GenenralHfTorchDecoder(
Expand All @@ -503,6 +512,7 @@ def make_model(
temperature=temperature,
dataset=dataset,
trust_remote_code=trust_remote_code,
tokenizer_name=tokenizer_name,
)
elif backend == "openai":
return OpenAIChatDecoder(
Expand Down