From 10090f1393799cb064957e8bd41c09da19cdf0ce Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 01:58:36 +0800 Subject: [PATCH 01/27] feat: add tool-use eval --- bigcodebench/evaluate.py | 34 +++++++++++++++++++++++----------- bigcodebench/generate.py | 13 +++++++++---- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 61e2a43f..14580a32 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -34,7 +34,7 @@ Result = Tuple[str, List[bool]] -def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit): +def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit): cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl") if os.path.exists(cache_file): if check_gt_only: @@ -54,8 +54,12 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, expected_time = dict() for problem in problems.values(): + if subset == "tool": + code = problem["canonical_solution"] + else: + code = problem[f"{split}_prompt"] + "\n" + problem["canonical_solution"] args = ( - problem["complete_prompt"] + "\n" + problem["canonical_solution"], + code, problem["test"], problem["task_id"], max_as_limit, @@ -130,7 +134,7 @@ def evaluate(flags): dataset_hash = get_bigcodebench_hash(subset=flags.subset) if not flags.no_gt: - expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit) + expected_time = get_groundtruth(flags.subset, n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit) else: expected_time = {task_id: None for task_id in problems} @@ -177,13 +181,21 @@ def evaluate(flags): f"Task {task_id} is found in the samples but not found in the dataset" ) continue - solution = ( - sample["solution"] + + if flags.subset == "tool": + solution = (problems[task_id][f"{flags.split}_prompt"] + "\n\n" + + problems[task_id]["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] + + "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]) + else: + solution = ( + sample["solution"] if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] - ) - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution remainings.add(sample["_identifier"]) args = ( completion_id[task_id], @@ -323,9 +335,9 @@ def save_pass_at_k(): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--split", required=True, type=str, choices=["complete", "instruct"] + "--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"] ) - parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) + parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard", "tool"]) parser.add_argument("--samples", required=True, type=str) parser.add_argument("--save_pass_rate", action="store_true") parser.add_argument("--parallel", default=None, type=int) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 679300cb..e1ecc592 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -37,6 +37,8 @@ def codegen( if model.is_direct_completion() and split == "instruct": raise Exception("Base model does not support direct completion for instruct tasks") + if subset == "tool": + assert split in ["positive", "negative", "mixed"], "Tool subset only supports positive, negative, and mixed split" # create save_path if it doesn't exist, e.g., a/b.jsonl dirname = os.path.dirname(save_path) if not os.path.exists(dirname) and dirname != "": @@ -70,9 +72,12 @@ def codegen( sidx = n_samples - nsamples while sidx < n_samples: try: - prompt = task[f"{split}_prompt"] + if split == "tool": + prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"] + else: + prompt = task[f"{split}_prompt"] except: - raise Exception(f"Invalid split {split}") + raise Exception(f"Invalid split {split} for BigCodeBench-{subset}") if strip_newlines: prompt = prompt.strip("\n") outputs = model.codegen( @@ -105,8 +110,8 @@ def codegen( def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", required=True, type=str) - parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct"]) - parser.add_argument("--subset", default="full", type=str, choices=["full", "hard"]) + parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"]) + parser.add_argument("--subset", default="full", type=str, choices=["full", "hard", "tool"]) parser.add_argument("--save_path", default=None, type=str) parser.add_argument("--bs", default=1, type=int) parser.add_argument("--n_samples", default=1, type=int) From 3641cfdcbeb5f0a4143d8f9edf7cf8a62612afba Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 02:05:01 +0800 Subject: [PATCH 02/27] fix: rm split --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 14580a32..ab86aab9 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -57,7 +57,7 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as if subset == "tool": code = problem["canonical_solution"] else: - code = problem[f"{split}_prompt"] + "\n" + problem["canonical_solution"] + code = problem["code_prompt"] + "\n" + problem["canonical_solution"] args = ( code, problem["test"], From d616f0e2fc53c15b9fdbf7ab0e470b24758c0831 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 02:13:34 +0800 Subject: [PATCH 03/27] fix check --- bigcodebench/data/utils.py | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/bigcodebench/data/utils.py b/bigcodebench/data/utils.py index fa91abe5..01bc0a2c 100644 --- a/bigcodebench/data/utils.py +++ b/bigcodebench/data/utils.py @@ -149,15 +149,30 @@ def write_directory(directory: PathLike, data: Iterable[Dict]): def completeness_check(name, data): for task_id, task in data.items(): - for key in [ - "complete_prompt", - "instruct_prompt", - "canonical_solution", - "code_prompt", - "test", - "entry_point" - ]: - assert key in task, f"{key} not found in {name} #{task_id}!" + try: + for key in [ + "complete_prompt", + "instruct_prompt", + "canonical_solution", + "code_prompt", + "test", + "entry_point" + ]: + assert key in task, f"{key} not found in {name} #{task_id}!" + except Exception as e: + for key in [ + "complete_prompt", + "positive_tool", + "negative_tool", + "mixed_tool", + "positive_tool_implementation", + "negative_tool_implementation", + "mixed_tool_implementation", + "canonical_solution", + "test", + "entry_point" + ]: + assert key in task, f"{key} not found in {name} #{task_id}!" def to_raw(string): From beaec382edae4df5dd89cd1c2afe744e8b910bd6 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 20:40:34 +0800 Subject: [PATCH 04/27] feat: add tool-use gen prompt --- bigcodebench/model.py | 98 ++++++++++++++++++++++++++++++++++++------- 1 file changed, 82 insertions(+), 16 deletions(-) diff --git a/bigcodebench/model.py b/bigcodebench/model.py index c5093b0f..ff69c254 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -55,18 +55,31 @@ def extra_eos_for_direct_completion(dataset) -> List[str]: _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-" -def make_chat_prompt(prompt: str, tokenizer: AutoTokenizer) -> str: +def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokenizer) -> str: # directly return prompt if it does not have a tokenizer.chat_template if tokenizer.chat_template is None: return prompt - prompt = f"""\ + if subset == "tool": + prompt = f"""\ +Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +``` +{prompt.strip()} +``` +""" + response = f"""\ +Below is a answer (e.g., Python script or refusal) that solves the problem: +{_MAGIC_SPLITTER_} +""" + else: + prompt = f"""\ Please provide a self-contained Python script that solves the following problem in a markdown code block: ``` {prompt.strip()} ``` """ - response = f"""\ + + response = f"""\ Below is a Python script with a self-contained function that solves the problem and passes corresponding tests: ```python {_MAGIC_SPLITTER_} @@ -86,6 +99,8 @@ class DecoderBase(ABC): def __init__( self, name: str, + subset: str, + split: str, batch_size: int = 1, temperature: float = 0.8, max_new_tokens: int = 1280, @@ -96,6 +111,8 @@ def __init__( ) -> None: print("Initializing a decoder model: {} ...".format(name)) self.name = name + self.subset = subset + self.split = split self.batch_size = batch_size self.temperature = temperature self.eos = EOS @@ -175,7 +192,7 @@ def __init__(self, name: str, **kwargs) -> None: def codegen( self, prompt: str, do_sample: bool = True, num_samples: int = 200 ) -> List[str]: - prompt = make_chat_prompt(prompt, self.tokenizer) + prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer) return VllmDecoder.codegen(self, prompt, do_sample, num_samples) @@ -259,7 +276,7 @@ def __init__(self, name: str, **kwargs): def codegen( self, prompt: str, do_sample: bool = True, num_samples: int = 200 ) -> List[str]: - prompt = make_chat_prompt(prompt, self.tokenizer) + prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer) return HfTorchDecoder.codegen(self, prompt, do_sample, num_samples) @@ -277,10 +294,16 @@ def codegen( # construct prompt fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text" - if fmt == "json_object": - message = r'Please complete the following code snippet by generating JSON like {"code": ""}' + if self.subset == "tool": + if fmt == "json_object": + message = r'Based on the given customized modules, please complete the following code snippet without using any imports by generating JSON like {"code": ""}' + else: + message = r"Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:" else: - message = r"Please generate self-contained code to complete the following problem:" + if fmt == "json_object": + message = r'Please complete the following code snippet by generating JSON like {"code": ""}' + else: + message = r"Please generate self-contained code to complete the following problem in a markdown code block:" message += f"\n```python\n{prompt.strip()}\n```" @@ -335,14 +358,29 @@ def codegen( batch_size = min(self.batch_size, num_samples) outputs = [] + + if self.subset == "tool": + message = f"""\ +Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" + for _ in range(batch_size): ret = self.client.chat( model=self.name, messages=[ ChatMessage( role="user", - content="Please generate self-contained code to solve the following problem in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```", + content=message, ) ], max_tokens=self.max_new_tokens, @@ -383,15 +421,28 @@ def codegen( assert batch_size == 1, "Sampling only supports batch size of 1" outputs = [] - for _ in range(batch_size): - message = anthropic_request.make_auto_request( + if self.subset == "tool": + message = f"""\ +Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" + + ret = anthropic_request.make_auto_request( client=self.client, model=self.name, messages=[ { "role": "user", - "content": "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```\n", + "content": message, } ], max_tokens=self.max_new_tokens, @@ -459,12 +510,27 @@ def codegen( model = genai.GenerativeModel(model_name=self.name, generation_config=genai_config, safety_settings=safety_settings) outputs = [] + + if self.subset == "tool": + message = f"""\ +Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" + for _ in range(batch_size): while True: try: response = model.generate_content( - "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```", + message, generation_config=genai_config ) output = response.candidates[0].content.parts[0].text From 4ef44cae379f31fc6f9a0b64d281582b499948ad Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 20:57:12 +0800 Subject: [PATCH 05/27] fix anthropic gen --- bigcodebench/model.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bigcodebench/model.py b/bigcodebench/model.py index ff69c254..8d6e9535 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -435,9 +435,9 @@ def codegen( {prompt.strip()} ``` """ - - ret = anthropic_request.make_auto_request( - client=self.client, + for _ in range(batch_size): + ret = anthropic_request.make_auto_request( + client=self.client, model=self.name, messages=[ { @@ -449,7 +449,7 @@ def codegen( stop_sequences=["\n```\n", "\nif "], **kwargs, ) - outputs.append(message.content[0].text) + outputs.append(ret.content[0].text) return outputs From d001ad48f6869fc0b68d01a554cc8a301eace1e5 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 20:59:02 +0800 Subject: [PATCH 06/27] fix args --- bigcodebench/generate.py | 2 ++ bigcodebench/model.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index e1ecc592..da0dfe39 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -145,6 +145,8 @@ def main(): model_runner = make_model( model=args.model, backend=args.backend, + subset=args.subset, + split=args.split, batch_size=args.bs, temperature=args.temperature, base_url=args.base_url, diff --git a/bigcodebench/model.py b/bigcodebench/model.py index 8d6e9535..f704709c 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -551,6 +551,8 @@ def codegen( def make_model( model: str, backend: str, + subset: str, + split: str, dataset: str = "bigcodebench", batch_size: int = 1, temperature: float = 0.0, @@ -563,6 +565,8 @@ def make_model( if backend == "vllm": return GeneralVllmDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, dataset=dataset, @@ -574,6 +578,8 @@ def make_model( elif backend == "hf": return GenenralHfTorchDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, dataset=dataset, @@ -584,6 +590,8 @@ def make_model( elif backend == "openai": return OpenAIChatDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, base_url=base_url, @@ -591,18 +599,24 @@ def make_model( elif backend == "mistral": return MistralChatDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) elif backend == "anthropic": return AnthropicMessageDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) elif backend == "google": return GeminiDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) \ No newline at end of file From f5171f6466adb267bf55f3dd118b686d983b00f4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Sun, 1 Sep 2024 21:00:37 +0800 Subject: [PATCH 07/27] fix subset check --- bigcodebench/generate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index da0dfe39..729b93ed 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -72,12 +72,12 @@ def codegen( sidx = n_samples - nsamples while sidx < n_samples: try: - if split == "tool": + if subset == "tool": prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"] else: prompt = task[f"{split}_prompt"] except: - raise Exception(f"Invalid split {split} for BigCodeBench-{subset}") + raise Exception(f"Invalid split {split} for bigcodebench-{subset}") if strip_newlines: prompt = prompt.strip("\n") outputs = model.codegen( From b3be294c2a325ba3aa7fbf5a981fd1b8ce2f6d69 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 02:07:15 +0800 Subject: [PATCH 08/27] fix prompts --- bigcodebench/model.py | 26 +++++++++++-------- bigcodebench/sanitize.py | 23 ++++++++++++----- bigcodebench/syncheck.py | 55 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 86 insertions(+), 18 deletions(-) diff --git a/bigcodebench/model.py b/bigcodebench/model.py index f704709c..9830055e 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -62,15 +62,13 @@ def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokeni if subset == "tool": prompt = f"""\ -Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: ``` {prompt.strip()} ``` """ - response = f"""\ -Below is a answer (e.g., Python script or refusal) that solves the problem: -{_MAGIC_SPLITTER_} -""" else: prompt = f"""\ Please provide a self-contained Python script that solves the following problem in a markdown code block: @@ -79,7 +77,7 @@ def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokeni ``` """ - response = f"""\ + response = f"""\ Below is a Python script with a self-contained function that solves the problem and passes corresponding tests: ```python {_MAGIC_SPLITTER_} @@ -296,9 +294,9 @@ def codegen( fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text" if self.subset == "tool": if fmt == "json_object": - message = r'Based on the given customized modules, please complete the following code snippet without using any imports by generating JSON like {"code": ""}' + message = r'Based on the given customized modules, please complete the following code snippet by generating JSON like {"code": ""} without using any external library APIs or defining any modules' else: - message = r"Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:" + message = r"You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.\n\nPlease complete `task_func` in a markdown code block without using any external library APIs or defining any modules:" else: if fmt == "json_object": message = r'Please complete the following code snippet by generating JSON like {"code": ""}' @@ -361,7 +359,9 @@ def codegen( if self.subset == "tool": message = f"""\ -Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: ``` {prompt.strip()} ``` @@ -423,7 +423,9 @@ def codegen( outputs = [] if self.subset == "tool": message = f"""\ -Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: ``` {prompt.strip()} ``` @@ -513,7 +515,9 @@ def codegen( if self.subset == "tool": message = f"""\ -Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports: +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: ``` {prompt.strip()} ``` diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index df9ed4eb..f120b887 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -15,7 +15,7 @@ write_directory, write_jsonl, ) -from bigcodebench.syncheck import syntax_check +from bigcodebench.syncheck import syntax_check, api_check CLASS_TYPE = "class_definition" FUNCTION_TYPE = "function_definition" @@ -183,6 +183,7 @@ def process_solution( sample_solution: Dict, dataset: Dict, entry_point: Dict, + check_lib: bool = False, debug_task: str = None, calibrate: bool = False, is_folder: bool = False, @@ -209,6 +210,9 @@ def process_solution( new_code = sanitize(code=old_code, entrypoint=function_name) + if check_lib: + new_code = "" if api_check(new_code) else new_code + # if old code and new code are different, print msg if new_code != old_code: msg = "Sanitized: " + dbg_identifier @@ -220,7 +224,7 @@ def process_solution( def script( - samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 + samples: str, check_lib: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 ): # task_id -> entry_point entry_point = {} @@ -235,15 +239,19 @@ def script( target_path = pathlib.Path(samples) if not inplace: if is_folder: - if calibrate: - new_name = target_path.name + "-sanitized-calibrated" + if check_lib: + new_name = target_path.name + "-skip-lib" + elif calibrate: + new_name = new_name + "-sanitized-calibrated" else: - new_name = target_path.name + "-sanitized" + new_name = new_name + "-sanitized" else: + if check_lib: + new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: - new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl") + new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl") else: - new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") + new_name = new_name.replace(".jsonl", "-sanitized.jsonl") target_path = target_path.parent / new_name target_path = str(target_path) @@ -257,6 +265,7 @@ def script( "sample_solution": sample_solution, "dataset": dataset, "entry_point": entry_point, + "check_lib": check_lib, "debug_task": debug_task, "calibrate": calibrate, "is_folder": is_folder, diff --git a/bigcodebench/syncheck.py b/bigcodebench/syncheck.py index 9ea97f4c..566afdbd 100755 --- a/bigcodebench/syncheck.py +++ b/bigcodebench/syncheck.py @@ -10,6 +10,61 @@ from bigcodebench.data import load_solutions +def api_check(code: str) -> bool: + tree = ast.parse(code) + imported_modules = set() + imported_names = {} + + class ApiExtractor(ast.NodeVisitor): + def __init__(self): + self.in_task_func = False + self.uses_library_api = False + + def visit_Import(self, node): + for alias in node.names: + imported_modules.add(alias.name) + if alias.asname: + imported_modules.add(alias.asname) + + def visit_ImportFrom(self, node): + if node.module: + for alias in node.names: + full_name = f'{node.module}.{alias.name}' + imported_names[alias.asname or alias.name] = full_name + + def visit_FunctionDef(self, node): + if node.name == 'task_func': + self.in_task_func = True + self.generic_visit(node) + self.in_task_func = False + else: + self.generic_visit(node) + + def visit_Attribute(self, node): + if self.in_task_func: + attr_chain = [] + current = node + while isinstance(current, ast.Attribute): + attr_chain.append(current.attr) + current = current.value + if isinstance(current, ast.Name): + attr_chain.append(current.id) + attr_chain.reverse() + full_name = '.'.join(attr_chain) + if attr_chain[0] in imported_modules or attr_chain[0] in imported_names: + self.uses_library_api = True + self.generic_visit(node) + + def visit_Name(self, node): + if self.in_task_func: + if node.id in imported_modules or node.id in imported_names: + self.uses_library_api = True + self.generic_visit(node) + + extractor = ApiExtractor() + extractor.visit(tree) + + return extractor.uses_library_api def syntax_check(code, verbose=False): try: From a3c8e15119a256470be4b55672455cb92a1e6057 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:00:35 +0800 Subject: [PATCH 09/27] fix tool solution postprocess --- bigcodebench/evaluate.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index ab86aab9..5dab748e 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -183,11 +183,12 @@ def evaluate(flags): continue if flags.subset == "tool": - solution = (problems[task_id][f"{flags.split}_prompt"] + "\n\n" - + problems[task_id]["solution"] + solution = (problems[task_id][f"complete_prompt"] + "\n\n" + + sample["solution"] if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] - + "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]) + ) + solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"] else: solution = ( sample["solution"] From 37125dcb39b127374a2cb616251a82e5a22ea3f9 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:07:15 +0800 Subject: [PATCH 10/27] change eval print --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 5dab748e..ced61335 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -269,7 +269,7 @@ def stucking_checker(): mode = "-calibrated" if "sanitized-calibrated" in flags.samples else "" extra = flags.subset.capitalize() flags.split = flags.split.capitalize() - cprint(f"BigCodeBench-{flags.split}{mode} ({extra})", "green") + cprint(f"BigCodeBench-{extra}{mode} ({flags.split})", "green") if flags.no_gt: cprint(f"Groundtruth is not checked", "yellow") From fddd02a446d16cb8ffca001a0e4fd6080e827cc6 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:17:28 +0800 Subject: [PATCH 11/27] fix skip module --- bigcodebench/sanitize.py | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index f120b887..d44dc70a 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool: return False -def sanitize(code: str, entrypoint: Optional[str] = None) -> str: +def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = False) -> str: code = code_extract(code.strip()) code_bytes = bytes(code, "utf8") parser = get_parser("python") @@ -116,11 +116,14 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: class_names = set() function_names = set() variable_names = set() - + reacheable = set() + root_node = tree.root_node import_nodes = [] definition_nodes = [] + task_func_found = not skip_module + for child in root_node.children: if child.type in IMPORT_TYPE: import_nodes.append(child) @@ -129,15 +132,19 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: if not ( name in class_names or name in variable_names or name in function_names ): - definition_nodes.append((name, child)) + if task_func_found: + definition_nodes.append((name, child)) class_names.add(name) elif child.type == FUNCTION_TYPE: name = get_definition_name(child) if not ( name in function_names or name in variable_names or name in class_names ): - definition_nodes.append((name, child)) - function_names.add(get_definition_name(child)) + if name == entrypoint: + task_func_found = True + if task_func_found: + definition_nodes.append((name, child)) + function_names.add(name) elif ( child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE ): @@ -146,7 +153,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: if not ( name in variable_names or name in function_names or name in class_names ): - definition_nodes.append((name, subchild)) + if task_func_found: + definition_nodes.append((name, subchild)) variable_names.add(name) if entrypoint: @@ -176,14 +184,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: outer_lines.append(i) if outer_lines: sanitized_output = "\n".join(lines[: outer_lines[-1]]) - return sanitized_output + if skip_module: + return "" if api_check(sanitized_output) else sanitized_output + else: + return sanitized_output def process_solution( sample_solution: Dict, dataset: Dict, entry_point: Dict, - check_lib: bool = False, + skip_module: bool = False, debug_task: str = None, calibrate: bool = False, is_folder: bool = False, @@ -209,9 +220,6 @@ def process_solution( old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") new_code = sanitize(code=old_code, entrypoint=function_name) - - if check_lib: - new_code = "" if api_check(new_code) else new_code # if old code and new code are different, print msg if new_code != old_code: @@ -224,7 +232,7 @@ def process_solution( def script( - samples: str, check_lib: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 + samples: str, skip_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 ): # task_id -> entry_point entry_point = {} @@ -239,14 +247,14 @@ def script( target_path = pathlib.Path(samples) if not inplace: if is_folder: - if check_lib: + if skip_module: new_name = target_path.name + "-skip-lib" elif calibrate: new_name = new_name + "-sanitized-calibrated" else: new_name = new_name + "-sanitized" else: - if check_lib: + if skip_module: new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl") @@ -265,7 +273,7 @@ def script( "sample_solution": sample_solution, "dataset": dataset, "entry_point": entry_point, - "check_lib": check_lib, + "skip_module": skip_module, "debug_task": debug_task, "calibrate": calibrate, "is_folder": is_folder, From 316e2968d60d00cd1b3745fcb6f50147f29e6a76 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:21:09 +0800 Subject: [PATCH 12/27] fix skip module --- bigcodebench/sanitize.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index d44dc70a..33bc2ca1 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -116,7 +116,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa class_names = set() function_names = set() variable_names = set() - reacheable = set() + reachable = set() root_node = tree.root_node import_nodes = [] @@ -159,7 +159,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa if entrypoint: name2deps = get_deps(definition_nodes) - reacheable = get_function_dependency(entrypoint, name2deps) + reachable = get_function_dependency(entrypoint, name2deps) sanitized_output = b"" @@ -168,7 +168,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa for pair in definition_nodes: name, node = pair - if entrypoint and not (name in reacheable): + if entrypoint and not (name in reachable): continue sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" @@ -248,18 +248,18 @@ def script( if not inplace: if is_folder: if skip_module: - new_name = target_path.name + "-skip-lib" + target_path.name = target_path.name + "-skip-lib" elif calibrate: - new_name = new_name + "-sanitized-calibrated" + new_name = target_path.name + "-sanitized-calibrated" else: - new_name = new_name + "-sanitized" + new_name = target_path.name + "-sanitized" else: if skip_module: - new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl") + target_path.name = target_path.name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: - new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl") + new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl") else: - new_name = new_name.replace(".jsonl", "-sanitized.jsonl") + new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") target_path = target_path.parent / new_name target_path = str(target_path) From aef1c3a0308e49d4233cffc424a1c233e4a10cd2 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:25:48 +0800 Subject: [PATCH 13/27] fix sanitize naming --- bigcodebench/sanitize.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index 33bc2ca1..61e5cb5e 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -245,22 +245,23 @@ def script( # make a new folder with "-sanitized" suffix is_folder = os.path.isdir(samples) target_path = pathlib.Path(samples) + target_path_name = target_path.name if not inplace: if is_folder: if skip_module: - target_path.name = target_path.name + "-skip-lib" + target_path_name = target_path_name + "-sanitized" elif calibrate: - new_name = target_path.name + "-sanitized-calibrated" + target_path_name = target_path_name + "-sanitized-calibrated" else: - new_name = target_path.name + "-sanitized" + target_path_name = target_path_name + "-sanitized" else: if skip_module: - target_path.name = target_path.name.replace(".jsonl", "-skip-lib.jsonl") + target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: - new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl") + target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl") else: - new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") - target_path = target_path.parent / new_name + target_path_name = target_path_name.replace(".jsonl", "-sanitized.jsonl") + target_path = target_path.parent / target_path_name target_path = str(target_path) nsan = 0 From 821cf773c14d7adeaaa325c526ae94cff7882a92 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:37:15 +0800 Subject: [PATCH 14/27] fix evaluate --- bigcodebench/evaluate.py | 5 +++-- bigcodebench/sanitize.py | 9 ++++++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index ced61335..9e560119 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -183,12 +183,13 @@ def evaluate(flags): continue if flags.subset == "tool": - solution = (problems[task_id][f"complete_prompt"] + "\n\n" - + sample["solution"] + solution = (sample["solution"] if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"] + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution else: solution = ( sample["solution"] diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index 61e5cb5e..431d462a 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -126,7 +126,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa for child in root_node.children: if child.type in IMPORT_TYPE: - import_nodes.append(child) + if not skip_module: + import_nodes.append(child) elif child.type == CLASS_TYPE: name = get_definition_name(child) if not ( @@ -219,8 +220,10 @@ def process_solution( if calibrate: old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") - new_code = sanitize(code=old_code, entrypoint=function_name) - + new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module) + if not new_code.startswith("def task_func") and new_code: + print(new_code) + exit() # if old code and new code are different, print msg if new_code != old_code: msg = "Sanitized: " + dbg_identifier From 5ce2928ed8f02e8478c4a2fbabea0289a0a0ab7d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:37:49 +0800 Subject: [PATCH 15/27] rm debug --- bigcodebench/sanitize.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index 431d462a..98199f8c 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -221,9 +221,7 @@ def process_solution( old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module) - if not new_code.startswith("def task_func") and new_code: - print(new_code) - exit() + # if old code and new code are different, print msg if new_code != old_code: msg = "Sanitized: " + dbg_identifier From cabdd5a6153fd5e34e33c9b7c0ef018d73b0416d Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 03:41:55 +0800 Subject: [PATCH 16/27] fix eval --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 9e560119..79aaae98 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -189,7 +189,7 @@ def evaluate(flags): ) solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"] if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution else: solution = ( sample["solution"] From 8aea23890d50a6bfd4827d279ad3b5e44c6cb999 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 18:56:13 +0800 Subject: [PATCH 17/27] fix output processing --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 79aaae98..b0c5a4f0 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -187,7 +187,7 @@ def evaluate(flags): if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) - solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"] + solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution else: From 170f4edf811afe8e4f78ed1f936e51dfe87950f4 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 19:03:41 +0800 Subject: [PATCH 18/27] fix flag --- bigcodebench/sanitize.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index 98199f8c..e247fe3f 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool: return False -def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = False) -> str: +def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = False) -> str: code = code_extract(code.strip()) code_bytes = bytes(code, "utf8") parser = get_parser("python") @@ -122,11 +122,11 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa import_nodes = [] definition_nodes = [] - task_func_found = not skip_module + task_func_found = not no_module for child in root_node.children: if child.type in IMPORT_TYPE: - if not skip_module: + if not no_module: import_nodes.append(child) elif child.type == CLASS_TYPE: name = get_definition_name(child) @@ -185,7 +185,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa outer_lines.append(i) if outer_lines: sanitized_output = "\n".join(lines[: outer_lines[-1]]) - if skip_module: + if no_module: return "" if api_check(sanitized_output) else sanitized_output else: return sanitized_output @@ -195,7 +195,7 @@ def process_solution( sample_solution: Dict, dataset: Dict, entry_point: Dict, - skip_module: bool = False, + no_module: bool = False, debug_task: str = None, calibrate: bool = False, is_folder: bool = False, @@ -220,7 +220,7 @@ def process_solution( if calibrate: old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") - new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module) + new_code = sanitize(code=old_code, entrypoint=function_name, no_module=no_module) # if old code and new code are different, print msg if new_code != old_code: @@ -233,7 +233,7 @@ def process_solution( def script( - samples: str, skip_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 + samples: str, no_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 ): # task_id -> entry_point entry_point = {} @@ -249,14 +249,14 @@ def script( target_path_name = target_path.name if not inplace: if is_folder: - if skip_module: + if no_module: target_path_name = target_path_name + "-sanitized" elif calibrate: target_path_name = target_path_name + "-sanitized-calibrated" else: target_path_name = target_path_name + "-sanitized" else: - if skip_module: + if no_module: target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl") @@ -275,7 +275,7 @@ def script( "sample_solution": sample_solution, "dataset": dataset, "entry_point": entry_point, - "skip_module": skip_module, + "no_module": no_module, "debug_task": debug_task, "calibrate": calibrate, "is_folder": is_folder, From 8f436048f4dfae8cda321d08ccb0fd1ae7ccb983 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Mon, 2 Sep 2024 20:02:47 +0800 Subject: [PATCH 19/27] fix output processing --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index b0c5a4f0..8f7edd5c 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -187,9 +187,9 @@ def evaluate(flags): if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) - solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution + solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution else: solution = ( sample["solution"] From e2a0e702ed8683897a1814bc063326ceeea62418 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 3 Sep 2024 20:28:19 +0800 Subject: [PATCH 20/27] fix tools --- bigcodebench/evaluate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 8f7edd5c..58d0753e 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -189,7 +189,7 @@ def evaluate(flags): ) if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution - solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution + solution = problems[task_id][f"positive_tool_implementation"] + solution else: solution = ( sample["solution"] From 3a00d9eae192ef5b11b363a2126f781c1e5f7989 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 3 Sep 2024 20:53:22 +0800 Subject: [PATCH 21/27] fix some missing titles --- tools/fix_v0110.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tools/fix_v0110.py b/tools/fix_v0110.py index d1d1300c..0b7f75ec 100644 --- a/tools/fix_v0110.py +++ b/tools/fix_v0110.py @@ -19,7 +19,21 @@ def map_ds(sample): "Requirements:\n - sklearn.ensemble\n", "Requirements:\n - pandas\n - sklearn.ensemble\n" ) - + if sample["task_id"] in ["BigCodeBench/241"]: + for k in sample.keys(): + if "prompt" in k: + sample[k] = sample[k].replace( + "The function will plot the original and normalized arrays using matplotlib.", + "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'." + ) + if sample["task_id"] in ["BigCodeBench/267"]: + for k in sample.keys(): + if "prompt" in k: + sample[k] = sample[k].replace( + "Plots and returns the FFT of the signal.", + "Plots and returns the FFT of the signal with a title of 'FFT of the signal'." + ) + return sample if __name__ == "__main__": @@ -28,7 +42,7 @@ def map_ds(sample): hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) ds = ds_dict[BIGCODEBENCH_VERSION] hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] - function_id = [37] + function_id = [37, 267, 241] new_ds = ds.map(map_ds) new_ds.to_json("BigCodeBench.jsonl") From fe3ac569c22b7c9711da5e93e25a3a5a1bf5db87 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Tue, 3 Sep 2024 20:23:45 +0800 Subject: [PATCH 22/27] fix tools to avoid dependency conflicts --- bigcodebench/evaluate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 58d0753e..35d9b55d 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -187,6 +187,7 @@ def evaluate(flags): if "solution" in sample else problems[task_id]["complete_prompt"] + sample["completion"] ) + solution = problems[task_id]["positive_tool_implementation"] + solution if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution solution = problems[task_id][f"positive_tool_implementation"] + solution From be9b0166d6d99386a8a9dd7e707b5d9ed221d4bc Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 4 Sep 2024 08:47:05 +0000 Subject: [PATCH 23/27] fix sanitize logic and skip empty in eval --- bigcodebench/evaluate.py | 18 ++++++++++++-- bigcodebench/sanitize.py | 52 +++++++++++++++++++--------------------- 2 files changed, 40 insertions(+), 30 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 35d9b55d..649a53b4 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -199,6 +199,20 @@ def evaluate(flags): ) if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + + # Skip execution for empty solutions + if not solution.strip(): + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": solution, + "base": (FAIL, "Empty solution") + }) + completion_id[task_id] += 1 + n_samples += 1 + continue + remainings.add(sample["_identifier"]) args = ( completion_id[task_id], @@ -215,7 +229,7 @@ def evaluate(flags): completion_id[task_id] += 1 n_samples += 1 - assert n_samples == len(remainings), "Missing problems in unfinished" + assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" assert len(completion_id) == len(problems), "Missing problems in samples" def stucking_checker(): @@ -230,7 +244,7 @@ def stucking_checker(): threading.Thread(target=stucking_checker).start() - for future in tqdm(as_completed(futures), total=n_samples): + for future in tqdm(as_completed(futures), total=len(futures)): result = future.result() remainings.remove(result["_identifier"]) eval_results[result["task_id"]].append(result) diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index e247fe3f..a915844a 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool: return False -def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = False) -> str: +def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str: code = code_extract(code.strip()) code_bytes = bytes(code, "utf8") parser = get_parser("python") @@ -122,29 +122,26 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals import_nodes = [] definition_nodes = [] - task_func_found = not no_module - for child in root_node.children: if child.type in IMPORT_TYPE: - if not no_module: - import_nodes.append(child) + # if subset != "tool": + import_nodes.append(child) elif child.type == CLASS_TYPE: name = get_definition_name(child) if not ( name in class_names or name in variable_names or name in function_names ): - if task_func_found: - definition_nodes.append((name, child)) + definition_nodes.append((name, child)) class_names.add(name) elif child.type == FUNCTION_TYPE: name = get_definition_name(child) if not ( name in function_names or name in variable_names or name in class_names ): - if name == entrypoint: - task_func_found = True - if task_func_found: - definition_nodes.append((name, child)) + # if name == entrypoint: + # task_func_found = True + # if task_func_found: + definition_nodes.append((name, child)) function_names.add(name) elif ( child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE @@ -154,8 +151,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals if not ( name in variable_names or name in function_names or name in class_names ): - if task_func_found: - definition_nodes.append((name, subchild)) + # if task_func_found: + definition_nodes.append((name, subchild)) variable_names.add(name) if entrypoint: @@ -185,17 +182,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals outer_lines.append(i) if outer_lines: sanitized_output = "\n".join(lines[: outer_lines[-1]]) - if no_module: - return "" if api_check(sanitized_output) else sanitized_output - else: - return sanitized_output + # if subset == "tool": + # return "" if api_check(solution[f"{split}_tool_"] + "\n" + sanitized_output) else sanitized_output + # else: + return sanitized_output def process_solution( sample_solution: Dict, dataset: Dict, entry_point: Dict, - no_module: bool = False, + subset: str, debug_task: str = None, calibrate: bool = False, is_folder: bool = False, @@ -220,8 +217,10 @@ def process_solution( if calibrate: old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") - new_code = sanitize(code=old_code, entrypoint=function_name, no_module=no_module) - + new_code = sanitize(code=old_code, solution=sample_solution, entrypoint=function_name) + if subset == "tool": + if api_check(new_code): + new_code = "" # if old code and new code are different, print msg if new_code != old_code: msg = "Sanitized: " + dbg_identifier @@ -233,12 +232,12 @@ def process_solution( def script( - samples: str, no_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 + samples: str, subset: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 ): # task_id -> entry_point entry_point = {} # merge two datasets - dataset = {**get_bigcodebench()} + dataset = {**get_bigcodebench(subset=subset)} for task_id, problem in dataset.items(): entry_point[task_id] = problem["entry_point"] @@ -249,15 +248,11 @@ def script( target_path_name = target_path.name if not inplace: if is_folder: - if no_module: - target_path_name = target_path_name + "-sanitized" - elif calibrate: + if calibrate: target_path_name = target_path_name + "-sanitized-calibrated" else: target_path_name = target_path_name + "-sanitized" else: - if no_module: - target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl") if calibrate: target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl") else: @@ -275,7 +270,7 @@ def script( "sample_solution": sample_solution, "dataset": dataset, "entry_point": entry_point, - "no_module": no_module, + "subset": subset, "debug_task": debug_task, "calibrate": calibrate, "is_folder": is_folder, @@ -288,6 +283,7 @@ def script( for result in results: if result is not None: + print(result) new_solutions.append(result) nsan += 1 ntotal += 1 From 2e153a66a0fea1b4f9d65ac19c60522ad9427b62 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 5 Sep 2024 00:26:30 +0800 Subject: [PATCH 24/27] calcuate pass_tool@k --- bigcodebench/eval/__init__.py | 78 ++++++++++++++++ bigcodebench/evaluate.py | 163 ++++++++++++++++++---------------- bigcodebench/sanitize.py | 1 - 3 files changed, 162 insertions(+), 80 deletions(-) diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py index 3596f53d..06a3a570 100644 --- a/bigcodebench/eval/__init__.py +++ b/bigcodebench/eval/__init__.py @@ -24,6 +24,7 @@ import multiprocessing import os import sys +import ast import time import types import unittest @@ -240,3 +241,80 @@ def evaluate_files( ) ret.append((stat, det.tolist())) return ret + + +def extract_defined_modules(code: str, entry_point: str): + tree = ast.parse(code) + defined_functions = set() + defined_methods = {} + used_functions = set() + used_methods = set() + variable_classes = {} + + class FunctionDefVisitor(ast.NodeVisitor): + def visit_FunctionDef(self, node): + defined_functions.add(node.name) + self.generic_visit(node) + + def visit_ClassDef(self, node): + for item in node.body: + if isinstance(item, ast.FunctionDef): + if node.name not in defined_methods: + defined_methods[node.name] = set() + defined_methods[node.name].add(item.name) + self.generic_visit(node) + + class TaskFuncVisitor(ast.NodeVisitor): + def visit_Assign(self, node): + if isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name): + class_name = node.value.func.id + for target in node.targets: + if isinstance(target, ast.Name): + variable_classes[target.id] = class_name + self.generic_visit(node) + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + used_functions.add(node.func.id) + elif isinstance(node.func, ast.Attribute): + value = node.func.value + if isinstance(value, ast.Name): + var_name = value.id + if var_name in variable_classes: + used_methods.add(f"{variable_classes[var_name]}.{node.func.attr}") + else: + used_methods.add(f"{var_name}.{node.func.attr}") + elif isinstance(value, ast.Attribute): + # Handle nested attributes (e.g., obj.attr.method()) + attr_chain = [node.func.attr] + while isinstance(value, ast.Attribute): + attr_chain.append(value.attr) + value = value.value + if isinstance(value, ast.Name): + var_name = value.id + if var_name in variable_classes: + attr_chain.append(variable_classes[var_name]) + else: + attr_chain.append(var_name) + used_methods.add('.'.join(reversed(attr_chain))) + self.generic_visit(node) + + # First pass: collect all defined functions and methods + FunctionDefVisitor().visit(tree) + + # Second pass: collect used functions and methods within task_func + for node in ast.iter_child_nodes(tree): + if isinstance(node, ast.FunctionDef) and node.name == 'task_func': + TaskFuncVisitor().visit(node) + break # Assuming there's only one task_func + + # Filter used functions to include only those defined before task_func + result = [func for func in used_functions if func in defined_functions] + + # Filter used methods to include only those defined before task_func + for class_name, methods in defined_methods.items(): + for method in methods: + if any(f"{class_name}.{method}" in used_method for used_method in used_methods): + result.append(f"{class_name}.{method}") + + return result diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 649a53b4..db0fbde9 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -26,6 +26,7 @@ compatible_eval_result, estimate_pass_at_k, untrusted_check, + extract_defined_modules, ) from bigcodebench.gen.util import trusted_check @@ -93,6 +94,7 @@ def check_correctness( identifier=None, min_time_limit: float = 0.1, gt_time_limit: float = 2.0, + used_modules: List[str] = [], ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details) ret = { "completion_id": completion_id, @@ -110,6 +112,10 @@ def check_correctness( min_time_limit, gt_time_limit, ) + if used_modules: + ret["used_modules"] = (set(used_modules) == set(problems["used_modules"])) + else: + ret["used_modules"] = True return ret @@ -165,89 +171,75 @@ def evaluate(flags): "eval": {}, } - with ProcessPoolExecutor(max_workers=n_workers) as executor: - futures = [] - completion_id = Counter() - n_samples = 0 - eval_results = defaultdict(list) # task_id -> - remainings = set() + completion_id = Counter() + n_samples = 0 + eval_results = defaultdict(list) # task_id -> + remainings = set() - print("Reading samples...") - for sample in tqdm(load_solutions(flags.samples)): - task_id = sample["task_id"] - - if task_id not in problems: - warn( - f"Task {task_id} is found in the samples but not found in the dataset" - ) - continue - - if flags.subset == "tool": - solution = (sample["solution"] - if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] - ) - solution = problems[task_id]["positive_tool_implementation"] + solution - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution - solution = problems[task_id][f"positive_tool_implementation"] + solution - else: - solution = ( - sample["solution"] - if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] - ) - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution - - # Skip execution for empty solutions - if not solution.strip(): - eval_results[task_id].append({ - "completion_id": completion_id[task_id], - "task_id": task_id, - "_identifier": sample["_identifier"], - "solution": solution, - "base": (FAIL, "Empty solution") - }) - completion_id[task_id] += 1 - n_samples += 1 - continue - - remainings.add(sample["_identifier"]) - args = ( - completion_id[task_id], - problems[task_id], - solution, - flags.max_as_limit, - flags.max_data_limit, - flags.max_stack_limit, - sample["_identifier"], - flags.min_time_limit, - expected_time[task_id] if expected_time[task_id] else 20 + print("Reading samples...") + samples = list(load_solutions(flags.samples)) + + def process_sample(sample): + task_id = sample["task_id"] + + if task_id not in problems: + warn(f"Task {task_id} is found in the samples but not found in the dataset") + return None + + used_modules = [] + if flags.subset == "tool": + solution = (sample["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + solution = problems[task_id]["positive_tool_implementation"] + solution + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution + solution = problems[task_id][f"positive_tool_implementation"] + solution + used_modules = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"]) + else: + solution = ( + sample["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] ) - futures.append(executor.submit(check_correctness, *args)) - completion_id[task_id] += 1 - n_samples += 1 - - assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" - assert len(completion_id) == len(problems), "Missing problems in samples" - - def stucking_checker(): - while remainings: - last_size = len(remainings) - time.sleep(240) - if last_size != len(remainings) or len(remainings) == 0: - continue - # Potential stucking - warn("No samples had finished testing in the last 240s") - warn(f"{len(remainings)} samples to be tested: {remainings}") + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + + # Skip execution for empty solutions + if not solution.strip(): + return { + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": solution, + "base": (FAIL, "Empty solution") + } + + args = ( + completion_id[task_id], + problems[task_id], + solution, + flags.max_as_limit, + flags.max_data_limit, + flags.max_stack_limit, + sample["_identifier"], + flags.min_time_limit, + expected_time[task_id] if expected_time[task_id] else 20, + used_modules, + ) + return check_correctness(*args) - threading.Thread(target=stucking_checker).start() + results_list = pqdm(samples, process_sample, n_jobs=n_workers, desc="Processing samples") - for future in tqdm(as_completed(futures), total=len(futures)): - result = future.result() - remainings.remove(result["_identifier"]) + for result in results_list: + if result is not None: eval_results[result["task_id"]].append(result) + completion_id[result["task_id"]] += 1 + n_samples += 1 + + assert n_samples == sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" + assert len(completion_id) == len(problems), "Missing problems in samples" # sort the results for each problem by completion_id for task_id, task_results in eval_results.items(): @@ -255,12 +247,14 @@ def stucking_checker(): results["eval"][task_id] = [] for res in task_results: stat, details = res["base"] + tool_use = res["used_modules"] results["eval"][task_id].append( { "task_id": task_id, "solution": res["solution"], "status": stat, "details": details, + "tool_use": tool_use, } ) @@ -275,13 +269,19 @@ def stucking_checker(): base_correct.append(bc) base_correct = np.array(base_correct) - + tool_correct = np.array([r["tool_use"] for r in results["eval"].values()]) + pass_at_k = { f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean() for k in [1, 5, 10, 25, 100] if total.min() >= k } + if flags.subset == "tool": + pass_at_k.update({f"pass_tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean() + for k in [1, 5, 10, 25, 100] + if total.min() >= k}) + mode = "-calibrated" if "sanitized-calibrated" in flags.samples else "" extra = flags.subset.capitalize() flags.split = flags.split.capitalize() @@ -300,6 +300,11 @@ def stucking_checker(): for k, v in pass_at_k.items(): cprint(f"{k}:\t{v:.3f}", "green") + + if flags.subset == "tool": + for k, v in pass_at_k.items(): + if k.startswith("pass_tool@"): + cprint(f"{k}:\t{v:.3f}", "green") # save results if os.path.isfile(result_path): diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index a915844a..ec7f908e 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -283,7 +283,6 @@ def script( for result in results: if result is not None: - print(result) new_solutions.append(result) nsan += 1 ntotal += 1 From 077da747151cebad49c30953407cefbae7fe239b Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 5 Sep 2024 03:59:42 +1000 Subject: [PATCH 25/27] fix tool eval --- bigcodebench/eval/__init__.py | 2 +- bigcodebench/evaluate.py | 234 +++++++++++++++++++++------------- 2 files changed, 143 insertions(+), 93 deletions(-) diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py index 06a3a570..66f3aabf 100644 --- a/bigcodebench/eval/__init__.py +++ b/bigcodebench/eval/__init__.py @@ -304,7 +304,7 @@ def visit_Call(self, node): # Second pass: collect used functions and methods within task_func for node in ast.iter_child_nodes(tree): - if isinstance(node, ast.FunctionDef) and node.name == 'task_func': + if isinstance(node, ast.FunctionDef) and node.name == entry_point: TaskFuncVisitor().visit(node) break # Assuming there's only one task_func diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index db0fbde9..48681deb 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -5,6 +5,7 @@ import pickle import threading import time +from pqdm.processes import pqdm from collections import Counter, defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import datetime @@ -85,6 +86,8 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as return expected_time def check_correctness( + subset: str, + split: str, completion_id: int, problem: Dict[str, Any], solution: str, @@ -94,7 +97,7 @@ def check_correctness( identifier=None, min_time_limit: float = 0.1, gt_time_limit: float = 2.0, - used_modules: List[str] = [], + used_tools: List[str] = [], ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details) ret = { "completion_id": completion_id, @@ -112,10 +115,13 @@ def check_correctness( min_time_limit, gt_time_limit, ) - if used_modules: - ret["used_modules"] = (set(used_modules) == set(problems["used_modules"])) - else: - ret["used_modules"] = True + if subset == "tool": + if split in ["positive", "mixed"]: + ret["used_tools"] = (set(used_tools) == set(problem["used_tools"])) + else: + assert split == "negative" + ret["used_tools"] = (set(used_tools) == {"refusal_func"}) + return ret @@ -171,75 +177,97 @@ def evaluate(flags): "eval": {}, } - completion_id = Counter() - n_samples = 0 - eval_results = defaultdict(list) # task_id -> - remainings = set() + with ProcessPoolExecutor(max_workers=n_workers) as executor: + futures = [] + completion_id = Counter() + n_samples = 0 + eval_results = defaultdict(list) # task_id -> + remainings = set() - print("Reading samples...") - samples = list(load_solutions(flags.samples)) - - def process_sample(sample): - task_id = sample["task_id"] - - if task_id not in problems: - warn(f"Task {task_id} is found in the samples but not found in the dataset") - return None + print("Reading samples...") + for sample in tqdm(load_solutions(flags.samples)): + task_id = sample["task_id"] + + if task_id not in problems: + warn( + f"Task {task_id} is found in the samples but not found in the dataset" + ) + continue + + if flags.subset == "tool": + solution = (sample["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + try: + used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"]) + except Exception as e: + used_tools = [] + solution = problems[task_id]["positive_tool_implementation"] + solution + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution + solution = problems[task_id][f"positive_tool_implementation"] + solution + + else: + solution = ( + sample["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution - used_modules = [] - if flags.subset == "tool": - solution = (sample["solution"] - if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] - ) - solution = problems[task_id]["positive_tool_implementation"] + solution - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution - solution = problems[task_id][f"positive_tool_implementation"] + solution - used_modules = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"]) - else: - solution = ( - sample["solution"] - if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] + # Skip execution for empty solutions + if not solution: + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": solution, + "base": (FAIL, "Empty solution") + }) + completion_id[task_id] += 1 + n_samples += 1 + continue + + remainings.add(sample["_identifier"]) + args = ( + flags.subset, + flags.split, + completion_id[task_id], + problems[task_id], + solution, + flags.max_as_limit, + flags.max_data_limit, + flags.max_stack_limit, + sample["_identifier"], + flags.min_time_limit, + expected_time[task_id] if expected_time[task_id] else 20, + used_tools ) - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution - - # Skip execution for empty solutions - if not solution.strip(): - return { - "completion_id": completion_id[task_id], - "task_id": task_id, - "_identifier": sample["_identifier"], - "solution": solution, - "base": (FAIL, "Empty solution") - } - - args = ( - completion_id[task_id], - problems[task_id], - solution, - flags.max_as_limit, - flags.max_data_limit, - flags.max_stack_limit, - sample["_identifier"], - flags.min_time_limit, - expected_time[task_id] if expected_time[task_id] else 20, - used_modules, - ) - return check_correctness(*args) - - results_list = pqdm(samples, process_sample, n_jobs=n_workers, desc="Processing samples") - - for result in results_list: - if result is not None: - eval_results[result["task_id"]].append(result) - completion_id[result["task_id"]] += 1 + futures.append(executor.submit(check_correctness, *args)) + completion_id[task_id] += 1 n_samples += 1 - assert n_samples == sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" - assert len(completion_id) == len(problems), "Missing problems in samples" + assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" + assert len(completion_id) == len(problems), "Missing problems in samples" + + def stucking_checker(): + while remainings: + last_size = len(remainings) + time.sleep(240) + if last_size != len(remainings) or len(remainings) == 0: + continue + # Potential stucking + warn("No samples had finished testing in the last 240s") + warn(f"{len(remainings)} samples to be tested: {remainings}") + + threading.Thread(target=stucking_checker).start() + + for future in tqdm(as_completed(futures), total=len(futures)): + result = future.result() + remainings.remove(result["_identifier"]) + eval_results[result["task_id"]].append(result) # sort the results for each problem by completion_id for task_id, task_results in eval_results.items(): @@ -247,29 +275,53 @@ def process_sample(sample): results["eval"][task_id] = [] for res in task_results: stat, details = res["base"] - tool_use = res["used_modules"] - results["eval"][task_id].append( - { - "task_id": task_id, - "solution": res["solution"], - "status": stat, - "details": details, - "tool_use": tool_use, - } - ) + if flags.subset == "tool": + tool_use = res["used_tools"] + results["eval"][task_id].append( + { + "task_id": task_id, + "solution": res["solution"], + "status": stat, + "details": details, + "tool_use": tool_use, + } + ) + else: + results["eval"][task_id].append( + { + "task_id": task_id, + "solution": res["solution"], + "status": stat, + "details": details, + } + ) # Calculate pass@k. total = np.array([len(r) for k, r in results["eval"].items() if k in problems]) base_correct = [] - + tool_correct = [] + syntax_correct = [] for key, res in results["eval"].items(): if key not in problems: continue bc = sum([r["status"] == PASS for r in res]) - base_correct.append(bc) - + base_correct.append(bc) + + for key, res in results["eval"].items(): + if key not in problems: + continue + tc = sum([r["tool_use"] for r in res]) + tool_correct.append(tc) + + for key, res in results["eval"].items(): + if key not in problems: + continue + empty_solutions = sum([r["solution"] for r in res]) + syntax_correct.append(empty_solutions) + base_correct = np.array(base_correct) - tool_correct = np.array([r["tool_use"] for r in results["eval"].values()]) + tool_correct = np.array(tool_correct) + syntax_correct = np.array(syntax_correct) pass_at_k = { f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean() @@ -278,9 +330,12 @@ def process_sample(sample): } if flags.subset == "tool": - pass_at_k.update({f"pass_tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean() + pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean() for k in [1, 5, 10, 25, 100] if total.min() >= k}) + pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean() + for k in [1, 5, 10, 25, 100] + if total.min() >= k}) mode = "-calibrated" if "sanitized-calibrated" in flags.samples else "" extra = flags.subset.capitalize() @@ -291,20 +346,15 @@ def process_sample(sample): cprint(f"Groundtruth is not checked", "yellow") else: if gt_pass_rate > 0.99: - cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%", "green") else: - cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%\nPlease be cautious!", "red") if len(failed_tasks) > 0: cprint(f"Failed tasks: {failed_tasks}", "red") for k, v in pass_at_k.items(): - cprint(f"{k}:\t{v:.3f}", "green") - - if flags.subset == "tool": - for k, v in pass_at_k.items(): - if k.startswith("pass_tool@"): - cprint(f"{k}:\t{v:.3f}", "green") + cprint(f"{k}:\t{v*100:.2f}%", "green") # save results if os.path.isfile(result_path): From 2a5c2bb8b44ef8a7ffa4b44d6142369b5a69f052 Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Thu, 5 Sep 2024 18:08:46 +1000 Subject: [PATCH 26/27] fix eval --- bigcodebench/evaluate.py | 43 ++++++++++++++++++++++------------------ 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 48681deb..6adcc36c 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -187,6 +187,7 @@ def evaluate(flags): print("Reading samples...") for sample in tqdm(load_solutions(flags.samples)): task_id = sample["task_id"] + used_tools = [] if task_id not in problems: warn( @@ -202,7 +203,7 @@ def evaluate(flags): try: used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"]) except Exception as e: - used_tools = [] + pass solution = problems[task_id]["positive_tool_implementation"] + solution if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution @@ -299,29 +300,14 @@ def stucking_checker(): # Calculate pass@k. total = np.array([len(r) for k, r in results["eval"].items() if k in problems]) base_correct = [] - tool_correct = [] - syntax_correct = [] + for key, res in results["eval"].items(): if key not in problems: continue bc = sum([r["status"] == PASS for r in res]) base_correct.append(bc) - for key, res in results["eval"].items(): - if key not in problems: - continue - tc = sum([r["tool_use"] for r in res]) - tool_correct.append(tc) - - for key, res in results["eval"].items(): - if key not in problems: - continue - empty_solutions = sum([r["solution"] for r in res]) - syntax_correct.append(empty_solutions) - base_correct = np.array(base_correct) - tool_correct = np.array(tool_correct) - syntax_correct = np.array(syntax_correct) pass_at_k = { f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean() @@ -330,9 +316,28 @@ def stucking_checker(): } if flags.subset == "tool": + tool_correct = [] + syntax_correct = [] + + for key, res in results["eval"].items(): + if key not in problems: + continue + tc = sum([r["tool_use"] for r in res]) + tool_correct.append(tc) + + for key, res in results["eval"].items(): + if key not in problems: + continue + empty_solutions = sum([r["solution"] for r in res]) + syntax_correct.append(empty_solutions) + + tool_correct = np.array(tool_correct) + syntax_correct = np.array(syntax_correct) + pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean() for k in [1, 5, 10, 25, 100] if total.min() >= k}) + pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean() for k in [1, 5, 10, 25, 100] if total.min() >= k}) @@ -346,9 +351,9 @@ def stucking_checker(): cprint(f"Groundtruth is not checked", "yellow") else: if gt_pass_rate > 0.99: - cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%", "green") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%", "green") else: - cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%\nPlease be cautious!", "red") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%\nPlease be cautious!", "red") if len(failed_tasks) > 0: cprint(f"Failed tasks: {failed_tasks}", "red") From 43207e9cd37c1b4a777e82c30eaf6879fd4ed89e Mon Sep 17 00:00:00 2001 From: Terry Zhuo Date: Wed, 18 Sep 2024 04:24:33 +1000 Subject: [PATCH 27/27] update --- bigcodebench/evaluate.py | 87 ++++++++++++++++--------------- bigcodebench/gen/util/__init__.py | 1 + bigcodebench/sanitize.py | 11 +++- 3 files changed, 55 insertions(+), 44 deletions(-) diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 6adcc36c..464af830 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -23,6 +23,7 @@ ) from bigcodebench.data.utils import CACHE_DIR from bigcodebench.eval import ( + FAIL, PASS, compatible_eval_result, estimate_pass_at_k, @@ -57,7 +58,7 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as for problem in problems.values(): if subset == "tool": - code = problem["canonical_solution"] + code = problem["positive_tool_implementation"] + "\n" + problem["code_before_entry_point"] + "\n" + problem["canonical_solution"] else: code = problem["code_prompt"] + "\n" + problem["canonical_solution"] args = ( @@ -119,8 +120,9 @@ def check_correctness( if split in ["positive", "mixed"]: ret["used_tools"] = (set(used_tools) == set(problem["used_tools"])) else: - assert split == "negative" - ret["used_tools"] = (set(used_tools) == {"refusal_func"}) + ret["used_tools"] = False + else: + ret["used_tools"] = False return ret @@ -195,6 +197,30 @@ def evaluate(flags): ) continue + # Skip execution for empty solutions + if not sample["solution"]: + + if flags.subset == "tool" and flags.split == "negative": + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": sample["solution"], + "used_tools": False, + "base": (FAIL, "No refusal") if "There is no solution can be found based on the provided helper functions." not in sample["solution"] else (PASS, "") + }) + else: + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": sample["solution"], + "used_tools": False, + "base": (FAIL, "Empty solution") + }) + completion_id[task_id] += 1 + n_samples += 1 + if flags.subset == "tool": solution = (sample["solution"] if "solution" in sample @@ -217,19 +243,6 @@ def evaluate(flags): ) if "sanitized-calibrated" in flags.samples: solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution - - # Skip execution for empty solutions - if not solution: - eval_results[task_id].append({ - "completion_id": completion_id[task_id], - "task_id": task_id, - "_identifier": sample["_identifier"], - "solution": solution, - "base": (FAIL, "Empty solution") - }) - completion_id[task_id] += 1 - n_samples += 1 - continue remainings.add(sample["_identifier"]) args = ( @@ -276,31 +289,20 @@ def stucking_checker(): results["eval"][task_id] = [] for res in task_results: stat, details = res["base"] - if flags.subset == "tool": - tool_use = res["used_tools"] - results["eval"][task_id].append( - { - "task_id": task_id, - "solution": res["solution"], - "status": stat, - "details": details, - "tool_use": tool_use, - } - ) - else: - results["eval"][task_id].append( - { - "task_id": task_id, - "solution": res["solution"], - "status": stat, - "details": details, - } - ) + tool_use = res["used_tools"] + results["eval"][task_id].append( + { + "task_id": task_id, + "solution": res["solution"], + "status": stat, + "details": details, + "tool_use": tool_use, + } + ) # Calculate pass@k. total = np.array([len(r) for k, r in results["eval"].items() if k in problems]) base_correct = [] - for key, res in results["eval"].items(): if key not in problems: continue @@ -315,7 +317,7 @@ def stucking_checker(): if total.min() >= k } - if flags.subset == "tool": + if flags.subset == "tool" and flags.split != "negative": tool_correct = [] syntax_correct = [] @@ -325,11 +327,10 @@ def stucking_checker(): tc = sum([r["tool_use"] for r in res]) tool_correct.append(tc) - for key, res in results["eval"].items(): - if key not in problems: + for sample in load_solutions(flags.samples): + if sample["task_id"] not in problems: continue - empty_solutions = sum([r["solution"] for r in res]) - syntax_correct.append(empty_solutions) + syntax_correct.append(sample["solution"] != "") tool_correct = np.array(tool_correct) syntax_correct = np.array(syntax_correct) @@ -359,7 +360,7 @@ def stucking_checker(): cprint(f"Failed tasks: {failed_tasks}", "red") for k, v in pass_at_k.items(): - cprint(f"{k}:\t{v*100:.2f}%", "green") + cprint(f"{k}: {v*100:.2f}%", "green") # save results if os.path.isfile(result_path): diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py index d8088ad5..306431a7 100644 --- a/bigcodebench/gen/util/__init__.py +++ b/bigcodebench/gen/util/__init__.py @@ -57,6 +57,7 @@ def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_sta errors = test_result.failures + test_result.errors if len(errors) > 0: + print(task_id) print(errors) times.value = -1 else: diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index ec7f908e..29d092cb 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -1,6 +1,7 @@ """Post-processing LLM-generated Python code implemented using tree-sitter.""" import os +import re import pathlib from typing import Dict, Generator, List, Optional, Set, Tuple from pqdm.processes import pqdm @@ -168,7 +169,15 @@ def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str name, node = pair if entrypoint and not (name in reachable): continue - sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + node_code = code_bytes[node.start_byte : node.end_byte].decode("utf8") + if node.type == FUNCTION_TYPE and name == entrypoint: + # Remove return type annotation, including unnecessary spaces + node_code = re.sub(r"->\s*[^:]+:", ":", node_code) + # Ensure there is exactly one space before the colon + node_code = re.sub(r'\s*\)(\s*):', ') :', node_code) + node_code = re.sub(r"\s*:", ":", node_code) + node_code = re.sub(r":", " :", node_code) + sanitized_output += node_code.encode("utf8") + b"\n" sanitized_output = sanitized_output[:-1].decode("utf8")