diff --git a/bigcodebench/data/utils.py b/bigcodebench/data/utils.py index fa91abe5..01bc0a2c 100644 --- a/bigcodebench/data/utils.py +++ b/bigcodebench/data/utils.py @@ -149,15 +149,30 @@ def write_directory(directory: PathLike, data: Iterable[Dict]): def completeness_check(name, data): for task_id, task in data.items(): - for key in [ - "complete_prompt", - "instruct_prompt", - "canonical_solution", - "code_prompt", - "test", - "entry_point" - ]: - assert key in task, f"{key} not found in {name} #{task_id}!" + try: + for key in [ + "complete_prompt", + "instruct_prompt", + "canonical_solution", + "code_prompt", + "test", + "entry_point" + ]: + assert key in task, f"{key} not found in {name} #{task_id}!" + except Exception as e: + for key in [ + "complete_prompt", + "positive_tool", + "negative_tool", + "mixed_tool", + "positive_tool_implementation", + "negative_tool_implementation", + "mixed_tool_implementation", + "canonical_solution", + "test", + "entry_point" + ]: + assert key in task, f"{key} not found in {name} #{task_id}!" def to_raw(string): diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py index 3596f53d..66f3aabf 100644 --- a/bigcodebench/eval/__init__.py +++ b/bigcodebench/eval/__init__.py @@ -24,6 +24,7 @@ import multiprocessing import os import sys +import ast import time import types import unittest @@ -240,3 +241,80 @@ def evaluate_files( ) ret.append((stat, det.tolist())) return ret + + +def extract_defined_modules(code: str, entry_point: str): + tree = ast.parse(code) + defined_functions = set() + defined_methods = {} + used_functions = set() + used_methods = set() + variable_classes = {} + + class FunctionDefVisitor(ast.NodeVisitor): + def visit_FunctionDef(self, node): + defined_functions.add(node.name) + self.generic_visit(node) + + def visit_ClassDef(self, node): + for item in node.body: + if isinstance(item, ast.FunctionDef): + if node.name not in defined_methods: + defined_methods[node.name] = set() + defined_methods[node.name].add(item.name) + self.generic_visit(node) + + class TaskFuncVisitor(ast.NodeVisitor): + def visit_Assign(self, node): + if isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name): + class_name = node.value.func.id + for target in node.targets: + if isinstance(target, ast.Name): + variable_classes[target.id] = class_name + self.generic_visit(node) + + def visit_Call(self, node): + if isinstance(node.func, ast.Name): + used_functions.add(node.func.id) + elif isinstance(node.func, ast.Attribute): + value = node.func.value + if isinstance(value, ast.Name): + var_name = value.id + if var_name in variable_classes: + used_methods.add(f"{variable_classes[var_name]}.{node.func.attr}") + else: + used_methods.add(f"{var_name}.{node.func.attr}") + elif isinstance(value, ast.Attribute): + # Handle nested attributes (e.g., obj.attr.method()) + attr_chain = [node.func.attr] + while isinstance(value, ast.Attribute): + attr_chain.append(value.attr) + value = value.value + if isinstance(value, ast.Name): + var_name = value.id + if var_name in variable_classes: + attr_chain.append(variable_classes[var_name]) + else: + attr_chain.append(var_name) + used_methods.add('.'.join(reversed(attr_chain))) + self.generic_visit(node) + + # First pass: collect all defined functions and methods + FunctionDefVisitor().visit(tree) + + # Second pass: collect used functions and methods within task_func + for node in ast.iter_child_nodes(tree): + if isinstance(node, ast.FunctionDef) and node.name == entry_point: + TaskFuncVisitor().visit(node) + break # Assuming there's only one task_func + + # Filter used functions to include only those defined before task_func + result = [func for func in used_functions if func in defined_functions] + + # Filter used methods to include only those defined before task_func + for class_name, methods in defined_methods.items(): + for method in methods: + if any(f"{class_name}.{method}" in used_method for used_method in used_methods): + result.append(f"{class_name}.{method}") + + return result diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py index 61e2a43f..464af830 100644 --- a/bigcodebench/evaluate.py +++ b/bigcodebench/evaluate.py @@ -5,6 +5,7 @@ import pickle import threading import time +from pqdm.processes import pqdm from collections import Counter, defaultdict from concurrent.futures import ProcessPoolExecutor, as_completed from datetime import datetime @@ -22,10 +23,12 @@ ) from bigcodebench.data.utils import CACHE_DIR from bigcodebench.eval import ( + FAIL, PASS, compatible_eval_result, estimate_pass_at_k, untrusted_check, + extract_defined_modules, ) from bigcodebench.gen.util import trusted_check @@ -34,7 +37,7 @@ Result = Tuple[str, List[bool]] -def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit): +def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit): cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl") if os.path.exists(cache_file): if check_gt_only: @@ -54,8 +57,12 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, expected_time = dict() for problem in problems.values(): + if subset == "tool": + code = problem["positive_tool_implementation"] + "\n" + problem["code_before_entry_point"] + "\n" + problem["canonical_solution"] + else: + code = problem["code_prompt"] + "\n" + problem["canonical_solution"] args = ( - problem["complete_prompt"] + "\n" + problem["canonical_solution"], + code, problem["test"], problem["task_id"], max_as_limit, @@ -80,6 +87,8 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, return expected_time def check_correctness( + subset: str, + split: str, completion_id: int, problem: Dict[str, Any], solution: str, @@ -89,6 +98,7 @@ def check_correctness( identifier=None, min_time_limit: float = 0.1, gt_time_limit: float = 2.0, + used_tools: List[str] = [], ) -> Dict[str, Result]: # {...}, "base" | "plus" -> (status, details) ret = { "completion_id": completion_id, @@ -106,6 +116,14 @@ def check_correctness( min_time_limit, gt_time_limit, ) + if subset == "tool": + if split in ["positive", "mixed"]: + ret["used_tools"] = (set(used_tools) == set(problem["used_tools"])) + else: + ret["used_tools"] = False + else: + ret["used_tools"] = False + return ret @@ -130,7 +148,7 @@ def evaluate(flags): dataset_hash = get_bigcodebench_hash(subset=flags.subset) if not flags.no_gt: - expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit) + expected_time = get_groundtruth(flags.subset, n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit) else: expected_time = {task_id: None for task_id in problems} @@ -171,21 +189,65 @@ def evaluate(flags): print("Reading samples...") for sample in tqdm(load_solutions(flags.samples)): task_id = sample["task_id"] + used_tools = [] if task_id not in problems: warn( f"Task {task_id} is found in the samples but not found in the dataset" ) continue - solution = ( - sample["solution"] + + # Skip execution for empty solutions + if not sample["solution"]: + + if flags.subset == "tool" and flags.split == "negative": + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": sample["solution"], + "used_tools": False, + "base": (FAIL, "No refusal") if "There is no solution can be found based on the provided helper functions." not in sample["solution"] else (PASS, "") + }) + else: + eval_results[task_id].append({ + "completion_id": completion_id[task_id], + "task_id": task_id, + "_identifier": sample["_identifier"], + "solution": sample["solution"], + "used_tools": False, + "base": (FAIL, "Empty solution") + }) + completion_id[task_id] += 1 + n_samples += 1 + + if flags.subset == "tool": + solution = (sample["solution"] + if "solution" in sample + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + try: + used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"]) + except Exception as e: + pass + solution = problems[task_id]["positive_tool_implementation"] + solution + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["complete_prompt"] + "\n pass\n" + solution + solution = problems[task_id][f"positive_tool_implementation"] + solution + + else: + solution = ( + sample["solution"] if "solution" in sample - else problems[task_id]["complete_prompt"] + sample["completion"] - ) - if "sanitized-calibrated" in flags.samples: - solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + else problems[task_id]["complete_prompt"] + sample["completion"] + ) + if "sanitized-calibrated" in flags.samples: + solution = problems[task_id]["code_prompt"] + "\n pass\n" + solution + remainings.add(sample["_identifier"]) args = ( + flags.subset, + flags.split, completion_id[task_id], problems[task_id], solution, @@ -194,15 +256,16 @@ def evaluate(flags): flags.max_stack_limit, sample["_identifier"], flags.min_time_limit, - expected_time[task_id] if expected_time[task_id] else 20 + expected_time[task_id] if expected_time[task_id] else 20, + used_tools ) futures.append(executor.submit(check_correctness, *args)) completion_id[task_id] += 1 n_samples += 1 - assert n_samples == len(remainings), "Missing problems in unfinished" + assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished" assert len(completion_id) == len(problems), "Missing problems in samples" - + def stucking_checker(): while remainings: last_size = len(remainings) @@ -215,7 +278,7 @@ def stucking_checker(): threading.Thread(target=stucking_checker).start() - for future in tqdm(as_completed(futures), total=n_samples): + for future in tqdm(as_completed(futures), total=len(futures)): result = future.result() remainings.remove(result["_identifier"]) eval_results[result["task_id"]].append(result) @@ -226,51 +289,78 @@ def stucking_checker(): results["eval"][task_id] = [] for res in task_results: stat, details = res["base"] + tool_use = res["used_tools"] results["eval"][task_id].append( { "task_id": task_id, "solution": res["solution"], "status": stat, "details": details, + "tool_use": tool_use, } ) # Calculate pass@k. total = np.array([len(r) for k, r in results["eval"].items() if k in problems]) base_correct = [] - for key, res in results["eval"].items(): if key not in problems: continue bc = sum([r["status"] == PASS for r in res]) - base_correct.append(bc) - + base_correct.append(bc) + base_correct = np.array(base_correct) - + pass_at_k = { f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean() for k in [1, 5, 10, 25, 100] if total.min() >= k } + if flags.subset == "tool" and flags.split != "negative": + tool_correct = [] + syntax_correct = [] + + for key, res in results["eval"].items(): + if key not in problems: + continue + tc = sum([r["tool_use"] for r in res]) + tool_correct.append(tc) + + for sample in load_solutions(flags.samples): + if sample["task_id"] not in problems: + continue + syntax_correct.append(sample["solution"] != "") + + tool_correct = np.array(tool_correct) + syntax_correct = np.array(syntax_correct) + + pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean() + for k in [1, 5, 10, 25, 100] + if total.min() >= k}) + + pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean() + for k in [1, 5, 10, 25, 100] + if total.min() >= k}) + mode = "-calibrated" if "sanitized-calibrated" in flags.samples else "" extra = flags.subset.capitalize() flags.split = flags.split.capitalize() - cprint(f"BigCodeBench-{flags.split}{mode} ({extra})", "green") + cprint(f"BigCodeBench-{extra}{mode} ({flags.split})", "green") if flags.no_gt: cprint(f"Groundtruth is not checked", "yellow") else: if gt_pass_rate > 0.99: - cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%", "green") else: - cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red") + cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%\nPlease be cautious!", "red") if len(failed_tasks) > 0: cprint(f"Failed tasks: {failed_tasks}", "red") for k, v in pass_at_k.items(): - cprint(f"{k}:\t{v:.3f}", "green") + cprint(f"{k}: {v*100:.2f}%", "green") # save results if os.path.isfile(result_path): @@ -323,9 +413,9 @@ def save_pass_at_k(): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--split", required=True, type=str, choices=["complete", "instruct"] + "--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"] ) - parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"]) + parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard", "tool"]) parser.add_argument("--samples", required=True, type=str) parser.add_argument("--save_pass_rate", action="store_true") parser.add_argument("--parallel", default=None, type=int) diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py index d8088ad5..306431a7 100644 --- a/bigcodebench/gen/util/__init__.py +++ b/bigcodebench/gen/util/__init__.py @@ -57,6 +57,7 @@ def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_sta errors = test_result.failures + test_result.errors if len(errors) > 0: + print(task_id) print(errors) times.value = -1 else: diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py index 679300cb..729b93ed 100644 --- a/bigcodebench/generate.py +++ b/bigcodebench/generate.py @@ -37,6 +37,8 @@ def codegen( if model.is_direct_completion() and split == "instruct": raise Exception("Base model does not support direct completion for instruct tasks") + if subset == "tool": + assert split in ["positive", "negative", "mixed"], "Tool subset only supports positive, negative, and mixed split" # create save_path if it doesn't exist, e.g., a/b.jsonl dirname = os.path.dirname(save_path) if not os.path.exists(dirname) and dirname != "": @@ -70,9 +72,12 @@ def codegen( sidx = n_samples - nsamples while sidx < n_samples: try: - prompt = task[f"{split}_prompt"] + if subset == "tool": + prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"] + else: + prompt = task[f"{split}_prompt"] except: - raise Exception(f"Invalid split {split}") + raise Exception(f"Invalid split {split} for bigcodebench-{subset}") if strip_newlines: prompt = prompt.strip("\n") outputs = model.codegen( @@ -105,8 +110,8 @@ def codegen( def main(): parser = argparse.ArgumentParser() parser.add_argument("--model", required=True, type=str) - parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct"]) - parser.add_argument("--subset", default="full", type=str, choices=["full", "hard"]) + parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"]) + parser.add_argument("--subset", default="full", type=str, choices=["full", "hard", "tool"]) parser.add_argument("--save_path", default=None, type=str) parser.add_argument("--bs", default=1, type=int) parser.add_argument("--n_samples", default=1, type=int) @@ -140,6 +145,8 @@ def main(): model_runner = make_model( model=args.model, backend=args.backend, + subset=args.subset, + split=args.split, batch_size=args.bs, temperature=args.temperature, base_url=args.base_url, diff --git a/bigcodebench/model.py b/bigcodebench/model.py index c5093b0f..9830055e 100644 --- a/bigcodebench/model.py +++ b/bigcodebench/model.py @@ -55,17 +55,28 @@ def extra_eos_for_direct_completion(dataset) -> List[str]: _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-" -def make_chat_prompt(prompt: str, tokenizer: AutoTokenizer) -> str: +def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokenizer) -> str: # directly return prompt if it does not have a tokenizer.chat_template if tokenizer.chat_template is None: return prompt - prompt = f"""\ + if subset == "tool": + prompt = f"""\ +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: +``` +{prompt.strip()} +``` +""" + else: + prompt = f"""\ Please provide a self-contained Python script that solves the following problem in a markdown code block: ``` {prompt.strip()} ``` """ + response = f"""\ Below is a Python script with a self-contained function that solves the problem and passes corresponding tests: ```python @@ -86,6 +97,8 @@ class DecoderBase(ABC): def __init__( self, name: str, + subset: str, + split: str, batch_size: int = 1, temperature: float = 0.8, max_new_tokens: int = 1280, @@ -96,6 +109,8 @@ def __init__( ) -> None: print("Initializing a decoder model: {} ...".format(name)) self.name = name + self.subset = subset + self.split = split self.batch_size = batch_size self.temperature = temperature self.eos = EOS @@ -175,7 +190,7 @@ def __init__(self, name: str, **kwargs) -> None: def codegen( self, prompt: str, do_sample: bool = True, num_samples: int = 200 ) -> List[str]: - prompt = make_chat_prompt(prompt, self.tokenizer) + prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer) return VllmDecoder.codegen(self, prompt, do_sample, num_samples) @@ -259,7 +274,7 @@ def __init__(self, name: str, **kwargs): def codegen( self, prompt: str, do_sample: bool = True, num_samples: int = 200 ) -> List[str]: - prompt = make_chat_prompt(prompt, self.tokenizer) + prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer) return HfTorchDecoder.codegen(self, prompt, do_sample, num_samples) @@ -277,10 +292,16 @@ def codegen( # construct prompt fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text" - if fmt == "json_object": - message = r'Please complete the following code snippet by generating JSON like {"code": ""}' + if self.subset == "tool": + if fmt == "json_object": + message = r'Based on the given customized modules, please complete the following code snippet by generating JSON like {"code": ""} without using any external library APIs or defining any modules' + else: + message = r"You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.\n\nPlease complete `task_func` in a markdown code block without using any external library APIs or defining any modules:" else: - message = r"Please generate self-contained code to complete the following problem:" + if fmt == "json_object": + message = r'Please complete the following code snippet by generating JSON like {"code": ""}' + else: + message = r"Please generate self-contained code to complete the following problem in a markdown code block:" message += f"\n```python\n{prompt.strip()}\n```" @@ -335,14 +356,31 @@ def codegen( batch_size = min(self.batch_size, num_samples) outputs = [] + + if self.subset == "tool": + message = f"""\ +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" + for _ in range(batch_size): ret = self.client.chat( model=self.name, messages=[ ChatMessage( role="user", - content="Please generate self-contained code to solve the following problem in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```", + content=message, ) ], max_tokens=self.max_new_tokens, @@ -383,22 +421,37 @@ def codegen( assert batch_size == 1, "Sampling only supports batch size of 1" outputs = [] + if self.subset == "tool": + message = f"""\ +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" for _ in range(batch_size): - message = anthropic_request.make_auto_request( - client=self.client, + ret = anthropic_request.make_auto_request( + client=self.client, model=self.name, messages=[ { "role": "user", - "content": "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```\n", + "content": message, } ], max_tokens=self.max_new_tokens, stop_sequences=["\n```\n", "\nif "], **kwargs, ) - outputs.append(message.content[0].text) + outputs.append(ret.content[0].text) return outputs @@ -459,12 +512,29 @@ def codegen( model = genai.GenerativeModel(model_name=self.name, generation_config=genai_config, safety_settings=safety_settings) outputs = [] + + if self.subset == "tool": + message = f"""\ +You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task. + +Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules: +``` +{prompt.strip()} +``` +""" + else: + message = f"""\ +Please provide a self-contained Python script that solves the following problem in a markdown code block: +``` +{prompt.strip()} +``` +""" + for _ in range(batch_size): while True: try: response = model.generate_content( - "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:" - + f"\n```python\n{prompt.strip()}\n```", + message, generation_config=genai_config ) output = response.candidates[0].content.parts[0].text @@ -485,6 +555,8 @@ def codegen( def make_model( model: str, backend: str, + subset: str, + split: str, dataset: str = "bigcodebench", batch_size: int = 1, temperature: float = 0.0, @@ -497,6 +569,8 @@ def make_model( if backend == "vllm": return GeneralVllmDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, dataset=dataset, @@ -508,6 +582,8 @@ def make_model( elif backend == "hf": return GenenralHfTorchDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, dataset=dataset, @@ -518,6 +594,8 @@ def make_model( elif backend == "openai": return OpenAIChatDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, base_url=base_url, @@ -525,18 +603,24 @@ def make_model( elif backend == "mistral": return MistralChatDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) elif backend == "anthropic": return AnthropicMessageDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) elif backend == "google": return GeminiDecoder( name=model, + subset=subset, + split=split, batch_size=batch_size, temperature=temperature, ) \ No newline at end of file diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py index df9ed4eb..29d092cb 100644 --- a/bigcodebench/sanitize.py +++ b/bigcodebench/sanitize.py @@ -1,6 +1,7 @@ """Post-processing LLM-generated Python code implemented using tree-sitter.""" import os +import re import pathlib from typing import Dict, Generator, List, Optional, Set, Tuple from pqdm.processes import pqdm @@ -15,7 +16,7 @@ write_directory, write_jsonl, ) -from bigcodebench.syncheck import syntax_check +from bigcodebench.syncheck import syntax_check, api_check CLASS_TYPE = "class_definition" FUNCTION_TYPE = "function_definition" @@ -108,7 +109,7 @@ def has_return_statement(node: Node) -> bool: return False -def sanitize(code: str, entrypoint: Optional[str] = None) -> str: +def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str: code = code_extract(code.strip()) code_bytes = bytes(code, "utf8") parser = get_parser("python") @@ -116,13 +117,15 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: class_names = set() function_names = set() variable_names = set() - + reachable = set() + root_node = tree.root_node import_nodes = [] definition_nodes = [] for child in root_node.children: if child.type in IMPORT_TYPE: + # if subset != "tool": import_nodes.append(child) elif child.type == CLASS_TYPE: name = get_definition_name(child) @@ -136,8 +139,11 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: if not ( name in function_names or name in variable_names or name in class_names ): + # if name == entrypoint: + # task_func_found = True + # if task_func_found: definition_nodes.append((name, child)) - function_names.add(get_definition_name(child)) + function_names.add(name) elif ( child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE ): @@ -146,12 +152,13 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: if not ( name in variable_names or name in function_names or name in class_names ): + # if task_func_found: definition_nodes.append((name, subchild)) variable_names.add(name) if entrypoint: name2deps = get_deps(definition_nodes) - reacheable = get_function_dependency(entrypoint, name2deps) + reachable = get_function_dependency(entrypoint, name2deps) sanitized_output = b"" @@ -160,9 +167,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: for pair in definition_nodes: name, node = pair - if entrypoint and not (name in reacheable): + if entrypoint and not (name in reachable): continue - sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n" + node_code = code_bytes[node.start_byte : node.end_byte].decode("utf8") + if node.type == FUNCTION_TYPE and name == entrypoint: + # Remove return type annotation, including unnecessary spaces + node_code = re.sub(r"->\s*[^:]+:", ":", node_code) + # Ensure there is exactly one space before the colon + node_code = re.sub(r'\s*\)(\s*):', ') :', node_code) + node_code = re.sub(r"\s*:", ":", node_code) + node_code = re.sub(r":", " :", node_code) + sanitized_output += node_code.encode("utf8") + b"\n" sanitized_output = sanitized_output[:-1].decode("utf8") @@ -176,6 +191,9 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str: outer_lines.append(i) if outer_lines: sanitized_output = "\n".join(lines[: outer_lines[-1]]) + # if subset == "tool": + # return "" if api_check(solution[f"{split}_tool_"] + "\n" + sanitized_output) else sanitized_output + # else: return sanitized_output @@ -183,6 +201,7 @@ def process_solution( sample_solution: Dict, dataset: Dict, entry_point: Dict, + subset: str, debug_task: str = None, calibrate: bool = False, is_folder: bool = False, @@ -207,8 +226,10 @@ def process_solution( if calibrate: old_code = old_code.replace("```python\n ", "```python\n"+dataset[task_id]["complete_prompt"]+" ") - new_code = sanitize(code=old_code, entrypoint=function_name) - + new_code = sanitize(code=old_code, solution=sample_solution, entrypoint=function_name) + if subset == "tool": + if api_check(new_code): + new_code = "" # if old code and new code are different, print msg if new_code != old_code: msg = "Sanitized: " + dbg_identifier @@ -220,12 +241,12 @@ def process_solution( def script( - samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 + samples: str, subset: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32 ): # task_id -> entry_point entry_point = {} # merge two datasets - dataset = {**get_bigcodebench()} + dataset = {**get_bigcodebench(subset=subset)} for task_id, problem in dataset.items(): entry_point[task_id] = problem["entry_point"] @@ -233,18 +254,19 @@ def script( # make a new folder with "-sanitized" suffix is_folder = os.path.isdir(samples) target_path = pathlib.Path(samples) + target_path_name = target_path.name if not inplace: if is_folder: if calibrate: - new_name = target_path.name + "-sanitized-calibrated" + target_path_name = target_path_name + "-sanitized-calibrated" else: - new_name = target_path.name + "-sanitized" + target_path_name = target_path_name + "-sanitized" else: if calibrate: - new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl") + target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl") else: - new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl") - target_path = target_path.parent / new_name + target_path_name = target_path_name.replace(".jsonl", "-sanitized.jsonl") + target_path = target_path.parent / target_path_name target_path = str(target_path) nsan = 0 @@ -257,6 +279,7 @@ def script( "sample_solution": sample_solution, "dataset": dataset, "entry_point": entry_point, + "subset": subset, "debug_task": debug_task, "calibrate": calibrate, "is_folder": is_folder, diff --git a/bigcodebench/syncheck.py b/bigcodebench/syncheck.py index 9ea97f4c..566afdbd 100755 --- a/bigcodebench/syncheck.py +++ b/bigcodebench/syncheck.py @@ -10,6 +10,61 @@ from bigcodebench.data import load_solutions +def api_check(code: str) -> bool: + tree = ast.parse(code) + imported_modules = set() + imported_names = {} + + class ApiExtractor(ast.NodeVisitor): + def __init__(self): + self.in_task_func = False + self.uses_library_api = False + + def visit_Import(self, node): + for alias in node.names: + imported_modules.add(alias.name) + if alias.asname: + imported_modules.add(alias.asname) + + def visit_ImportFrom(self, node): + if node.module: + for alias in node.names: + full_name = f'{node.module}.{alias.name}' + imported_names[alias.asname or alias.name] = full_name + + def visit_FunctionDef(self, node): + if node.name == 'task_func': + self.in_task_func = True + self.generic_visit(node) + self.in_task_func = False + else: + self.generic_visit(node) + + def visit_Attribute(self, node): + if self.in_task_func: + attr_chain = [] + current = node + while isinstance(current, ast.Attribute): + attr_chain.append(current.attr) + current = current.value + if isinstance(current, ast.Name): + attr_chain.append(current.id) + attr_chain.reverse() + full_name = '.'.join(attr_chain) + if attr_chain[0] in imported_modules or attr_chain[0] in imported_names: + self.uses_library_api = True + self.generic_visit(node) + + def visit_Name(self, node): + if self.in_task_func: + if node.id in imported_modules or node.id in imported_names: + self.uses_library_api = True + self.generic_visit(node) + + extractor = ApiExtractor() + extractor.visit(tree) + + return extractor.uses_library_api def syntax_check(code, verbose=False): try: diff --git a/tools/fix_v0110.py b/tools/fix_v0110.py index d1d1300c..0b7f75ec 100644 --- a/tools/fix_v0110.py +++ b/tools/fix_v0110.py @@ -19,7 +19,21 @@ def map_ds(sample): "Requirements:\n - sklearn.ensemble\n", "Requirements:\n - pandas\n - sklearn.ensemble\n" ) - + if sample["task_id"] in ["BigCodeBench/241"]: + for k in sample.keys(): + if "prompt" in k: + sample[k] = sample[k].replace( + "The function will plot the original and normalized arrays using matplotlib.", + "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'." + ) + if sample["task_id"] in ["BigCodeBench/267"]: + for k in sample.keys(): + if "prompt" in k: + sample[k] = sample[k].replace( + "Plots and returns the FFT of the signal.", + "Plots and returns the FFT of the signal with a title of 'FFT of the signal'." + ) + return sample if __name__ == "__main__": @@ -28,7 +42,7 @@ def map_ds(sample): hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF) ds = ds_dict[BIGCODEBENCH_VERSION] hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION] - function_id = [37] + function_id = [37, 267, 241] new_ds = ds.map(map_ds) new_ds.to_json("BigCodeBench.jsonl")