From 10090f1393799cb064957e8bd41c09da19cdf0ce Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 01:58:36 +0800
Subject: [PATCH 01/27] feat: add tool-use eval

---
 bigcodebench/evaluate.py | 34 +++++++++++++++++++++++-----------
 bigcodebench/generate.py | 13 +++++++++----
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 61e2a43f..14580a32 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -34,7 +34,7 @@
 Result = Tuple[str, List[bool]]
 
 
-def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
+def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
     cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
     if os.path.exists(cache_file):
         if check_gt_only:
@@ -54,8 +54,12 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit,
         expected_time = dict()
         
         for problem in problems.values():
+            if subset == "tool":
+                code = problem["canonical_solution"]
+            else:
+                code = problem[f"{split}_prompt"] + "\n" + problem["canonical_solution"]
             args = (
-                problem["complete_prompt"] + "\n" + problem["canonical_solution"],
+                code,
                 problem["test"],
                 problem["task_id"],
                 max_as_limit,
@@ -130,7 +134,7 @@ def evaluate(flags):
     dataset_hash = get_bigcodebench_hash(subset=flags.subset)
     
     if not flags.no_gt:
-        expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit)
+        expected_time = get_groundtruth(flags.subset, n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit)
     else:
         expected_time = {task_id: None for task_id in problems}
     
@@ -177,13 +181,21 @@ def evaluate(flags):
                         f"Task {task_id} is found in the samples but not found in the dataset"
                     )
                     continue
-                solution = (
-                    sample["solution"]
+                
+                if flags.subset == "tool":
+                    solution = (problems[task_id][f"{flags.split}_prompt"] + "\n\n"
+                                + problems[task_id]["solution"]  
+                                if "solution" in sample
+                                else problems[task_id]["complete_prompt"] + sample["completion"] 
+                                + "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"])
+                else:
+                    solution = (
+                        sample["solution"]
                     if "solution" in sample
-                    else problems[task_id]["complete_prompt"] + sample["completion"]
-                )
-                if "sanitized-calibrated" in flags.samples:
-                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                        else problems[task_id]["complete_prompt"] + sample["completion"]
+                    )
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
                 remainings.add(sample["_identifier"])
                 args = (
                     completion_id[task_id],
@@ -323,9 +335,9 @@ def save_pass_at_k():
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--split", required=True, type=str, choices=["complete", "instruct"]
+        "--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"]
     )
-    parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"])
+    parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard", "tool"])
     parser.add_argument("--samples", required=True, type=str)
     parser.add_argument("--save_pass_rate", action="store_true")
     parser.add_argument("--parallel", default=None, type=int)
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 679300cb..e1ecc592 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -37,6 +37,8 @@ def codegen(
         if model.is_direct_completion() and split == "instruct":
             raise Exception("Base model does not support direct completion for instruct tasks")
         
+        if subset == "tool":
+            assert split in ["positive", "negative", "mixed"], "Tool subset only supports positive, negative, and mixed split"
         # create save_path if it doesn't exist, e.g., a/b.jsonl
         dirname = os.path.dirname(save_path)
         if not os.path.exists(dirname) and dirname != "":
@@ -70,9 +72,12 @@ def codegen(
             sidx = n_samples - nsamples
             while sidx < n_samples:
                 try:
-                    prompt = task[f"{split}_prompt"]
+                    if split == "tool":
+                        prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"]
+                    else:
+                        prompt = task[f"{split}_prompt"]
                 except:
-                    raise Exception(f"Invalid split {split}")
+                    raise Exception(f"Invalid split {split} for BigCodeBench-{subset}")
                 if strip_newlines:
                     prompt = prompt.strip("\n")
                 outputs = model.codegen(
@@ -105,8 +110,8 @@ def codegen(
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", required=True, type=str)
-    parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct"])
-    parser.add_argument("--subset", default="full", type=str, choices=["full", "hard"])
+    parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"])
+    parser.add_argument("--subset", default="full", type=str, choices=["full", "hard", "tool"])
     parser.add_argument("--save_path", default=None, type=str)
     parser.add_argument("--bs", default=1, type=int)
     parser.add_argument("--n_samples", default=1, type=int)

From 3641cfdcbeb5f0a4143d8f9edf7cf8a62612afba Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 02:05:01 +0800
Subject: [PATCH 02/27] fix: rm split

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 14580a32..ab86aab9 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -57,7 +57,7 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as
             if subset == "tool":
                 code = problem["canonical_solution"]
             else:
-                code = problem[f"{split}_prompt"] + "\n" + problem["canonical_solution"]
+                code = problem["code_prompt"] + "\n" + problem["canonical_solution"]
             args = (
                 code,
                 problem["test"],

From d616f0e2fc53c15b9fdbf7ab0e470b24758c0831 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 02:13:34 +0800
Subject: [PATCH 03/27] fix check

---
 bigcodebench/data/utils.py | 33 ++++++++++++++++++++++++---------
 1 file changed, 24 insertions(+), 9 deletions(-)

diff --git a/bigcodebench/data/utils.py b/bigcodebench/data/utils.py
index fa91abe5..01bc0a2c 100644
--- a/bigcodebench/data/utils.py
+++ b/bigcodebench/data/utils.py
@@ -149,15 +149,30 @@ def write_directory(directory: PathLike, data: Iterable[Dict]):
 
 def completeness_check(name, data):
     for task_id, task in data.items():
-        for key in [
-            "complete_prompt",
-            "instruct_prompt",
-            "canonical_solution",
-            "code_prompt",
-            "test",
-            "entry_point"
-        ]:
-            assert key in task, f"{key} not found in {name} #{task_id}!"
+        try:
+            for key in [
+                "complete_prompt",
+                "instruct_prompt",
+                "canonical_solution",
+                "code_prompt",
+                "test",
+                "entry_point"
+            ]:
+                assert key in task, f"{key} not found in {name} #{task_id}!"
+        except Exception as e:
+            for key in [
+                "complete_prompt",
+                "positive_tool",
+                "negative_tool",
+                "mixed_tool",
+                "positive_tool_implementation",
+                "negative_tool_implementation",
+                "mixed_tool_implementation",
+                "canonical_solution",
+                "test",
+                "entry_point"
+            ]:
+                assert key in task, f"{key} not found in {name} #{task_id}!"
 
 
 def to_raw(string):

From beaec382edae4df5dd89cd1c2afe744e8b910bd6 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 20:40:34 +0800
Subject: [PATCH 04/27] feat: add tool-use gen prompt

---
 bigcodebench/model.py | 98 ++++++++++++++++++++++++++++++++++++-------
 1 file changed, 82 insertions(+), 16 deletions(-)

diff --git a/bigcodebench/model.py b/bigcodebench/model.py
index c5093b0f..ff69c254 100644
--- a/bigcodebench/model.py
+++ b/bigcodebench/model.py
@@ -55,18 +55,31 @@ def extra_eos_for_direct_completion(dataset) -> List[str]:
 _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
 
 
-def make_chat_prompt(prompt: str, tokenizer: AutoTokenizer) -> str:
+def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokenizer) -> str:
     # directly return prompt if it does not have a tokenizer.chat_template
     if tokenizer.chat_template is None:
         return prompt
 
-    prompt = f"""\
+    if subset == "tool":
+        prompt = f"""\
+Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+```
+{prompt.strip()}
+```
+"""     
+        response = f"""\
+Below is a answer (e.g., Python script or refusal) that solves the problem:
+{_MAGIC_SPLITTER_}
+"""
+    else:
+        prompt = f"""\
 Please provide a self-contained Python script that solves the following problem in a markdown code block:
 ```
 {prompt.strip()}
 ```
 """
-    response = f"""\
+
+        response = f"""\
 Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:
 ```python
 {_MAGIC_SPLITTER_}
@@ -86,6 +99,8 @@ class DecoderBase(ABC):
     def __init__(
         self,
         name: str,
+        subset: str,
+        split: str,
         batch_size: int = 1,
         temperature: float = 0.8,
         max_new_tokens: int = 1280,
@@ -96,6 +111,8 @@ def __init__(
     ) -> None:
         print("Initializing a decoder model: {} ...".format(name))
         self.name = name
+        self.subset = subset
+        self.split = split
         self.batch_size = batch_size
         self.temperature = temperature
         self.eos = EOS
@@ -175,7 +192,7 @@ def __init__(self, name: str, **kwargs) -> None:
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer)
         return VllmDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -259,7 +276,7 @@ def __init__(self, name: str, **kwargs):
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer)
         return HfTorchDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -277,10 +294,16 @@ def codegen(
 
         # construct prompt
         fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text"
-        if fmt == "json_object":
-            message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
+        if self.subset == "tool":
+            if fmt == "json_object":
+                message = r'Based on the given customized modules, please complete the following code snippet without using any imports by generating JSON like {"code": ""}'
+            else:
+                message = r"Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:"
         else:
-            message = r"Please generate self-contained code to complete the following problem:"
+            if fmt == "json_object":
+                message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
+            else:
+                message = r"Please generate self-contained code to complete the following problem in a markdown code block:"
 
         message += f"\n```python\n{prompt.strip()}\n```"
 
@@ -335,14 +358,29 @@ def codegen(
         batch_size = min(self.batch_size, num_samples)
 
         outputs = []
+        
+        if self.subset == "tool":
+            message = f"""\
+Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
+
         for _ in range(batch_size):
             ret = self.client.chat(
                 model=self.name,
                 messages=[
                     ChatMessage(
                         role="user",
-                        content="Please generate self-contained code to solve the following problem in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```",
+                        content=message,
                     )
                 ],
                 max_tokens=self.max_new_tokens,
@@ -383,15 +421,28 @@ def codegen(
             assert batch_size == 1, "Sampling only supports batch size of 1"
 
         outputs = []
-        for _ in range(batch_size):
-            message = anthropic_request.make_auto_request(
+        if self.subset == "tool":
+            message = f"""\
+Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
+
+        ret = anthropic_request.make_auto_request(
                 client=self.client,
                 model=self.name,
                 messages=[
                     {
                         "role": "user",
-                        "content": "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```\n",
+                        "content": message,
                     }
                 ],
                 max_tokens=self.max_new_tokens,
@@ -459,12 +510,27 @@ def codegen(
         model = genai.GenerativeModel(model_name=self.name, generation_config=genai_config, safety_settings=safety_settings)
         
         outputs = []
+        
+        if self.subset == "tool":
+            message = f"""\
+Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
+
         for _ in range(batch_size):
             while True:
                 try:
                     response = model.generate_content(
-                        "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```",
+                        message,
                         generation_config=genai_config
                     )
                     output = response.candidates[0].content.parts[0].text

From 4ef44cae379f31fc6f9a0b64d281582b499948ad Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 20:57:12 +0800
Subject: [PATCH 05/27] fix anthropic gen

---
 bigcodebench/model.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/bigcodebench/model.py b/bigcodebench/model.py
index ff69c254..8d6e9535 100644
--- a/bigcodebench/model.py
+++ b/bigcodebench/model.py
@@ -435,9 +435,9 @@ def codegen(
 {prompt.strip()}
 ```
 """
-
-        ret = anthropic_request.make_auto_request(
-                client=self.client,
+        for _ in range(batch_size):
+            ret = anthropic_request.make_auto_request(
+                    client=self.client,
                 model=self.name,
                 messages=[
                     {
@@ -449,7 +449,7 @@ def codegen(
                 stop_sequences=["\n```\n", "\nif "],
                 **kwargs,
             )
-            outputs.append(message.content[0].text)
+            outputs.append(ret.content[0].text)
 
         return outputs
 

From d001ad48f6869fc0b68d01a554cc8a301eace1e5 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 20:59:02 +0800
Subject: [PATCH 06/27] fix args

---
 bigcodebench/generate.py |  2 ++
 bigcodebench/model.py    | 14 ++++++++++++++
 2 files changed, 16 insertions(+)

diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index e1ecc592..da0dfe39 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -145,6 +145,8 @@ def main():
     model_runner = make_model(
         model=args.model,
         backend=args.backend,
+        subset=args.subset,
+        split=args.split,
         batch_size=args.bs,
         temperature=args.temperature,
         base_url=args.base_url,
diff --git a/bigcodebench/model.py b/bigcodebench/model.py
index 8d6e9535..f704709c 100644
--- a/bigcodebench/model.py
+++ b/bigcodebench/model.py
@@ -551,6 +551,8 @@ def codegen(
 def make_model(
     model: str,
     backend: str,
+    subset: str,
+    split: str,
     dataset: str = "bigcodebench",
     batch_size: int = 1,
     temperature: float = 0.0,
@@ -563,6 +565,8 @@ def make_model(
     if backend == "vllm":
         return GeneralVllmDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             dataset=dataset,
@@ -574,6 +578,8 @@ def make_model(
     elif backend == "hf":
         return GenenralHfTorchDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             dataset=dataset,
@@ -584,6 +590,8 @@ def make_model(
     elif backend == "openai":
         return OpenAIChatDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             base_url=base_url,
@@ -591,18 +599,24 @@ def make_model(
     elif backend == "mistral":
         return MistralChatDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
     elif backend == "anthropic":
         return AnthropicMessageDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
     elif backend == "google":
         return GeminiDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
\ No newline at end of file

From f5171f6466adb267bf55f3dd118b686d983b00f4 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Sun, 1 Sep 2024 21:00:37 +0800
Subject: [PATCH 07/27] fix subset check

---
 bigcodebench/generate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index da0dfe39..729b93ed 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -72,12 +72,12 @@ def codegen(
             sidx = n_samples - nsamples
             while sidx < n_samples:
                 try:
-                    if split == "tool":
+                    if subset == "tool":
                         prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"]
                     else:
                         prompt = task[f"{split}_prompt"]
                 except:
-                    raise Exception(f"Invalid split {split} for BigCodeBench-{subset}")
+                    raise Exception(f"Invalid split {split} for bigcodebench-{subset}")
                 if strip_newlines:
                     prompt = prompt.strip("\n")
                 outputs = model.codegen(

From b3be294c2a325ba3aa7fbf5a981fd1b8ce2f6d69 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 02:07:15 +0800
Subject: [PATCH 08/27] fix prompts

---
 bigcodebench/model.py    | 26 +++++++++++--------
 bigcodebench/sanitize.py | 23 ++++++++++++-----
 bigcodebench/syncheck.py | 55 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 86 insertions(+), 18 deletions(-)

diff --git a/bigcodebench/model.py b/bigcodebench/model.py
index f704709c..9830055e 100644
--- a/bigcodebench/model.py
+++ b/bigcodebench/model.py
@@ -62,15 +62,13 @@ def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokeni
 
     if subset == "tool":
         prompt = f"""\
-Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
 ```
 {prompt.strip()}
 ```
 """     
-        response = f"""\
-Below is a answer (e.g., Python script or refusal) that solves the problem:
-{_MAGIC_SPLITTER_}
-"""
     else:
         prompt = f"""\
 Please provide a self-contained Python script that solves the following problem in a markdown code block:
@@ -79,7 +77,7 @@ def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokeni
 ```
 """
 
-        response = f"""\
+    response = f"""\
 Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:
 ```python
 {_MAGIC_SPLITTER_}
@@ -296,9 +294,9 @@ def codegen(
         fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text"
         if self.subset == "tool":
             if fmt == "json_object":
-                message = r'Based on the given customized modules, please complete the following code snippet without using any imports by generating JSON like {"code": ""}'
+                message = r'Based on the given customized modules, please complete the following code snippet by generating JSON like {"code": ""} without using any external library APIs or defining any modules'
             else:
-                message = r"Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:"
+                message = r"You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.\n\nPlease complete `task_func` in a markdown code block without using any external library APIs or defining any modules:"
         else:
             if fmt == "json_object":
                 message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
@@ -361,7 +359,9 @@ def codegen(
         
         if self.subset == "tool":
             message = f"""\
-Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
 ```
 {prompt.strip()}
 ```
@@ -423,7 +423,9 @@ def codegen(
         outputs = []
         if self.subset == "tool":
             message = f"""\
-Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
 ```
 {prompt.strip()}
 ```
@@ -513,7 +515,9 @@ def codegen(
         
         if self.subset == "tool":
             message = f"""\
-Based on the given customized modules, please provide a self-contained answer (e.g., Python script or refusal) that solves the following problem in a markdown code block without using any imports:
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
 ```
 {prompt.strip()}
 ```
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index df9ed4eb..f120b887 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -15,7 +15,7 @@
     write_directory,
     write_jsonl,
 )
-from bigcodebench.syncheck import syntax_check
+from bigcodebench.syncheck import syntax_check, api_check
 
 CLASS_TYPE = "class_definition"
 FUNCTION_TYPE = "function_definition"
@@ -183,6 +183,7 @@ def process_solution(
     sample_solution: Dict,
     dataset: Dict,
     entry_point: Dict,
+    check_lib: bool = False,
     debug_task: str = None,
     calibrate: bool = False,
     is_folder: bool = False,
@@ -209,6 +210,9 @@ def process_solution(
 
     new_code = sanitize(code=old_code, entrypoint=function_name)
 
+    if check_lib:
+        new_code = "" if api_check(new_code) else new_code
+    
     # if old code and new code are different, print msg
     if new_code != old_code:
         msg = "Sanitized: " + dbg_identifier
@@ -220,7 +224,7 @@ def process_solution(
 
 
 def script(
-    samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
+    samples: str, check_lib: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
 ):
     # task_id -> entry_point
     entry_point = {}
@@ -235,15 +239,19 @@ def script(
     target_path = pathlib.Path(samples)
     if not inplace:
         if is_folder:
-            if calibrate:
-                new_name = target_path.name + "-sanitized-calibrated"
+            if check_lib:
+                new_name = target_path.name + "-skip-lib"
+            elif calibrate:
+                new_name = new_name + "-sanitized-calibrated"
             else:
-                new_name = target_path.name + "-sanitized"
+                new_name = new_name + "-sanitized"
         else:
+            if check_lib:
+                new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
-                new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl")
+                new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
             else:
-                new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
+                new_name = new_name.replace(".jsonl", "-sanitized.jsonl")
         target_path = target_path.parent / new_name
     target_path = str(target_path)
 
@@ -257,6 +265,7 @@ def script(
             "sample_solution": sample_solution,
             "dataset": dataset,
             "entry_point": entry_point,
+            "check_lib": check_lib,
             "debug_task": debug_task,
             "calibrate": calibrate,
             "is_folder": is_folder,
diff --git a/bigcodebench/syncheck.py b/bigcodebench/syncheck.py
index 9ea97f4c..566afdbd 100755
--- a/bigcodebench/syncheck.py
+++ b/bigcodebench/syncheck.py
@@ -10,6 +10,61 @@
 
 from bigcodebench.data import load_solutions
 
+def api_check(code: str) -> bool:
+    tree = ast.parse(code)
+    imported_modules = set()
+    imported_names = {}
+
+    class ApiExtractor(ast.NodeVisitor):
+        def __init__(self):
+            self.in_task_func = False
+            self.uses_library_api = False
+
+        def visit_Import(self, node):
+            for alias in node.names:
+                imported_modules.add(alias.name)
+                if alias.asname:
+                    imported_modules.add(alias.asname)
+
+        def visit_ImportFrom(self, node):
+            if node.module:
+                for alias in node.names:
+                    full_name = f'{node.module}.{alias.name}'
+                    imported_names[alias.asname or alias.name] = full_name
+
+        def visit_FunctionDef(self, node):
+            if node.name == 'task_func':
+                self.in_task_func = True
+                self.generic_visit(node)
+                self.in_task_func = False
+            else:
+                self.generic_visit(node)
+
+        def visit_Attribute(self, node):
+            if self.in_task_func:
+                attr_chain = []
+                current = node
+                while isinstance(current, ast.Attribute):
+                    attr_chain.append(current.attr)
+                    current = current.value
+                if isinstance(current, ast.Name):
+                    attr_chain.append(current.id)
+                    attr_chain.reverse()
+                    full_name = '.'.join(attr_chain)
+                    if attr_chain[0] in imported_modules or attr_chain[0] in imported_names:
+                        self.uses_library_api = True
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if self.in_task_func:
+                if node.id in imported_modules or node.id in imported_names:
+                    self.uses_library_api = True
+            self.generic_visit(node)
+
+    extractor = ApiExtractor()
+    extractor.visit(tree)
+
+    return extractor.uses_library_api
 
 def syntax_check(code, verbose=False):
     try:

From a3c8e15119a256470be4b55672455cb92a1e6057 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:00:35 +0800
Subject: [PATCH 09/27] fix tool solution postprocess

---
 bigcodebench/evaluate.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index ab86aab9..5dab748e 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -183,11 +183,12 @@ def evaluate(flags):
                     continue
                 
                 if flags.subset == "tool":
-                    solution = (problems[task_id][f"{flags.split}_prompt"] + "\n\n"
-                                + problems[task_id]["solution"]  
+                    solution = (problems[task_id][f"complete_prompt"] + "\n\n"
+                                + sample["solution"]  
                                 if "solution" in sample
                                 else problems[task_id]["complete_prompt"] + sample["completion"] 
-                                + "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"])
+                                )
+                    solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]
                 else:
                     solution = (
                         sample["solution"]

From 37125dcb39b127374a2cb616251a82e5a22ea3f9 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:07:15 +0800
Subject: [PATCH 10/27] change eval print

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 5dab748e..ced61335 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -269,7 +269,7 @@ def stucking_checker():
     mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
     extra = flags.subset.capitalize()
     flags.split = flags.split.capitalize()
-    cprint(f"BigCodeBench-{flags.split}{mode} ({extra})", "green")
+    cprint(f"BigCodeBench-{extra}{mode} ({flags.split})", "green")
         
     if flags.no_gt:
         cprint(f"Groundtruth is not checked", "yellow")

From fddd02a446d16cb8ffca001a0e4fd6080e827cc6 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:17:28 +0800
Subject: [PATCH 11/27] fix skip module

---
 bigcodebench/sanitize.py | 38 +++++++++++++++++++++++---------------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index f120b887..d44dc70a 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool:
     return False
 
 
-def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
+def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = False) -> str:
     code = code_extract(code.strip())
     code_bytes = bytes(code, "utf8")
     parser = get_parser("python")
@@ -116,11 +116,14 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
     class_names = set()
     function_names = set()
     variable_names = set()
-
+    reacheable = set()
+    
     root_node = tree.root_node
     import_nodes = []
     definition_nodes = []
 
+    task_func_found = not skip_module
+
     for child in root_node.children:
         if child.type in IMPORT_TYPE:
             import_nodes.append(child)
@@ -129,15 +132,19 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             if not (
                 name in class_names or name in variable_names or name in function_names
             ):
-                definition_nodes.append((name, child))
+                if task_func_found:
+                    definition_nodes.append((name, child))
                 class_names.add(name)
         elif child.type == FUNCTION_TYPE:
             name = get_definition_name(child)
             if not (
                 name in function_names or name in variable_names or name in class_names
             ):
-                definition_nodes.append((name, child))
-                function_names.add(get_definition_name(child))
+                if name == entrypoint:
+                    task_func_found = True
+                if task_func_found:
+                    definition_nodes.append((name, child))
+                function_names.add(name)
         elif (
             child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE
         ):
@@ -146,7 +153,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             if not (
                 name in variable_names or name in function_names or name in class_names
             ):
-                definition_nodes.append((name, subchild))
+                if task_func_found:
+                    definition_nodes.append((name, subchild))
                 variable_names.add(name)
 
     if entrypoint:
@@ -176,14 +184,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             outer_lines.append(i)
     if outer_lines:
         sanitized_output = "\n".join(lines[: outer_lines[-1]])
-    return sanitized_output
+    if skip_module:
+        return "" if api_check(sanitized_output) else sanitized_output
+    else:
+        return sanitized_output
 
 
 def process_solution(
     sample_solution: Dict,
     dataset: Dict,
     entry_point: Dict,
-    check_lib: bool = False,
+    skip_module: bool = False,
     debug_task: str = None,
     calibrate: bool = False,
     is_folder: bool = False,
@@ -209,9 +220,6 @@ def process_solution(
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
     new_code = sanitize(code=old_code, entrypoint=function_name)
-
-    if check_lib:
-        new_code = "" if api_check(new_code) else new_code
     
     # if old code and new code are different, print msg
     if new_code != old_code:
@@ -224,7 +232,7 @@ def process_solution(
 
 
 def script(
-    samples: str, check_lib: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
+    samples: str, skip_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
 ):
     # task_id -> entry_point
     entry_point = {}
@@ -239,14 +247,14 @@ def script(
     target_path = pathlib.Path(samples)
     if not inplace:
         if is_folder:
-            if check_lib:
+            if skip_module:
                 new_name = target_path.name + "-skip-lib"
             elif calibrate:
                 new_name = new_name + "-sanitized-calibrated"
             else:
                 new_name = new_name + "-sanitized"
         else:
-            if check_lib:
+            if skip_module:
                 new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
                 new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
@@ -265,7 +273,7 @@ def script(
             "sample_solution": sample_solution,
             "dataset": dataset,
             "entry_point": entry_point,
-            "check_lib": check_lib,
+            "skip_module": skip_module,
             "debug_task": debug_task,
             "calibrate": calibrate,
             "is_folder": is_folder,

From 316e2968d60d00cd1b3745fcb6f50147f29e6a76 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:21:09 +0800
Subject: [PATCH 12/27] fix skip module

---
 bigcodebench/sanitize.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index d44dc70a..33bc2ca1 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -116,7 +116,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
     class_names = set()
     function_names = set()
     variable_names = set()
-    reacheable = set()
+    reachable = set()
     
     root_node = tree.root_node
     import_nodes = []
@@ -159,7 +159,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
 
     if entrypoint:
         name2deps = get_deps(definition_nodes)
-        reacheable = get_function_dependency(entrypoint, name2deps)
+        reachable = get_function_dependency(entrypoint, name2deps)
 
     sanitized_output = b""
 
@@ -168,7 +168,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
 
     for pair in definition_nodes:
         name, node = pair
-        if entrypoint and not (name in reacheable):
+        if entrypoint and not (name in reachable):
             continue
         sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n"
         
@@ -248,18 +248,18 @@ def script(
     if not inplace:
         if is_folder:
             if skip_module:
-                new_name = target_path.name + "-skip-lib"
+                target_path.name = target_path.name + "-skip-lib"
             elif calibrate:
-                new_name = new_name + "-sanitized-calibrated"
+                new_name = target_path.name + "-sanitized-calibrated"
             else:
-                new_name = new_name + "-sanitized"
+                new_name = target_path.name + "-sanitized"
         else:
             if skip_module:
-                new_name = target_path.name.replace(".jsonl", "-skip-lib.jsonl")
+                target_path.name = target_path.name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
-                new_name = new_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
+                new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl")
             else:
-                new_name = new_name.replace(".jsonl", "-sanitized.jsonl")
+                new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
         target_path = target_path.parent / new_name
     target_path = str(target_path)
 

From aef1c3a0308e49d4233cffc424a1c233e4a10cd2 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:25:48 +0800
Subject: [PATCH 13/27] fix sanitize naming

---
 bigcodebench/sanitize.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index 33bc2ca1..61e5cb5e 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -245,22 +245,23 @@ def script(
     # make a new folder with "-sanitized" suffix
     is_folder = os.path.isdir(samples)
     target_path = pathlib.Path(samples)
+    target_path_name = target_path.name
     if not inplace:
         if is_folder:
             if skip_module:
-                target_path.name = target_path.name + "-skip-lib"
+                target_path_name = target_path_name + "-sanitized"
             elif calibrate:
-                new_name = target_path.name + "-sanitized-calibrated"
+                target_path_name = target_path_name + "-sanitized-calibrated"
             else:
-                new_name = target_path.name + "-sanitized"
+                target_path_name = target_path_name + "-sanitized"
         else:
             if skip_module:
-                target_path.name = target_path.name.replace(".jsonl", "-skip-lib.jsonl")
+                target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
-                new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl")
+                target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
             else:
-                new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
-        target_path = target_path.parent / new_name
+                target_path_name = target_path_name.replace(".jsonl", "-sanitized.jsonl")
+        target_path = target_path.parent / target_path_name
     target_path = str(target_path)
 
     nsan = 0

From 821cf773c14d7adeaaa325c526ae94cff7882a92 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:37:15 +0800
Subject: [PATCH 14/27] fix evaluate

---
 bigcodebench/evaluate.py | 5 +++--
 bigcodebench/sanitize.py | 9 ++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index ced61335..9e560119 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -183,12 +183,13 @@ def evaluate(flags):
                     continue
                 
                 if flags.subset == "tool":
-                    solution = (problems[task_id][f"complete_prompt"] + "\n\n"
-                                + sample["solution"]  
+                    solution = (sample["solution"]  
                                 if "solution" in sample
                                 else problems[task_id]["complete_prompt"] + sample["completion"] 
                                 )
                     solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
                 else:
                     solution = (
                         sample["solution"]
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index 61e5cb5e..431d462a 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -126,7 +126,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
 
     for child in root_node.children:
         if child.type in IMPORT_TYPE:
-            import_nodes.append(child)
+            if not skip_module:
+                import_nodes.append(child)
         elif child.type == CLASS_TYPE:
             name = get_definition_name(child)
             if not (
@@ -219,8 +220,10 @@ def process_solution(
         if calibrate:
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
-    new_code = sanitize(code=old_code, entrypoint=function_name)
-    
+    new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module)
+    if not new_code.startswith("def task_func") and new_code:
+        print(new_code)
+        exit()
     # if old code and new code are different, print msg
     if new_code != old_code:
         msg = "Sanitized: " + dbg_identifier

From 5ce2928ed8f02e8478c4a2fbabea0289a0a0ab7d Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:37:49 +0800
Subject: [PATCH 15/27] rm debug

---
 bigcodebench/sanitize.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index 431d462a..98199f8c 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -221,9 +221,7 @@ def process_solution(
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
     new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module)
-    if not new_code.startswith("def task_func") and new_code:
-        print(new_code)
-        exit()
+    
     # if old code and new code are different, print msg
     if new_code != old_code:
         msg = "Sanitized: " + dbg_identifier

From cabdd5a6153fd5e34e33c9b7c0ef018d73b0416d Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 03:41:55 +0800
Subject: [PATCH 16/27] fix eval

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 9e560119..79aaae98 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -189,7 +189,7 @@ def evaluate(flags):
                                 )
                     solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]
                     if "sanitized-calibrated" in flags.samples:
-                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                        solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
                 else:
                     solution = (
                         sample["solution"]

From 8aea23890d50a6bfd4827d279ad3b5e44c6cb999 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 18:56:13 +0800
Subject: [PATCH 17/27] fix output processing

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 79aaae98..b0c5a4f0 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -187,7 +187,7 @@ def evaluate(flags):
                                 if "solution" in sample
                                 else problems[task_id]["complete_prompt"] + sample["completion"] 
                                 )
-                    solution += "\n\n" + problems[task_id][f"{flags.split}_tool_implementation"]
+                    solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
                 else:

From 170f4edf811afe8e4f78ed1f936e51dfe87950f4 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 19:03:41 +0800
Subject: [PATCH 18/27] fix flag

---
 bigcodebench/sanitize.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index 98199f8c..e247fe3f 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool:
     return False
 
 
-def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = False) -> str:
+def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = False) -> str:
     code = code_extract(code.strip())
     code_bytes = bytes(code, "utf8")
     parser = get_parser("python")
@@ -122,11 +122,11 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
     import_nodes = []
     definition_nodes = []
 
-    task_func_found = not skip_module
+    task_func_found = not no_module
 
     for child in root_node.children:
         if child.type in IMPORT_TYPE:
-            if not skip_module:
+            if not no_module:
                 import_nodes.append(child)
         elif child.type == CLASS_TYPE:
             name = get_definition_name(child)
@@ -185,7 +185,7 @@ def sanitize(code: str, entrypoint: Optional[str] = None, skip_module: bool = Fa
             outer_lines.append(i)
     if outer_lines:
         sanitized_output = "\n".join(lines[: outer_lines[-1]])
-    if skip_module:
+    if no_module:
         return "" if api_check(sanitized_output) else sanitized_output
     else:
         return sanitized_output
@@ -195,7 +195,7 @@ def process_solution(
     sample_solution: Dict,
     dataset: Dict,
     entry_point: Dict,
-    skip_module: bool = False,
+    no_module: bool = False,
     debug_task: str = None,
     calibrate: bool = False,
     is_folder: bool = False,
@@ -220,7 +220,7 @@ def process_solution(
         if calibrate:
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
-    new_code = sanitize(code=old_code, entrypoint=function_name, skip_module=skip_module)
+    new_code = sanitize(code=old_code, entrypoint=function_name, no_module=no_module)
     
     # if old code and new code are different, print msg
     if new_code != old_code:
@@ -233,7 +233,7 @@ def process_solution(
 
 
 def script(
-    samples: str, skip_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
+    samples: str, no_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
 ):
     # task_id -> entry_point
     entry_point = {}
@@ -249,14 +249,14 @@ def script(
     target_path_name = target_path.name
     if not inplace:
         if is_folder:
-            if skip_module:
+            if no_module:
                 target_path_name = target_path_name + "-sanitized"
             elif calibrate:
                 target_path_name = target_path_name + "-sanitized-calibrated"
             else:
                 target_path_name = target_path_name + "-sanitized"
         else:
-            if skip_module:
+            if no_module:
                 target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
                 target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
@@ -275,7 +275,7 @@ def script(
             "sample_solution": sample_solution,
             "dataset": dataset,
             "entry_point": entry_point,
-            "skip_module": skip_module,
+            "no_module": no_module,
             "debug_task": debug_task,
             "calibrate": calibrate,
             "is_folder": is_folder,

From 8f436048f4dfae8cda321d08ccb0fd1ae7ccb983 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Mon, 2 Sep 2024 20:02:47 +0800
Subject: [PATCH 19/27] fix output processing

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index b0c5a4f0..8f7edd5c 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -187,9 +187,9 @@ def evaluate(flags):
                                 if "solution" in sample
                                 else problems[task_id]["complete_prompt"] + sample["completion"] 
                                 )
-                    solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
+                    solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution
                 else:
                     solution = (
                         sample["solution"]

From e2a0e702ed8683897a1814bc063326ceeea62418 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 3 Sep 2024 20:28:19 +0800
Subject: [PATCH 20/27] fix tools

---
 bigcodebench/evaluate.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 8f7edd5c..58d0753e 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -189,7 +189,7 @@ def evaluate(flags):
                                 )
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
-                    solution = problems[task_id][f"{flags.split}_tool_implementation"] + solution
+                    solution = problems[task_id][f"positive_tool_implementation"] + solution
                 else:
                     solution = (
                         sample["solution"]

From 3a00d9eae192ef5b11b363a2126f781c1e5f7989 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 3 Sep 2024 20:53:22 +0800
Subject: [PATCH 21/27] fix some missing titles

---
 tools/fix_v0110.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/tools/fix_v0110.py b/tools/fix_v0110.py
index d1d1300c..0b7f75ec 100644
--- a/tools/fix_v0110.py
+++ b/tools/fix_v0110.py
@@ -19,7 +19,21 @@ def map_ds(sample):
                             "Requirements:\n    - sklearn.ensemble\n",
                             "Requirements:\n    - pandas\n    - sklearn.ensemble\n"    
                 )
-    
+    if sample["task_id"] in ["BigCodeBench/241"]:
+        for k in sample.keys():
+            if "prompt" in k:
+                sample[k] = sample[k].replace(
+                            "The function will plot the original and normalized arrays using matplotlib.",
+                            "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'."    
+                )
+    if sample["task_id"] in ["BigCodeBench/267"]:
+        for k in sample.keys():
+            if "prompt" in k:
+                sample[k] = sample[k].replace(
+                            "Plots and returns the FFT of the signal.",
+                            "Plots and returns the FFT of the signal with a title of 'FFT of the signal'."    
+                )
+
     return sample
     
 if __name__ == "__main__":
@@ -28,7 +42,7 @@ def map_ds(sample):
     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
     ds = ds_dict[BIGCODEBENCH_VERSION]
     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
-    function_id = [37]
+    function_id = [37, 267, 241]
     
     new_ds = ds.map(map_ds)
     new_ds.to_json("BigCodeBench.jsonl")

From fe3ac569c22b7c9711da5e93e25a3a5a1bf5db87 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Tue, 3 Sep 2024 20:23:45 +0800
Subject: [PATCH 22/27] fix tools to avoid dependency conflicts

---
 bigcodebench/evaluate.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 58d0753e..35d9b55d 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -187,6 +187,7 @@ def evaluate(flags):
                                 if "solution" in sample
                                 else problems[task_id]["complete_prompt"] + sample["completion"] 
                                 )
+                    solution = problems[task_id]["positive_tool_implementation"] + solution
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
                     solution = problems[task_id][f"positive_tool_implementation"] + solution

From be9b0166d6d99386a8a9dd7e707b5d9ed221d4bc Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Wed, 4 Sep 2024 08:47:05 +0000
Subject: [PATCH 23/27] fix sanitize logic and skip empty in eval

---
 bigcodebench/evaluate.py | 18 ++++++++++++--
 bigcodebench/sanitize.py | 52 +++++++++++++++++++---------------------
 2 files changed, 40 insertions(+), 30 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 35d9b55d..649a53b4 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -199,6 +199,20 @@ def evaluate(flags):
                     )
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                
+                # Skip execution for empty solutions
+                if not solution.strip():
+                    eval_results[task_id].append({
+                        "completion_id": completion_id[task_id],
+                        "task_id": task_id,
+                        "_identifier": sample["_identifier"],
+                        "solution": solution,
+                        "base": (FAIL, "Empty solution")
+                    })
+                    completion_id[task_id] += 1
+                    n_samples += 1
+                    continue
+                
                 remainings.add(sample["_identifier"])
                 args = (
                     completion_id[task_id],
@@ -215,7 +229,7 @@ def evaluate(flags):
                 completion_id[task_id] += 1
                 n_samples += 1
 
-            assert n_samples == len(remainings), "Missing problems in unfinished"
+            assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
             assert len(completion_id) == len(problems), "Missing problems in samples"
 
             def stucking_checker():
@@ -230,7 +244,7 @@ def stucking_checker():
 
             threading.Thread(target=stucking_checker).start()
 
-            for future in tqdm(as_completed(futures), total=n_samples):
+            for future in tqdm(as_completed(futures), total=len(futures)):
                 result = future.result()
                 remainings.remove(result["_identifier"])
                 eval_results[result["task_id"]].append(result)
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index e247fe3f..a915844a 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -108,7 +108,7 @@ def has_return_statement(node: Node) -> bool:
     return False
 
 
-def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = False) -> str:
+def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str:
     code = code_extract(code.strip())
     code_bytes = bytes(code, "utf8")
     parser = get_parser("python")
@@ -122,29 +122,26 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals
     import_nodes = []
     definition_nodes = []
 
-    task_func_found = not no_module
-
     for child in root_node.children:
         if child.type in IMPORT_TYPE:
-            if not no_module:
-                import_nodes.append(child)
+            # if subset != "tool":
+            import_nodes.append(child)
         elif child.type == CLASS_TYPE:
             name = get_definition_name(child)
             if not (
                 name in class_names or name in variable_names or name in function_names
             ):
-                if task_func_found:
-                    definition_nodes.append((name, child))
+                definition_nodes.append((name, child))
                 class_names.add(name)
         elif child.type == FUNCTION_TYPE:
             name = get_definition_name(child)
             if not (
                 name in function_names or name in variable_names or name in class_names
             ):
-                if name == entrypoint:
-                    task_func_found = True
-                if task_func_found:
-                    definition_nodes.append((name, child))
+                # if name == entrypoint:
+                #     task_func_found = True
+                # if task_func_found:
+                definition_nodes.append((name, child))
                 function_names.add(name)
         elif (
             child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE
@@ -154,8 +151,8 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals
             if not (
                 name in variable_names or name in function_names or name in class_names
             ):
-                if task_func_found:
-                    definition_nodes.append((name, subchild))
+                # if task_func_found:
+                definition_nodes.append((name, subchild))
                 variable_names.add(name)
 
     if entrypoint:
@@ -185,17 +182,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None, no_module: bool = Fals
             outer_lines.append(i)
     if outer_lines:
         sanitized_output = "\n".join(lines[: outer_lines[-1]])
-    if no_module:
-        return "" if api_check(sanitized_output) else sanitized_output
-    else:
-        return sanitized_output
+    # if subset == "tool":
+    #     return "" if api_check(solution[f"{split}_tool_"] + "\n" + sanitized_output) else sanitized_output
+    # else:
+    return sanitized_output
 
 
 def process_solution(
     sample_solution: Dict,
     dataset: Dict,
     entry_point: Dict,
-    no_module: bool = False,
+    subset: str,
     debug_task: str = None,
     calibrate: bool = False,
     is_folder: bool = False,
@@ -220,8 +217,10 @@ def process_solution(
         if calibrate:
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
-    new_code = sanitize(code=old_code, entrypoint=function_name, no_module=no_module)
-    
+    new_code = sanitize(code=old_code, solution=sample_solution, entrypoint=function_name)
+    if subset == "tool":
+        if api_check(new_code):
+            new_code = ""
     # if old code and new code are different, print msg
     if new_code != old_code:
         msg = "Sanitized: " + dbg_identifier
@@ -233,12 +232,12 @@ def process_solution(
 
 
 def script(
-    samples: str, no_module: bool = False, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
+    samples: str, subset: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
 ):
     # task_id -> entry_point
     entry_point = {}
     # merge two datasets
-    dataset = {**get_bigcodebench()}
+    dataset = {**get_bigcodebench(subset=subset)}
 
     for task_id, problem in dataset.items():
         entry_point[task_id] = problem["entry_point"]
@@ -249,15 +248,11 @@ def script(
     target_path_name = target_path.name
     if not inplace:
         if is_folder:
-            if no_module:
-                target_path_name = target_path_name + "-sanitized"
-            elif calibrate:
+            if calibrate:
                 target_path_name = target_path_name + "-sanitized-calibrated"
             else:
                 target_path_name = target_path_name + "-sanitized"
         else:
-            if no_module:
-                target_path_name = target_path_name.replace(".jsonl", "-skip-lib.jsonl")
             if calibrate:
                 target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
             else:
@@ -275,7 +270,7 @@ def script(
             "sample_solution": sample_solution,
             "dataset": dataset,
             "entry_point": entry_point,
-            "no_module": no_module,
+            "subset": subset,
             "debug_task": debug_task,
             "calibrate": calibrate,
             "is_folder": is_folder,
@@ -288,6 +283,7 @@ def script(
 
     for result in results:
         if result is not None:
+            print(result)
             new_solutions.append(result)
             nsan += 1
         ntotal += 1

From 2e153a66a0fea1b4f9d65ac19c60522ad9427b62 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Thu, 5 Sep 2024 00:26:30 +0800
Subject: [PATCH 24/27] calcuate pass_tool@k

---
 bigcodebench/eval/__init__.py |  78 ++++++++++++++++
 bigcodebench/evaluate.py      | 163 ++++++++++++++++++----------------
 bigcodebench/sanitize.py      |   1 -
 3 files changed, 162 insertions(+), 80 deletions(-)

diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
index 3596f53d..06a3a570 100644
--- a/bigcodebench/eval/__init__.py
+++ b/bigcodebench/eval/__init__.py
@@ -24,6 +24,7 @@
 import multiprocessing
 import os
 import sys
+import ast
 import time
 import types
 import unittest
@@ -240,3 +241,80 @@ def evaluate_files(
         )
         ret.append((stat, det.tolist()))
     return ret
+
+
+def extract_defined_modules(code: str, entry_point: str):
+    tree = ast.parse(code)
+    defined_functions = set()
+    defined_methods = {}
+    used_functions = set()
+    used_methods = set()
+    variable_classes = {}
+
+    class FunctionDefVisitor(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined_functions.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            for item in node.body:
+                if isinstance(item, ast.FunctionDef):
+                    if node.name not in defined_methods:
+                        defined_methods[node.name] = set()
+                    defined_methods[node.name].add(item.name)
+            self.generic_visit(node)
+
+    class TaskFuncVisitor(ast.NodeVisitor):
+        def visit_Assign(self, node):
+            if isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name):
+                class_name = node.value.func.id
+                for target in node.targets:
+                    if isinstance(target, ast.Name):
+                        variable_classes[target.id] = class_name
+            self.generic_visit(node)
+
+        def visit_Call(self, node):
+            if isinstance(node.func, ast.Name):
+                used_functions.add(node.func.id)
+            elif isinstance(node.func, ast.Attribute):
+                value = node.func.value
+                if isinstance(value, ast.Name):
+                    var_name = value.id
+                    if var_name in variable_classes:
+                        used_methods.add(f"{variable_classes[var_name]}.{node.func.attr}")
+                    else:
+                        used_methods.add(f"{var_name}.{node.func.attr}")
+                elif isinstance(value, ast.Attribute):
+                    # Handle nested attributes (e.g., obj.attr.method())
+                    attr_chain = [node.func.attr]
+                    while isinstance(value, ast.Attribute):
+                        attr_chain.append(value.attr)
+                        value = value.value
+                    if isinstance(value, ast.Name):
+                        var_name = value.id
+                        if var_name in variable_classes:
+                            attr_chain.append(variable_classes[var_name])
+                        else:
+                            attr_chain.append(var_name)
+                        used_methods.add('.'.join(reversed(attr_chain)))
+            self.generic_visit(node)
+
+    # First pass: collect all defined functions and methods
+    FunctionDefVisitor().visit(tree)
+
+    # Second pass: collect used functions and methods within task_func
+    for node in ast.iter_child_nodes(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == 'task_func':
+            TaskFuncVisitor().visit(node)
+            break  # Assuming there's only one task_func
+
+    # Filter used functions to include only those defined before task_func
+    result = [func for func in used_functions if func in defined_functions]
+
+    # Filter used methods to include only those defined before task_func
+    for class_name, methods in defined_methods.items():
+        for method in methods:
+            if any(f"{class_name}.{method}" in used_method for used_method in used_methods):
+                result.append(f"{class_name}.{method}")
+
+    return result
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 649a53b4..db0fbde9 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -26,6 +26,7 @@
     compatible_eval_result,
     estimate_pass_at_k,
     untrusted_check,
+    extract_defined_modules,
 )
 from bigcodebench.gen.util import trusted_check
 
@@ -93,6 +94,7 @@ def check_correctness(
     identifier=None,
     min_time_limit: float = 0.1,
     gt_time_limit: float = 2.0,
+    used_modules: List[str] = [],
 ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
     ret = {
         "completion_id": completion_id,
@@ -110,6 +112,10 @@ def check_correctness(
         min_time_limit,
         gt_time_limit,
     )
+    if used_modules:
+        ret["used_modules"] = (set(used_modules) == set(problems["used_modules"]))
+    else:
+        ret["used_modules"] = True
     return ret
 
 
@@ -165,89 +171,75 @@ def evaluate(flags):
             "eval": {},
         }
 
-        with ProcessPoolExecutor(max_workers=n_workers) as executor:
-            futures = []
-            completion_id = Counter()
-            n_samples = 0
-            eval_results = defaultdict(list)  # task_id ->
-            remainings = set()
+        completion_id = Counter()
+        n_samples = 0
+        eval_results = defaultdict(list)  # task_id ->
+        remainings = set()
 
-            print("Reading samples...")
-            for sample in tqdm(load_solutions(flags.samples)):
-                task_id = sample["task_id"]
-                
-                if task_id not in problems:
-                    warn(
-                        f"Task {task_id} is found in the samples but not found in the dataset"
-                    )
-                    continue
-                
-                if flags.subset == "tool":
-                    solution = (sample["solution"]  
-                                if "solution" in sample
-                                else problems[task_id]["complete_prompt"] + sample["completion"] 
-                                )
-                    solution = problems[task_id]["positive_tool_implementation"] + solution
-                    if "sanitized-calibrated" in flags.samples:
-                        solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
-                    solution = problems[task_id][f"positive_tool_implementation"] + solution
-                else:
-                    solution = (
-                        sample["solution"]
-                    if "solution" in sample
-                        else problems[task_id]["complete_prompt"] + sample["completion"]
-                    )
-                    if "sanitized-calibrated" in flags.samples:
-                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
-                
-                # Skip execution for empty solutions
-                if not solution.strip():
-                    eval_results[task_id].append({
-                        "completion_id": completion_id[task_id],
-                        "task_id": task_id,
-                        "_identifier": sample["_identifier"],
-                        "solution": solution,
-                        "base": (FAIL, "Empty solution")
-                    })
-                    completion_id[task_id] += 1
-                    n_samples += 1
-                    continue
-                
-                remainings.add(sample["_identifier"])
-                args = (
-                    completion_id[task_id],
-                    problems[task_id],
-                    solution,
-                    flags.max_as_limit,
-                    flags.max_data_limit,
-                    flags.max_stack_limit,
-                    sample["_identifier"],
-                    flags.min_time_limit,
-                    expected_time[task_id] if expected_time[task_id] else 20
+        print("Reading samples...")
+        samples = list(load_solutions(flags.samples))
+        
+        def process_sample(sample):
+            task_id = sample["task_id"]
+            
+            if task_id not in problems:
+                warn(f"Task {task_id} is found in the samples but not found in the dataset")
+                return None
+            
+            used_modules = []
+            if flags.subset == "tool":
+                solution = (sample["solution"]  
+                            if "solution" in sample
+                            else problems[task_id]["complete_prompt"] + sample["completion"] 
+                            )
+                solution = problems[task_id]["positive_tool_implementation"] + solution
+                if "sanitized-calibrated" in flags.samples:
+                    solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
+                solution = problems[task_id][f"positive_tool_implementation"] + solution
+                used_modules = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"])
+            else:
+                solution = (
+                    sample["solution"]
+                if "solution" in sample
+                    else problems[task_id]["complete_prompt"] + sample["completion"]
                 )
-                futures.append(executor.submit(check_correctness, *args))
-                completion_id[task_id] += 1
-                n_samples += 1
-
-            assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
-            assert len(completion_id) == len(problems), "Missing problems in samples"
-
-            def stucking_checker():
-                while remainings:
-                    last_size = len(remainings)
-                    time.sleep(240)
-                    if last_size != len(remainings) or len(remainings) == 0:
-                        continue
-                    # Potential stucking
-                    warn("No samples had finished testing in the last 240s")
-                    warn(f"{len(remainings)} samples to be tested: {remainings}")
+                if "sanitized-calibrated" in flags.samples:
+                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+            
+            # Skip execution for empty solutions
+            if not solution.strip():
+                return {
+                    "completion_id": completion_id[task_id],
+                    "task_id": task_id,
+                    "_identifier": sample["_identifier"],
+                    "solution": solution,
+                    "base": (FAIL, "Empty solution")
+                }
+            
+            args = (
+                completion_id[task_id],
+                problems[task_id],
+                solution,
+                flags.max_as_limit,
+                flags.max_data_limit,
+                flags.max_stack_limit,
+                sample["_identifier"],
+                flags.min_time_limit,
+                expected_time[task_id] if expected_time[task_id] else 20,
+                used_modules,
+            )
+            return check_correctness(*args)
 
-            threading.Thread(target=stucking_checker).start()
+        results_list = pqdm(samples, process_sample, n_jobs=n_workers, desc="Processing samples")
 
-            for future in tqdm(as_completed(futures), total=len(futures)):
-                result = future.result()
-                remainings.remove(result["_identifier"])
+        for result in results_list:
+            if result is not None:
                 eval_results[result["task_id"]].append(result)
+                completion_id[result["task_id"]] += 1
+                n_samples += 1
+
+        assert n_samples == sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
+        assert len(completion_id) == len(problems), "Missing problems in samples"
 
         # sort the results for each problem by completion_id
         for task_id, task_results in eval_results.items():
@@ -255,12 +247,14 @@ def stucking_checker():
             results["eval"][task_id] = []
             for res in task_results:
                 stat, details = res["base"]
+                tool_use = res["used_modules"]
                 results["eval"][task_id].append(
                     {
                         "task_id": task_id,
                         "solution": res["solution"],
                         "status": stat,
                         "details": details,
+                        "tool_use": tool_use,
                     }
                 )
 
@@ -275,13 +269,19 @@ def stucking_checker():
         base_correct.append(bc)
 
     base_correct = np.array(base_correct)
-
+    tool_correct = np.array([r["tool_use"] for r in results["eval"].values()])
+    
     pass_at_k = {
         f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
         for k in [1, 5, 10, 25, 100]
         if total.min() >= k
     }
     
+    if flags.subset == "tool":
+        pass_at_k.update({f"pass_tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean()
+                          for k in [1, 5, 10, 25, 100]
+                          if total.min() >= k})
+    
     mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
     extra = flags.subset.capitalize()
     flags.split = flags.split.capitalize()
@@ -300,6 +300,11 @@ def stucking_checker():
     
     for k, v in pass_at_k.items():
         cprint(f"{k}:\t{v:.3f}", "green")
+        
+    if flags.subset == "tool":
+        for k, v in pass_at_k.items():
+            if k.startswith("pass_tool@"):
+                cprint(f"{k}:\t{v:.3f}", "green")
 
     # save results
     if os.path.isfile(result_path):
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index a915844a..ec7f908e 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -283,7 +283,6 @@ def script(
 
     for result in results:
         if result is not None:
-            print(result)
             new_solutions.append(result)
             nsan += 1
         ntotal += 1

From 077da747151cebad49c30953407cefbae7fe239b Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Thu, 5 Sep 2024 03:59:42 +1000
Subject: [PATCH 25/27] fix tool eval

---
 bigcodebench/eval/__init__.py |   2 +-
 bigcodebench/evaluate.py      | 234 +++++++++++++++++++++-------------
 2 files changed, 143 insertions(+), 93 deletions(-)

diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
index 06a3a570..66f3aabf 100644
--- a/bigcodebench/eval/__init__.py
+++ b/bigcodebench/eval/__init__.py
@@ -304,7 +304,7 @@ def visit_Call(self, node):
 
     # Second pass: collect used functions and methods within task_func
     for node in ast.iter_child_nodes(tree):
-        if isinstance(node, ast.FunctionDef) and node.name == 'task_func':
+        if isinstance(node, ast.FunctionDef) and node.name == entry_point:
             TaskFuncVisitor().visit(node)
             break  # Assuming there's only one task_func
 
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index db0fbde9..48681deb 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -5,6 +5,7 @@
 import pickle
 import threading
 import time
+from pqdm.processes import pqdm
 from collections import Counter, defaultdict
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import datetime
@@ -85,6 +86,8 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as
     return expected_time
 
 def check_correctness(
+    subset: str,
+    split: str,
     completion_id: int,
     problem: Dict[str, Any],
     solution: str,
@@ -94,7 +97,7 @@ def check_correctness(
     identifier=None,
     min_time_limit: float = 0.1,
     gt_time_limit: float = 2.0,
-    used_modules: List[str] = [],
+    used_tools: List[str] = [],
 ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
     ret = {
         "completion_id": completion_id,
@@ -112,10 +115,13 @@ def check_correctness(
         min_time_limit,
         gt_time_limit,
     )
-    if used_modules:
-        ret["used_modules"] = (set(used_modules) == set(problems["used_modules"]))
-    else:
-        ret["used_modules"] = True
+    if subset == "tool":
+        if split in ["positive", "mixed"]:
+            ret["used_tools"] = (set(used_tools) == set(problem["used_tools"]))
+        else:
+            assert split == "negative"
+            ret["used_tools"] = (set(used_tools) == {"refusal_func"})
+
     return ret
 
 
@@ -171,75 +177,97 @@ def evaluate(flags):
             "eval": {},
         }
 
-        completion_id = Counter()
-        n_samples = 0
-        eval_results = defaultdict(list)  # task_id ->
-        remainings = set()
+        with ProcessPoolExecutor(max_workers=n_workers) as executor:
+            futures = []
+            completion_id = Counter()
+            n_samples = 0
+            eval_results = defaultdict(list)  # task_id ->
+            remainings = set()
 
-        print("Reading samples...")
-        samples = list(load_solutions(flags.samples))
-        
-        def process_sample(sample):
-            task_id = sample["task_id"]
-            
-            if task_id not in problems:
-                warn(f"Task {task_id} is found in the samples but not found in the dataset")
-                return None
+            print("Reading samples...")
+            for sample in tqdm(load_solutions(flags.samples)):
+                task_id = sample["task_id"]
+                
+                if task_id not in problems:
+                    warn(
+                        f"Task {task_id} is found in the samples but not found in the dataset"
+                    )
+                    continue
+                
+                if flags.subset == "tool":
+                    solution = (sample["solution"]  
+                                if "solution" in sample
+                                else problems[task_id]["complete_prompt"] + sample["completion"] 
+                                )
+                    try:
+                        used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"])
+                    except Exception as e:
+                        used_tools = []
+                    solution = problems[task_id]["positive_tool_implementation"] + solution
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
+                    solution = problems[task_id][f"positive_tool_implementation"] + solution
+
+                else:
+                    solution = (
+                        sample["solution"]
+                    if "solution" in sample
+                        else problems[task_id]["complete_prompt"] + sample["completion"]
+                    )
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
             
-            used_modules = []
-            if flags.subset == "tool":
-                solution = (sample["solution"]  
-                            if "solution" in sample
-                            else problems[task_id]["complete_prompt"] + sample["completion"] 
-                            )
-                solution = problems[task_id]["positive_tool_implementation"] + solution
-                if "sanitized-calibrated" in flags.samples:
-                    solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
-                solution = problems[task_id][f"positive_tool_implementation"] + solution
-                used_modules = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"])
-            else:
-                solution = (
-                    sample["solution"]
-                if "solution" in sample
-                    else problems[task_id]["complete_prompt"] + sample["completion"]
+                # Skip execution for empty solutions
+                if not solution:
+                    eval_results[task_id].append({
+                        "completion_id": completion_id[task_id],
+                        "task_id": task_id,
+                        "_identifier": sample["_identifier"],
+                        "solution": solution,
+                        "base": (FAIL, "Empty solution")
+                    })
+                    completion_id[task_id] += 1
+                    n_samples += 1
+                    continue
+        
+                remainings.add(sample["_identifier"])
+                args = (
+                    flags.subset,
+                    flags.split,
+                    completion_id[task_id],
+                    problems[task_id],
+                    solution,
+                    flags.max_as_limit,
+                    flags.max_data_limit,
+                    flags.max_stack_limit,
+                    sample["_identifier"],
+                    flags.min_time_limit,
+                    expected_time[task_id] if expected_time[task_id] else 20,
+                    used_tools
                 )
-                if "sanitized-calibrated" in flags.samples:
-                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
-            
-            # Skip execution for empty solutions
-            if not solution.strip():
-                return {
-                    "completion_id": completion_id[task_id],
-                    "task_id": task_id,
-                    "_identifier": sample["_identifier"],
-                    "solution": solution,
-                    "base": (FAIL, "Empty solution")
-                }
-            
-            args = (
-                completion_id[task_id],
-                problems[task_id],
-                solution,
-                flags.max_as_limit,
-                flags.max_data_limit,
-                flags.max_stack_limit,
-                sample["_identifier"],
-                flags.min_time_limit,
-                expected_time[task_id] if expected_time[task_id] else 20,
-                used_modules,
-            )
-            return check_correctness(*args)
-
-        results_list = pqdm(samples, process_sample, n_jobs=n_workers, desc="Processing samples")
-
-        for result in results_list:
-            if result is not None:
-                eval_results[result["task_id"]].append(result)
-                completion_id[result["task_id"]] += 1
+                futures.append(executor.submit(check_correctness, *args))
+                completion_id[task_id] += 1
                 n_samples += 1
 
-        assert n_samples == sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
-        assert len(completion_id) == len(problems), "Missing problems in samples"
+            assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
+            assert len(completion_id) == len(problems), "Missing problems in samples"
+            
+            def stucking_checker():
+                while remainings:
+                    last_size = len(remainings)
+                    time.sleep(240)
+                    if last_size != len(remainings) or len(remainings) == 0:
+                        continue
+                    # Potential stucking
+                    warn("No samples had finished testing in the last 240s")
+                    warn(f"{len(remainings)} samples to be tested: {remainings}")
+
+            threading.Thread(target=stucking_checker).start()
+
+            for future in tqdm(as_completed(futures), total=len(futures)):
+                result = future.result()
+                remainings.remove(result["_identifier"])
+                eval_results[result["task_id"]].append(result)
 
         # sort the results for each problem by completion_id
         for task_id, task_results in eval_results.items():
@@ -247,29 +275,53 @@ def process_sample(sample):
             results["eval"][task_id] = []
             for res in task_results:
                 stat, details = res["base"]
-                tool_use = res["used_modules"]
-                results["eval"][task_id].append(
-                    {
-                        "task_id": task_id,
-                        "solution": res["solution"],
-                        "status": stat,
-                        "details": details,
-                        "tool_use": tool_use,
-                    }
-                )
+                if flags.subset == "tool":
+                    tool_use = res["used_tools"]
+                    results["eval"][task_id].append(
+                        {
+                            "task_id": task_id,
+                            "solution": res["solution"],
+                            "status": stat,
+                            "details": details,
+                            "tool_use": tool_use,
+                        }
+                    )
+                else:
+                    results["eval"][task_id].append(
+                        {
+                            "task_id": task_id,
+                            "solution": res["solution"],
+                            "status": stat,
+                            "details": details,
+                        }
+                    )
 
     # Calculate pass@k.
     total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
     base_correct = []
-
+    tool_correct = []
+    syntax_correct = []
     for key, res in results["eval"].items():
         if key not in problems:
             continue
         bc = sum([r["status"] == PASS for r in res])
-        base_correct.append(bc)
-
+        base_correct.append(bc)    
+    
+    for key, res in results["eval"].items():
+        if key not in problems:
+            continue
+        tc = sum([r["tool_use"] for r in res])
+        tool_correct.append(tc)
+    
+    for key, res in results["eval"].items():
+        if key not in problems:
+            continue
+        empty_solutions = sum([r["solution"] for r in res])
+        syntax_correct.append(empty_solutions)
+    
     base_correct = np.array(base_correct)
-    tool_correct = np.array([r["tool_use"] for r in results["eval"].values()])
+    tool_correct = np.array(tool_correct)
+    syntax_correct = np.array(syntax_correct)
     
     pass_at_k = {
         f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
@@ -278,9 +330,12 @@ def process_sample(sample):
     }
     
     if flags.subset == "tool":
-        pass_at_k.update({f"pass_tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean()
+        pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean()
                           for k in [1, 5, 10, 25, 100]
                           if total.min() >= k})
+        pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean()
+                        for k in [1, 5, 10, 25, 100]
+                        if total.min() >= k})
     
     mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
     extra = flags.subset.capitalize()
@@ -291,20 +346,15 @@ def process_sample(sample):
         cprint(f"Groundtruth is not checked", "yellow")
     else:
         if gt_pass_rate > 0.99:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%", "green")
         else:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%\nPlease be cautious!", "red")
         
         if len(failed_tasks) > 0:
             cprint(f"Failed tasks: {failed_tasks}", "red")
     
     for k, v in pass_at_k.items():
-        cprint(f"{k}:\t{v:.3f}", "green")
-        
-    if flags.subset == "tool":
-        for k, v in pass_at_k.items():
-            if k.startswith("pass_tool@"):
-                cprint(f"{k}:\t{v:.3f}", "green")
+        cprint(f"{k}:\t{v*100:.2f}%", "green")
 
     # save results
     if os.path.isfile(result_path):

From 2a5c2bb8b44ef8a7ffa4b44d6142369b5a69f052 Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Thu, 5 Sep 2024 18:08:46 +1000
Subject: [PATCH 26/27] fix eval

---
 bigcodebench/evaluate.py | 43 ++++++++++++++++++++++------------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 48681deb..6adcc36c 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -187,6 +187,7 @@ def evaluate(flags):
             print("Reading samples...")
             for sample in tqdm(load_solutions(flags.samples)):
                 task_id = sample["task_id"]
+                used_tools = []
                 
                 if task_id not in problems:
                     warn(
@@ -202,7 +203,7 @@ def evaluate(flags):
                     try:
                         used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"])
                     except Exception as e:
-                        used_tools = []
+                        pass
                     solution = problems[task_id]["positive_tool_implementation"] + solution
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
@@ -299,29 +300,14 @@ def stucking_checker():
     # Calculate pass@k.
     total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
     base_correct = []
-    tool_correct = []
-    syntax_correct = []
+
     for key, res in results["eval"].items():
         if key not in problems:
             continue
         bc = sum([r["status"] == PASS for r in res])
         base_correct.append(bc)    
     
-    for key, res in results["eval"].items():
-        if key not in problems:
-            continue
-        tc = sum([r["tool_use"] for r in res])
-        tool_correct.append(tc)
-    
-    for key, res in results["eval"].items():
-        if key not in problems:
-            continue
-        empty_solutions = sum([r["solution"] for r in res])
-        syntax_correct.append(empty_solutions)
-    
     base_correct = np.array(base_correct)
-    tool_correct = np.array(tool_correct)
-    syntax_correct = np.array(syntax_correct)
     
     pass_at_k = {
         f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
@@ -330,9 +316,28 @@ def stucking_checker():
     }
     
     if flags.subset == "tool":
+        tool_correct = []
+        syntax_correct = []
+        
+        for key, res in results["eval"].items():
+            if key not in problems:
+                continue
+            tc = sum([r["tool_use"] for r in res])
+            tool_correct.append(tc)
+        
+        for key, res in results["eval"].items():
+            if key not in problems:
+                continue
+            empty_solutions = sum([r["solution"] for r in res])
+            syntax_correct.append(empty_solutions)
+        
+        tool_correct = np.array(tool_correct)
+        syntax_correct = np.array(syntax_correct)
+        
         pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean()
                           for k in [1, 5, 10, 25, 100]
                           if total.min() >= k})
+        
         pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean()
                         for k in [1, 5, 10, 25, 100]
                         if total.min() >= k})
@@ -346,9 +351,9 @@ def stucking_checker():
         cprint(f"Groundtruth is not checked", "yellow")
     else:
         if gt_pass_rate > 0.99:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%", "green")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%", "green")
         else:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.2f}%\nPlease be cautious!", "red")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%\nPlease be cautious!", "red")
         
         if len(failed_tasks) > 0:
             cprint(f"Failed tasks: {failed_tasks}", "red")

From 43207e9cd37c1b4a777e82c30eaf6879fd4ed89e Mon Sep 17 00:00:00 2001
From: Terry Zhuo <terryzhuo25@gmail.com>
Date: Wed, 18 Sep 2024 04:24:33 +1000
Subject: [PATCH 27/27] update

---
 bigcodebench/evaluate.py          | 87 ++++++++++++++++---------------
 bigcodebench/gen/util/__init__.py |  1 +
 bigcodebench/sanitize.py          | 11 +++-
 3 files changed, 55 insertions(+), 44 deletions(-)

diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 6adcc36c..464af830 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -23,6 +23,7 @@
 )
 from bigcodebench.data.utils import CACHE_DIR
 from bigcodebench.eval import (
+    FAIL,
     PASS,
     compatible_eval_result,
     estimate_pass_at_k,
@@ -57,7 +58,7 @@ def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as
         
         for problem in problems.values():
             if subset == "tool":
-                code = problem["canonical_solution"]
+                code = problem["positive_tool_implementation"] + "\n" + problem["code_before_entry_point"] + "\n" + problem["canonical_solution"]
             else:
                 code = problem["code_prompt"] + "\n" + problem["canonical_solution"]
             args = (
@@ -119,8 +120,9 @@ def check_correctness(
         if split in ["positive", "mixed"]:
             ret["used_tools"] = (set(used_tools) == set(problem["used_tools"]))
         else:
-            assert split == "negative"
-            ret["used_tools"] = (set(used_tools) == {"refusal_func"})
+            ret["used_tools"] = False
+    else:
+        ret["used_tools"] = False
 
     return ret
 
@@ -195,6 +197,30 @@ def evaluate(flags):
                     )
                     continue
                 
+                # Skip execution for empty solutions
+                if not sample["solution"]:
+                    
+                    if flags.subset == "tool" and flags.split == "negative":
+                        eval_results[task_id].append({
+                            "completion_id": completion_id[task_id],
+                            "task_id": task_id,
+                            "_identifier": sample["_identifier"],
+                            "solution": sample["solution"],
+                            "used_tools": False,
+                            "base": (FAIL, "No refusal") if "There is no solution can be found based on the provided helper functions." not in sample["solution"] else (PASS, "")
+                            })
+                    else:
+                        eval_results[task_id].append({
+                            "completion_id": completion_id[task_id],
+                            "task_id": task_id,
+                            "_identifier": sample["_identifier"],
+                            "solution": sample["solution"],
+                            "used_tools": False,
+                            "base": (FAIL, "Empty solution")
+                        })
+                    completion_id[task_id] += 1
+                    n_samples += 1
+                
                 if flags.subset == "tool":
                     solution = (sample["solution"]  
                                 if "solution" in sample
@@ -217,19 +243,6 @@ def evaluate(flags):
                     )
                     if "sanitized-calibrated" in flags.samples:
                         solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
-            
-                # Skip execution for empty solutions
-                if not solution:
-                    eval_results[task_id].append({
-                        "completion_id": completion_id[task_id],
-                        "task_id": task_id,
-                        "_identifier": sample["_identifier"],
-                        "solution": solution,
-                        "base": (FAIL, "Empty solution")
-                    })
-                    completion_id[task_id] += 1
-                    n_samples += 1
-                    continue
         
                 remainings.add(sample["_identifier"])
                 args = (
@@ -276,31 +289,20 @@ def stucking_checker():
             results["eval"][task_id] = []
             for res in task_results:
                 stat, details = res["base"]
-                if flags.subset == "tool":
-                    tool_use = res["used_tools"]
-                    results["eval"][task_id].append(
-                        {
-                            "task_id": task_id,
-                            "solution": res["solution"],
-                            "status": stat,
-                            "details": details,
-                            "tool_use": tool_use,
-                        }
-                    )
-                else:
-                    results["eval"][task_id].append(
-                        {
-                            "task_id": task_id,
-                            "solution": res["solution"],
-                            "status": stat,
-                            "details": details,
-                        }
-                    )
+                tool_use = res["used_tools"]
+                results["eval"][task_id].append(
+                    {
+                        "task_id": task_id,
+                        "solution": res["solution"],
+                        "status": stat,
+                        "details": details,
+                        "tool_use": tool_use,
+                    }
+                )
 
     # Calculate pass@k.
     total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
     base_correct = []
-
     for key, res in results["eval"].items():
         if key not in problems:
             continue
@@ -315,7 +317,7 @@ def stucking_checker():
         if total.min() >= k
     }
     
-    if flags.subset == "tool":
+    if flags.subset == "tool" and flags.split != "negative":
         tool_correct = []
         syntax_correct = []
         
@@ -325,11 +327,10 @@ def stucking_checker():
             tc = sum([r["tool_use"] for r in res])
             tool_correct.append(tc)
         
-        for key, res in results["eval"].items():
-            if key not in problems:
+        for sample in load_solutions(flags.samples):
+            if sample["task_id"] not in problems:
                 continue
-            empty_solutions = sum([r["solution"] for r in res])
-            syntax_correct.append(empty_solutions)
+            syntax_correct.append(sample["solution"] != "")
         
         tool_correct = np.array(tool_correct)
         syntax_correct = np.array(syntax_correct)
@@ -359,7 +360,7 @@ def stucking_checker():
             cprint(f"Failed tasks: {failed_tasks}", "red")
     
     for k, v in pass_at_k.items():
-        cprint(f"{k}:\t{v*100:.2f}%", "green")
+        cprint(f"{k}: {v*100:.2f}%", "green")
 
     # save results
     if os.path.isfile(result_path):
diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py
index d8088ad5..306431a7 100644
--- a/bigcodebench/gen/util/__init__.py
+++ b/bigcodebench/gen/util/__init__.py
@@ -57,6 +57,7 @@ def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_sta
         
         errors = test_result.failures + test_result.errors
         if len(errors) > 0:
+            print(task_id)
             print(errors)
             times.value = -1
         else:
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index ec7f908e..29d092cb 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -1,6 +1,7 @@
 """Post-processing LLM-generated Python code implemented using tree-sitter."""
 
 import os
+import re
 import pathlib
 from typing import Dict, Generator, List, Optional, Set, Tuple
 from pqdm.processes import pqdm
@@ -168,7 +169,15 @@ def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str
         name, node = pair
         if entrypoint and not (name in reachable):
             continue
-        sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n"
+        node_code = code_bytes[node.start_byte : node.end_byte].decode("utf8")
+        if node.type == FUNCTION_TYPE and name == entrypoint:
+            # Remove return type annotation, including unnecessary spaces
+            node_code = re.sub(r"->\s*[^:]+:", ":", node_code)
+            # Ensure there is exactly one space before the colon
+            node_code = re.sub(r'\s*\)(\s*):', ') :', node_code)
+            node_code = re.sub(r"\s*:", ":", node_code)
+            node_code = re.sub(r":", " :", node_code)
+        sanitized_output += node_code.encode("utf8") + b"\n"
         
     sanitized_output = sanitized_output[:-1].decode("utf8")