diff --git a/bigcodebench/data/utils.py b/bigcodebench/data/utils.py
index fa91abe5..01bc0a2c 100644
--- a/bigcodebench/data/utils.py
+++ b/bigcodebench/data/utils.py
@@ -149,15 +149,30 @@ def write_directory(directory: PathLike, data: Iterable[Dict]):
 
 def completeness_check(name, data):
     for task_id, task in data.items():
-        for key in [
-            "complete_prompt",
-            "instruct_prompt",
-            "canonical_solution",
-            "code_prompt",
-            "test",
-            "entry_point"
-        ]:
-            assert key in task, f"{key} not found in {name} #{task_id}!"
+        try:
+            for key in [
+                "complete_prompt",
+                "instruct_prompt",
+                "canonical_solution",
+                "code_prompt",
+                "test",
+                "entry_point"
+            ]:
+                assert key in task, f"{key} not found in {name} #{task_id}!"
+        except Exception as e:
+            for key in [
+                "complete_prompt",
+                "positive_tool",
+                "negative_tool",
+                "mixed_tool",
+                "positive_tool_implementation",
+                "negative_tool_implementation",
+                "mixed_tool_implementation",
+                "canonical_solution",
+                "test",
+                "entry_point"
+            ]:
+                assert key in task, f"{key} not found in {name} #{task_id}!"
 
 
 def to_raw(string):
diff --git a/bigcodebench/eval/__init__.py b/bigcodebench/eval/__init__.py
index 3596f53d..66f3aabf 100644
--- a/bigcodebench/eval/__init__.py
+++ b/bigcodebench/eval/__init__.py
@@ -24,6 +24,7 @@
 import multiprocessing
 import os
 import sys
+import ast
 import time
 import types
 import unittest
@@ -240,3 +241,80 @@ def evaluate_files(
         )
         ret.append((stat, det.tolist()))
     return ret
+
+
+def extract_defined_modules(code: str, entry_point: str):
+    tree = ast.parse(code)
+    defined_functions = set()
+    defined_methods = {}
+    used_functions = set()
+    used_methods = set()
+    variable_classes = {}
+
+    class FunctionDefVisitor(ast.NodeVisitor):
+        def visit_FunctionDef(self, node):
+            defined_functions.add(node.name)
+            self.generic_visit(node)
+
+        def visit_ClassDef(self, node):
+            for item in node.body:
+                if isinstance(item, ast.FunctionDef):
+                    if node.name not in defined_methods:
+                        defined_methods[node.name] = set()
+                    defined_methods[node.name].add(item.name)
+            self.generic_visit(node)
+
+    class TaskFuncVisitor(ast.NodeVisitor):
+        def visit_Assign(self, node):
+            if isinstance(node.value, ast.Call) and isinstance(node.value.func, ast.Name):
+                class_name = node.value.func.id
+                for target in node.targets:
+                    if isinstance(target, ast.Name):
+                        variable_classes[target.id] = class_name
+            self.generic_visit(node)
+
+        def visit_Call(self, node):
+            if isinstance(node.func, ast.Name):
+                used_functions.add(node.func.id)
+            elif isinstance(node.func, ast.Attribute):
+                value = node.func.value
+                if isinstance(value, ast.Name):
+                    var_name = value.id
+                    if var_name in variable_classes:
+                        used_methods.add(f"{variable_classes[var_name]}.{node.func.attr}")
+                    else:
+                        used_methods.add(f"{var_name}.{node.func.attr}")
+                elif isinstance(value, ast.Attribute):
+                    # Handle nested attributes (e.g., obj.attr.method())
+                    attr_chain = [node.func.attr]
+                    while isinstance(value, ast.Attribute):
+                        attr_chain.append(value.attr)
+                        value = value.value
+                    if isinstance(value, ast.Name):
+                        var_name = value.id
+                        if var_name in variable_classes:
+                            attr_chain.append(variable_classes[var_name])
+                        else:
+                            attr_chain.append(var_name)
+                        used_methods.add('.'.join(reversed(attr_chain)))
+            self.generic_visit(node)
+
+    # First pass: collect all defined functions and methods
+    FunctionDefVisitor().visit(tree)
+
+    # Second pass: collect used functions and methods within task_func
+    for node in ast.iter_child_nodes(tree):
+        if isinstance(node, ast.FunctionDef) and node.name == entry_point:
+            TaskFuncVisitor().visit(node)
+            break  # Assuming there's only one task_func
+
+    # Filter used functions to include only those defined before task_func
+    result = [func for func in used_functions if func in defined_functions]
+
+    # Filter used methods to include only those defined before task_func
+    for class_name, methods in defined_methods.items():
+        for method in methods:
+            if any(f"{class_name}.{method}" in used_method for used_method in used_methods):
+                result.append(f"{class_name}.{method}")
+
+    return result
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 61e2a43f..464af830 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -5,6 +5,7 @@
 import pickle
 import threading
 import time
+from pqdm.processes import pqdm
 from collections import Counter, defaultdict
 from concurrent.futures import ProcessPoolExecutor, as_completed
 from datetime import datetime
@@ -22,10 +23,12 @@
 )
 from bigcodebench.data.utils import CACHE_DIR
 from bigcodebench.eval import (
+    FAIL,
     PASS,
     compatible_eval_result,
     estimate_pass_at_k,
     untrusted_check,
+    extract_defined_modules,
 )
 from bigcodebench.gen.util import trusted_check
 
@@ -34,7 +37,7 @@
 Result = Tuple[str, List[bool]]
 
 
-def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
+def get_groundtruth(subset, n_workers, problems, hashcode, check_gt_only, max_as_limit, max_data_limit, max_stack_limit, min_time_limit):
     cache_file = os.path.join(CACHE_DIR, f"{hashcode}.pkl")
     if os.path.exists(cache_file):
         if check_gt_only:
@@ -54,8 +57,12 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit,
         expected_time = dict()
         
         for problem in problems.values():
+            if subset == "tool":
+                code = problem["positive_tool_implementation"] + "\n" + problem["code_before_entry_point"] + "\n" + problem["canonical_solution"]
+            else:
+                code = problem["code_prompt"] + "\n" + problem["canonical_solution"]
             args = (
-                problem["complete_prompt"] + "\n" + problem["canonical_solution"],
+                code,
                 problem["test"],
                 problem["task_id"],
                 max_as_limit,
@@ -80,6 +87,8 @@ def get_groundtruth(n_workers, problems, hashcode, check_gt_only, max_as_limit,
     return expected_time
 
 def check_correctness(
+    subset: str,
+    split: str,
     completion_id: int,
     problem: Dict[str, Any],
     solution: str,
@@ -89,6 +98,7 @@ def check_correctness(
     identifier=None,
     min_time_limit: float = 0.1,
     gt_time_limit: float = 2.0,
+    used_tools: List[str] = [],
 ) -> Dict[str, Result]:  # {...}, "base" | "plus" -> (status, details)
     ret = {
         "completion_id": completion_id,
@@ -106,6 +116,14 @@ def check_correctness(
         min_time_limit,
         gt_time_limit,
     )
+    if subset == "tool":
+        if split in ["positive", "mixed"]:
+            ret["used_tools"] = (set(used_tools) == set(problem["used_tools"]))
+        else:
+            ret["used_tools"] = False
+    else:
+        ret["used_tools"] = False
+
     return ret
 
 
@@ -130,7 +148,7 @@ def evaluate(flags):
     dataset_hash = get_bigcodebench_hash(subset=flags.subset)
     
     if not flags.no_gt:
-        expected_time = get_groundtruth(n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit)
+        expected_time = get_groundtruth(flags.subset, n_workers, problems, dataset_hash, flags.check_gt_only, flags.max_as_limit, flags.max_data_limit, flags.max_stack_limit, flags.min_time_limit)
     else:
         expected_time = {task_id: None for task_id in problems}
     
@@ -171,21 +189,65 @@ def evaluate(flags):
             print("Reading samples...")
             for sample in tqdm(load_solutions(flags.samples)):
                 task_id = sample["task_id"]
+                used_tools = []
                 
                 if task_id not in problems:
                     warn(
                         f"Task {task_id} is found in the samples but not found in the dataset"
                     )
                     continue
-                solution = (
-                    sample["solution"]
+                
+                # Skip execution for empty solutions
+                if not sample["solution"]:
+                    
+                    if flags.subset == "tool" and flags.split == "negative":
+                        eval_results[task_id].append({
+                            "completion_id": completion_id[task_id],
+                            "task_id": task_id,
+                            "_identifier": sample["_identifier"],
+                            "solution": sample["solution"],
+                            "used_tools": False,
+                            "base": (FAIL, "No refusal") if "There is no solution can be found based on the provided helper functions." not in sample["solution"] else (PASS, "")
+                            })
+                    else:
+                        eval_results[task_id].append({
+                            "completion_id": completion_id[task_id],
+                            "task_id": task_id,
+                            "_identifier": sample["_identifier"],
+                            "solution": sample["solution"],
+                            "used_tools": False,
+                            "base": (FAIL, "Empty solution")
+                        })
+                    completion_id[task_id] += 1
+                    n_samples += 1
+                
+                if flags.subset == "tool":
+                    solution = (sample["solution"]  
+                                if "solution" in sample
+                                else problems[task_id]["complete_prompt"] + sample["completion"] 
+                                )
+                    try:
+                        used_tools = extract_defined_modules(problems[task_id][f"{flags.split}_tool_implementation"] + "\n" + solution, problems[task_id]["entry_point"])
+                    except Exception as e:
+                        pass
+                    solution = problems[task_id]["positive_tool_implementation"] + solution
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["complete_prompt"] + "\n    pass\n" + solution
+                    solution = problems[task_id][f"positive_tool_implementation"] + solution
+
+                else:
+                    solution = (
+                        sample["solution"]
                     if "solution" in sample
-                    else problems[task_id]["complete_prompt"] + sample["completion"]
-                )
-                if "sanitized-calibrated" in flags.samples:
-                    solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+                        else problems[task_id]["complete_prompt"] + sample["completion"]
+                    )
+                    if "sanitized-calibrated" in flags.samples:
+                        solution = problems[task_id]["code_prompt"] + "\n    pass\n" + solution
+        
                 remainings.add(sample["_identifier"])
                 args = (
+                    flags.subset,
+                    flags.split,
                     completion_id[task_id],
                     problems[task_id],
                     solution,
@@ -194,15 +256,16 @@ def evaluate(flags):
                     flags.max_stack_limit,
                     sample["_identifier"],
                     flags.min_time_limit,
-                    expected_time[task_id] if expected_time[task_id] else 20
+                    expected_time[task_id] if expected_time[task_id] else 20,
+                    used_tools
                 )
                 futures.append(executor.submit(check_correctness, *args))
                 completion_id[task_id] += 1
                 n_samples += 1
 
-            assert n_samples == len(remainings), "Missing problems in unfinished"
+            assert n_samples == len(remainings) + sum(len(r) for r in eval_results.values()), "Missing problems in unfinished"
             assert len(completion_id) == len(problems), "Missing problems in samples"
-
+            
             def stucking_checker():
                 while remainings:
                     last_size = len(remainings)
@@ -215,7 +278,7 @@ def stucking_checker():
 
             threading.Thread(target=stucking_checker).start()
 
-            for future in tqdm(as_completed(futures), total=n_samples):
+            for future in tqdm(as_completed(futures), total=len(futures)):
                 result = future.result()
                 remainings.remove(result["_identifier"])
                 eval_results[result["task_id"]].append(result)
@@ -226,51 +289,78 @@ def stucking_checker():
             results["eval"][task_id] = []
             for res in task_results:
                 stat, details = res["base"]
+                tool_use = res["used_tools"]
                 results["eval"][task_id].append(
                     {
                         "task_id": task_id,
                         "solution": res["solution"],
                         "status": stat,
                         "details": details,
+                        "tool_use": tool_use,
                     }
                 )
 
     # Calculate pass@k.
     total = np.array([len(r) for k, r in results["eval"].items() if k in problems])
     base_correct = []
-
     for key, res in results["eval"].items():
         if key not in problems:
             continue
         bc = sum([r["status"] == PASS for r in res])
-        base_correct.append(bc)
-
+        base_correct.append(bc)    
+    
     base_correct = np.array(base_correct)
-
+    
     pass_at_k = {
         f"pass@{k}": estimate_pass_at_k(total, base_correct, k).mean()
         for k in [1, 5, 10, 25, 100]
         if total.min() >= k
     }
     
+    if flags.subset == "tool" and flags.split != "negative":
+        tool_correct = []
+        syntax_correct = []
+        
+        for key, res in results["eval"].items():
+            if key not in problems:
+                continue
+            tc = sum([r["tool_use"] for r in res])
+            tool_correct.append(tc)
+        
+        for sample in load_solutions(flags.samples):
+            if sample["task_id"] not in problems:
+                continue
+            syntax_correct.append(sample["solution"] != "")
+        
+        tool_correct = np.array(tool_correct)
+        syntax_correct = np.array(syntax_correct)
+        
+        pass_at_k.update({f"tool@{k}": estimate_pass_at_k(total, tool_correct, k).mean()
+                          for k in [1, 5, 10, 25, 100]
+                          if total.min() >= k})
+        
+        pass_at_k.update({f"syntax@{k}": estimate_pass_at_k(total, syntax_correct, k).mean()
+                        for k in [1, 5, 10, 25, 100]
+                        if total.min() >= k})
+    
     mode = "-calibrated" if "sanitized-calibrated" in flags.samples else ""
     extra = flags.subset.capitalize()
     flags.split = flags.split.capitalize()
-    cprint(f"BigCodeBench-{flags.split}{mode} ({extra})", "green")
+    cprint(f"BigCodeBench-{extra}{mode} ({flags.split})", "green")
         
     if flags.no_gt:
         cprint(f"Groundtruth is not checked", "yellow")
     else:
         if gt_pass_rate > 0.99:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}", "green")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%", "green")
         else:
-            cprint(f"Groundtruth pass rate: {gt_pass_rate:.3f}\nPlease be cautious!", "red")
+            cprint(f"Groundtruth pass rate: {gt_pass_rate*100:.1f}%\nPlease be cautious!", "red")
         
         if len(failed_tasks) > 0:
             cprint(f"Failed tasks: {failed_tasks}", "red")
     
     for k, v in pass_at_k.items():
-        cprint(f"{k}:\t{v:.3f}", "green")
+        cprint(f"{k}: {v*100:.2f}%", "green")
 
     # save results
     if os.path.isfile(result_path):
@@ -323,9 +413,9 @@ def save_pass_at_k():
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument(
-        "--split", required=True, type=str, choices=["complete", "instruct"]
+        "--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"]
     )
-    parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard"])
+    parser.add_argument("--subset", default="hard", type=str, choices=["full", "hard", "tool"])
     parser.add_argument("--samples", required=True, type=str)
     parser.add_argument("--save_pass_rate", action="store_true")
     parser.add_argument("--parallel", default=None, type=int)
diff --git a/bigcodebench/gen/util/__init__.py b/bigcodebench/gen/util/__init__.py
index d8088ad5..306431a7 100644
--- a/bigcodebench/gen/util/__init__.py
+++ b/bigcodebench/gen/util/__init__.py
@@ -57,6 +57,7 @@ def trusted_exec(code, test_code, task_id, max_as_limit, max_data_limit, max_sta
         
         errors = test_result.failures + test_result.errors
         if len(errors) > 0:
+            print(task_id)
             print(errors)
             times.value = -1
         else:
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index 679300cb..729b93ed 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -37,6 +37,8 @@ def codegen(
         if model.is_direct_completion() and split == "instruct":
             raise Exception("Base model does not support direct completion for instruct tasks")
         
+        if subset == "tool":
+            assert split in ["positive", "negative", "mixed"], "Tool subset only supports positive, negative, and mixed split"
         # create save_path if it doesn't exist, e.g., a/b.jsonl
         dirname = os.path.dirname(save_path)
         if not os.path.exists(dirname) and dirname != "":
@@ -70,9 +72,12 @@ def codegen(
             sidx = n_samples - nsamples
             while sidx < n_samples:
                 try:
-                    prompt = task[f"{split}_prompt"]
+                    if subset == "tool":
+                        prompt = task[f"{split}_tool"] + "\n\n" + task["complete_prompt"]
+                    else:
+                        prompt = task[f"{split}_prompt"]
                 except:
-                    raise Exception(f"Invalid split {split}")
+                    raise Exception(f"Invalid split {split} for bigcodebench-{subset}")
                 if strip_newlines:
                     prompt = prompt.strip("\n")
                 outputs = model.codegen(
@@ -105,8 +110,8 @@ def codegen(
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("--model", required=True, type=str)
-    parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct"])
-    parser.add_argument("--subset", default="full", type=str, choices=["full", "hard"])
+    parser.add_argument("--split", required=True, type=str, choices=["complete", "instruct", "positive", "negative", "mixed"])
+    parser.add_argument("--subset", default="full", type=str, choices=["full", "hard", "tool"])
     parser.add_argument("--save_path", default=None, type=str)
     parser.add_argument("--bs", default=1, type=int)
     parser.add_argument("--n_samples", default=1, type=int)
@@ -140,6 +145,8 @@ def main():
     model_runner = make_model(
         model=args.model,
         backend=args.backend,
+        subset=args.subset,
+        split=args.split,
         batch_size=args.bs,
         temperature=args.temperature,
         base_url=args.base_url,
diff --git a/bigcodebench/model.py b/bigcodebench/model.py
index c5093b0f..9830055e 100644
--- a/bigcodebench/model.py
+++ b/bigcodebench/model.py
@@ -55,17 +55,28 @@ def extra_eos_for_direct_completion(dataset) -> List[str]:
 _MAGIC_SPLITTER_ = "-[[]]-this-is-really-our-highest-priority-[[]]-"
 
 
-def make_chat_prompt(prompt: str, tokenizer: AutoTokenizer) -> str:
+def make_chat_prompt(prompt: str, subset: str, split: str, tokenizer: AutoTokenizer) -> str:
     # directly return prompt if it does not have a tokenizer.chat_template
     if tokenizer.chat_template is None:
         return prompt
 
-    prompt = f"""\
+    if subset == "tool":
+        prompt = f"""\
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
+```
+{prompt.strip()}
+```
+"""     
+    else:
+        prompt = f"""\
 Please provide a self-contained Python script that solves the following problem in a markdown code block:
 ```
 {prompt.strip()}
 ```
 """
+
     response = f"""\
 Below is a Python script with a self-contained function that solves the problem and passes corresponding tests:
 ```python
@@ -86,6 +97,8 @@ class DecoderBase(ABC):
     def __init__(
         self,
         name: str,
+        subset: str,
+        split: str,
         batch_size: int = 1,
         temperature: float = 0.8,
         max_new_tokens: int = 1280,
@@ -96,6 +109,8 @@ def __init__(
     ) -> None:
         print("Initializing a decoder model: {} ...".format(name))
         self.name = name
+        self.subset = subset
+        self.split = split
         self.batch_size = batch_size
         self.temperature = temperature
         self.eos = EOS
@@ -175,7 +190,7 @@ def __init__(self, name: str, **kwargs) -> None:
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer)
         return VllmDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -259,7 +274,7 @@ def __init__(self, name: str, **kwargs):
     def codegen(
         self, prompt: str, do_sample: bool = True, num_samples: int = 200
     ) -> List[str]:
-        prompt = make_chat_prompt(prompt, self.tokenizer)
+        prompt = make_chat_prompt(prompt, self.subset, self.split, self.tokenizer)
         return HfTorchDecoder.codegen(self, prompt, do_sample, num_samples)
 
 
@@ -277,10 +292,16 @@ def codegen(
 
         # construct prompt
         fmt = "json_object" if self.name == "gpt-4-1106-preview" else "text"
-        if fmt == "json_object":
-            message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
+        if self.subset == "tool":
+            if fmt == "json_object":
+                message = r'Based on the given customized modules, please complete the following code snippet by generating JSON like {"code": ""} without using any external library APIs or defining any modules'
+            else:
+                message = r"You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.\n\nPlease complete `task_func` in a markdown code block without using any external library APIs or defining any modules:"
         else:
-            message = r"Please generate self-contained code to complete the following problem:"
+            if fmt == "json_object":
+                message = r'Please complete the following code snippet by generating JSON like {"code": ""}'
+            else:
+                message = r"Please generate self-contained code to complete the following problem in a markdown code block:"
 
         message += f"\n```python\n{prompt.strip()}\n```"
 
@@ -335,14 +356,31 @@ def codegen(
         batch_size = min(self.batch_size, num_samples)
 
         outputs = []
+        
+        if self.subset == "tool":
+            message = f"""\
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
+
         for _ in range(batch_size):
             ret = self.client.chat(
                 model=self.name,
                 messages=[
                     ChatMessage(
                         role="user",
-                        content="Please generate self-contained code to solve the following problem in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```",
+                        content=message,
                     )
                 ],
                 max_tokens=self.max_new_tokens,
@@ -383,22 +421,37 @@ def codegen(
             assert batch_size == 1, "Sampling only supports batch size of 1"
 
         outputs = []
+        if self.subset == "tool":
+            message = f"""\
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
         for _ in range(batch_size):
-            message = anthropic_request.make_auto_request(
-                client=self.client,
+            ret = anthropic_request.make_auto_request(
+                    client=self.client,
                 model=self.name,
                 messages=[
                     {
                         "role": "user",
-                        "content": "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```\n",
+                        "content": message,
                     }
                 ],
                 max_tokens=self.max_new_tokens,
                 stop_sequences=["\n```\n", "\nif "],
                 **kwargs,
             )
-            outputs.append(message.content[0].text)
+            outputs.append(ret.content[0].text)
 
         return outputs
 
@@ -459,12 +512,29 @@ def codegen(
         model = genai.GenerativeModel(model_name=self.name, generation_config=genai_config, safety_settings=safety_settings)
         
         outputs = []
+        
+        if self.subset == "tool":
+            message = f"""\
+You should only utilize the helper modules given in the markdown code block to provide a self-contained Python script that implements `task_func`. Assume all listed helper modules work as expected on the backend but are not visible for their implementation details. Call `refusal_func` if you cannot find proper helper modules in the code block to solve the task.
+
+Please complete `task_func` in a markdown code block without using any external library APIs or defining any modules:
+```
+{prompt.strip()}
+```
+"""
+        else:
+            message = f"""\
+Please provide a self-contained Python script that solves the following problem in a markdown code block:
+```
+{prompt.strip()}
+```
+"""
+
         for _ in range(batch_size):
             while True:
                 try:
                     response = model.generate_content(
-                        "Please generate self-contained code to complete the following problem wrapped in a Python markdown block:"
-                        + f"\n```python\n{prompt.strip()}\n```",
+                        message,
                         generation_config=genai_config
                     )
                     output = response.candidates[0].content.parts[0].text
@@ -485,6 +555,8 @@ def codegen(
 def make_model(
     model: str,
     backend: str,
+    subset: str,
+    split: str,
     dataset: str = "bigcodebench",
     batch_size: int = 1,
     temperature: float = 0.0,
@@ -497,6 +569,8 @@ def make_model(
     if backend == "vllm":
         return GeneralVllmDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             dataset=dataset,
@@ -508,6 +582,8 @@ def make_model(
     elif backend == "hf":
         return GenenralHfTorchDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             dataset=dataset,
@@ -518,6 +594,8 @@ def make_model(
     elif backend == "openai":
         return OpenAIChatDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
             base_url=base_url,
@@ -525,18 +603,24 @@ def make_model(
     elif backend == "mistral":
         return MistralChatDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
     elif backend == "anthropic":
         return AnthropicMessageDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
     elif backend == "google":
         return GeminiDecoder(
             name=model,
+            subset=subset,
+            split=split,
             batch_size=batch_size,
             temperature=temperature,
         )
\ No newline at end of file
diff --git a/bigcodebench/sanitize.py b/bigcodebench/sanitize.py
index df9ed4eb..29d092cb 100644
--- a/bigcodebench/sanitize.py
+++ b/bigcodebench/sanitize.py
@@ -1,6 +1,7 @@
 """Post-processing LLM-generated Python code implemented using tree-sitter."""
 
 import os
+import re
 import pathlib
 from typing import Dict, Generator, List, Optional, Set, Tuple
 from pqdm.processes import pqdm
@@ -15,7 +16,7 @@
     write_directory,
     write_jsonl,
 )
-from bigcodebench.syncheck import syntax_check
+from bigcodebench.syncheck import syntax_check, api_check
 
 CLASS_TYPE = "class_definition"
 FUNCTION_TYPE = "function_definition"
@@ -108,7 +109,7 @@ def has_return_statement(node: Node) -> bool:
     return False
 
 
-def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
+def sanitize(code: str, solution: Dict, entrypoint: Optional[str] = None) -> str:
     code = code_extract(code.strip())
     code_bytes = bytes(code, "utf8")
     parser = get_parser("python")
@@ -116,13 +117,15 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
     class_names = set()
     function_names = set()
     variable_names = set()
-
+    reachable = set()
+    
     root_node = tree.root_node
     import_nodes = []
     definition_nodes = []
 
     for child in root_node.children:
         if child.type in IMPORT_TYPE:
+            # if subset != "tool":
             import_nodes.append(child)
         elif child.type == CLASS_TYPE:
             name = get_definition_name(child)
@@ -136,8 +139,11 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             if not (
                 name in function_names or name in variable_names or name in class_names
             ):
+                # if name == entrypoint:
+                #     task_func_found = True
+                # if task_func_found:
                 definition_nodes.append((name, child))
-                function_names.add(get_definition_name(child))
+                function_names.add(name)
         elif (
             child.type == EXPRESSION_TYPE and child.children[0].type == ASSIGNMENT_TYPE
         ):
@@ -146,12 +152,13 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             if not (
                 name in variable_names or name in function_names or name in class_names
             ):
+                # if task_func_found:
                 definition_nodes.append((name, subchild))
                 variable_names.add(name)
 
     if entrypoint:
         name2deps = get_deps(definition_nodes)
-        reacheable = get_function_dependency(entrypoint, name2deps)
+        reachable = get_function_dependency(entrypoint, name2deps)
 
     sanitized_output = b""
 
@@ -160,9 +167,17 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
 
     for pair in definition_nodes:
         name, node = pair
-        if entrypoint and not (name in reacheable):
+        if entrypoint and not (name in reachable):
             continue
-        sanitized_output += code_bytes[node.start_byte : node.end_byte] + b"\n"
+        node_code = code_bytes[node.start_byte : node.end_byte].decode("utf8")
+        if node.type == FUNCTION_TYPE and name == entrypoint:
+            # Remove return type annotation, including unnecessary spaces
+            node_code = re.sub(r"->\s*[^:]+:", ":", node_code)
+            # Ensure there is exactly one space before the colon
+            node_code = re.sub(r'\s*\)(\s*):', ') :', node_code)
+            node_code = re.sub(r"\s*:", ":", node_code)
+            node_code = re.sub(r":", " :", node_code)
+        sanitized_output += node_code.encode("utf8") + b"\n"
         
     sanitized_output = sanitized_output[:-1].decode("utf8")
     
@@ -176,6 +191,9 @@ def sanitize(code: str, entrypoint: Optional[str] = None) -> str:
             outer_lines.append(i)
     if outer_lines:
         sanitized_output = "\n".join(lines[: outer_lines[-1]])
+    # if subset == "tool":
+    #     return "" if api_check(solution[f"{split}_tool_"] + "\n" + sanitized_output) else sanitized_output
+    # else:
     return sanitized_output
 
 
@@ -183,6 +201,7 @@ def process_solution(
     sample_solution: Dict,
     dataset: Dict,
     entry_point: Dict,
+    subset: str,
     debug_task: str = None,
     calibrate: bool = False,
     is_folder: bool = False,
@@ -207,8 +226,10 @@ def process_solution(
         if calibrate:
             old_code = old_code.replace("```python\n    ", "```python\n"+dataset[task_id]["complete_prompt"]+"    ")
 
-    new_code = sanitize(code=old_code, entrypoint=function_name)
-
+    new_code = sanitize(code=old_code, solution=sample_solution, entrypoint=function_name)
+    if subset == "tool":
+        if api_check(new_code):
+            new_code = ""
     # if old code and new code are different, print msg
     if new_code != old_code:
         msg = "Sanitized: " + dbg_identifier
@@ -220,12 +241,12 @@ def process_solution(
 
 
 def script(
-    samples: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
+    samples: str, subset: str, inplace: bool = False, debug_task: str = None, calibrate: bool = False, parallel: int=32
 ):
     # task_id -> entry_point
     entry_point = {}
     # merge two datasets
-    dataset = {**get_bigcodebench()}
+    dataset = {**get_bigcodebench(subset=subset)}
 
     for task_id, problem in dataset.items():
         entry_point[task_id] = problem["entry_point"]
@@ -233,18 +254,19 @@ def script(
     # make a new folder with "-sanitized" suffix
     is_folder = os.path.isdir(samples)
     target_path = pathlib.Path(samples)
+    target_path_name = target_path.name
     if not inplace:
         if is_folder:
             if calibrate:
-                new_name = target_path.name + "-sanitized-calibrated"
+                target_path_name = target_path_name + "-sanitized-calibrated"
             else:
-                new_name = target_path.name + "-sanitized"
+                target_path_name = target_path_name + "-sanitized"
         else:
             if calibrate:
-                new_name = target_path.name.replace(".jsonl", "-sanitized-calibrated.jsonl")
+                target_path_name = target_path_name.replace(".jsonl", "-sanitized-calibrated.jsonl")
             else:
-                new_name = target_path.name.replace(".jsonl", "-sanitized.jsonl")
-        target_path = target_path.parent / new_name
+                target_path_name = target_path_name.replace(".jsonl", "-sanitized.jsonl")
+        target_path = target_path.parent / target_path_name
     target_path = str(target_path)
 
     nsan = 0
@@ -257,6 +279,7 @@ def script(
             "sample_solution": sample_solution,
             "dataset": dataset,
             "entry_point": entry_point,
+            "subset": subset,
             "debug_task": debug_task,
             "calibrate": calibrate,
             "is_folder": is_folder,
diff --git a/bigcodebench/syncheck.py b/bigcodebench/syncheck.py
index 9ea97f4c..566afdbd 100755
--- a/bigcodebench/syncheck.py
+++ b/bigcodebench/syncheck.py
@@ -10,6 +10,61 @@
 
 from bigcodebench.data import load_solutions
 
+def api_check(code: str) -> bool:
+    tree = ast.parse(code)
+    imported_modules = set()
+    imported_names = {}
+
+    class ApiExtractor(ast.NodeVisitor):
+        def __init__(self):
+            self.in_task_func = False
+            self.uses_library_api = False
+
+        def visit_Import(self, node):
+            for alias in node.names:
+                imported_modules.add(alias.name)
+                if alias.asname:
+                    imported_modules.add(alias.asname)
+
+        def visit_ImportFrom(self, node):
+            if node.module:
+                for alias in node.names:
+                    full_name = f'{node.module}.{alias.name}'
+                    imported_names[alias.asname or alias.name] = full_name
+
+        def visit_FunctionDef(self, node):
+            if node.name == 'task_func':
+                self.in_task_func = True
+                self.generic_visit(node)
+                self.in_task_func = False
+            else:
+                self.generic_visit(node)
+
+        def visit_Attribute(self, node):
+            if self.in_task_func:
+                attr_chain = []
+                current = node
+                while isinstance(current, ast.Attribute):
+                    attr_chain.append(current.attr)
+                    current = current.value
+                if isinstance(current, ast.Name):
+                    attr_chain.append(current.id)
+                    attr_chain.reverse()
+                    full_name = '.'.join(attr_chain)
+                    if attr_chain[0] in imported_modules or attr_chain[0] in imported_names:
+                        self.uses_library_api = True
+            self.generic_visit(node)
+
+        def visit_Name(self, node):
+            if self.in_task_func:
+                if node.id in imported_modules or node.id in imported_names:
+                    self.uses_library_api = True
+            self.generic_visit(node)
+
+    extractor = ApiExtractor()
+    extractor.visit(tree)
+
+    return extractor.uses_library_api
 
 def syntax_check(code, verbose=False):
     try:
diff --git a/tools/fix_v0110.py b/tools/fix_v0110.py
index d1d1300c..0b7f75ec 100644
--- a/tools/fix_v0110.py
+++ b/tools/fix_v0110.py
@@ -19,7 +19,21 @@ def map_ds(sample):
                             "Requirements:\n    - sklearn.ensemble\n",
                             "Requirements:\n    - pandas\n    - sklearn.ensemble\n"    
                 )
-    
+    if sample["task_id"] in ["BigCodeBench/241"]:
+        for k in sample.keys():
+            if "prompt" in k:
+                sample[k] = sample[k].replace(
+                            "The function will plot the original and normalized arrays using matplotlib.",
+                            "The function will plot the original and normalized arrays with a title of 'Original vs. Normalized Data'."    
+                )
+    if sample["task_id"] in ["BigCodeBench/267"]:
+        for k in sample.keys():
+            if "prompt" in k:
+                sample[k] = sample[k].replace(
+                            "Plots and returns the FFT of the signal.",
+                            "Plots and returns the FFT of the signal with a title of 'FFT of the signal'."    
+                )
+
     return sample
     
 if __name__ == "__main__":
@@ -28,7 +42,7 @@ def map_ds(sample):
     hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
     ds = ds_dict[BIGCODEBENCH_VERSION]
     hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
-    function_id = [37]
+    function_id = [37, 267, 241]
     
     new_ds = ds.map(map_ds)
     new_ds.to_json("BigCodeBench.jsonl")