From 6fbf723ab145c030fb5886eaeef43cee5b71a578 Mon Sep 17 00:00:00 2001 From: Terry Yue Zhuo Date: Fri, 2 Jan 2026 11:48:16 +0000 Subject: [PATCH 1/2] add codex --- .gitignore | 2 +- bigcodebench/provider/__init__.py | 12 ++++++ bigcodebench/provider/codex.py | 63 +++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 bigcodebench/provider/codex.py diff --git a/.gitignore b/.gitignore index abe84432..21f5fa2e 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ *.py[cod] *$py.class - +bcb_result*/ # C extensions *.so diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 4cb34102..239e3603 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -143,4 +143,16 @@ def make_model( max_new_tokens=max_new_tokens, instruction_prefix=instruction_prefix, response_prefix=response_prefix, + ) + elif backend == "codex": + from bigcodebench.provider.codex import CodexDecoder + + return CodexDecoder( + name=model, + subset=subset, + split=split, + temperature=temperature, + max_new_tokens=max_new_tokens, + instruction_prefix=instruction_prefix, + response_prefix=response_prefix, ) \ No newline at end of file diff --git a/bigcodebench/provider/codex.py b/bigcodebench/provider/codex.py new file mode 100644 index 00000000..435b1655 --- /dev/null +++ b/bigcodebench/provider/codex.py @@ -0,0 +1,63 @@ +import os +import tempfile +import subprocess +from typing import List +from concurrent.futures import ProcessPoolExecutor, as_completed +from tqdm import tqdm + +from bigcodebench.provider.base import DecoderBase + + +class CodexDecoder(DecoderBase): + def __init__(self, name: str, **kwargs) -> None: + super().__init__(name, **kwargs) + self.model = name + + def codegen( + self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 + ) -> List[str]: + # For each prompt, run codex in parallel + with ProcessPoolExecutor(max_workers=len(prompts)) as executor: + futures = [executor.submit(self._run_codex, i, prompt, num_samples) for i, prompt in enumerate(prompts)] + # Use tqdm to display progress as workers complete so the caller + # (generate.py) sees progress during long-running Codex jobs. + results = [] + for future in tqdm(as_completed(futures), total=len(futures), desc=f"Codex: {self.model}"): + results.append(future.result()) + # Sort results back to original order + results.sort(key=lambda x: x[0]) + return [res[1] for res in results] + + def _run_codex(self, idx: int, prompt: str, num_samples: int) -> tuple[int, List[str]]: + # Since num_samples is per prompt, but codex might generate one, we assume num_samples=1 for now + # If codex can generate multiple, adjust + # For simplicity, assume one per run + outputs = [] + for _ in range(num_samples): + with tempfile.TemporaryDirectory() as temp_dir: + # Modify prompt to instruct writing to solution.py + modified_prompt = prompt + "\n\nWrite the complete solution code to the file 'solution.py' in the current directory." + # Run codex + try: + result = subprocess.run( + ["codex", "exec", "--skip-git-repo-check", "--cd", temp_dir,"--full-auto", modified_prompt], + capture_output=True, + text=True, + timeout=300 # 5 min timeout + ) + if result.returncode == 0: + solution_file = os.path.join(temp_dir, "solution.py") + if os.path.exists(solution_file): + with open(solution_file, "r") as f: + code = f.read() + outputs.append(code) + else: + outputs.append("") # No file + else: + outputs.append("") # Error + except subprocess.TimeoutExpired: + outputs.append("") + return (idx, outputs) + + def is_direct_completion(self) -> bool: + return False \ No newline at end of file From 176679ad853e069c48057fc736a19326feb752aa Mon Sep 17 00:00:00 2001 From: Terry Yue Zhuo Date: Sat, 3 Jan 2026 07:22:39 +0000 Subject: [PATCH 2/2] support codex --- ADVANCED_USAGE.md | 4 ++-- README.md | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index c0905bad..4f1a26c4 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -105,7 +105,7 @@ bigcodebench.generate \ --temperature [temp] \ --n_samples [n_samples] \ --resume \ - --backend [vllm|openai|mistral|anthropic|google|hf] \ + --backend [vllm|openai|mistral|anthropic|google|hf|codex] \ --tp [TENSOR_PARALLEL_SIZE] \ [--trust_remote_code] \ [--base_url [base_url]] \ @@ -127,7 +127,7 @@ docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebenc --temperature [temp] \ --n_samples [n_samples] \ --resume \ - --backend [vllm|openai|mistral|anthropic|google|hf] \ + --backend [vllm|openai|mistral|anthropic|google|hf|codex] \ --tp [TENSOR_PARALLEL_SIZE] # ...Or if you are using CPUs diff --git a/README.md b/README.md index b5a38af5..9cbd1eab 100755 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ bigcodebench.evaluate \ --execution [e2b|gradio|local] \ --split [complete|instruct] \ --subset [full|hard] \ - --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference] + --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference|codex] ``` - All the resulted files will be stored in a folder named `bcb_results`.