From 6fbf723ab145c030fb5886eaeef43cee5b71a578 Mon Sep 17 00:00:00 2001
From: Terry Yue Zhuo <terryzhuo25@gmail.com>
Date: Fri, 2 Jan 2026 11:48:16 +0000
Subject: [PATCH 1/2] add codex

---
 .gitignore                        |  2 +-
 bigcodebench/provider/__init__.py | 12 ++++++
 bigcodebench/provider/codex.py    | 63 +++++++++++++++++++++++++++++++
 3 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 bigcodebench/provider/codex.py

diff --git a/.gitignore b/.gitignore
index abe84432..21f5fa2e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
+bcb_result*/
 # C extensions
 *.so
 
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index 4cb34102..239e3603 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -143,4 +143,16 @@ def make_model(
             max_new_tokens=max_new_tokens,
             instruction_prefix=instruction_prefix,
             response_prefix=response_prefix,
+        )
+    elif backend == "codex":
+        from bigcodebench.provider.codex import CodexDecoder
+
+        return CodexDecoder(
+            name=model,
+            subset=subset,
+            split=split,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            instruction_prefix=instruction_prefix,
+            response_prefix=response_prefix,
         )
\ No newline at end of file
diff --git a/bigcodebench/provider/codex.py b/bigcodebench/provider/codex.py
new file mode 100644
index 00000000..435b1655
--- /dev/null
+++ b/bigcodebench/provider/codex.py
@@ -0,0 +1,63 @@
+import os
+import tempfile
+import subprocess
+from typing import List
+from concurrent.futures import ProcessPoolExecutor, as_completed
+from tqdm import tqdm
+
+from bigcodebench.provider.base import DecoderBase
+
+
+class CodexDecoder(DecoderBase):
+    def __init__(self, name: str, **kwargs) -> None:
+        super().__init__(name, **kwargs)
+        self.model = name
+
+    def codegen(
+        self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
+    ) -> List[str]:
+        # For each prompt, run codex in parallel
+        with ProcessPoolExecutor(max_workers=len(prompts)) as executor:
+            futures = [executor.submit(self._run_codex, i, prompt, num_samples) for i, prompt in enumerate(prompts)]
+            # Use tqdm to display progress as workers complete so the caller
+            # (generate.py) sees progress during long-running Codex jobs.
+            results = []
+            for future in tqdm(as_completed(futures), total=len(futures), desc=f"Codex: {self.model}"):
+                results.append(future.result())
+        # Sort results back to original order
+        results.sort(key=lambda x: x[0])
+        return [res[1] for res in results]
+
+    def _run_codex(self, idx: int, prompt: str, num_samples: int) -> tuple[int, List[str]]:
+        # Since num_samples is per prompt, but codex might generate one, we assume num_samples=1 for now
+        # If codex can generate multiple, adjust
+        # For simplicity, assume one per run
+        outputs = []
+        for _ in range(num_samples):
+            with tempfile.TemporaryDirectory() as temp_dir:
+                # Modify prompt to instruct writing to solution.py
+                modified_prompt = prompt + "\n\nWrite the complete solution code to the file 'solution.py' in the current directory."
+                # Run codex
+                try:
+                    result = subprocess.run(
+                        ["codex", "exec", "--skip-git-repo-check", "--cd", temp_dir,"--full-auto", modified_prompt],
+                        capture_output=True,
+                        text=True,
+                        timeout=300  # 5 min timeout
+                    )
+                    if result.returncode == 0:
+                        solution_file = os.path.join(temp_dir, "solution.py")
+                        if os.path.exists(solution_file):
+                            with open(solution_file, "r") as f:
+                                code = f.read()
+                            outputs.append(code)
+                        else:
+                            outputs.append("")  # No file
+                    else:
+                        outputs.append("")  # Error
+                except subprocess.TimeoutExpired:
+                    outputs.append("")
+        return (idx, outputs)
+
+    def is_direct_completion(self) -> bool:
+        return False
\ No newline at end of file

From 176679ad853e069c48057fc736a19326feb752aa Mon Sep 17 00:00:00 2001
From: Terry Yue Zhuo <terryzhuo25@gmail.com>
Date: Sat, 3 Jan 2026 07:22:39 +0000
Subject: [PATCH 2/2] support codex

---
 ADVANCED_USAGE.md | 4 ++--
 README.md         | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index c0905bad..4f1a26c4 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -105,7 +105,7 @@ bigcodebench.generate \
     --temperature [temp] \
     --n_samples [n_samples] \
     --resume \
-    --backend [vllm|openai|mistral|anthropic|google|hf] \
+    --backend [vllm|openai|mistral|anthropic|google|hf|codex] \
     --tp [TENSOR_PARALLEL_SIZE] \
     [--trust_remote_code] \
     [--base_url [base_url]] \
@@ -127,7 +127,7 @@ docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebenc
     --temperature [temp] \
     --n_samples [n_samples] \
     --resume \
-    --backend [vllm|openai|mistral|anthropic|google|hf] \
+    --backend [vllm|openai|mistral|anthropic|google|hf|codex] \
     --tp [TENSOR_PARALLEL_SIZE]
 
 # ...Or if you are using CPUs
diff --git a/README.md b/README.md
index b5a38af5..9cbd1eab 100755
--- a/README.md
+++ b/README.md
@@ -127,7 +127,7 @@ bigcodebench.evaluate \
   --execution [e2b|gradio|local] \
   --split [complete|instruct] \
   --subset [full|hard] \
-  --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference]
+  --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference|codex]
 ```
 
 - All the resulted files will be stored in a folder named `bcb_results`.