diff --git a/.gitignore b/.gitignore index abe8443..21f5fa2 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ __pycache__/ *.py[cod] *$py.class - +bcb_result*/ # C extensions *.so diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md index c0905ba..4f1a26c 100755 --- a/ADVANCED_USAGE.md +++ b/ADVANCED_USAGE.md @@ -105,7 +105,7 @@ bigcodebench.generate \ --temperature [temp] \ --n_samples [n_samples] \ --resume \ - --backend [vllm|openai|mistral|anthropic|google|hf] \ + --backend [vllm|openai|mistral|anthropic|google|hf|codex] \ --tp [TENSOR_PARALLEL_SIZE] \ [--trust_remote_code] \ [--base_url [base_url]] \ @@ -127,7 +127,7 @@ docker run --gpus '"device=$CUDA_VISIBLE_DEVICES"' -v $(pwd):/app -t bigcodebenc --temperature [temp] \ --n_samples [n_samples] \ --resume \ - --backend [vllm|openai|mistral|anthropic|google|hf] \ + --backend [vllm|openai|mistral|anthropic|google|hf|codex] \ --tp [TENSOR_PARALLEL_SIZE] # ...Or if you are using CPUs diff --git a/README.md b/README.md index b5a38af..9cbd1ea 100755 --- a/README.md +++ b/README.md @@ -127,7 +127,7 @@ bigcodebench.evaluate \ --execution [e2b|gradio|local] \ --split [complete|instruct] \ --subset [full|hard] \ - --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference] + --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference|codex] ``` - All the resulted files will be stored in a folder named `bcb_results`. diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py index 4cb3410..239e360 100644 --- a/bigcodebench/provider/__init__.py +++ b/bigcodebench/provider/__init__.py @@ -143,4 +143,16 @@ def make_model( max_new_tokens=max_new_tokens, instruction_prefix=instruction_prefix, response_prefix=response_prefix, + ) + elif backend == "codex": + from bigcodebench.provider.codex import CodexDecoder + + return CodexDecoder( + name=model, + subset=subset, + split=split, + temperature=temperature, + max_new_tokens=max_new_tokens, + instruction_prefix=instruction_prefix, + response_prefix=response_prefix, ) \ No newline at end of file diff --git a/bigcodebench/provider/codex.py b/bigcodebench/provider/codex.py new file mode 100644 index 0000000..435b165 --- /dev/null +++ b/bigcodebench/provider/codex.py @@ -0,0 +1,63 @@ +import os +import tempfile +import subprocess +from typing import List +from concurrent.futures import ProcessPoolExecutor, as_completed +from tqdm import tqdm + +from bigcodebench.provider.base import DecoderBase + + +class CodexDecoder(DecoderBase): + def __init__(self, name: str, **kwargs) -> None: + super().__init__(name, **kwargs) + self.model = name + + def codegen( + self, prompts: List[str], do_sample: bool = True, num_samples: int = 200 + ) -> List[str]: + # For each prompt, run codex in parallel + with ProcessPoolExecutor(max_workers=len(prompts)) as executor: + futures = [executor.submit(self._run_codex, i, prompt, num_samples) for i, prompt in enumerate(prompts)] + # Use tqdm to display progress as workers complete so the caller + # (generate.py) sees progress during long-running Codex jobs. + results = [] + for future in tqdm(as_completed(futures), total=len(futures), desc=f"Codex: {self.model}"): + results.append(future.result()) + # Sort results back to original order + results.sort(key=lambda x: x[0]) + return [res[1] for res in results] + + def _run_codex(self, idx: int, prompt: str, num_samples: int) -> tuple[int, List[str]]: + # Since num_samples is per prompt, but codex might generate one, we assume num_samples=1 for now + # If codex can generate multiple, adjust + # For simplicity, assume one per run + outputs = [] + for _ in range(num_samples): + with tempfile.TemporaryDirectory() as temp_dir: + # Modify prompt to instruct writing to solution.py + modified_prompt = prompt + "\n\nWrite the complete solution code to the file 'solution.py' in the current directory." + # Run codex + try: + result = subprocess.run( + ["codex", "exec", "--skip-git-repo-check", "--cd", temp_dir,"--full-auto", modified_prompt], + capture_output=True, + text=True, + timeout=300 # 5 min timeout + ) + if result.returncode == 0: + solution_file = os.path.join(temp_dir, "solution.py") + if os.path.exists(solution_file): + with open(solution_file, "r") as f: + code = f.read() + outputs.append(code) + else: + outputs.append("") # No file + else: + outputs.append("") # Error + except subprocess.TimeoutExpired: + outputs.append("") + return (idx, outputs) + + def is_direct_completion(self) -> bool: + return False \ No newline at end of file