diff --git a/ADVANCED_USAGE.md b/ADVANCED_USAGE.md
index 4f48eca..c0905ba 100755
--- a/ADVANCED_USAGE.md
+++ b/ADVANCED_USAGE.md
@@ -50,6 +50,7 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--n_samples`: The number of samples, default to `1`
- `--temperature`: The temperature, default to `0.0`
- `--max_new_tokens`: The length of max new tokens, default to `1280`
+- `--max_model_len`: The length of max tokens in VLLM, default to `12800`
- `--greedy`: Whether to use greedy decoding, default to `False`
- `--strip_newlines`: Whether to strip newlines, default to `False`, set to `True` to strip newlines for some model series like StarCoder2
- `--direct_completion`: Whether to use direct completion, default to `False`
@@ -69,7 +70,8 @@ Below are all the arguments for `bigcodebench.evaluate` for the remote evaluatio
- `--tokenizer_legacy`: Whether to use the legacy tokenizer, default to `False`
- `--samples`: The path to the generated samples file, default to `None`
- `--no_execute`: Whether to not execute the samples, default to `False`
-- `--remote_execute_api`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
+- `--e2b_endpoint`: The API endpoint for remote execution, default to `bigcodebench_evaluator`, you can also use your own E2B API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
+- `--gradio_endpoint`: The API endpoint for remote execution, default to `https://bigcode-bigcodebench-evaluator.hf.space/`, you can also use your own Gradio API endpoint by cloning the [bigcodebench-evaluator](https://huggingface.co/spaces/bigcode/bigcodebench-evaluator) repo and check `Use via API` at the bottom of the HF space page
- `--pass_k`: The `k` in `Pass@k`, default to `[1, 5, 10]`, e.g. `--pass_k 1,5,10` will evaluate `Pass@1`, `Pass@5` and `Pass@10`
- `--calibrated`: Whether to use the calibrated samples, default to `True`
- `--save_pass_rate`: Whether to save the pass rate to a file, default to `True`
diff --git a/Docker/Evaluate.Dockerfile b/Docker/Evaluate.Dockerfile
index 90e7f40..8b2cdcd 100755
--- a/Docker/Evaluate.Dockerfile
+++ b/Docker/Evaluate.Dockerfile
@@ -54,7 +54,7 @@ RUN pip install \
rich \
accelerate \
anthropic \
- google-generativeai \
+ google-genai \
mistralai \
openai \
e2b
diff --git a/README.md b/README.md
index 4bcea25..b5a38af 100755
--- a/README.md
+++ b/README.md
@@ -27,8 +27,8 @@
🎉 Check out our latest work!
- 🌟 SWE Arena 🌟
- 🚀 Open Evaluation Platform on AI for Software Engineering 🚀
+ 🌟 BigCodeArena 🌟
+ 🚀 Open Evaluation Platform on AI for Vibe Coding 🚀
✨ 100% free to use the latest frontier models! ✨
@@ -127,7 +127,7 @@ bigcodebench.evaluate \
--execution [e2b|gradio|local] \
--split [complete|instruct] \
--subset [full|hard] \
- --backend [vllm|openai|anthropic|google|mistral|hf]
+ --backend [vllm|openai|anthropic|google|mistral|hf|hf-inference]
```
- All the resulted files will be stored in a folder named `bcb_results`.
@@ -177,10 +177,17 @@ Access Gemini APIs from [Google AI Studio](https://aistudio.google.com/)
export GOOGLE_API_KEY=
```
+Access the [Hugging Face Serverless Inference API](https://huggingface.co/docs/api-inference/en/index)
+```bash
+export HF_INFERENCE_API_KEY=
+```
+
+Please make sure your HF access token has the `Make calls to inference providers` permission.
+
## 💻 LLM-generated Code
We share pre-generated code samples from LLMs we have [evaluated](https://huggingface.co/spaces/bigcode/bigcodebench-leaderboard) on the full set:
-* See the attachment of our [v0.2.1.post7](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.1.post7). We include `sanitized_samples_calibrated.zip` for your convenience.
+* See the attachment of our [v0.2.4](https://github.com/bigcode-project/bigcodebench/releases/tag/v0.2.4). We include `sanitized_samples_calibrated.zip` for your convenience.
## 🧑 Advanced Usage
diff --git a/analysis/get_results.py b/analysis/get_results.py
index fc5aa17..641c43b 100755
--- a/analysis/get_results.py
+++ b/analysis/get_results.py
@@ -4,7 +4,7 @@
import numpy as np
from numpy import mean
from glob import glob
-from utils import *
+from utils import model_info
from tqdm import tqdm
import pandas as pd
import itertools
@@ -48,6 +48,8 @@ def get_results(tids):
"moe": info["moe"],
"size": info["size"],
"act_param": info["act_param"],
+ "date": info.get("date", None),
+ "prefill": info.get("prefill", False),
# "direct_complete": info["direct_complete"],
}
@@ -118,12 +120,12 @@ def check_valid(results):
def split_gen():
- shutil.rmtree("sanitized_samples", ignore_errors=True)
shutil.rmtree("sanitized_calibrated_samples", ignore_errors=True)
- os.makedirs("sanitized_samples/complete", exist_ok=True)
- os.makedirs("sanitized_samples/instruct", exist_ok=True)
- os.makedirs("sanitized_calibrated_samples/complete", exist_ok=True)
- os.makedirs("sanitized_calibrated_samples/instruct", exist_ok=True)
+ os.makedirs("sanitized_calibrated_samples/hard/complete", exist_ok=True)
+ os.makedirs("sanitized_calibrated_samples/hard/instruct", exist_ok=True)
+ os.makedirs("sanitized_calibrated_samples/full/complete", exist_ok=True)
+ os.makedirs("sanitized_calibrated_samples/full/instruct", exist_ok=True)
+
for model, info in model_info.items():
model = model.replace("/", "--")
files = glob(f"results/{model}--bigcodebench-*.jsonl")
@@ -131,27 +133,21 @@ def split_gen():
model = info["link"].split("https://huggingface.co/")[-1].replace("/", "--")
for file in files:
+ if "-sanitized" not in file or "calibrated" not in file:
+ continue
+
_, suffix = os.path.basename(file).split("--bigcodebench-")
with open(file, "r") as f:
data = f.readlines()
- if "-sanitized" in file:
- if "calibrated" in file:
- if info["prompted"]:
- if suffix.startswith("complete"):
- with open(f"sanitized_calibrated_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
- f.writelines(data)
- else:
- with open(f"sanitized_calibrated_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
- f.writelines(data)
+ split_type = "hard" if "-hard-" in file else "full"
+ if info["prompted"]:
+ if suffix.startswith("complete") or suffix.startswith("hard-complete"):
+ with open(f"sanitized_calibrated_samples/{split_type}/complete/{model}--bigcodebench-{suffix}", "w") as f:
+ f.writelines(data)
else:
- if suffix.startswith("complete"):
- with open(f"sanitized_samples/complete/{model}--bigcodebench-{suffix}", "w") as f:
- f.writelines(data)
- else:
- with open(f"sanitized_samples/instruct/{model}--bigcodebench-{suffix}", "w") as f:
- f.writelines(data)
-
+ with open(f"sanitized_calibrated_samples/{split_type}/instruct/{model}--bigcodebench-{suffix}", "w") as f:
+ f.writelines(data)
def read_task_perf(tids, task="complete"):
model_results = dict()
@@ -255,7 +251,7 @@ def get_solve_rate(data_dict, task="complete"):
def get_hf_ds(results):
hf_dataset = {"model": [], "link": [], "moe": [], "size": [], "act_param": [], "type": [], #"lazy": [],# "direct_complete": [],
- "complete": [], "instruct": []}
+ "complete": [], "instruct": [], "date": [], "prefill": []}
for model, result in results.items():
hf_dataset["model"].append(model)
@@ -267,6 +263,8 @@ def get_hf_ds(results):
# hf_dataset["lazy"].append(result["lazy"])
hf_dataset["complete"].append(result["pass@1"]["complete"])
hf_dataset["instruct"].append(result["pass@1"]["instruct"])
+ hf_dataset["date"].append(result["date"])
+ hf_dataset["prefill"].append(result["prefill"])
# hf_dataset["direct_complete"].append(result["direct_complete"])
return Dataset.from_dict(hf_dataset)
@@ -302,7 +300,7 @@ def get_perf_df(data_dict):
if __name__ == "__main__":
- # split_gen()
+ split_gen()
bcb_orig = load_dataset("bigcode/bigcodebench", split="v0.1.1")
bcb_hard = load_dataset("bigcode/bigcodebench-hard", split="v0.1.1")
bcb_config = {
diff --git a/analysis/utils.py b/analysis/utils.py
index 430e113..20ecbf5 100755
--- a/analysis/utils.py
+++ b/analysis/utils.py
@@ -7,7 +7,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-04",
},
"bigcode/starcoder2-15b-instruct-v0.1": {
"name": "StarCoder2-15B-Instruct-v0.1",
@@ -17,7 +18,8 @@
"size": 15,
"act_param": 15,
"open-data": "Full",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-30"
},
"bigcode/starcoder2-3b": {
"name": "StarCoder2-3B",
@@ -27,7 +29,8 @@
"size": 3,
"act_param": 3,
"open-data": "Full",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-29"
},
"bigcode/starcoder2-7b": {
"name": "StarCoder2-7B",
@@ -37,7 +40,8 @@
"size": 7,
"act_param": 7,
"open-data": "Full",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-29"
},
"bigcode/starcoder2-15b": {
"name": "StarCoder2-15B",
@@ -47,7 +51,8 @@
"size": 15,
"act_param": 15,
"open-data": "Full",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-29"
},
"Qwen/CodeQwen1.5-7B": {
"name": "CodeQwen1.5-7B",
@@ -57,7 +62,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-16"
},
"google/codegemma-2b": {
"name": "CodeGemma-2B",
@@ -67,7 +73,8 @@
"size": 2,
"act_param": 2,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-10"
},
"google/codegemma-7b": {
"name": "CodeGemma-7B",
@@ -77,7 +84,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-10"
},
"google/codegemma-7b-it": {
"name": "CodeGemma-7B-Instruct",
@@ -87,7 +95,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-10"
},
"gpt-3.5-turbo-0125": {
"name": "GPT-3.5-Turbo-0125",
@@ -97,7 +106,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-01-25"
},
"gpt-4o": {
"name": "GPT-4o-2024-05-13",
@@ -107,7 +117,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-13"
},
"gpt-4-turbo-2024-04-09": {
"name": "GPT-4-Turbo-2024-04-09",
@@ -117,7 +128,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-09"
},
"gpt-4-0613": {
"name": "GPT-4-0613",
@@ -127,7 +139,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-13"
},
"codellama/CodeLlama-7b-hf": {
"name": "CodeLlama-7B-Base",
@@ -137,7 +150,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-13b-hf": {
"name": "CodeLlama-13B-Base",
@@ -147,7 +161,8 @@
"size": 13,
"act_param": 13,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-7b-Instruct-hf": {
"name": "CodeLlama-7B-Instruct",
@@ -157,7 +172,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-13b-Instruct-hf": {
"name": "CodeLlama-13B-Instruct",
@@ -167,7 +183,8 @@
"size": 13,
"act_param": 13,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"mistral-large-2402": {
"name": "Mistral-Large-2402",
@@ -177,7 +194,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-26"
},
"mistral-small-2402": {
"name": "Mistral-Small-2402",
@@ -187,7 +205,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-26"
},
"mistralai/Mixtral-8x22B-v0.1": {
"name": "Mixtral-8x22B-Base",
@@ -197,7 +216,8 @@
"size": 176,
"act_param": 44,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-17"
},
"mistralai/Mixtral-8x22B-Instruct-v0.1": {
"name": "Mixtral-8x22B-Instruct",
@@ -207,7 +227,8 @@
"size": 176,
"act_param": 44,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-17"
},
"codellama/CodeLlama-34b-hf": {
"name": "CodeLlama-34B-Base",
@@ -217,7 +238,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-34b-Instruct-hf": {
"name": "CodeLlama-34B-Instruct",
@@ -227,7 +249,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-70b-hf": {
"name": "CodeLlama-70B-Base",
@@ -237,7 +260,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"codellama/CodeLlama-70b-Instruct-hf": {
"name": "CodeLlama-70B-Instruct",
@@ -247,7 +271,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"Qwen/CodeQwen1.5-7B-Chat": {
"name": "CodeQwen1.5-7B-Chat",
@@ -257,7 +282,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-16"
},
"Qwen/Qwen1.5-110B-Chat": {
"name": "Qwen1.5-110B-Chat",
@@ -267,7 +293,8 @@
"size": 110,
"act_param": 110,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-26"
},
"Qwen/Qwen1.5-72B-Chat": {
"name": "Qwen1.5-72B-Chat",
@@ -277,7 +304,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-26"
},
"Qwen/Qwen1.5-32B-Chat": {
"name": "Qwen1.5-32B-Chat",
@@ -287,7 +315,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-26"
},
"deepseek-ai/DeepSeek-V2-Chat": {
"name": "DeepSeek-V2-Chat",
@@ -297,7 +326,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-06"
},
"deepseek-ai/deepseek-coder-1.3b-base": {
"name": "DeepSeek-Coder-1.3B-Base",
@@ -307,7 +337,8 @@
"size": 1.3,
"act_param": 1.3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"deepseek-ai/deepseek-coder-1.3b-instruct": {
"name": "DeepSeek-Coder-1.3B-Instruct",
@@ -317,7 +348,8 @@
"size": 1.3,
"act_param": 1.3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"deepseek-ai/deepseek-coder-33b-base": {
"name": "DeepSeek-Coder-33B-Base",
@@ -327,7 +359,8 @@
"size": 33,
"act_param": 33,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"deepseek-ai/deepseek-coder-33b-instruct": {
"name": "DeepSeek-Coder-33B-Instruct",
@@ -337,7 +370,8 @@
"size": 33,
"act_param": 33,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"deepseek-ai/deepseek-coder-6.7b-base": {
"name": "DeepSeek-Coder-6.7B-Base",
@@ -347,7 +381,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"deepseek-ai/deepseek-coder-6.7b-instruct": {
"name": "DeepSeek-Coder-6.7B-Instruct",
@@ -357,7 +392,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-10-28"
},
"meta-llama/Meta-Llama-3-70B": {
"name": "Llama-3-70B-Base",
@@ -367,7 +403,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-18"
},
"meta-llama/Meta-Llama-3-70B-Instruct": {
"name": "Llama-3-70B-Instruct",
@@ -377,7 +414,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-18"
},
"meta-llama/Meta-Llama-3-8B": {
"name": "Llama-3-8B-Base",
@@ -387,7 +425,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-18"
},
"meta-llama/Meta-Llama-3-8B-Instruct": {
"name": "Llama-3-8B-Instruct",
@@ -397,7 +436,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-18"
},
"ibm-granite/granite-3b-code-instruct": {
"name": "Granite-Code-3B-Instruct",
@@ -407,7 +447,8 @@
"size": 3,
"act_param": 3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-8b-code-instruct": {
"name": "Granite-Code-8B-Instruct",
@@ -417,7 +458,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-20b-code-instruct": {
"name": "Granite-Code-20B-Instruct",
@@ -427,7 +469,8 @@
"size": 20,
"act_param": 20,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-34b-code-instruct": {
"name": "Granite-Code-34B-Instruct",
@@ -437,7 +480,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-3b-code-base": {
"name": "Granite-Code-3B-Base",
@@ -447,7 +491,8 @@
"size": 3,
"act_param": 3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-8b-code-base": {
"name": "Granite-Code-8B-Base",
@@ -457,7 +502,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-20b-code-base": {
"name": "Granite-Code-20B-Base",
@@ -467,7 +513,8 @@
"size": 20,
"act_param": 20,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"ibm-granite/granite-34b-code-base": {
"name": "Granite-Code-34B-Base",
@@ -477,7 +524,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-06"
},
"claude-3-haiku-20240307": {
"name": "Claude-3-Haiku-20240307",
@@ -487,7 +535,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-03-07"
},
"claude-3-sonnet-20240229": {
"name": "Claude-3-Sonnet-20240229",
@@ -497,7 +546,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-02-29"
},
"claude-3-opus-20240229": {
"name": "Claude-3-Opus-20240229",
@@ -507,7 +557,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-02-29"
},
"01-ai/Yi-1.5-34B-Chat": {
"name": "Yi-1.5-34B-Chat",
@@ -517,7 +568,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"01-ai/Yi-1.5-34B": {
"name": "Yi-1.5-34B",
@@ -527,7 +579,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"01-ai/Yi-1.5-9B-Chat": {
"name": "Yi-1.5-9B-Chat",
@@ -537,7 +590,8 @@
"size": 9,
"act_param": 9,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"01-ai/Yi-1.5-9B": {
"name": "Yi-1.5-9B",
@@ -547,7 +601,8 @@
"size": 9,
"act_param": 9,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"01-ai/Yi-1.5-6B-Chat": {
"name": "Yi-1.5-6B-Chat",
@@ -557,7 +612,8 @@
"size": 6,
"act_param": 6,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"01-ai/Yi-1.5-6B": {
"name": "Yi-1.5-6B",
@@ -567,7 +623,8 @@
"size": 6,
"act_param": 6,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-20"
},
"Qwen/Qwen2-57B-A14B": {
"name": "Qwen2-57B-A14B",
@@ -577,7 +634,8 @@
"size": 57,
"act_param": 14,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-07"
},
"Qwen/Qwen2-7B-Instruct": {
"name": "Qwen2-7B-Instruct",
@@ -587,7 +645,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-07"
},
"Qwen/Qwen2-72B-Chat": {
"name": "Qwen2-72B-Chat",
@@ -597,7 +656,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-07"
},
"gemini-1.5-pro": {
"name": "Gemini-1.5-Pro-API-0514",
@@ -607,7 +667,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-14"
},
"gemini-1.5-flash": {
"name": "Gemini-1.5-Flash-API-0514",
@@ -617,7 +678,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-14"
},
"m-a-p/OpenCodeInterpreter-DS-33B": {
"name": "OpenCodeInterpreter-DS-33B",
@@ -627,7 +689,8 @@
"size": 33,
"act_param": 33,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-22"
},
"m-a-p/OpenCodeInterpreter-DS-6.7B": {
"name": "OpenCodeInterpreter-DS-6.7B",
@@ -637,7 +700,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-22"
},
"m-a-p/OpenCodeInterpreter-DS-1.3B": {
"name": "OpenCodeInterpreter-DS-1.3B",
@@ -647,7 +711,8 @@
"size": 1.3,
"act_param": 1.3,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-22"
},
"microsoft/Phi-3-medium-128k-instruct": {
"name": "Phi-3-Medium-128K-Instruct",
@@ -657,7 +722,8 @@
"size": 14,
"act_param": 14,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-21"
},
"microsoft/Phi-3-small-128k-instruct": {
"name": "Phi-3-Small-128K-Instruct",
@@ -667,7 +733,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-21"
},
"codestral-2405": {
"name": "Codestral-22B-v0.1",
@@ -677,7 +744,8 @@
"size": 22,
"act_param": 22,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-23"
},
"codestral-mamba-2407": {
"name": "Codestral-Mamba",
@@ -687,7 +755,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-16"
},
"mistralai/Mistral-7B-Instruct-v0.3": {
"name": "Mistral-7B-Instruct-v0.3",
@@ -697,7 +766,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-22"
},
"mistralai/Mistral-7B-v0.3": {
"name": "Mistral-7B-v0.3",
@@ -707,7 +777,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-22"
},
"CohereForAI/c4ai-command-r-plus": {
"name": "Command R+",
@@ -717,7 +788,8 @@
"size": 104,
"act_param": 104,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-04"
},
"deepseek-coder": {
"name": "DeepSeek-Coder-V2-Instruct",
@@ -727,7 +799,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-06-17"
},
"deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct": {
"name": "DeepSeek-Coder-V2-Lite-Instruct",
@@ -737,7 +810,8 @@
"size": 16,
"act_param": 2.4,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-17"
},
"deepseek-ai/DeepSeek-Coder-V2-Lite-Base": {
"name": "DeepSeek-Coder-V2-Lite-Base",
@@ -747,7 +821,8 @@
"size": 16,
"act_param": 2.4,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-17"
},
"claude-3-5-sonnet-20240620": {
"name": "Claude-3.5-Sonnet-20240620",
@@ -757,7 +832,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-06-20"
},
"NousResearch/Hermes-2-Theta-Llama-3-70B": {
"name": "Hermes-2-Theta-Llama-3-70B",
@@ -767,7 +843,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-24"
},
"microsoft/wavecoder-ultra-6.7b": {
"name": "WaveCoder-Ultra-6.7B",
@@ -777,7 +854,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-12-26"
},
"google/gemma-2-9b-it": {
"name": "Gemma-2-9B-Instruct",
@@ -787,7 +865,8 @@
"size": 9,
"act_param": 9,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-19"
},
"Bin12345/AutoCoder": {
"name": "AutoCoder",
@@ -797,7 +876,8 @@
"size": 33,
"act_param": 33,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-23"
},
"Bin12345/AutoCoder_S_6.7B": {
"name": "AutoCoder-S-6.7B",
@@ -807,7 +887,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-23"
},
"Bin12345/AutoCoder_QW_7B": {
"name": "AutoCoder-QW-7B",
@@ -817,7 +898,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-23"
},
"SenseLLM/ReflectionCoder-DS-33B": {
"name": "ReflectionCoder-DS-33B",
@@ -827,7 +909,8 @@
"size": 33,
"act_param": 33,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-27"
},
"SenseLLM/ReflectionCoder-DS-6.7B": {
"name": "ReflectionCoder-DS-6.7B",
@@ -837,7 +920,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-27"
},
"SenseLLM/ReflectionCoder-CL-34B": {
"name": "ReflectionCoder-CL-34B",
@@ -847,7 +931,8 @@
"size": 34,
"act_param": 34,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-27"
},
"SenseLLM/ReflectionCoder-CL-7B": {
"name": "ReflectionCoder-CL-7B",
@@ -857,7 +942,8 @@
"size": 7,
"act_param": 7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-27"
},
"new-microsoft/Phi-3-mini-128k-instruct": {
"name": "Phi-3.1-Mini-128K-Instruct",
@@ -867,7 +953,8 @@
"size": 3.8,
"act_param": 3.8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-21"
},
"old-microsoft/Phi-3-mini-128k-instruct": {
"name": "Phi-3-Mini-128K-Instruct",
@@ -877,7 +964,8 @@
"size": 3.8,
"act_param": 3.8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-21"
},
"internlm/internlm2_5-7b-chat": {
"name": "InternLM2.5-7B-Chat",
@@ -887,7 +975,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-03"
},
"NousResearch/Hermes-2-Pro-Llama-3-70B": {
"name": "Hermes-2-Pro-Llama-3-70B",
@@ -897,7 +986,8 @@
"size": 70,
"act_param": 70,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-27"
},
"new-deepseek-chat": {
"name": "DeepSeek-V2-Chat (2024-06-28)",
@@ -907,7 +997,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-06-28"
},
"vllm-google/gemma-2-27b-it": {
"name": "Gemma-2-27B-Instruct",
@@ -917,7 +1008,8 @@
"size": 27,
"act_param": 27,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-06-19"
},
"Artigenz/Artigenz-Coder-DS-6.7B": {
"name": "Artigenz-Coder-DS-6.7B",
@@ -927,7 +1019,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-16"
},
"openchat/openchat-3.6-8b-20240522": {
"name": "OpenChat-3.6-8B-20240522",
@@ -937,7 +1030,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-22"
},
"Phind/Phind-CodeLlama-34B-v2": {
"name": "Phind-CodeLlama-34B-v2",
@@ -947,7 +1041,8 @@
"size": 34,
"act_param": 34,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2023-08-25"
},
"yi-large": {
"name": "Yi-Large",
@@ -957,7 +1052,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-13"
},
"THUDM/codegeex4-all-9b": {
"name": "CodeGeex4-All-9B",
@@ -967,7 +1063,8 @@
"size": 9,
"act_param": 9,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-05"
},
"gpt-4o-mini-2024-07-18": {
"name": "GPT-4o-mini-2024-07-18",
@@ -977,7 +1074,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-18"
},
"Nexusflow/Athene-70B": {
"name": "Athene-70B",
@@ -987,7 +1085,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-20"
},
"NTQAI/Nxcode-CQ-7B-orpo": {
"name": "Nxcode-CQ-7B-Orpo",
@@ -997,7 +1096,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-25"
},
"migtissera/Llama-3-70B-Synthia-v3.5": {
"name": "Llama-3-70B-Synthia-v3.5",
@@ -1007,7 +1107,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-05-27"
},
"migtissera/Tess-v2.5.2-Qwen2-72B": {
"name": "Tess-v2.5.2-Qwen2-72B",
@@ -1017,7 +1118,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-18"
},
"WhiteRabbitNeo/WhiteRabbitNeo-33B-v1.5": {
"name": "WhiteRabbitNeo-33B-v1.5",
@@ -1027,7 +1129,8 @@
"size": 33,
"act_param": 33,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-02-10"
},
"mistral-large-2407": {
"name": "Mistral-Large-Instruct-2407",
@@ -1037,7 +1140,8 @@
"size": 123,
"act_param": 123,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-07-24"
},
"meta-llama/Meta-Llama-3.1-8B-Instruct": {
"name": "Llama-3.1-8B-Instruct",
@@ -1047,7 +1151,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-23"
},
"meta-llama/Meta-Llama-3.1-70B-Instruct": {
"name": "Llama-3.1-70B-Instruct",
@@ -1057,7 +1162,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-23"
},
"meta--llama-3.1-405b-instruct": {
"name": "Llama-3.1-405B-Instruct",
@@ -1067,7 +1173,8 @@
"size": 405,
"act_param": 405,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-23"
},
"deepseek-coder-20240724": {
"name": "DeepSeek-Coder-V2-Instruct (2024-07-24)",
@@ -1077,7 +1184,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-24"
},
"microsoft/Phi-3.5-mini-instruct": {
"name": "Phi-3.5-Mini-Instruct",
@@ -1087,7 +1195,8 @@
"size": 3.8,
"act_param": 3.8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-23"
},
"nv-mistralai--mistral-nemo-12b-instruct": {
"name": "Mistral-Nemo-12B-Instruct",
@@ -1097,7 +1206,8 @@
"size": 12,
"act_param": 12,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-18"
},
"wyt2000/InverseCoder-CL-13B": {
"name": "InverseCoder-CL-13B",
@@ -1107,7 +1217,8 @@
"size": 13,
"act_param": 13,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-08"
},
"wyt2000/InverseCoder-CL-7B": {
"name": "InverseCoder-CL-7B",
@@ -1117,7 +1228,8 @@
"size": 7,
"act_param": 7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-08"
},
"wyt2000/InverseCoder-DS-6.7B": {
"name": "InverseCoder-DS-6.7B",
@@ -1127,7 +1239,8 @@
"size": 6.7,
"act_param": 6.7,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-07-08"
},
"gemini-1.5-pro-exp-0801": {
"name": "Gemini-1.5-Pro-Exp-0801",
@@ -1137,7 +1250,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-08-01"
},
"gpt-4o-2024-08-06": {
"name": "GPT-4o-2024-08-06",
@@ -1147,7 +1261,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-06"
},
"abacusai/Dracarys-Llama-3.1-70B-Instruct": {
"name": "Dracarys-Llama-3.1-70B-Instruct",
@@ -1157,7 +1272,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-23"
},
"abacusai/Dracarys-72B-Instruct": {
"name": "Dracarys-72B-Instruct",
@@ -1167,7 +1283,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-23"
},
"gemini-1.5-pro-exp-0827": {
"name": "Gemini-1.5-Pro-Exp-0827",
@@ -1177,7 +1294,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-27"
},
"gemini-1.5-flash-exp-0827": {
"name": "Gemini-1.5-Flash-Exp-0827",
@@ -1187,7 +1305,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-27"
},
"microsoft/Phi-3.5-mini-instruct": {
"name": "Phi-3.5-Mini-Instruct",
@@ -1197,7 +1316,8 @@
"size": 3.8,
"act_param": 3.8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-23"
},
"abacusai/Dracarys-Llama-3.1-70B-Instruct": {
"name": "Dracarys-Llama-3.1-70B-Instruct",
@@ -1207,7 +1327,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-23"
},
"abacusai/Dracarys-72B-Instruct": {
"name": "Dracarys-72B-Instruct",
@@ -1217,7 +1338,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-04-23"
},
"deepseek-coder-v2.5": {
"name": "DeepSeek-V2.5",
@@ -1227,7 +1349,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-18"
},
"CohereForAI/c4ai-command-r-08-2024": {
"name": "C4AI-Command-R-08-2024",
@@ -1237,7 +1360,8 @@
"size": 32.3,
"act_param": 32.3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-30"
},
"CohereForAI/c4ai-command-r-plus-08-2024": {
"name": "C4AI-Command-R-Plus-08-2024",
@@ -1247,7 +1371,8 @@
"size": 104,
"act_param": 104,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-30"
},
"ayueei--yue-coder-9b-preview": {
"name": "Yi-Coder-9B-Chat",
@@ -1257,7 +1382,8 @@
"size": 9,
"act_param": 9,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-04"
},
# "mattshumer/ref_70_e3_prefill": {
# "name": "Reflection-Llama-3.1-70B",
@@ -1285,7 +1411,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-09-12"
},
"o1-mini-2024-09-12": {
"name": "o1-Mini-2024-09-12 (temperature=1)",
@@ -1295,7 +1422,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-09-12"
},
"Qwen/Qwen2.5-Coder-1.5B-Instruct": {
"name": "Qwen2.5-Coder-1.5B-Instruct",
@@ -1305,7 +1433,8 @@
"size": 1.5,
"act_param": 1.5,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-12"
},
"Qwen/Qwen2.5-Coder-7B-Instruct": {
"name": "Qwen2.5-Coder-7B-Instruct",
@@ -1315,7 +1444,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-12"
},
"gemini-1.5-pro-002": {
"name": "Gemini-1.5-Pro-002",
@@ -1325,7 +1455,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-09-25"
},
"mistralai/Mistral-Small-Instruct-2409": {
"name": "Mistral-Small-Instruct-2409",
@@ -1335,7 +1466,8 @@
"size": 22.2,
"act_param": 22.2,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-18"
},
"Qwen/Qwen2.5-0.5B-Instruct": {
"name": "Qwen2.5-0.5B-Instruct",
@@ -1345,7 +1477,8 @@
"size": 0.5,
"act_param": 0.5,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-1.5B-Instruct": {
"name": "Qwen2.5-1.5B-Instruct",
@@ -1355,7 +1488,8 @@
"size": 1.5,
"act_param": 1.5,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-7B-Instruct": {
"name": "Qwen2.5-7B-Instruct",
@@ -1365,7 +1499,8 @@
"size": 7,
"act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-14B-Instruct": {
"name": "Qwen2.5-14B-Instruct",
@@ -1375,7 +1510,8 @@
"size": 14,
"act_param": 14,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-32B-Instruct": {
"name": "Qwen2.5-32B-Instruct",
@@ -1385,7 +1521,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-72B-Instruct": {
"name": "Qwen2.5-72B-Instruct",
@@ -1395,7 +1532,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"meta-llama/Llama-3.2-1B-Instruct": {
"name": "Llama-3.2-1B-Instruct",
@@ -1405,7 +1543,8 @@
"size": 1,
"act_param": 1,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-25"
},
"meta-llama/Llama-3.2-3B-Instruct": {
"name": "Llama-3.2-3B-Instruct",
@@ -1415,7 +1554,8 @@
"size": 3,
"act_param": 3,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-25"
},
"nvidia/Llama-3.1-Nemotron-70B-Instruct-HF": {
"name": "Llama-3.1-Nemotron-70B-Instruct",
@@ -1425,7 +1565,8 @@
"size": 70,
"act_param": 70,
"open-data": "Partial",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-25"
},
"claude-3-5-sonnet-20241022": {
"name": "Claude-3.5-Sonnet-20241022",
@@ -1435,7 +1576,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-10-22"
},
"ibm-granite/granite-3.0-8b-instruct": {
"name": "Granite-3.0-8B-Instruct",
@@ -1445,7 +1587,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-10-21"
},
"ibm-granite/granite-3.0-2b-instruct": {
"name": "Granite-3.0-2B-Instruct",
@@ -1455,7 +1598,8 @@
"size": 2,
"act_param": 2,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-10-21"
},
"grok-beta--main": {
"name": "Grok-Beta",
@@ -1465,7 +1609,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-03-17"
},
"claude-3-5-haiku-20241022--main": {
"name": "Claude-3.5-Haiku-20241022",
@@ -1475,7 +1620,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-10-22"
},
"Qwen/Qwen2.5-Coder-14B-Instruct--main": {
"name": "Qwen2.5-Coder-14B-Instruct",
@@ -1485,7 +1631,8 @@
"size": 14,
"act_param": 14,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"Qwen/Qwen2.5-Coder-32B-Instruct--main": {
"name": "Qwen2.5-Coder-32B-Instruct",
@@ -1495,7 +1642,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-09-19"
},
"infly/OpenCoder-1.5B-Instruct--main": {
"name": "OpenCoder-1.5B-Instruct",
@@ -1505,7 +1653,8 @@
"size": 1.5,
"act_param": 1.5,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-09"
},
"infly/OpenCoder-8B-Instruct--main": {
"name": "OpenCoder-8B-Instruct",
@@ -1515,7 +1664,8 @@
"size": 8,
"act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-09"
},
"microsoft/Phi-3.5-mini-instruct--main": {
"name": "Phi-3.5-Mini-Instruct",
@@ -1525,7 +1675,8 @@
"size": 3.8,
"act_param": 3.8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-08-21"
},
"Nexusflow/Athene-V2-Agent--main": {
"name": "Athene-V2-Agent",
@@ -1535,7 +1686,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-14"
},
"Nexusflow/Athene-V2-Chat--main": {
"name": "Athene-V2-Chat",
@@ -1545,7 +1697,8 @@
"size": 72,
"act_param": 72,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-14"
},
"gemini-exp-1114--main": {
"name": "Gemini-Exp-1114",
@@ -1555,7 +1708,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-11-14"
},
"gpt-4o-2024-11-20--main": {
"name": "GPT-4o-2024-11-20",
@@ -1565,7 +1719,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-11-20"
},
"gemini-exp-1121--main": {
"name": "Gemini-Exp-1121",
@@ -1575,7 +1730,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-21"
},
"gemini-exp-1206--main": {
"name": "Gemini-Exp-1206",
@@ -1585,7 +1741,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-12-06"
},
"meta-llama--Llama-3.3-70B-Instruct--main": {
"name": "Llama-3.3-70B-Instruct",
@@ -1595,7 +1752,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-19"
},
"deepseek-ai--DeepSeek-V2.5-1210--main": {
"name": "DeepSeek-V2.5-1210",
@@ -1605,7 +1763,8 @@
"size": 236,
"act_param": 21,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-10"
},
"gemini-2.0-flash-exp--main": {
"name": "Gemini-2.0-Flash-Exp",
@@ -1615,7 +1774,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-11"
},
"gemini-2.0-flash-thinking-exp-1219--main": {
"name": "Gemini-2.0-Flash-Thinking-Exp-1219",
@@ -1625,7 +1785,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-19"
},
"gemini-2.0-flash-thinking-exp-01-21--main": {
"name": "Gemini-2.0-Flash-Thinking-Exp-01-21",
@@ -1635,7 +1796,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-21"
},
"o1-2024-12-17--main": {
"name": "o1-2024-12-17 (temperature=1, reasoning=medium)",
@@ -1645,7 +1807,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-12-17"
},
"o1-2024-12-17--low--main": {
"name": "o1-2024-12-17 (temperature=1, reasoning=low)",
@@ -1655,7 +1818,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-12-17"
},
"o1-2024-12-17--high--main": {
"name": "o1-2024-12-17 (temperature=1, reasoning=high)",
@@ -1665,17 +1829,19 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-12-17"
},
"deepseek-v3-chat--main": {
- "name": "DeepSeek-V3-Chat",
- "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-Chat",
+ "name": "DeepSeek-V3",
+ "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3",
"prompted": True,
"moe": True,
"size": 671,
"act_param": 37,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2024-12-26"
},
"microsoft--phi-4--main": {
"name": "Phi-4",
@@ -1685,7 +1851,8 @@
"size": 14.7,
"act_param": 14.7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-12-13"
},
"deepseek-reasoner--main": {
"name": "DeepSeek-R1",
@@ -1695,7 +1862,8 @@
"size": 671,
"act_param": 37,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Llama-70B--main": {
"name": "DeepSeek-R1-Distill-Llama-70B",
@@ -1705,7 +1873,8 @@
"size": 70,
"act_param": 70,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Qwen-32B--main": {
"name": "DeepSeek-R1-Distill-Qwen-32B",
@@ -1715,7 +1884,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Qwen-14B--main": {
"name": "DeepSeek-R1-Distill-Qwen-14B",
@@ -1725,27 +1895,30 @@
"size": 14,
"act_param": 14,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B--main": {
"name": "DeepSeek-R1-Distill-Llama-8B",
"link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
"prompted": True,
"moe": False,
- "size": 14,
- "act_param": 14,
+ "size": 8,
+ "act_param": 8,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B--main": {
"name": "DeepSeek-R1-Distill-Qwen-7B",
"link": "https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"prompted": True,
"moe": False,
- "size": 14,
- "act_param": 14,
+ "size": 7,
+ "act_param": 7,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B--main": {
"name": "DeepSeek-R1-Distill-Qwen-1.5B",
@@ -1755,7 +1928,8 @@
"size": 1.5,
"act_param": 1.5,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-20"
},
"mistralai/Mistral-Small-24B-Instruct-2501--main": {
"name": "Mistral-Small-24B-Instruct-2501",
@@ -1765,7 +1939,8 @@
"size": 24,
"act_param": 24,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-31"
},
"o3-mini-2025-01-31--medium--main": {
"name": "o3-mini-2025-01-31 (temperature=1, reasoning=medium)",
@@ -1775,7 +1950,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2025-01-31"
},
"o3-mini-2025-01-31--low--main": {
"name": "o3-mini-2025-01-31 (temperature=1, reasoning=low)",
@@ -1785,7 +1961,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2025-01-31"
},
"o3-mini-2025-01-31--high--main": {
"name": "o3-mini-2025-01-31 (temperature=1, reasoning=high)",
@@ -1795,7 +1972,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": True,
+ "prefill": False,
+ "date": "2025-01-31"
},
"gemini-2.0-flash-001--main": {
"name": "Gemini-2.0-Flash-001",
@@ -1805,7 +1983,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-02-05"
},
"gemini-2.0-flash-exp--main": {
"name": "Gemini-2.0-Flash-Exp",
@@ -1815,7 +1994,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-02-05"
},
"gemini-2.0-flash-lite-preview-02-05--main": {
"name": "Gemini-2.0-Flash-Lite-Preview-02-05",
@@ -1825,7 +2005,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-02-05"
},
"gemini-2.0-pro-exp-02-05--main": {
"name": "Gemini-2.0-Pro-Exp-02-05",
@@ -1835,7 +2016,8 @@
"size": None,
"act_param": None,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-02-05"
},
"NovaSky-AI--Sky-T1-32B-Flash--main": {
"name": "Sky-T1-32B-Flash",
@@ -1845,7 +2027,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-12"
},
"NovaSky-AI--Sky-T1-32B-Preview--main": {
"name": "Sky-T1-32B-Preview",
@@ -1855,7 +2038,8 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2025-01-12"
},
"Qwen--QwQ-32B-Preview--main": {
"name": "QwQ-32B-Preview",
@@ -1865,6 +2049,205 @@
"size": 32,
"act_param": 32,
"open-data": "None",
- "reasoning": False,
+ "prefill": True,
+ "date": "2024-11-28"
+ },
+ "claude-3-7-sonnet-20250219--main": {
+ "name": "Claude-3-Haiku-20240307",
+ "link": "https://www.anthropic.com/news/claude-3-family",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-02-19"
+ },
+ "chatgpt-4o-latest--main": {
+ "name": "ChatGPT-4o-latest-20250129",
+ "link": "https://chat.openai.com/",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "prefill": False,
+ "date": "2025-01-29"
+ },
+ "Kwaipilot--KwaiCoder-23B-A4B-v1--main": {
+ "name": "KwaiCoder-23B-A4B-v1",
+ "link": "https://huggingface.co/Kwaipilot/KwaiCoder-23B-A4B-v1",
+ "open-data": "None",
+ "prompted": False,
+ "moe": True,
+ "size": 23,
+ "act_param": 4,
+ "prefill": True,
+ "date": "2025-01-25"
+ },
+ "qwen-max-latest--main": {
+ "name": "Qwen2.5-Max",
+ "link": "https://qwenlm.github.io/blog/qwen2.5-max/",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": None,
+ "act_param": None,
+ "prefill": False,
+ "date": "2025-01-28"
+ },
+ "claude-3-7-sonnet-20250219--3200-output-128k-2025-02-19--main": {
+ "name": "Claude-3.7-Sonnet-20250219 (temperature=1, length=12800, reasoning=3200)",
+ "link": "https://www.anthropic.com/news/claude-3-7-sonnet",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-02-19"
+ },
+ "claude-3-7-sonnet-20250219--main": {
+ "name": "Claude-3.7-Sonnet-20250219",
+ "link": "https://www.anthropic.com/news/claude-3-7-sonnet",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-02-19"
+ },
+ "WarriorCoder-6.7B--main": {
+ "name": "WarriorCoder-6.7B (Reproduced)",
+ "link": "https://arxiv.org/abs/2412.17395",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": 6.7,
+ "act_param": 6.7,
+ "open-data": "None",
+ "prefill": True,
+ "date": "2025-02-18"
+ },
+ "google--gemma-3-27b-it--main": {
+ "name": "Gemma-3-27B-Instruct",
+ "link": "https://huggingface.co/google/gemma-3-27b-it",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": 27,
+ "act_param": 27,
+ "open-data": "None",
+ "prefill": True,
+ "date": "2025-03-12"
+ },
+ "Qwen--QwQ-32B--skip_prefill--main": {
+ "name": "QwQ-32B (w/ Reasoning)",
+ "link": "https://huggingface.co/Qwen/QwQ-32B",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": 32,
+ "act_param": 32,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-03-06"
+ },
+ "deepseek-chat-0324--main": {
+ "name": "DeepSeek-V3-0324",
+ "link": "https://huggingface.co/deepseek-ai/DeepSeek-V3-0324",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": 671,
+ "act_param": 37,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-03-24"
+ },
+ "gemini-2.5-pro-exp-03-25--main": {
+ "name": "Gemini-2.5-Pro-Exp-03-25",
+ "link": "https://blog.google/technology/google-deepmind/gemini-model-thinking-updates-march-2025/",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": None,
+ "act_param": 37,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-03-25"
+ },
+ "meta/llama-4-scout-17b-16e-instruct--main": {
+ "name": "Llama-4-Scout",
+ "link": "https://huggingface.co/meta-llama/Llama-4-Scout-17B-16E-Instruct",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": 109,
+ "act_param": 17,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-04-05"
+ },
+ "meta/llama-4-maverick-17b-128e-instruct--main": {
+ "name": "Llama-4-Maverick",
+ "link": "https://huggingface.co/meta-llama/Llama-4-Maverick-17B-128E-Instruct",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": 109,
+ "act_param": 17,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-04-05"
},
-}
+ "agentica-org/DeepCoder-14B-Preview--main": {
+ "name": "DeepCoder-14B-Preview",
+ "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": 14,
+ "act_param": 14,
+ "open-data": "None",
+ "prefill": True,
+ "date": "2025-04-09"
+ },
+ "openrouter/quasar-alpha--main": {
+ "name": "Quasar-Alpha",
+ "link": "https://openrouter.ai/openrouter/quasar-alpha",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-04-02"
+ },
+ "agentica-org/DeepCoder-14B-Preview--skip_prefill--main": {
+ "name": "DeepCoder-14B-Preview (w/ Reasoning, 64k tokens, temperature=0.6)",
+ "link": "https://huggingface.co/agentica-org/DeepCoder-14B-Preview",
+ "open-data": "None",
+ "prompted": True,
+ "moe": False,
+ "size": 14,
+ "act_param": 14,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-04-09"
+ },
+ "openrouter/optimus-alpha--main": {
+ "name": "Optimus-Alpha",
+ "link": "https://openrouter.ai/openrouter/optimus-alpha",
+ "open-data": "None",
+ "prompted": True,
+ "moe": True,
+ "size": None,
+ "act_param": None,
+ "open-data": "None",
+ "prefill": False,
+ "date": "2025-04-10"
+ }
+}
\ No newline at end of file
diff --git a/bigcodebench/data/bigcodebench.py b/bigcodebench/data/bigcodebench.py
index 26090f1..87cfdf6 100644
--- a/bigcodebench/data/bigcodebench.py
+++ b/bigcodebench/data/bigcodebench.py
@@ -14,7 +14,7 @@
BIGCODEBENCH_OVERRIDE_PATH = os.environ.get("BIGCODEBENCH_OVERRIDE_PATH", None)
BIGCODEBENCH_HF = "bigcode/bigcodebench"
-BIGCODEBENCH_VERSION = "v0.1.3"
+BIGCODEBENCH_VERSION = "v0.1.4"
def _ready_bigcodebench_path(subset="full", version="default") -> str:
if BIGCODEBENCH_OVERRIDE_PATH:
diff --git a/bigcodebench/evaluate.py b/bigcodebench/evaluate.py
index 9e1fd45..5122a89 100644
--- a/bigcodebench/evaluate.py
+++ b/bigcodebench/evaluate.py
@@ -189,12 +189,19 @@ def evaluate(
# run the evaluation
print(f"Command run in sandbox {e2b_endpoint}")
- sandbox.commands.run("bigcodebench.evaluate --execution 'local' "
- f"--split {split} --subset {subset} --samples {samples} "
- f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} "
- f"--parallel {parallel} --selective_evaluate {selective_evaluate} --min_time_limit {min_time_limit} "
+ command = "bigcodebench.evaluate --execution 'local' "\
+ f"--split {split} --subset {subset} --samples {samples} "\
+ f"--pass_k {pass_k} --save_pass_rate {save_pass_rate} --calibrated {calibrated} "\
+ f"--parallel {parallel} --selective_evaluate {selective_evaluate} --min_time_limit {min_time_limit} "\
f"--max_as_limit {max_as_limit} --max_data_limit {max_data_limit} --max_stack_limit {max_stack_limit} "
- f"--check_gt_only {check_gt_only} --no_gt {no_gt}", on_stderr=lambda x: print(x), on_stdout=lambda x: print(x), timeout=60*50)
+
+ if check_gt_only:
+ command += f"--check_gt_only "
+ if no_gt:
+ command += f"--no_gt "
+ if no_execute:
+ command += f"--no_execute "
+ sandbox.commands.run(command, on_stdout=lambda x: print(x), on_stderr=lambda x: print(x), timeout=60*60)
if not check_gt_only:
# download the results
@@ -205,9 +212,17 @@ def evaluate(
else:
pass_at_k = dict()
- passk = list(pass_k)
+
+ if isinstance(pass_k, str):
+ passk = [int(k) for k in pass_k.split(",") if k.strip()]
+ elif isinstance(pass_k, int):
+ passk = [pass_k]
+ elif isinstance(pass_k, (list, tuple)):
+ passk = list(pass_k)
+ else:
+ raise ValueError(f"Invalid type for pass_k: {type(pass_k)}")
- if isinstance(selective_evaluate, str):
+ if selective_evaluate and isinstance(selective_evaluate, str):
selected_ids = set(selective_evaluate.split(","))
else:
try:
@@ -311,14 +326,13 @@ def evaluate(
assert len(completion_id) == len(problems), f"Missing problems in samples. Expected {len(problems)} problems, got {len(completion_id)}"
def stucking_checker():
- while remainings:
- last_size = len(remainings)
- time.sleep(240)
- if last_size != len(remainings) or len(remainings) == 0:
- continue
- # Potential stucking
- warn("No samples had finished testing in the last 240s")
- warn(f"{len(remainings)} samples to be tested: {remainings}")
+ not_done = futures
+ while len(not_done) > 0:
+ done, not_done = wait(not_done, timeout=240, return_when=FIRST_COMPLETED)
+
+ if len(done) == 0:
+ warn("No samples have finished testing in the last 240s")
+ warn(f"{len(remainings)} samples to be tested: {remainings}")
threading.Thread(target=stucking_checker).start()
diff --git a/bigcodebench/gen/util/anthropic_request.py b/bigcodebench/gen/util/anthropic_request.py
index e53feab..f6d18fd 100644
--- a/bigcodebench/gen/util/anthropic_request.py
+++ b/bigcodebench/gen/util/anthropic_request.py
@@ -16,7 +16,19 @@ def make_auto_request(client: anthropic.Client, *args, **kwargs) -> Message:
try:
signal.signal(signal.SIGALRM, handler)
signal.alarm(100)
- ret = client.messages.create(*args, **kwargs)
+ if "reasoning_budget" in kwargs and "reasoning_beta" in kwargs:
+ kwargs["thinking"] = {
+ "type": "enabled",
+ "budget_tokens": kwargs["reasoning_budget"],
+ }
+ kwargs["betas"] = [kwargs["reasoning_beta"]]
+ kwargs.pop("reasoning_budget")
+ kwargs.pop("reasoning_beta")
+ kwargs.pop("temperature")
+ if "thinking" in kwargs:
+ ret = client.beta.messages.create(*args, **kwargs, stream=True)
+ else:
+ ret = client.messages.create(*args, **kwargs)
signal.alarm(0)
except anthropic.RateLimitError:
print("Rate limit exceeded. Waiting...")
diff --git a/bigcodebench/gen/util/google_request.py b/bigcodebench/gen/util/google_request.py
index 9e13607..5a76362 100644
--- a/bigcodebench/gen/util/google_request.py
+++ b/bigcodebench/gen/util/google_request.py
@@ -1,11 +1,12 @@
import time
-import google.generativeai as genai
+from google import genai
from google.api_core.exceptions import GoogleAPICallError, ResourceExhausted
def make_request(
- client: genai.GenerativeModel,
+ model: str,
+ client: genai.Client,
message: str,
temperature: float,
n: int,
@@ -13,21 +14,34 @@ def make_request(
) -> genai.types.GenerateContentResponse:
kwargs = {"temperature": temperature, "max_output_tokens": max_new_tokens}
- if "-thinking-" in client.model_name:
+ if "-thinking-" in model:
kwargs.pop("max_output_tokens")
-
- response = client.generate_content(
- [{"role": "user", "parts": [message]}],
- generation_config=genai.types.GenerationConfig(
+
+ response = client.models.generate_content(
+ model=model,
+ contents=message,
+ config=genai.types.GenerateContentConfig(
candidate_count=n,
+ safety_settings=[
+ genai.types.SafetySetting(
+ category='HARM_CATEGORY_DANGEROUS_CONTENT',
+ threshold='BLOCK_NONE'
+ ),
+ genai.types.SafetySetting(
+ category='HARM_CATEGORY_SEXUALLY_EXPLICIT',
+ threshold='BLOCK_NONE'
+ ),
+ genai.types.SafetySetting(
+ category='HARM_CATEGORY_HATE_SPEECH',
+ threshold='BLOCK_NONE'
+ ),
+ genai.types.SafetySetting(
+ category='HARM_CATEGORY_HARASSMENT',
+ threshold='BLOCK_NONE'
+ ),
+ ],
**kwargs
- ),
- safety_settings=[
- {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
- {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
- {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
- {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
- ],
+ ),
)
return response
diff --git a/bigcodebench/gen/util/hf_inference_request.py b/bigcodebench/gen/util/hf_inference_request.py
new file mode 100644
index 0000000..fe4aaf3
--- /dev/null
+++ b/bigcodebench/gen/util/hf_inference_request.py
@@ -0,0 +1,34 @@
+import time
+
+from huggingface_hub import InferenceClient
+from huggingface_hub.inference._generated.types import TextGenerationOutput
+
+
+def make_request(
+ client: InferenceClient,
+ message: str,
+ model: str,
+ temperature: float,
+ n: int,
+ max_new_tokens: int = 2048,
+) -> TextGenerationOutput:
+ response = client.text_generation(
+ model=model,
+ prompt=message,
+ do_sample=False,
+ max_new_tokens=max_new_tokens,
+ )
+
+ return response
+
+
+def make_auto_request(*args, **kwargs) -> TextGenerationOutput:
+ ret = None
+ while ret is None:
+ try:
+ ret = make_request(*args, **kwargs)
+ except Exception as e:
+ print("Unknown error. Waiting...")
+ print(e)
+ time.sleep(1)
+ return ret
diff --git a/bigcodebench/gen/util/openai_request.py b/bigcodebench/gen/util/openai_request.py
index f8db3f5..3c8b741 100644
--- a/bigcodebench/gen/util/openai_request.py
+++ b/bigcodebench/gen/util/openai_request.py
@@ -17,7 +17,7 @@ def make_request(
kwargs["top_p"] = 0.95
kwargs["max_completion_tokens"] = max_tokens
kwargs["temperature"] = temperature
- if model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"): # pop top-p and max_completion_tokens
+ if any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]): # pop top-p and max_completion_tokens
kwargs.pop("top_p")
kwargs.pop("max_completion_tokens")
kwargs.pop("temperature")
diff --git a/bigcodebench/generate.py b/bigcodebench/generate.py
index bcf1463..adbf892 100644
--- a/bigcodebench/generate.py
+++ b/bigcodebench/generate.py
@@ -127,12 +127,19 @@ def run_codegen(
split: str,
subset: str,
root: str = "bcb_results",
+ lora_path: str = None,
bs: Optional[int] = None,
n_samples: int = 1,
temperature: float = 0.0,
max_new_tokens: int = 1280,
+ # vllm
+ max_model_len: int = 12800,
greedy: bool = False,
+ # openai
reasoning_effort: str = "medium",
+ # anthropic
+ reasoning_budget: int = 0,
+ reasoning_beta: str = "output-128k-2025-02-19",
strip_newlines: bool = False,
direct_completion: bool = False,
resume: bool = True,
@@ -170,9 +177,13 @@ def run_codegen(
backend=backend,
subset=subset,
split=split,
+ lora_path=lora_path,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ max_model_len=max_model_len,
reasoning_effort=reasoning_effort,
+ reasoning_budget=reasoning_budget,
+ reasoning_beta=reasoning_beta,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
prefill=not skip_prefill,
@@ -186,9 +197,15 @@ def run_codegen(
)
extra = "-" + subset if subset != "full" else ""
- if reasoning_effort and model.startswith("o1-") or model.startswith("o3-") or model.endswith("-reasoner"):
+ if backend == "openai" and reasoning_effort and any(model.startswith(m) or model.endswith(m) for m in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):
model = model + f"--{reasoning_effort}"
-
+
+ if lora_path:
+ model = model + f"--lora-{lora_path}"
+
+ if backend == "anthropic" and reasoning_budget and reasoning_beta:
+ model = model + f"--{reasoning_budget}-{reasoning_beta}"
+
if skip_prefill:
identifier = model.replace("/", "--") + "--skip_prefill" + f"--{revision}--bigcodebench{extra}-{split}--{backend}-{temperature}-{n_samples}-sanitized_calibrated.jsonl"
else:
diff --git a/bigcodebench/provider/__init__.py b/bigcodebench/provider/__init__.py
index dbadfd4..4cb3410 100644
--- a/bigcodebench/provider/__init__.py
+++ b/bigcodebench/provider/__init__.py
@@ -6,11 +6,16 @@ def make_model(
backend: str,
subset: str,
split: str,
+ lora_path: str = None,
dataset: str = "bigcodebench",
temperature: float = 0.0,
max_new_tokens: int = 1280,
- # o1 and o3 only
+ max_model_len: int = 12800,
+ # openai only
reasoning_effort: str = "medium",
+ # anthropic only
+ reasoning_budget: int = 0,
+ reasoning_beta: str = "output-128k-2025-02-19",
# instruction model only
instruction_prefix: str = None,
response_prefix: str = None,
@@ -35,8 +40,10 @@ def make_model(
name=model,
subset=subset,
split=split,
+ lora_path=lora_path,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ max_model_len=max_model_len,
revision=revision,
dataset=dataset,
direct_completion=direct_completion,
@@ -55,6 +62,7 @@ def make_model(
name=model,
subset=subset,
split=split,
+ lora_path=lora_path,
temperature=temperature,
max_new_tokens=max_new_tokens,
revision=revision,
@@ -68,6 +76,19 @@ def make_model(
tokenizer_name=tokenizer_name,
tokenizer_legacy=tokenizer_legacy,
)
+ elif backend == "hf-inference":
+ from bigcodebench.provider.hf_inference import HuggingFaceInferenceDecoder
+
+ return HuggingFaceInferenceDecoder(
+ name=model,
+ subset=subset,
+ split=split,
+ temperature=temperature,
+ max_new_tokens=max_new_tokens,
+ direct_completion=direct_completion,
+ instruction_prefix=instruction_prefix,
+ response_prefix=response_prefix,
+ )
elif backend == "openai":
from bigcodebench.provider.openai import OpenAIChatDecoder
@@ -105,6 +126,8 @@ def make_model(
split=split,
temperature=temperature,
max_new_tokens=max_new_tokens,
+ reasoning_budget=reasoning_budget,
+ reasoning_beta=reasoning_beta,
instruction_prefix=instruction_prefix,
response_prefix=response_prefix,
)
diff --git a/bigcodebench/provider/anthropic.py b/bigcodebench/provider/anthropic.py
index 1969e0c..b4a7e43 100644
--- a/bigcodebench/provider/anthropic.py
+++ b/bigcodebench/provider/anthropic.py
@@ -9,9 +9,11 @@
from bigcodebench.provider.utility import make_raw_chat_prompt
class AnthropicDecoder(DecoderBase):
- def __init__(self, name: str, **kwargs) -> None:
+ def __init__(self, name: str, reasoning_budget: int = 0, reasoning_beta: str = "output-128k-2025-02-19", **kwargs) -> None:
super().__init__(name, **kwargs)
self.client = anthropic.Anthropic(api_key=os.getenv("ANTHROPIC_KEY"))
+ self.reasoning_budget = reasoning_budget
+ self.reasoning_beta = reasoning_beta
def codegen(
self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
@@ -43,8 +45,20 @@ def codegen(
max_tokens=self.max_new_tokens,
temperature=self.temperature,
stop_sequences=self.eos,
+ reasoning_budget=self.reasoning_budget,
+ reasoning_beta=self.reasoning_beta,
)
- outputs.append(ret.content[0].text)
+ if isinstance(ret, anthropic.Stream):
+ output = ""
+ for chunk in ret:
+ if chunk.type == "content_block_delta":
+ # if chunk.delta.type == "thinking_delta":
+ # output += chunk.delta.thinking
+ if chunk.delta.type == "text_delta":
+ output += chunk.delta.text
+ outputs.append(output)
+ else:
+ outputs.append(ret.content[0].text)
all_outputs.append(outputs)
return all_outputs
diff --git a/bigcodebench/provider/google.py b/bigcodebench/provider/google.py
index 2194c47..e3b18ff 100644
--- a/bigcodebench/provider/google.py
+++ b/bigcodebench/provider/google.py
@@ -2,7 +2,7 @@
from typing import List
from tqdm import tqdm
-import google.generativeai as genai
+from google import genai
from bigcodebench.provider.base import DecoderBase
from bigcodebench.gen.util.google_request import make_auto_request
@@ -12,8 +12,8 @@
class GoogleDecoder(DecoderBase):
def __init__(self, name: str, **kwargs):
super().__init__(name, **kwargs)
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
- self.client = genai.GenerativeModel(name)
+ self.model = name
+ self.client = genai.Client(api_key=os.getenv("GOOGLE_API_KEY"))
def codegen(
self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
@@ -34,7 +34,8 @@ def codegen(
tokenizer=None,
)
ret = make_auto_request(
- self.client,
+ model=self.model,
+ client=self.client,
message=message,
n=num_samples,
temperature=self.temperature,
diff --git a/bigcodebench/provider/hf.py b/bigcodebench/provider/hf.py
index a85957d..fbe50e5 100644
--- a/bigcodebench/provider/hf.py
+++ b/bigcodebench/provider/hf.py
@@ -41,7 +41,8 @@ def __init__(
if self.is_direct_completion(): # no chat template
self.eos += extra_eos_for_direct_completion(dataset)
else: # with chat template
- self.eos += ["\n```\n"]
+ if self.prefill and "```" in self.response_prefix:
+ self.eos += ["\n```\n"]
print(f"{self.eos = }")
self.model = AutoModelForCausalLM.from_pretrained(name, **kwargs)
diff --git a/bigcodebench/provider/hf_inference.py b/bigcodebench/provider/hf_inference.py
new file mode 100644
index 0000000..1737448
--- /dev/null
+++ b/bigcodebench/provider/hf_inference.py
@@ -0,0 +1,54 @@
+import os
+from typing import List
+from tqdm import tqdm
+
+from huggingface_hub import InferenceClient
+
+from bigcodebench.provider.base import DecoderBase
+from bigcodebench.gen.util.hf_inference_request import make_auto_request
+from bigcodebench.provider.utility import make_raw_chat_prompt
+
+
+class HuggingFaceInferenceDecoder(DecoderBase):
+ def __init__(self, name: str, **kwargs):
+ super().__init__(name, **kwargs)
+ self.client = InferenceClient(
+ provider="hf-inference", api_key=os.getenv("HF_INFERENCE_API_KEY")
+ )
+
+ def codegen(
+ self, prompts: List[str], do_sample: bool = True, num_samples: int = 200
+ ) -> List[str]:
+ if do_sample:
+ assert self.temperature > 0, "Temperature must be positive for sampling"
+
+ all_outputs = []
+
+ for prompt in tqdm(prompts):
+ outputs = []
+ message = (
+ prompt
+ if self.is_direct_completion()
+ else make_raw_chat_prompt(
+ task_prompt=prompt,
+ subset=self.subset,
+ split=self.split,
+ instruction_prefix=self.instruction_prefix,
+ response_prefix=self.response_prefix,
+ tokenizer=None,
+ )
+ )
+ ret = make_auto_request(
+ self.client,
+ message=message,
+ model=self.name,
+ n=num_samples,
+ temperature=self.temperature,
+ max_new_tokens=self.max_new_tokens,
+ )
+ outputs.append(ret)
+ all_outputs.append(outputs)
+ return all_outputs
+
+ def is_direct_completion(self) -> bool:
+ return self.direct_completion
diff --git a/bigcodebench/provider/openai.py b/bigcodebench/provider/openai.py
index 12790f6..ff1459f 100644
--- a/bigcodebench/provider/openai.py
+++ b/bigcodebench/provider/openai.py
@@ -28,7 +28,7 @@ def codegen(
tokenizer=None,
) for prompt in prompts]
# use concurrency based batching for o1 and deepseek models
- if self.name.startswith("o1-") or self.name.startswith("o3-") or self.name.startswith("deepseek"):
+ if any(self.name.startswith(model) or self.name.endswith(model) for model in ["o1-", "o3-", "reasoner", "grok-3-mini-beta"]):
return self._codegen_batch_via_concurrency(messages, num_samples)
return self._codegen_api_batch(messages, num_samples)
diff --git a/bigcodebench/provider/vllm.py b/bigcodebench/provider/vllm.py
index 5ce67ab..41cd251 100644
--- a/bigcodebench/provider/vllm.py
+++ b/bigcodebench/provider/vllm.py
@@ -3,6 +3,8 @@
from transformers import AutoTokenizer
from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
+from huggingface_hub import snapshot_download
from bigcodebench.provider.base import DecoderBase
from bigcodebench.provider.utility import (
@@ -11,7 +13,7 @@
)
class VllmDecoder(DecoderBase):
- def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
+ def __init__(self, name: str, lora_path: str, dataset: str, tp: int, max_model_len: int, **kwargs) -> None:
super().__init__(name, **kwargs)
kwargs = {
@@ -27,8 +29,19 @@ def __init__(self, name: str, dataset: str, tp: int, **kwargs) -> None:
if self.is_direct_completion():
self.eos += extra_eos_for_direct_completion(dataset)
else:
- self.eos += ["\n```\n"]
- self.llm = LLM(model=name, max_model_len=self.max_new_tokens, **kwargs)
+ if self.prefill and "```" in self.response_prefix:
+ self.eos += ["\n```\n"]
+
+ self.lora_request = None
+ if lora_path:
+ local_lora_path = snapshot_download(lora_path)
+ self.lora_request = LoRARequest(
+ "lora",
+ 1,
+ local_lora_path,
+ )
+
+ self.llm = LLM(model=name, max_model_len=max_model_len, enable_lora=True if self.lora_request else False, **kwargs)
self.llm.set_tokenizer(tokenizer=self.tokenizer)
def is_direct_completion(self) -> bool:
@@ -63,6 +76,7 @@ def codegen(
stop=self.eos,
skip_special_tokens=self.skip_special_tokens,
),
+ lora_request=self.lora_request,
use_tqdm=True,
)
diff --git a/run.sh b/run.sh
index 6242abd..8bfcdd7 100755
--- a/run.sh
+++ b/run.sh
@@ -10,5 +10,4 @@ bigcodebench.evaluate \
--model $MODEL \
--split $SPLIT \
--subset $SUBSET \
- --backend $BACKEND \
- --check_gt_only
\ No newline at end of file
+ --backend $BACKEND
\ No newline at end of file
diff --git a/sandbox-templates/e2b.Dockerfile b/sandbox-templates/e2b.Dockerfile
index c6ba2ca..a667880 100644
--- a/sandbox-templates/e2b.Dockerfile
+++ b/sandbox-templates/e2b.Dockerfile
@@ -27,6 +27,7 @@ RUN adduser --disabled-password --gecos "" bigcodebenchuser
RUN rm -rf /bigcodebench
+RUN echo 1
# Acquire benchmark code to local
ADD "https://api.github.com/repos/bigcode-project/bigcodebench/commits?per_page=1" latest_commit
RUN git clone https://github.com/bigcode-project/bigcodebench.git /bigcodebench
diff --git a/setup.cfg b/setup.cfg
index cc20139..ea71dc0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -35,10 +35,10 @@ install_requires =
rich
accelerate>=0.30.1
anthropic>=0.26.1
- google-generativeai>=0.5.4
+ google-genai
mistralai>=0.2.0,<1.0.0
openai>=1.11.1
- e2b
+ e2b<=1.11.1
[options.entry_points]
console_scripts =
diff --git a/tools/fix_v023.py b/tools/fix_v023.py
new file mode 100644
index 0000000..22b1559
--- /dev/null
+++ b/tools/fix_v023.py
@@ -0,0 +1,91 @@
+from datasets import load_dataset, Dataset, DatasetDict
+from huggingface_hub import HfApi
+
+import json
+import copy
+
+BIGCODEBENCH_HF = "bigcode/bigcodebench"
+BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
+BIGCODEBENCH_VERSION = "v0.1.3"
+BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
+BIGCODEBENCH_NEW_VERSION = "v0.1.4"
+
+def map_ds(sample):
+ if sample["task_id"] in ["BigCodeBench/211"]:
+ sample['test'] = sample['test'].replace(
+"""
+ mock_response = MagicMock()
+ mock_response.content = MOCK_CONTENT
+""",
+"""
+ mock_response = MagicMock()
+ mock_response.content = MOCK_CONTENT
+ mock_response.status_code = 200
+"""
+ )
+ if sample["task_id"] in ["BigCodeBench/215"]:
+ sample['test'] = sample['test'].replace(
+"""
+ mock_response = Mock()
+""",
+"""
+ mock_response = Mock()
+ mock_response.status_code = 200
+"""
+ )
+ sample['test'] = sample['test'].replace(
+"""
+ mock_response.text =""",
+"""
+ MOCK_TEXT ="""
+ )
+ sample['test'] = sample['test'].replace(
+"""
+ mock_get.return_value = mock_response
+""",
+"""
+ mock_response.text = MOCK_TEXT
+ mock_response.json = lambda: json.loads(MOCK_TEXT)
+ mock_get.return_value = mock_response
+"""
+ )
+ sample['complete_prompt'] = sample['complete_prompt'].replace("Thif function will raise", "This function will raise")
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace("Thif function will raise", "This function will raise")
+ sample['doc_struct'] = sample['doc_struct'].replace("Thif function will raise", "This function will raise")
+ return sample
+
+if __name__ == "__main__":
+ api = HfApi()
+ ds_dict = load_dataset(BIGCODEBENCH_HF)
+ hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
+ ds = ds_dict[BIGCODEBENCH_VERSION]
+ hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
+ function_id = [211, 215]
+
+ new_ds = ds.map(map_ds)
+ new_ds.to_json("BigCodeBench.jsonl")
+ ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
+ ds_dict.push_to_hub(BIGCODEBENCH_HF)
+
+ new_hard_ds = hard_ds.map(map_ds)
+ new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
+ hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
+ hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
+
+ for i in function_id:
+ old_sample = ds.select([i])
+ new_sample = new_ds.select([i])
+ old_sample.to_json("old.jsonl")
+ new_sample.to_json("new.jsonl")
+ api.upload_file(
+ path_or_fileobj="old.jsonl",
+ path_in_repo=f"{i}/old.jsonl",
+ repo_id=BIGCODEBENCH_UPDATE,
+ # repo_type="dataset"
+ )
+ api.upload_file(
+ path_or_fileobj="new.jsonl",
+ path_in_repo=f"{i}/new.jsonl",
+ repo_id=BIGCODEBENCH_UPDATE,
+ # repo_type="dataset"
+ )
diff --git a/tools/fix_v025.py b/tools/fix_v025.py
new file mode 100644
index 0000000..edbeb71
--- /dev/null
+++ b/tools/fix_v025.py
@@ -0,0 +1,135 @@
+from datasets import load_dataset
+from huggingface_hub import HfApi
+
+BIGCODEBENCH_HF = "bigcode/bigcodebench"
+BIGCODEBENCH_HARD_HF = "bigcode/bigcodebench-hard"
+BIGCODEBENCH_VERSION = "v0.1.4"
+BIGCODEBENCH_UPDATE = "bigcode/bcb_update"
+BIGCODEBENCH_NEW_VERSION = "v0.1.5"
+
+def map_ds(sample):
+ if sample["task_id"] in ["BigCodeBench/332"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/334"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/376"]:
+ sample['code_prompt'] = sample['code_prompt'].replace(
+ "import nltk\n",
+ "import nltk\nnltk.download('stopwords')\n",
+ 1
+ )
+ sample['complete_prompt'] = sample['complete_prompt'].replace(
+ "import nltk\n",
+ "import nltk\nnltk.download('stopwords')\n",
+ 1
+ )
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/383"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/633"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/635"]:
+ sample['code_prompt'] = sample['code_prompt'].replace(
+ "# Importing the required libraries",
+ "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ sample['complete_prompt'] = sample['complete_prompt'].replace(
+ "# Importing the required libraries",
+ "# Importing the required libraries\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/849"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('stopwords')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('stopwords')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/940"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
+ )
+
+ if sample["task_id"] in ["BigCodeBench/1109"]:
+ sample['code_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['code_prompt']
+ sample['complete_prompt'] = "import nltk\nnltk.download('punkt')\n" + sample['complete_prompt']
+ sample['instruct_prompt'] = sample['instruct_prompt'].replace(
+ "\nYou should write self-contained code starting with:\n```\n",
+ "\nYou should write self-contained code starting with:\n```\nimport nltk\nnltk.download('punkt')\n"
+ )
+
+ return sample
+
+if __name__ == "__main__":
+ api = HfApi()
+ ds_dict = load_dataset(BIGCODEBENCH_HF)
+ hard_ds_dict = load_dataset(BIGCODEBENCH_HARD_HF)
+ ds = ds_dict[BIGCODEBENCH_VERSION]
+ hard_ds = hard_ds_dict[BIGCODEBENCH_VERSION]
+ function_id = [332, 334, 376, 383, 633, 635, 849, 940, 1109]
+
+ new_ds = ds.map(map_ds)
+ new_ds.to_json("BigCodeBench.jsonl")
+ ds_dict[BIGCODEBENCH_NEW_VERSION] = new_ds
+ ds_dict.push_to_hub(BIGCODEBENCH_HF)
+
+ new_hard_ds = hard_ds.map(map_ds)
+ new_hard_ds.to_json("BigCodeBench-Hard.jsonl")
+ hard_ds_dict[BIGCODEBENCH_NEW_VERSION] = new_hard_ds
+ hard_ds_dict.push_to_hub(BIGCODEBENCH_HARD_HF)
+
+ for i in function_id:
+ old_sample = ds.select([i])
+ new_sample = new_ds.select([i])
+ old_sample.to_json("old.jsonl")
+ new_sample.to_json("new.jsonl")
+ api.upload_file(
+ path_or_fileobj="old.jsonl",
+ path_in_repo=f"{i}/old.jsonl",
+ repo_id=BIGCODEBENCH_UPDATE,
+ # repo_type="dataset"
+ )
+ api.upload_file(
+ path_or_fileobj="new.jsonl",
+ path_in_repo=f"{i}/new.jsonl",
+ repo_id=BIGCODEBENCH_UPDATE,
+ # repo_type="dataset"
+ )
\ No newline at end of file