diff --git a/lcb_runner/evaluation/compute_code_generation_metrics.py b/lcb_runner/evaluation/compute_code_generation_metrics.py index b8de33e..cb2ebfc 100644 --- a/lcb_runner/evaluation/compute_code_generation_metrics.py +++ b/lcb_runner/evaluation/compute_code_generation_metrics.py @@ -42,6 +42,10 @@ def check_correctness(sample, generation, timeout, debug=True): p.join( timeout=(timeout + 1) * len(json.loads(sample["input_output"])["inputs"]) + 5 ) + try: + os.kill(p.pid, 9) # Force to kill the process by PID for safety + except: + None if p.is_alive(): p.kill() if not result: diff --git a/lcb_runner/evaluation/testing_util.py b/lcb_runner/evaluation/testing_util.py index ecc8947..2719f2d 100644 --- a/lcb_runner/evaluation/testing_util.py +++ b/lcb_runner/evaluation/testing_util.py @@ -434,7 +434,7 @@ def run_test(sample, test=None, debug=False, timeout=6): # Disable functionalities that can make destructive changes to the test. # max memory is set to 4GB - reliability_guard() + reliability_guard(4 * 1024 * 1024 * 1024) if debug: print(f"start = {datetime.now().time()}")