diff --git a/lcb_runner/evaluation/compute_code_generation_metrics.py b/lcb_runner/evaluation/compute_code_generation_metrics.py
index b8de33e..cb2ebfc 100644
--- a/lcb_runner/evaluation/compute_code_generation_metrics.py
+++ b/lcb_runner/evaluation/compute_code_generation_metrics.py
@@ -42,6 +42,10 @@ def check_correctness(sample, generation, timeout, debug=True):
     p.join(
         timeout=(timeout + 1) * len(json.loads(sample["input_output"])["inputs"]) + 5
     )
+    try:
+        os.kill(p.pid, 9) # Force to kill the process by PID for safety
+    except:
+        None
     if p.is_alive():
         p.kill()
     if not result:
diff --git a/lcb_runner/evaluation/testing_util.py b/lcb_runner/evaluation/testing_util.py
index ecc8947..2719f2d 100644
--- a/lcb_runner/evaluation/testing_util.py
+++ b/lcb_runner/evaluation/testing_util.py
@@ -434,7 +434,7 @@ def run_test(sample, test=None, debug=False, timeout=6):
 
     # Disable functionalities that can make destructive changes to the test.
     # max memory is set to 4GB
-    reliability_guard()
+    reliability_guard(4 * 1024 * 1024 * 1024)
 
     if debug:
         print(f"start = {datetime.now().time()}")