Spaces:

fair-forward
/

languagebench

Running

App Files Files Community

davidpomerenke commited on 13 days ago

Commit

c2afc16

verified ·

1 Parent(s): bec2f46

Upload from GitHub Actions: fast-fail on account-level API errors; refuse to ship runs with >80% errors

Browse files

Today's workflow run hit a 403 'Key limit exceeded' from OpenRouter on
the very first request and would have kept burning through the entire
combinatorial matrix tagging every row with status=error. We caught it
manually and cancelled before save() fired, but the fragility is real:
any future account-level outage (auth, credit cap, key revoke) would
flood results-detailed with bogus errors and then auto_blocklist would
mass-exclude the auto-discovered models from the *next* run, even after
the underlying issue is fixed. Two layers of protection.

Layer 1 - propagate account-level failures (evals/models.py + tasks.py):
- Add FatalAPIError. Raised by complete() when the OpenRouter error
message matches markers like "key limit exceeded", "insufficient
credits", "unauthorized", "payment required", "invalid api key".
- query() in tasks.py re-raises FatalAPIError instead of swallowing
it into status=error. asyncio.gather then surfaces it, the eval
loop aborts, and save() is never reached.

Layer 2 - save-time error-rate gate (evals/main.py):
- Catches infra problems that aren't account-level (rate-limit
storms, regional outages, dependency failure). If >80% of NEW
rows in this run have status != "ok", raise before save() instead
of pushing the polluted snapshot to HF. Forces a human to look
at the logs and re-run after fixing.

Files changed (3) hide show

evals/main.py +17 -1
evals/models.py +28 -0
evals/tasks.py +4 -1

evals/main.py CHANGED Viewed

@@ -77,7 +77,23 @@ async def evaluate():
     ]
     results = [r for batch in batch_results for result in batch for r in result]
     results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
     # Merge with cached results (immutable log, prefer latest results on conflict)
     all_results = pd.concat([old_results, results]).drop_duplicates(
         subset=["task", "model", "bcp_47", "metric", "sentence_nr"],

     ]
     results = [r for batch in batch_results for result in batch for r in result]
     results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
+    # Defense-in-depth: if a huge fraction of NEW rows are errors, an
+    # infrastructure problem (rate limits, regional ban, dependency outage)
+    # likely degraded the run rather than the models actually being broken.
+    # Refuse to push the polluted snapshot so auto_blocklist doesn't kick in
+    # against innocent models on the next cycle. FatalAPIError already covers
+    # auth/credit failures upstream — this is the catch-all for everything else.
+    if not results.empty and "status" in results.columns:
+        new_error_rate = (results["status"] != "ok").mean()
+        if new_error_rate > 0.8:
+            raise RuntimeError(
+                f"Run produced {new_error_rate:.1%} errors in {len(results)} "
+                f"new rows (threshold 80%). Likely an infra failure; refusing "
+                f"to push to HF. Investigate the logs above and re-run when "
+                f"resolved."
+            )
     # Merge with cached results (immutable log, prefer latest results on conflict)
     all_results = pd.concat([old_results, results]).drop_duplicates(
         subset=["task", "model", "bcp_47", "metric", "sentence_nr"],

evals/models.py CHANGED Viewed

@@ -292,6 +292,26 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
 google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
 @cache
 async def complete(**kwargs) -> str | None:
     async with openrouter_rate_limit:
@@ -301,6 +321,14 @@ async def complete(**kwargs) -> str | None:
             if "filtered" in e.message:
                 return None
             raise e
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()

 google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
+class FatalAPIError(RuntimeError):
+    """Account-level failure (auth, key-limit, payment). Abort the whole run.
+    These errors apply to every subsequent call regardless of model/prompt, so
+    continuing the eval just floods results-detailed with bogus errors and
+    poisons the auto-blocklist. Raised by complete(); re-raised (not swallowed)
+    by query() in tasks.py so it propagates out of the eval loop.
+    """
+_FATAL_ERROR_MARKERS = (
+    "key limit exceeded",
+    "insufficient credits",
+    "insufficient_quota",
+    "invalid api key",
+    "unauthorized",
+    "payment required",
+)
 @cache
 async def complete(**kwargs) -> str | None:
     async with openrouter_rate_limit:
             if "filtered" in e.message:
                 return None
             raise e
+        except Exception as e:
+            msg = str(e).lower()
+            if any(marker in msg for marker in _FATAL_ERROR_MARKERS):
+                raise FatalAPIError(
+                    f"OpenRouter account-level failure: {e}. "
+                    "Aborting run before results-detailed is polluted."
+                ) from e
+            raise
     if not response.choices:
         raise Exception(response)
     return response.choices[0].message.content.strip()

evals/tasks.py CHANGED Viewed

@@ -13,7 +13,7 @@ from datasets_.truthfulqa import load_truthfulqa
 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
 from languages import languages, script_name
-from models import complete, translate_google
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
@@ -48,6 +48,9 @@ async def query(model, prompt):
                 ),
             ),
         )
     except Exception as e:
         print(f"exception for model {model}: {e}")
         return None

 from google.cloud import translate_v2 as translate
 from langcodes import closest_supported_match
 from languages import languages, script_name
+from models import FatalAPIError, complete, translate_google
 bleu = evaluate.load("bleu")
 chrf = evaluate.load("chrf")
                 ),
             ),
         )
+    except FatalAPIError:
+        # Account-level: re-raise so the eval loop aborts cleanly before save.
+        raise
     except Exception as e:
         print(f"exception for model {model}: {e}")
         return None