Spaces:
Running
Upload from GitHub Actions: fast-fail on account-level API errors; refuse to ship runs with >80% errors
Browse filesToday's workflow run hit a 403 'Key limit exceeded' from OpenRouter on
the very first request and would have kept burning through the entire
combinatorial matrix tagging every row with status=error. We caught it
manually and cancelled before save() fired, but the fragility is real:
any future account-level outage (auth, credit cap, key revoke) would
flood results-detailed with bogus errors and then auto_blocklist would
mass-exclude the auto-discovered models from the *next* run, even after
the underlying issue is fixed. Two layers of protection.
Layer 1 - propagate account-level failures (evals/models.py + tasks.py):
- Add FatalAPIError. Raised by complete() when the OpenRouter error
message matches markers like "key limit exceeded", "insufficient
credits", "unauthorized", "payment required", "invalid api key".
- query() in tasks.py re-raises FatalAPIError instead of swallowing
it into status=error. asyncio.gather then surfaces it, the eval
loop aborts, and save() is never reached.
Layer 2 - save-time error-rate gate (evals/main.py):
- Catches infra problems that aren't account-level (rate-limit
storms, regional outages, dependency failure). If >80% of NEW
rows in this run have status != "ok", raise before save() instead
of pushing the polluted snapshot to HF. Forces a human to look
at the logs and re-run after fixing.
- evals/main.py +17 -1
- evals/models.py +28 -0
- evals/tasks.py +4 -1
|
@@ -77,7 +77,23 @@ async def evaluate():
|
|
| 77 |
]
|
| 78 |
results = [r for batch in batch_results for result in batch for r in result]
|
| 79 |
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
|
| 80 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
# Merge with cached results (immutable log, prefer latest results on conflict)
|
| 82 |
all_results = pd.concat([old_results, results]).drop_duplicates(
|
| 83 |
subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
|
|
|
|
| 77 |
]
|
| 78 |
results = [r for batch in batch_results for result in batch for r in result]
|
| 79 |
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
|
| 80 |
+
|
| 81 |
+
# Defense-in-depth: if a huge fraction of NEW rows are errors, an
|
| 82 |
+
# infrastructure problem (rate limits, regional ban, dependency outage)
|
| 83 |
+
# likely degraded the run rather than the models actually being broken.
|
| 84 |
+
# Refuse to push the polluted snapshot so auto_blocklist doesn't kick in
|
| 85 |
+
# against innocent models on the next cycle. FatalAPIError already covers
|
| 86 |
+
# auth/credit failures upstream — this is the catch-all for everything else.
|
| 87 |
+
if not results.empty and "status" in results.columns:
|
| 88 |
+
new_error_rate = (results["status"] != "ok").mean()
|
| 89 |
+
if new_error_rate > 0.8:
|
| 90 |
+
raise RuntimeError(
|
| 91 |
+
f"Run produced {new_error_rate:.1%} errors in {len(results)} "
|
| 92 |
+
f"new rows (threshold 80%). Likely an infra failure; refusing "
|
| 93 |
+
f"to push to HF. Investigate the logs above and re-run when "
|
| 94 |
+
f"resolved."
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
# Merge with cached results (immutable log, prefer latest results on conflict)
|
| 98 |
all_results = pd.concat([old_results, results]).drop_duplicates(
|
| 99 |
subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
|
|
@@ -292,6 +292,26 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
|
|
| 292 |
google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
|
| 293 |
|
| 294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
@cache
|
| 296 |
async def complete(**kwargs) -> str | None:
|
| 297 |
async with openrouter_rate_limit:
|
|
@@ -301,6 +321,14 @@ async def complete(**kwargs) -> str | None:
|
|
| 301 |
if "filtered" in e.message:
|
| 302 |
return None
|
| 303 |
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
if not response.choices:
|
| 305 |
raise Exception(response)
|
| 306 |
return response.choices[0].message.content.strip()
|
|
|
|
| 292 |
google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
|
| 293 |
|
| 294 |
|
| 295 |
+
class FatalAPIError(RuntimeError):
|
| 296 |
+
"""Account-level failure (auth, key-limit, payment). Abort the whole run.
|
| 297 |
+
|
| 298 |
+
These errors apply to every subsequent call regardless of model/prompt, so
|
| 299 |
+
continuing the eval just floods results-detailed with bogus errors and
|
| 300 |
+
poisons the auto-blocklist. Raised by complete(); re-raised (not swallowed)
|
| 301 |
+
by query() in tasks.py so it propagates out of the eval loop.
|
| 302 |
+
"""
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
_FATAL_ERROR_MARKERS = (
|
| 306 |
+
"key limit exceeded",
|
| 307 |
+
"insufficient credits",
|
| 308 |
+
"insufficient_quota",
|
| 309 |
+
"invalid api key",
|
| 310 |
+
"unauthorized",
|
| 311 |
+
"payment required",
|
| 312 |
+
)
|
| 313 |
+
|
| 314 |
+
|
| 315 |
@cache
|
| 316 |
async def complete(**kwargs) -> str | None:
|
| 317 |
async with openrouter_rate_limit:
|
|
|
|
| 321 |
if "filtered" in e.message:
|
| 322 |
return None
|
| 323 |
raise e
|
| 324 |
+
except Exception as e:
|
| 325 |
+
msg = str(e).lower()
|
| 326 |
+
if any(marker in msg for marker in _FATAL_ERROR_MARKERS):
|
| 327 |
+
raise FatalAPIError(
|
| 328 |
+
f"OpenRouter account-level failure: {e}. "
|
| 329 |
+
"Aborting run before results-detailed is polluted."
|
| 330 |
+
) from e
|
| 331 |
+
raise
|
| 332 |
if not response.choices:
|
| 333 |
raise Exception(response)
|
| 334 |
return response.choices[0].message.content.strip()
|
|
@@ -13,7 +13,7 @@ from datasets_.truthfulqa import load_truthfulqa
|
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
-
from models import complete, translate_google
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
@@ -48,6 +48,9 @@ async def query(model, prompt):
|
|
| 48 |
),
|
| 49 |
),
|
| 50 |
)
|
|
|
|
|
|
|
|
|
|
| 51 |
except Exception as e:
|
| 52 |
print(f"exception for model {model}: {e}")
|
| 53 |
return None
|
|
|
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
+
from models import FatalAPIError, complete, translate_google
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
|
|
| 48 |
),
|
| 49 |
),
|
| 50 |
)
|
| 51 |
+
except FatalAPIError:
|
| 52 |
+
# Account-level: re-raise so the eval loop aborts cleanly before save.
|
| 53 |
+
raise
|
| 54 |
except Exception as e:
|
| 55 |
print(f"exception for model {model}: {e}")
|
| 56 |
return None
|