davidpomerenke commited on
Commit
c2afc16
·
verified ·
1 Parent(s): bec2f46

Upload from GitHub Actions: fast-fail on account-level API errors; refuse to ship runs with >80% errors

Browse files

Today's workflow run hit a 403 'Key limit exceeded' from OpenRouter on
the very first request and would have kept burning through the entire
combinatorial matrix tagging every row with status=error. We caught it
manually and cancelled before save() fired, but the fragility is real:
any future account-level outage (auth, credit cap, key revoke) would
flood results-detailed with bogus errors and then auto_blocklist would
mass-exclude the auto-discovered models from the *next* run, even after
the underlying issue is fixed. Two layers of protection.

Layer 1 - propagate account-level failures (evals/models.py + tasks.py):
- Add FatalAPIError. Raised by complete() when the OpenRouter error
message matches markers like "key limit exceeded", "insufficient
credits", "unauthorized", "payment required", "invalid api key".
- query() in tasks.py re-raises FatalAPIError instead of swallowing
it into status=error. asyncio.gather then surfaces it, the eval
loop aborts, and save() is never reached.

Layer 2 - save-time error-rate gate (evals/main.py):
- Catches infra problems that aren't account-level (rate-limit
storms, regional outages, dependency failure). If >80% of NEW
rows in this run have status != "ok", raise before save() instead
of pushing the polluted snapshot to HF. Forces a human to look
at the logs and re-run after fixing.

Files changed (3) hide show
  1. evals/main.py +17 -1
  2. evals/models.py +28 -0
  3. evals/tasks.py +4 -1
evals/main.py CHANGED
@@ -77,7 +77,23 @@ async def evaluate():
77
  ]
78
  results = [r for batch in batch_results for result in batch for r in result]
79
  results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
80
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  # Merge with cached results (immutable log, prefer latest results on conflict)
82
  all_results = pd.concat([old_results, results]).drop_duplicates(
83
  subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
 
77
  ]
78
  results = [r for batch in batch_results for result in batch for r in result]
79
  results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
80
+
81
+ # Defense-in-depth: if a huge fraction of NEW rows are errors, an
82
+ # infrastructure problem (rate limits, regional ban, dependency outage)
83
+ # likely degraded the run rather than the models actually being broken.
84
+ # Refuse to push the polluted snapshot so auto_blocklist doesn't kick in
85
+ # against innocent models on the next cycle. FatalAPIError already covers
86
+ # auth/credit failures upstream — this is the catch-all for everything else.
87
+ if not results.empty and "status" in results.columns:
88
+ new_error_rate = (results["status"] != "ok").mean()
89
+ if new_error_rate > 0.8:
90
+ raise RuntimeError(
91
+ f"Run produced {new_error_rate:.1%} errors in {len(results)} "
92
+ f"new rows (threshold 80%). Likely an infra failure; refusing "
93
+ f"to push to HF. Investigate the logs above and re-run when "
94
+ f"resolved."
95
+ )
96
+
97
  # Merge with cached results (immutable log, prefer latest results on conflict)
98
  all_results = pd.concat([old_results, results]).drop_duplicates(
99
  subset=["task", "model", "bcp_47", "metric", "sentence_nr"],
evals/models.py CHANGED
@@ -292,6 +292,26 @@ huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
292
  google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
293
 
294
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  @cache
296
  async def complete(**kwargs) -> str | None:
297
  async with openrouter_rate_limit:
@@ -301,6 +321,14 @@ async def complete(**kwargs) -> str | None:
301
  if "filtered" in e.message:
302
  return None
303
  raise e
 
 
 
 
 
 
 
 
304
  if not response.choices:
305
  raise Exception(response)
306
  return response.choices[0].message.content.strip()
 
292
  google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
293
 
294
 
295
+ class FatalAPIError(RuntimeError):
296
+ """Account-level failure (auth, key-limit, payment). Abort the whole run.
297
+
298
+ These errors apply to every subsequent call regardless of model/prompt, so
299
+ continuing the eval just floods results-detailed with bogus errors and
300
+ poisons the auto-blocklist. Raised by complete(); re-raised (not swallowed)
301
+ by query() in tasks.py so it propagates out of the eval loop.
302
+ """
303
+
304
+
305
+ _FATAL_ERROR_MARKERS = (
306
+ "key limit exceeded",
307
+ "insufficient credits",
308
+ "insufficient_quota",
309
+ "invalid api key",
310
+ "unauthorized",
311
+ "payment required",
312
+ )
313
+
314
+
315
  @cache
316
  async def complete(**kwargs) -> str | None:
317
  async with openrouter_rate_limit:
 
321
  if "filtered" in e.message:
322
  return None
323
  raise e
324
+ except Exception as e:
325
+ msg = str(e).lower()
326
+ if any(marker in msg for marker in _FATAL_ERROR_MARKERS):
327
+ raise FatalAPIError(
328
+ f"OpenRouter account-level failure: {e}. "
329
+ "Aborting run before results-detailed is polluted."
330
+ ) from e
331
+ raise
332
  if not response.choices:
333
  raise Exception(response)
334
  return response.choices[0].message.content.strip()
evals/tasks.py CHANGED
@@ -13,7 +13,7 @@ from datasets_.truthfulqa import load_truthfulqa
13
  from google.cloud import translate_v2 as translate
14
  from langcodes import closest_supported_match
15
  from languages import languages, script_name
16
- from models import complete, translate_google
17
 
18
  bleu = evaluate.load("bleu")
19
  chrf = evaluate.load("chrf")
@@ -48,6 +48,9 @@ async def query(model, prompt):
48
  ),
49
  ),
50
  )
 
 
 
51
  except Exception as e:
52
  print(f"exception for model {model}: {e}")
53
  return None
 
13
  from google.cloud import translate_v2 as translate
14
  from langcodes import closest_supported_match
15
  from languages import languages, script_name
16
+ from models import FatalAPIError, complete, translate_google
17
 
18
  bleu = evaluate.load("bleu")
19
  chrf = evaluate.load("chrf")
 
48
  ),
49
  ),
50
  )
51
+ except FatalAPIError:
52
+ # Account-level: re-raise so the eval loop aborts cleanly before save.
53
+ raise
54
  except Exception as e:
55
  print(f"exception for model {model}: {e}")
56
  return None