File size: 25,996 Bytes
3df66f9 8c8b2c7 af28f6f 3df66f9 af28f6f 3df66f9 94407ab af28f6f b286969 af28f6f 8c8b2c7 af28f6f f663644 2ea847d f663644 8c8b2c7 af28f6f 94407ab af28f6f 94407ab af28f6f 94407ab b286969 94407ab b286969 af28f6f 94407ab f47fa42 8c8b2c7 af28f6f 94407ab af28f6f 94407ab af28f6f 94407ab 6b070cd 94407ab af28f6f 94407ab 5bed6f3 af28f6f 94407ab af28f6f 94407ab b286969 af28f6f b286969 af28f6f b286969 af28f6f b286969 af28f6f b286969 94407ab b286969 af28f6f 45a014d 1efbc3f b4df4b9 1efbc3f 45a014d 1efbc3f b4df4b9 1efbc3f 45a014d b286969 af28f6f b286969 94407ab b286969 af28f6f b286969 af28f6f 94407ab b286969 94407ab b286969 94407ab 5bed6f3 94407ab b286969 5a05fa9 af28f6f b286969 0bcfec8 3df66f9 b286969 3df66f9 b286969 94407ab b286969 94407ab b286969 5bed6f3 b286969 5bed6f3 94407ab 5bed6f3 94407ab b286969 94407ab 5a05fa9 94407ab b286969 0bcfec8 3df66f9 94407ab 3df66f9 94407ab b286969 2ea847d 94407ab 6b070cd 7d1892e b2efe87 94407ab af28f6f 3df66f9 2ea847d 3df66f9 af28f6f 3df66f9 af28f6f 3df66f9 af28f6f 3df66f9 af28f6f 94407ab af28f6f 3df66f9 94407ab af28f6f 3df66f9 af28f6f 94407ab 3df66f9 94407ab 3df66f9 94407ab 3df66f9 94407ab 3df66f9 94407ab 3df66f9 94407ab 3df66f9 94407ab 3df66f9 94407ab af28f6f 8c8b2c7 af28f6f 8c8b2c7 af28f6f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 |
import datetime
import os
from typing import Any, Dict, Optional, Tuple
import gradio as gr
import pandas as pd
from loguru import logger
from src.config import HISTORY_PATH
from src.data_manager import get_random_example, load_models
from src.judge import JudgeManager
from src.ui import UI
# Global state for evaluations
eval1: Optional[Dict[str, Any]] = None
eval2: Optional[Dict[str, Any]] = None
selected_judges: list = []
current_test_type: str = "grounding"
# Add more detailed logging
logger.info("EvalArena starting up")
# Check if benchmarks directory exists
if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
logger.info(f"Found benchmark directories: {benchmark_dirs}")
# Log CSV files in each directory
for d in benchmark_dirs:
dir_path = os.path.join("benchmarks", d)
files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
logger.info(f"Benchmark directory '{d}' contains files: {files}")
else:
logger.warning("Benchmarks directory not found or not accessible")
def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
"""Format the leaderboard dataframe for display in the UI.
This ensures consistent display across environments
like Huggingface Spaces."""
# Create a copy of the dataframe with only the columns we want to display
display_df = pd.DataFrame()
display_df["Judge Name"] = df["judge_name"]
display_df["ELO Score"] = df["elo_score"]
display_df["Wins"] = df["wins"]
display_df["Losses"] = df["losses"]
display_df["Total Evaluations"] = df["total_evaluations"]
return display_df
def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
"""Load benchmark data for the selected benchmark type and dataset.
Args:
benchmark_type: The type of benchmark (e.g., 'prompt-injections')
dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')
Returns:
Tuple containing:
- DataFrame formatted for display
- Markdown string with benchmark information
"""
# Create empty dataframe with the expected columns
empty_df = pd.DataFrame(
columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
)
# Handle case when None or empty values are passed
if not benchmark_type or not dataset_name:
logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
return empty_df, "Please select both a benchmark type and dataset"
try:
# Construct the path to the benchmark metrics file
metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")
logger.info(f"Loading benchmark from {metrics_file}")
if not os.path.exists(metrics_file):
error_message = f"Error: Could not find metrics file at {metrics_file}"
logger.error(error_message)
return empty_df, error_message
# Load the CSV file
df = pd.read_csv(metrics_file)
logger.info(f"Loaded benchmark with {len(df)} rows")
# Check if the file has the required columns
required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
logger.error(error_message)
return empty_df, error_message
# Format the dataframe for display
display_df = pd.DataFrame()
display_df["Judge Name"] = df["judge_name"]
display_df["F1 Score"] = df["f1"].round(3)
display_df["Balanced Accuracy"] = df["bacc"].round(3)
display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
display_df["Correct"] = df["correct"]
display_df["Total"] = df["count"]
# Sort by balanced accuracy descending
display_df = display_df.sort_values("Balanced Accuracy", ascending=False)
# Generate information about the benchmark
total_samples = df["count"].iloc[0] if not df.empty else 0
info_md = f"""
# Benchmark: {dataset_name}
**Type**: {benchmark_type}
**Total Samples**: {total_samples}
This table shows how different AI judge models performed on this benchmark.
Higher F1 score and balanced accuracy indicate better performance.
"""
return display_df, info_md
except pd.errors.EmptyDataError:
error_message = "Error: The CSV file is empty"
logger.error(error_message)
return empty_df, error_message
except pd.errors.ParserError:
error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
logger.error(error_message)
return empty_df, error_message
except Exception as e:
error_message = f"Error loading benchmark data: {str(e)}"
logger.error(error_message)
return empty_df, error_message
def initialize():
"""Initialize the application."""
# Load models from file
judges = load_models()
logger.info(f"Loaded {len(judges)} judges")
# Initialize judge manager
judge_manager = JudgeManager(judges)
# Set default test type
default_test_type = "grounding"
global current_test_type
current_test_type = default_test_type
# Create UI
ui = UI(
refresh_fn=lambda test_type: refresh_example(test_type, judge_manager),
submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
judge_manager,
),
winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
both_correct_fn=lambda: handle_both_correct(judge_manager),
both_incorrect_fn=lambda: handle_both_incorrect(judge_manager),
refresh_leaderboard_fn=lambda: format_leaderboard_for_display(
judge_manager.leaderboard_df,
),
leaderboard_df=format_leaderboard_for_display(
judge_manager.leaderboard_df,
),
load_benchmark_fn=load_benchmark_data,
)
return ui.create_interface()
def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
"""Get a random example for the given test type."""
try:
# Get example from the dataset
logger.info(f"Getting example for test type: {test_type}")
example = get_random_example(test_type)
# Default values for all return fields
input_text = ""
output_text = ""
text_input = ""
claim_input = ""
single_text_input = ""
policy_input = ""
policy_output = ""
policy_assertion = ""
# Populate fields based on test type
if test_type == "grounding":
text_input = example["text"]
claim_input = example["claim"]
elif test_type in ["prompt_injections", "safety"]:
single_text_input = example["text"]
elif test_type == "policy":
policy_input = example["input"]
policy_output = example["output"]
policy_assertion = example["assertion"]
else:
# Legacy format
input_text = example.get("text", f"Sample input for {test_type}")
output_text = example.get("claim", f"Sample output for {test_type}")
return (
input_text,
output_text,
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
)
except Exception as e:
logger.error(f"Error getting example: {e}")
# Return empty strings for all fields
return (
"",
"",
"",
"",
"",
"",
"",
"",
)
def submit_example(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple:
"""Prepare for evaluation and select random judges."""
global selected_judges, current_test_type, eval1, eval2
try:
logger.info(f"Preparing evaluation for test type: {test_type}")
current_test_type = test_type
# Reset evaluations
eval1 = None
eval2 = None
# Select random judges
selected_judges = judge_manager.pick_random_judges()
if len(selected_judges) < 2:
return (
"Error: Not enough judges available",
"Error: Not enough judges available",
None,
None,
None,
None,
None,
None,
None,
gr.update(visible=False),
)
# Format inputs for Qualifire evaluation
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get a single Qualifire evaluation to be shared by both judges
try:
qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire(
input_text,
output_text,
test_type,
as_raw=True, # Get raw result to share between judges
)
logger.info("Completed Qualifire evaluation")
# Store the Qualifire result for both judges to use
judge_manager.shared_qualifire_result = qualifire_result
judge_manager.shared_qualifire_time = time_elapsed
except Exception as e:
logger.error(f"Error during Qualifire evaluation: {str(e)}")
# Continue even if Qualifire fails - judges can still work without it
# Show loading messages while evaluations are in progress
status_text = "Evaluations starting... Both judges will evaluate in parallel."
return (
"Loading evaluation 1...",
"Loading evaluation 2...",
gr.update(value=text_input),
gr.update(value=claim_input),
gr.update(value=single_text_input),
gr.update(value=policy_input),
gr.update(value=policy_output),
gr.update(value=policy_assertion),
gr.update(value=test_type),
gr.update(visible=True, value=status_text),
)
except Exception as e:
logger.error(f"Error preparing evaluation: {e}")
return (
f"Error: {str(e)}",
f"Error: {str(e)}",
gr.update(value=text_input),
gr.update(value=claim_input),
gr.update(value=single_text_input),
gr.update(value=policy_input),
gr.update(value=policy_output),
gr.update(value=policy_assertion),
gr.update(value=test_type),
gr.update(visible=False),
)
def get_evaluation1(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple[str, Any]:
"""Get evaluation from the first judge."""
global eval1, selected_judges
try:
if not selected_judges or len(selected_judges) < 1:
return "No judges selected", gr.update(visible=False)
logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")
# Format inputs based on test type
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get evaluation from the first judge
eval1 = judge_manager.get_evaluation(
selected_judges[0],
input_text,
output_text,
test_type,
use_shared_result=True,
)
logger.info("Completed evaluation 1")
# Display the evaluation (time is already included in the evaluation)
display_eval = eval1["display_evaluation"]
# Make the selection button visible once the evaluation is ready
return display_eval, gr.update(visible=True)
except Exception as e:
logger.error(f"Error getting evaluation 1: {e}")
return f"Error: {str(e)}", gr.update(visible=False)
def get_evaluation2(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
judge_manager: JudgeManager,
) -> Tuple[str, Any, Any]:
"""Get evaluation from the second judge."""
global eval2, selected_judges
try:
if not selected_judges or len(selected_judges) < 2:
return (
"No judges selected",
gr.update(
visible=False,
),
gr.update(
visible=False,
),
)
logger.info(
f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
)
# Format inputs based on test type
input_text, output_text = format_inputs_for_evaluation(
text_input,
claim_input,
single_text_input,
policy_input,
policy_output,
policy_assertion,
test_type,
)
# Get evaluation from the second judge
eval2 = judge_manager.get_evaluation(
selected_judges[1],
input_text,
output_text,
test_type,
use_shared_result=True,
)
logger.info("Completed evaluation 2")
# Display the evaluation (time is already included in the evaluation)
display_eval = eval2["display_evaluation"]
return (
display_eval,
gr.update(visible=True),
gr.update(visible=True),
)
except Exception as e:
logger.error(f"Error getting evaluation 2: {e}")
return (
f"Error: {str(e)}",
gr.update(
visible=False,
),
gr.update(
visible=False,
),
)
def format_inputs_for_evaluation(
text_input: str,
claim_input: str,
single_text_input: str,
policy_input: str,
policy_output: str,
policy_assertion: str,
test_type: str,
) -> Tuple[str, str]:
"""Format inputs based on test type to be compatible with the evaluation function."""
if test_type == "grounding":
input_text = text_input
output_text = claim_input
elif test_type in ["prompt_injections", "safety"]:
input_text = single_text_input
output_text = ""
elif test_type == "policy":
input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}"
output_text = policy_output
else:
# Default fallback - this should not happen with the UI constraints
input_text = text_input or single_text_input or policy_input
output_text = claim_input or policy_output
return input_text, output_text
def save_to_history(
input_text: str,
output_text: str,
judge1_id: str,
judge1_name: str,
judge1_evaluation: str,
judge1_time: float,
judge2_id: str,
judge2_name: str,
judge2_evaluation: str,
judge2_time: float,
winner_id: str,
) -> None:
"""Save the evaluation results to history CSV file."""
try:
# Create a new row for the history
history_row = {
"timestamp": datetime.datetime.now().isoformat(),
"input": input_text,
"output": output_text,
"judge1_id": judge1_id,
"judge1_name": judge1_name,
"judge1_evaluation": judge1_evaluation,
"judge1_time": judge1_time,
"judge2_id": judge2_id,
"judge2_name": judge2_name,
"judge2_evaluation": judge2_evaluation,
"judge2_time": judge2_time,
"winner_id": winner_id,
}
# Try to load existing history
try:
history_df = pd.read_csv(HISTORY_PATH)
except (FileNotFoundError, pd.errors.EmptyDataError):
# Create a new history dataframe if file doesn't exist or is empty
history_df = pd.DataFrame(columns=list(history_row.keys()))
# Append the new row
history_df = pd.concat(
[history_df, pd.DataFrame([history_row])],
ignore_index=True,
)
# Save to CSV
history_df.to_csv(HISTORY_PATH, index=False)
logger.info("Saved evaluation to history")
except Exception as e:
logger.error(f"Error saving to history: {e}")
def select_winner(choice: str, judge_manager: JudgeManager) -> str:
"""Select a winner from the evaluations."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
if choice == "Evaluation 1":
winner_eval = eval1
loser_eval = eval2
winner_id = eval1["judge"]["id"]
else:
winner_eval = eval2
loser_eval = eval1
winner_id = eval2["judge"]["id"]
# Update leaderboard
updated_board = judge_manager.update_leaderboard(
winner_eval["judge"]["id"],
loser_eval["judge"]["id"],
result_type="win",
)
# Save to history
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id=winner_id,
)
# Construct result message with revealed judges' names
result_message = f"You selected: {choice}\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the winner's new ELO score
winner_mask = updated_board["judge_id"] == winner_id
winner_elo = updated_board[winner_mask]["elo_score"].values[0]
result_message += f"Winner: {winner_eval['judge']['name']} "
result_message += f"(New ELO: {winner_elo:.2f})\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error selecting winner: {e}")
return f"Error: {str(e)}"
def handle_both_correct(judge_manager: JudgeManager) -> str:
"""Handle case where both evaluations are correct."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
# Update leaderboard for both judges
updated_board = judge_manager.update_leaderboard(
eval1["judge"]["id"],
eval2["judge"]["id"],
result_type="both_correct",
)
# Save to history with both as winners
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id="both",
)
# Construct result message with revealed judges' names
result_message = "You selected: Both Correct\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the new ELO scores
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]
result_message += "\nBoth judges performed well!\n"
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error handling both correct: {e}")
return f"Error: {str(e)}"
def handle_both_incorrect(judge_manager: JudgeManager) -> str:
"""Handle case where both evaluations are incorrect."""
global eval1, eval2, current_test_type
try:
if not eval1 or not eval2:
return "Error: No evaluations available"
# Get the input and output text that was evaluated
input_text, output_text = "", ""
if "input_text" in eval1 and "output_text" in eval1:
input_text = eval1.get("input_text", "")
output_text = eval1.get("output_text", "")
# Update leaderboard for both judges
updated_board = judge_manager.update_leaderboard(
eval1["judge"]["id"],
eval2["judge"]["id"],
result_type="both_incorrect",
)
# Save to history with neither as winner
save_to_history(
input_text=input_text,
output_text=output_text,
judge1_id=eval1["judge"]["id"],
judge1_name=eval1["judge"]["name"],
judge1_evaluation=eval1["anonymous_evaluation"],
judge1_time=eval1["elapsed_time"],
judge2_id=eval2["judge"]["id"],
judge2_name=eval2["judge"]["name"],
judge2_evaluation=eval2["anonymous_evaluation"],
judge2_time=eval2["elapsed_time"],
winner_id="none",
)
# Construct result message with revealed judges' names
result_message = "You selected: Both Incorrect\n\n"
result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
result_message += (
f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
)
# Get the new ELO scores
judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]
judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]
result_message += "\nBoth judges need improvement.\n"
result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
result_message += f"Test Type: {current_test_type}\n"
return result_message
except Exception as e:
logger.error(f"Error handling both incorrect: {e}")
return f"Error: {str(e)}"
def main():
"""Initialize the application."""
demo = initialize()
demo.launch(server_name="0.0.0.0")
if __name__ == "__main__":
main()
|