EvalArena

Sleeping

File size: 25,996 Bytes

import datetime
import os
from typing import Any, Dict, Optional, Tuple

import gradio as gr
import pandas as pd
from loguru import logger

from src.config import HISTORY_PATH
from src.data_manager import get_random_example, load_models
from src.judge import JudgeManager
from src.ui import UI

# Global state for evaluations
eval1: Optional[Dict[str, Any]] = None
eval2: Optional[Dict[str, Any]] = None
selected_judges: list = []
current_test_type: str = "grounding"

# Add more detailed logging
logger.info("EvalArena starting up")
# Check if benchmarks directory exists
if os.path.exists("benchmarks") and os.path.isdir("benchmarks"):
    benchmark_dirs = [d for d in os.listdir("benchmarks") if os.path.isdir(os.path.join("benchmarks", d))]
    logger.info(f"Found benchmark directories: {benchmark_dirs}")

    # Log CSV files in each directory
    for d in benchmark_dirs:
        dir_path = os.path.join("benchmarks", d)
        files = [f for f in os.listdir(dir_path) if f.endswith("-judges-metrics.csv")]
        logger.info(f"Benchmark directory '{d}' contains files: {files}")
else:
    logger.warning("Benchmarks directory not found or not accessible")


def format_leaderboard_for_display(df: pd.DataFrame) -> pd.DataFrame:
    """Format the leaderboard dataframe for display in the UI.
    This ensures consistent display across environments
    like Huggingface Spaces."""
    # Create a copy of the dataframe with only the columns we want to display
    display_df = pd.DataFrame()
    display_df["Judge Name"] = df["judge_name"]
    display_df["ELO Score"] = df["elo_score"]
    display_df["Wins"] = df["wins"]
    display_df["Losses"] = df["losses"]
    display_df["Total Evaluations"] = df["total_evaluations"]
    return display_df


def load_benchmark_data(benchmark_type: str, dataset_name: str) -> Tuple[pd.DataFrame, str]:
    """Load benchmark data for the selected benchmark type and dataset.

    Args:
        benchmark_type: The type of benchmark (e.g., 'prompt-injections')
        dataset_name: The name of the dataset (e.g., 'allenai-wildjailbreak')

    Returns:
        Tuple containing:
            - DataFrame formatted for display
            - Markdown string with benchmark information
    """
    # Create empty dataframe with the expected columns
    empty_df = pd.DataFrame(
        columns=["Judge Name", "F1 Score", "Balanced Accuracy", "Avg Latency (s)", "Correct", "Total"]
    )

    # Handle case when None or empty values are passed
    if not benchmark_type or not dataset_name:
        logger.warning(f"Invalid benchmark parameters: type={benchmark_type}, dataset={dataset_name}")
        return empty_df, "Please select both a benchmark type and dataset"

    try:
        # Construct the path to the benchmark metrics file
        metrics_file = os.path.join("benchmarks", benchmark_type, f"{dataset_name}-judges-metrics.csv")

        logger.info(f"Loading benchmark from {metrics_file}")

        if not os.path.exists(metrics_file):
            error_message = f"Error: Could not find metrics file at {metrics_file}"
            logger.error(error_message)
            return empty_df, error_message

        # Load the CSV file
        df = pd.read_csv(metrics_file)
        logger.info(f"Loaded benchmark with {len(df)} rows")

        # Check if the file has the required columns
        required_columns = ["judge_name", "f1", "bacc", "avg_latency", "correct", "count"]
        missing_columns = [col for col in required_columns if col not in df.columns]

        if missing_columns:
            error_message = f"Error: CSV file missing required columns: {', '.join(missing_columns)}"
            logger.error(error_message)
            return empty_df, error_message

        # Format the dataframe for display
        display_df = pd.DataFrame()
        display_df["Judge Name"] = df["judge_name"]
        display_df["F1 Score"] = df["f1"].round(3)
        display_df["Balanced Accuracy"] = df["bacc"].round(3)
        display_df["Avg Latency (s)"] = df["avg_latency"].round(2)
        display_df["Correct"] = df["correct"]
        display_df["Total"] = df["count"]

        # Sort by balanced accuracy descending
        display_df = display_df.sort_values("Balanced Accuracy", ascending=False)

        # Generate information about the benchmark
        total_samples = df["count"].iloc[0] if not df.empty else 0
        info_md = f"""
        # Benchmark: {dataset_name}

        **Type**: {benchmark_type}
        **Total Samples**: {total_samples}

        This table shows how different AI judge models performed on this benchmark.
        Higher F1 score and balanced accuracy indicate better performance.
        """

        return display_df, info_md

    except pd.errors.EmptyDataError:
        error_message = "Error: The CSV file is empty"
        logger.error(error_message)
        return empty_df, error_message
    except pd.errors.ParserError:
        error_message = "Error: Could not parse the CSV file - it may be corrupted or not in CSV format"
        logger.error(error_message)
        return empty_df, error_message
    except Exception as e:
        error_message = f"Error loading benchmark data: {str(e)}"
        logger.error(error_message)
        return empty_df, error_message


def initialize():
    """Initialize the application."""
    # Load models from file
    judges = load_models()
    logger.info(f"Loaded {len(judges)} judges")

    # Initialize judge manager
    judge_manager = JudgeManager(judges)

    # Set default test type
    default_test_type = "grounding"
    global current_test_type
    current_test_type = default_test_type

    # Create UI
    ui = UI(
        refresh_fn=lambda test_type: refresh_example(test_type, judge_manager),
        submit_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: submit_example(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
            judge_manager,
        ),
        evaluate1_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation1(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
            judge_manager,
        ),
        evaluate2_fn=lambda text_input, claim_input, single_text_input, policy_input, policy_output, policy_assertion, test_type: get_evaluation2(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
            judge_manager,
        ),
        winner1_fn=lambda: select_winner("Evaluation 1", judge_manager),
        winner2_fn=lambda: select_winner("Evaluation 2", judge_manager),
        both_correct_fn=lambda: handle_both_correct(judge_manager),
        both_incorrect_fn=lambda: handle_both_incorrect(judge_manager),
        refresh_leaderboard_fn=lambda: format_leaderboard_for_display(
            judge_manager.leaderboard_df,
        ),
        leaderboard_df=format_leaderboard_for_display(
            judge_manager.leaderboard_df,
        ),
        load_benchmark_fn=load_benchmark_data,
    )

    return ui.create_interface()


def refresh_example(test_type: str, judge_manager: JudgeManager) -> Tuple:
    """Get a random example for the given test type."""
    try:
        # Get example from the dataset
        logger.info(f"Getting example for test type: {test_type}")
        example = get_random_example(test_type)

        # Default values for all return fields
        input_text = ""
        output_text = ""
        text_input = ""
        claim_input = ""
        single_text_input = ""
        policy_input = ""
        policy_output = ""
        policy_assertion = ""

        # Populate fields based on test type
        if test_type == "grounding":
            text_input = example["text"]
            claim_input = example["claim"]
        elif test_type in ["prompt_injections", "safety"]:
            single_text_input = example["text"]
        elif test_type == "policy":
            policy_input = example["input"]
            policy_output = example["output"]
            policy_assertion = example["assertion"]
        else:
            # Legacy format
            input_text = example.get("text", f"Sample input for {test_type}")
            output_text = example.get("claim", f"Sample output for {test_type}")

        return (
            input_text,
            output_text,
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
        )
    except Exception as e:
        logger.error(f"Error getting example: {e}")
        # Return empty strings for all fields
        return (
            "",
            "",
            "",
            "",
            "",
            "",
            "",
            "",
        )


def submit_example(
    text_input: str,
    claim_input: str,
    single_text_input: str,
    policy_input: str,
    policy_output: str,
    policy_assertion: str,
    test_type: str,
    judge_manager: JudgeManager,
) -> Tuple:
    """Prepare for evaluation and select random judges."""
    global selected_judges, current_test_type, eval1, eval2

    try:
        logger.info(f"Preparing evaluation for test type: {test_type}")
        current_test_type = test_type

        # Reset evaluations
        eval1 = None
        eval2 = None

        # Select random judges
        selected_judges = judge_manager.pick_random_judges()

        if len(selected_judges) < 2:
            return (
                "Error: Not enough judges available",
                "Error: Not enough judges available",
                None,
                None,
                None,
                None,
                None,
                None,
                None,
                gr.update(visible=False),
            )

        # Format inputs for Qualifire evaluation
        input_text, output_text = format_inputs_for_evaluation(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
        )

        # Get a single Qualifire evaluation to be shared by both judges
        try:
            qualifire_result, time_elapsed = judge_manager.evaluate_with_qualifire(
                input_text,
                output_text,
                test_type,
                as_raw=True,  # Get raw result to share between judges
            )
            logger.info("Completed Qualifire evaluation")

            # Store the Qualifire result for both judges to use
            judge_manager.shared_qualifire_result = qualifire_result
            judge_manager.shared_qualifire_time = time_elapsed
        except Exception as e:
            logger.error(f"Error during Qualifire evaluation: {str(e)}")
            # Continue even if Qualifire fails - judges can still work without it

        # Show loading messages while evaluations are in progress
        status_text = "Evaluations starting... Both judges will evaluate in parallel."
        return (
            "Loading evaluation 1...",
            "Loading evaluation 2...",
            gr.update(value=text_input),
            gr.update(value=claim_input),
            gr.update(value=single_text_input),
            gr.update(value=policy_input),
            gr.update(value=policy_output),
            gr.update(value=policy_assertion),
            gr.update(value=test_type),
            gr.update(visible=True, value=status_text),
        )
    except Exception as e:
        logger.error(f"Error preparing evaluation: {e}")
        return (
            f"Error: {str(e)}",
            f"Error: {str(e)}",
            gr.update(value=text_input),
            gr.update(value=claim_input),
            gr.update(value=single_text_input),
            gr.update(value=policy_input),
            gr.update(value=policy_output),
            gr.update(value=policy_assertion),
            gr.update(value=test_type),
            gr.update(visible=False),
        )


def get_evaluation1(
    text_input: str,
    claim_input: str,
    single_text_input: str,
    policy_input: str,
    policy_output: str,
    policy_assertion: str,
    test_type: str,
    judge_manager: JudgeManager,
) -> Tuple[str, Any]:
    """Get evaluation from the first judge."""
    global eval1, selected_judges

    try:
        if not selected_judges or len(selected_judges) < 1:
            return "No judges selected", gr.update(visible=False)

        logger.info(f"Starting evaluation 1 with judge {selected_judges[0]['name']}")

        # Format inputs based on test type
        input_text, output_text = format_inputs_for_evaluation(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
        )

        # Get evaluation from the first judge
        eval1 = judge_manager.get_evaluation(
            selected_judges[0],
            input_text,
            output_text,
            test_type,
            use_shared_result=True,
        )
        logger.info("Completed evaluation 1")

        # Display the evaluation (time is already included in the evaluation)
        display_eval = eval1["display_evaluation"]

        # Make the selection button visible once the evaluation is ready
        return display_eval, gr.update(visible=True)
    except Exception as e:
        logger.error(f"Error getting evaluation 1: {e}")
        return f"Error: {str(e)}", gr.update(visible=False)


def get_evaluation2(
    text_input: str,
    claim_input: str,
    single_text_input: str,
    policy_input: str,
    policy_output: str,
    policy_assertion: str,
    test_type: str,
    judge_manager: JudgeManager,
) -> Tuple[str, Any, Any]:
    """Get evaluation from the second judge."""
    global eval2, selected_judges

    try:
        if not selected_judges or len(selected_judges) < 2:
            return (
                "No judges selected",
                gr.update(
                    visible=False,
                ),
                gr.update(
                    visible=False,
                ),
            )

        logger.info(
            f"Starting evaluation 2 with judge {selected_judges[1]['name']}",
        )

        # Format inputs based on test type
        input_text, output_text = format_inputs_for_evaluation(
            text_input,
            claim_input,
            single_text_input,
            policy_input,
            policy_output,
            policy_assertion,
            test_type,
        )

        # Get evaluation from the second judge
        eval2 = judge_manager.get_evaluation(
            selected_judges[1],
            input_text,
            output_text,
            test_type,
            use_shared_result=True,
        )
        logger.info("Completed evaluation 2")

        # Display the evaluation (time is already included in the evaluation)
        display_eval = eval2["display_evaluation"]

        return (
            display_eval,
            gr.update(visible=True),
            gr.update(visible=True),
        )
    except Exception as e:
        logger.error(f"Error getting evaluation 2: {e}")
        return (
            f"Error: {str(e)}",
            gr.update(
                visible=False,
            ),
            gr.update(
                visible=False,
            ),
        )


def format_inputs_for_evaluation(
    text_input: str,
    claim_input: str,
    single_text_input: str,
    policy_input: str,
    policy_output: str,
    policy_assertion: str,
    test_type: str,
) -> Tuple[str, str]:
    """Format inputs based on test type to be compatible with the evaluation function."""
    if test_type == "grounding":
        input_text = text_input
        output_text = claim_input
    elif test_type in ["prompt_injections", "safety"]:
        input_text = single_text_input
        output_text = ""
    elif test_type == "policy":
        input_text = f"Input: {policy_input}\nAssertion: {policy_assertion}"
        output_text = policy_output
    else:
        # Default fallback - this should not happen with the UI constraints
        input_text = text_input or single_text_input or policy_input
        output_text = claim_input or policy_output

    return input_text, output_text


def save_to_history(
    input_text: str,
    output_text: str,
    judge1_id: str,
    judge1_name: str,
    judge1_evaluation: str,
    judge1_time: float,
    judge2_id: str,
    judge2_name: str,
    judge2_evaluation: str,
    judge2_time: float,
    winner_id: str,
) -> None:
    """Save the evaluation results to history CSV file."""
    try:
        # Create a new row for the history
        history_row = {
            "timestamp": datetime.datetime.now().isoformat(),
            "input": input_text,
            "output": output_text,
            "judge1_id": judge1_id,
            "judge1_name": judge1_name,
            "judge1_evaluation": judge1_evaluation,
            "judge1_time": judge1_time,
            "judge2_id": judge2_id,
            "judge2_name": judge2_name,
            "judge2_evaluation": judge2_evaluation,
            "judge2_time": judge2_time,
            "winner_id": winner_id,
        }

        # Try to load existing history
        try:
            history_df = pd.read_csv(HISTORY_PATH)
        except (FileNotFoundError, pd.errors.EmptyDataError):
            # Create a new history dataframe if file doesn't exist or is empty
            history_df = pd.DataFrame(columns=list(history_row.keys()))

        # Append the new row
        history_df = pd.concat(
            [history_df, pd.DataFrame([history_row])],
            ignore_index=True,
        )

        # Save to CSV
        history_df.to_csv(HISTORY_PATH, index=False)
        logger.info("Saved evaluation to history")
    except Exception as e:
        logger.error(f"Error saving to history: {e}")


def select_winner(choice: str, judge_manager: JudgeManager) -> str:
    """Select a winner from the evaluations."""
    global eval1, eval2, current_test_type

    try:
        if not eval1 or not eval2:
            return "Error: No evaluations available"

        # Get the input and output text that was evaluated
        input_text, output_text = "", ""
        if "input_text" in eval1 and "output_text" in eval1:
            input_text = eval1.get("input_text", "")
            output_text = eval1.get("output_text", "")

        if choice == "Evaluation 1":
            winner_eval = eval1
            loser_eval = eval2
            winner_id = eval1["judge"]["id"]
        else:
            winner_eval = eval2
            loser_eval = eval1
            winner_id = eval2["judge"]["id"]

        # Update leaderboard
        updated_board = judge_manager.update_leaderboard(
            winner_eval["judge"]["id"],
            loser_eval["judge"]["id"],
            result_type="win",
        )

        # Save to history
        save_to_history(
            input_text=input_text,
            output_text=output_text,
            judge1_id=eval1["judge"]["id"],
            judge1_name=eval1["judge"]["name"],
            judge1_evaluation=eval1["anonymous_evaluation"],
            judge1_time=eval1["elapsed_time"],
            judge2_id=eval2["judge"]["id"],
            judge2_name=eval2["judge"]["name"],
            judge2_evaluation=eval2["anonymous_evaluation"],
            judge2_time=eval2["elapsed_time"],
            winner_id=winner_id,
        )

        # Construct result message with revealed judges' names
        result_message = f"You selected: {choice}\n\n"
        result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
        result_message += (
            f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
        )

        # Get the winner's new ELO score
        winner_mask = updated_board["judge_id"] == winner_id
        winner_elo = updated_board[winner_mask]["elo_score"].values[0]

        result_message += f"Winner: {winner_eval['judge']['name']} "
        result_message += f"(New ELO: {winner_elo:.2f})\n"
        result_message += f"Test Type: {current_test_type}\n"

        return result_message
    except Exception as e:
        logger.error(f"Error selecting winner: {e}")
        return f"Error: {str(e)}"


def handle_both_correct(judge_manager: JudgeManager) -> str:
    """Handle case where both evaluations are correct."""
    global eval1, eval2, current_test_type

    try:
        if not eval1 or not eval2:
            return "Error: No evaluations available"

        # Get the input and output text that was evaluated
        input_text, output_text = "", ""
        if "input_text" in eval1 and "output_text" in eval1:
            input_text = eval1.get("input_text", "")
            output_text = eval1.get("output_text", "")

        # Update leaderboard for both judges
        updated_board = judge_manager.update_leaderboard(
            eval1["judge"]["id"],
            eval2["judge"]["id"],
            result_type="both_correct",
        )

        # Save to history with both as winners
        save_to_history(
            input_text=input_text,
            output_text=output_text,
            judge1_id=eval1["judge"]["id"],
            judge1_name=eval1["judge"]["name"],
            judge1_evaluation=eval1["anonymous_evaluation"],
            judge1_time=eval1["elapsed_time"],
            judge2_id=eval2["judge"]["id"],
            judge2_name=eval2["judge"]["name"],
            judge2_evaluation=eval2["anonymous_evaluation"],
            judge2_time=eval2["elapsed_time"],
            winner_id="both",
        )

        # Construct result message with revealed judges' names
        result_message = "You selected: Both Correct\n\n"
        result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
        result_message += (
            f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
        )

        # Get the new ELO scores
        judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
        judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]

        judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
        judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]

        result_message += "\nBoth judges performed well!\n"
        result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
        result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
        result_message += f"Test Type: {current_test_type}\n"

        return result_message
    except Exception as e:
        logger.error(f"Error handling both correct: {e}")
        return f"Error: {str(e)}"


def handle_both_incorrect(judge_manager: JudgeManager) -> str:
    """Handle case where both evaluations are incorrect."""
    global eval1, eval2, current_test_type

    try:
        if not eval1 or not eval2:
            return "Error: No evaluations available"

        # Get the input and output text that was evaluated
        input_text, output_text = "", ""
        if "input_text" in eval1 and "output_text" in eval1:
            input_text = eval1.get("input_text", "")
            output_text = eval1.get("output_text", "")

        # Update leaderboard for both judges
        updated_board = judge_manager.update_leaderboard(
            eval1["judge"]["id"],
            eval2["judge"]["id"],
            result_type="both_incorrect",
        )

        # Save to history with neither as winner
        save_to_history(
            input_text=input_text,
            output_text=output_text,
            judge1_id=eval1["judge"]["id"],
            judge1_name=eval1["judge"]["name"],
            judge1_evaluation=eval1["anonymous_evaluation"],
            judge1_time=eval1["elapsed_time"],
            judge2_id=eval2["judge"]["id"],
            judge2_name=eval2["judge"]["name"],
            judge2_evaluation=eval2["anonymous_evaluation"],
            judge2_time=eval2["elapsed_time"],
            winner_id="none",
        )

        # Construct result message with revealed judges' names
        result_message = "You selected: Both Incorrect\n\n"
        result_message += f"Evaluation 1 was by: {eval1['judge']['name']} (took {eval1['elapsed_time']:.2f} seconds)\n"
        result_message += (
            f"Evaluation 2 was by: {eval2['judge']['name']} (took {eval2['elapsed_time']:.2f} seconds)\n\n"
        )

        # Get the new ELO scores
        judge1_mask = updated_board["judge_id"] == eval1["judge"]["id"]
        judge2_mask = updated_board["judge_id"] == eval2["judge"]["id"]

        judge1_elo = updated_board[judge1_mask]["elo_score"].values[0]
        judge2_elo = updated_board[judge2_mask]["elo_score"].values[0]

        result_message += "\nBoth judges need improvement.\n"
        result_message += f"{eval1['judge']['name']} new ELO: {judge1_elo:.2f}\n"
        result_message += f"{eval2['judge']['name']} new ELO: {judge2_elo:.2f}\n"
        result_message += f"Test Type: {current_test_type}\n"

        return result_message
    except Exception as e:
        logger.error(f"Error handling both incorrect: {e}")
        return f"Error: {str(e)}"


def main():
    """Initialize the application."""
    demo = initialize()
    demo.launch(server_name="0.0.0.0")


if __name__ == "__main__":
    main()