import librosa
import numpy as np
import soundfile as sf
from transformers import pipeline
import gradio as gr
import os

# Emotion modeli
emotion_model = pipeline("audio-classification", model="superb/hubert-large-superb-er")

def analyze_audio(file):
    # 1) WAV yükle
    y, sr = librosa.load(file, sr=16000)

    # 2) RMS Energy
    rms = librosa.feature.rms(y=y)[0]
    energy_mean = float(np.mean(rms))
    energy_peaks = int(np.sum(rms > energy_mean * 2.0))

    # 3) Pitch / Tone Variation
    pitch = librosa.yin(y, fmin=50, fmax=500)
    pitch_variation = float(np.std(pitch))

    # 4) Speaking Pace (words per second)
    # sadece kaba tempo analizi için
    duration = librosa.get_duration(y=y, sr=sr)
    tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
    pace = float(tempo)

    # 5) Pause Detection
    silence = librosa.effects.split(y, top_db=30)
    num_pauses = len(silence)

    # 6) Emotion Classification
    emotions = emotion_model(file)
    top_emotion = emotions[0]["label"]
    top_emotion_score = float(emotions[0]["score"])

    # 7) Final Composite Emotional Score
    final_score = (
        energy_mean * 0.25 +
        pitch_variation * 0.30 +
        pace * 0.10 +
        num_pauses * 0.05 +
        top_emotion_score * 0.30
    )

    return {
        "energy_mean": energy_mean,
        "energy_peaks": energy_peaks,
        "pitch_variation": pitch_variation,
        "pace": pace,
        "num_pauses": num_pauses,
        "emotion": top_emotion,
        "emotion_confidence": top_emotion_score,
        "audio_emotion_score": final_score
    }


def process_file(audio_file):
    return analyze_audio(audio_file)

interface = gr.Interface(
    fn=process_file,
    inputs=gr.Audio(type="filepath"),
    outputs="json",
    title="Audio Emotion Engine",
    description="Extracts emotional, rhythmic and tonal features from audio for viral segment scoring."
)

interface.launch()