import librosa import numpy as np import soundfile as sf from transformers import pipeline import gradio as gr import os # Emotion modeli emotion_model = pipeline("audio-classification", model="superb/hubert-large-superb-er") def analyze_audio(file): # 1) WAV yükle y, sr = librosa.load(file, sr=16000) # 2) RMS Energy rms = librosa.feature.rms(y=y)[0] energy_mean = float(np.mean(rms)) energy_peaks = int(np.sum(rms > energy_mean * 2.0)) # 3) Pitch / Tone Variation pitch = librosa.yin(y, fmin=50, fmax=500) pitch_variation = float(np.std(pitch)) # 4) Speaking Pace (words per second) # sadece kaba tempo analizi için duration = librosa.get_duration(y=y, sr=sr) tempo, _ = librosa.beat.beat_track(y=y, sr=sr) pace = float(tempo) # 5) Pause Detection silence = librosa.effects.split(y, top_db=30) num_pauses = len(silence) # 6) Emotion Classification emotions = emotion_model(file) top_emotion = emotions[0]["label"] top_emotion_score = float(emotions[0]["score"]) # 7) Final Composite Emotional Score final_score = ( energy_mean * 0.25 + pitch_variation * 0.30 + pace * 0.10 + num_pauses * 0.05 + top_emotion_score * 0.30 ) return { "energy_mean": energy_mean, "energy_peaks": energy_peaks, "pitch_variation": pitch_variation, "pace": pace, "num_pauses": num_pauses, "emotion": top_emotion, "emotion_confidence": top_emotion_score, "audio_emotion_score": final_score } def process_file(audio_file): return analyze_audio(audio_file) interface = gr.Interface( fn=process_file, inputs=gr.Audio(type="filepath"), outputs="json", title="Audio Emotion Engine", description="Extracts emotional, rhythmic and tonal features from audio for viral segment scoring." ) interface.launch()