AudioEngine / app.py
erydmn's picture
Create app.py
c51cd4a verified
import librosa
import numpy as np
import soundfile as sf
from transformers import pipeline
import gradio as gr
import os
# Emotion modeli
emotion_model = pipeline("audio-classification", model="superb/hubert-large-superb-er")
def analyze_audio(file):
# 1) WAV yükle
y, sr = librosa.load(file, sr=16000)
# 2) RMS Energy
rms = librosa.feature.rms(y=y)[0]
energy_mean = float(np.mean(rms))
energy_peaks = int(np.sum(rms > energy_mean * 2.0))
# 3) Pitch / Tone Variation
pitch = librosa.yin(y, fmin=50, fmax=500)
pitch_variation = float(np.std(pitch))
# 4) Speaking Pace (words per second)
# sadece kaba tempo analizi için
duration = librosa.get_duration(y=y, sr=sr)
tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
pace = float(tempo)
# 5) Pause Detection
silence = librosa.effects.split(y, top_db=30)
num_pauses = len(silence)
# 6) Emotion Classification
emotions = emotion_model(file)
top_emotion = emotions[0]["label"]
top_emotion_score = float(emotions[0]["score"])
# 7) Final Composite Emotional Score
final_score = (
energy_mean * 0.25 +
pitch_variation * 0.30 +
pace * 0.10 +
num_pauses * 0.05 +
top_emotion_score * 0.30
)
return {
"energy_mean": energy_mean,
"energy_peaks": energy_peaks,
"pitch_variation": pitch_variation,
"pace": pace,
"num_pauses": num_pauses,
"emotion": top_emotion,
"emotion_confidence": top_emotion_score,
"audio_emotion_score": final_score
}
def process_file(audio_file):
return analyze_audio(audio_file)
interface = gr.Interface(
fn=process_file,
inputs=gr.Audio(type="filepath"),
outputs="json",
title="Audio Emotion Engine",
description="Extracts emotional, rhythmic and tonal features from audio for viral segment scoring."
)
interface.launch()