roar-companion / app.py
munyew's picture
Upload app.py with huggingface_hub
78d4ef4 verified
"""
app.py
ROAR Companion — HuggingFace Space
MERaLiON-2-3B + Multi-Agent (Scam/Calendar/Food) + Warm Singlish Companion
Runs on HuggingFace Persistent GPU (T4 Small)
"""
import os
import re
import asyncio
import base64
import tempfile
import traceback
import datetime
import numpy as np
import soundfile as sf
import torch
import urllib.request
import urllib.parse
import json
import gradio as gr
from pathlib import Path
# ─────────────────────────────────────────────────────────────────────────────
# CONFIG — set via HuggingFace Space Secrets
# ─────────────────────────────────────────────────────────────────────────────
TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "")
TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
GOOGLE_CREDS_JSON = os.environ.get("GOOGLE_CREDS_JSON", "") # JSON string
TTS_VOICE_EN = "en-SG-WayneNeural"
TTS_VOICE_ZH = "zh-SG-WanLungNeural"
MODEL_ID = "MERaLiON/MERaLiON-2-3B"
COMPANION_PERSONA = """You are a warm, helpful AI companion for Singaporeans.
You understand Singlish, code-switching between English, Mandarin, Malay, and Tamil.
You speak naturally, like a friendly Singaporean friend — not stiff or formal.
Keep responses concise and conversational — 1 to 3 sentences.
If the user seems stressed, acknowledge their feelings first."""
# ─────────────────────────────────────────────────────────────────────────────
# LOAD MERALION
# ─────────────────────────────────────────────────────────────────────────────
print("=" * 60)
print("Loading MERaLiON-2-3B from HuggingFace...")
print("=" * 60)
try:
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
model = AutoModelForSpeechSeq2Seq.from_pretrained(
MODEL_ID,
torch_dtype=torch.float16,
device_map="cuda" if torch.cuda.is_available() else "cpu",
trust_remote_code=True,
use_safetensors=True
)
model.eval()
device = "CUDA" if torch.cuda.is_available() else "CPU"
print(f"MERaLiON-2-3B loaded on {device}.")
except Exception as e:
print(f"Failed to load MERaLiON: {e}")
model = None
processor = None
# ─────────────────────────────────────────────────────────────────────────────
# SESSION STORE
# ─────────────────────────────────────────────────────────────────────────────
sessions = {}
def get_session(session_id):
if session_id not in sessions:
sessions[session_id] = {"history": [], "pending_action": None}
return sessions[session_id]
def add_to_history(session, role, text):
session["history"].append({"role": role, "text": text})
if len(session["history"]) > 6:
session["history"] = session["history"][-6:]
def format_history(session):
if not session["history"]:
return ""
lines = []
for turn in session["history"][-4:]:
prefix = "User" if turn["role"] == "user" else "Companion"
lines.append(f"{prefix}: {turn['text']}")
return "\n".join(lines) + "\n"
# ─────────────────────────────────────────────────────────────────────────────
# INTENT DETECTION
# ─────────────────────────────────────────────────────────────────────────────
YES_WORDS = ["yes","ya","yep","yeah","yah","ok","okay","sure","can","can lah",
"ok lah","yes lah","go ahead","do it","please","confirm","alright",
"fine","why not","of course","ya can","can can","shiok","好","好的","可以"]
NO_WORDS = ["no","nope","nah","no need","don't","dont","cancel","never mind",
"nevermind","no lah","nah lah","forget it","skip","not now","later",
"no thanks","不用","不要","算了"]
SCAM_KEYWORDS = ["scam","kena scam","suspicious","fraud","phishing","transfer money",
"bank account","otp","lucky draw","prize","win","stranger called",
"unknown number","weird sms","suspicious link","lost money","cheated"]
FOOD_HINTS = ["hungry","makan","eat","food","hungry lah","very hungry","starving",
"want to eat","craving","order food","grab food","supper","lunch",
"dinner","breakfast","没吃","饿"]
FOOD_ITEMS = ["chicken rice","nasi lemak","wonton mee","char kway teow","bak chor mee",
"roti prata","laksa","mee goreng","pizza","burger","chicken noodles",
"fried rice","noodles","rice","chicken","fish","beef","prawn","prata","mee",
"porridge","ban mian","hokkien mee","satay","bak kut teh"]
CALENDAR_HINTS = ["tomorrow","next week","monday","tuesday","wednesday","thursday","friday",
"saturday","sunday","tonight","this afternoon","this morning","this evening",
"must go","need to go","have to go","going to","got to"]
CALENDAR_ACTIONS = ["schedule","meeting","appointment","book","remind","set reminder",
"add to calendar","block out","calendar","add event","create event"]
def is_confirmation(t): return any(w in t.lower() for w in YES_WORDS)
def is_rejection(t): return any(w in t.lower() for w in NO_WORDS)
def detect_scam(t): return sum(1 for k in SCAM_KEYWORDS if k in t.lower()) > 0
def detect_food_hint(t):return any(k in t.lower() for k in FOOD_HINTS)
def detect_food_item(t):
tl = t.lower()
for item in FOOD_ITEMS:
if item in tl: return item.title()
m = re.search(r'(?:want|order|eat|have|get|bring me)\s+(?:some\s+)?(.+?)(?:\s*$|[.,])', tl)
if m:
c = m.group(1).strip()
if 2 < len(c) < 30: return c.title()
return None
def detect_calendar_hint(t):
tl = t.lower()
has_time = any(k in tl for k in CALENDAR_HINTS)
has_action = any(k in tl for k in CALENDAR_ACTIONS)
if has_action and has_time: return "strong"
if has_time: return "weak"
return None
# ─────────────────────────────────────────────────────────────────────────────
# MERALION INFERENCE
# ─────────────────────────────────────────────────────────────────────────────
def clean_text(text):
if not text: return ""
for t in ["<|audio|>","<|chat|>","<|text|>","<|","|>","<SpeechHere>",
"<Speaker1>","<Speaker2>","<bos>","<eos>"]:
text = text.replace(t, "")
lines = []
for line in text.split("\n"):
line = line.strip()
if not line or line.startswith("User:") or line.startswith("* "): continue
if line.startswith("Companion:"): line = line.replace("Companion:","").strip()
lines.append(line)
return " ".join(lines).strip()
def run_meralion(audio_array, query, max_tokens=150):
if model is None or processor is None: return ""
pt = "Instruction: {query} \nFollow the text instruction based on the following audio: <SpeechHere>"
conv = [{"role": "user", "content": pt.format(query=query)}]
chat_prompt = processor.tokenizer.apply_chat_template(
conv, tokenize=False, add_generation_prompt=True)
if isinstance(chat_prompt, list): chat_prompt = chat_prompt[0]
inputs = processor(text=chat_prompt, audios=audio_array, sampling_rate=16000)
device = next(model.parameters()).device
inputs = {k: v.to(device) if hasattr(v,"to") else v for k,v in inputs.items()}
if "input_features" in inputs:
inputs["input_features"] = inputs["input_features"].to(torch.float16)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=max_tokens,
do_sample=True, temperature=0.75,
top_p=0.9, repetition_penalty=1.1)
gen = out[:, inputs["input_ids"].shape[1]:]
return clean_text(processor.batch_decode(gen, skip_special_tokens=True)[0].strip())
def transcribe(audio_array):
return run_meralion(audio_array,
"Please transcribe what the user said accurately, including Singlish, Malay, Mandarin or Tamil words.",
max_tokens=200)
def companion_reply(audio_array, transcript, session, extra=""):
history = format_history(session)
ctx = f"{COMPANION_PERSONA}\n\n"
if history: ctx += f"Conversation so far:\n{history}\n"
if extra: ctx += f"{extra}\n"
ctx += f"The user just said: '{transcript}'. Reply warmly in 1-2 sentences."
result = run_meralion(audio_array, ctx)
return result if len(result) > 5 else "Wah, tell me more leh!"
# ─────────────────────────────────────────────────────────────────────────────
# TELEGRAM
# ─────────────────────────────────────────────────────────────────────────────
def send_telegram(message):
if not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID: return False
try:
encoded = urllib.parse.quote(message)
url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/sendMessage?chat_id={TELEGRAM_CHAT_ID}&text={encoded}&parse_mode=HTML"
urllib.request.urlopen(url, timeout=10)
return True
except Exception as e:
print(f"Telegram error: {e}")
return False
# ─────────────────────────────────────────────────────────────────────────────
# EVENT EXTRACTION
# ─────────────────────────────────────────────────────────────────────────────
def extract_event(transcript):
tl = transcript.lower()
today = datetime.date.today()
if "tomorrow" in tl: event_date = today + datetime.timedelta(days=1)
elif "monday" in tl: event_date = today + datetime.timedelta(days=(0-today.weekday())%7 or 7)
elif "tuesday" in tl: event_date = today + datetime.timedelta(days=(1-today.weekday())%7 or 7)
elif "wednesday" in tl: event_date = today + datetime.timedelta(days=(2-today.weekday())%7 or 7)
elif "thursday" in tl: event_date = today + datetime.timedelta(days=(3-today.weekday())%7 or 7)
elif "friday" in tl: event_date = today + datetime.timedelta(days=(4-today.weekday())%7 or 7)
elif "next week" in tl: event_date = today + datetime.timedelta(days=7)
else: event_date = today + datetime.timedelta(days=1)
m = re.search(r'(\d{1,2})(?::(\d{2}))?\s*(am|pm)', tl)
if m:
h = int(m.group(1)); mn = int(m.group(2) or 0)
if m.group(3)=="pm" and h!=12: h+=12
elif m.group(3)=="am" and h==12: h=0
event_time = f"{h:02d}:{mn:02d}"
elif "morning" in tl: event_time = "09:00"
elif "afternoon" in tl: event_time = "14:00"
elif "evening" in tl or "tonight" in tl: event_time = "19:00"
elif "noon" in tl: event_time = "12:00"
else: event_time = "10:00"
title = "Reminder"
wm = re.search(r'(?:with|meet)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', transcript)
if wm: title = f"Meeting with {wm.group(1)}"
elif "meeting" in tl: title = "Meeting"
elif "lunch" in tl: title = "Lunch"
elif "dinner" in tl: title = "Dinner"
elif "office" in tl: title = "Back to Office"
elif "call" in tl: title = "Call"
return {"title": title, "date": str(event_date), "time": event_time}
# ─────────────────────────────────────────────────────────────────────────────
# AGENT EXECUTORS
# ─────────────────────────────────────────────────────────────────────────────
def execute_scam(transcript):
pm = re.search(r'\b(\+?65)?[\s-]?([689]\d{7})\b', transcript)
phone = pm.group(0).strip() if pm else "unknown number"
send_telegram(f"🚨 <b>ROAR Scam Alert</b>\n\n📞 {phone}\n💬 \"{transcript[:100]}\"\n\n⚠️ Call ScamShield 1799")
if phone != "unknown number":
return f"🛡️ I've flagged {phone} and sent an alert. Don't transfer money or share your OTP. Call ScamShield at 1799 now!"
return "🛡️ Alamak, sounds very suspicious! I've logged a scam alert. Don't transfer any money — call ScamShield at 1799 immediately!"
def execute_food(food_item):
ot = datetime.datetime.now().strftime("%H:%M")
eta = (datetime.datetime.now()+datetime.timedelta(minutes=25)).strftime("%H:%M")
send_telegram(f"🍜 <b>ROAR Food Order!</b>\n\n🛒 {food_item}\n⏰ Ordered: {ot}\n🚴 ETA: {eta}\n\n✅ Confirmed via ROAR!")
return f"🍜 Shiok! Order placed for {food_item}. ETA {eta}. Check Telegram for confirmation!"
def execute_calendar(transcript):
event = extract_event(transcript)
try:
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build
import pickle, io
SCOPES = ['https://www.googleapis.com/auth/calendar']
token_path = '/tmp/token.pickle'
creds = None
if os.path.exists(token_path):
with open(token_path,'rb') as f: creds = pickle.load(f)
if not creds or not creds.valid:
if creds and creds.expired and creds.refresh_token:
creds.refresh(Request())
elif GOOGLE_CREDS_JSON:
import tempfile
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
f.write(GOOGLE_CREDS_JSON)
creds_file = f.name
flow = InstalledAppFlow.from_client_secrets_file(creds_file, SCOPES)
creds = flow.run_local_server(port=0)
os.unlink(creds_file)
with open(token_path,'wb') as f: pickle.dump(creds, f)
service = build('calendar','v3',credentials=creds)
start_dt = datetime.datetime.strptime(f"{event['date']} {event['time']}", "%Y-%m-%d %H:%M")
end_dt = start_dt + datetime.timedelta(hours=1)
cal_event = {
'summary': event['title'],
'start': {'dateTime': start_dt.isoformat(), 'timeZone': 'Asia/Singapore'},
'end': {'dateTime': end_dt.isoformat(), 'timeZone': 'Asia/Singapore'},
}
service.events().insert(calendarId='primary', body=cal_event).execute()
send_telegram(f"📅 <b>ROAR Calendar</b>\n\n✅ {event['title']}\n🗓 {event['date']} at {event['time']}")
return f"📅 Done lah! '{event['title']}' added to your Google Calendar on {event['date']} at {event['time']}!"
except Exception as e:
print(f"Calendar error: {e}")
send_telegram(f"📅 <b>ROAR Calendar</b>\n📌 {event['title']}\n🗓 {event['date']} at {event['time']}\n⚠️ Add manually")
return f"📅 Noted! '{event['title']}' on {event['date']} at {event['time']}. Check Telegram!"
# ─────────────────────────────────────────────────────────────────────────────
# CONVERSATION HANDLER
# ─────────────────────────────────────────────────────────────────────────────
def handle_conversation(audio_array, transcript, session):
pending = session.get("pending_action")
if pending:
if is_confirmation(transcript):
session["pending_action"] = None
if pending["type"] == "scam":
return execute_scam(pending["transcript"]), "Scam Agent"
elif pending["type"] == "food":
item = pending.get("food_item") or detect_food_item(transcript) or "Your order"
return execute_food(item), "Food Agent"
elif pending["type"] == "food_item_needed":
item = detect_food_item(transcript)
if item: return execute_food(item), "Food Agent"
session["pending_action"] = {"type":"food_item_needed","transcript":transcript}
return "What would you like ah? Chicken rice? Nasi lemak?", "MERaLiON"
elif pending["type"] == "calendar":
return execute_calendar(pending["transcript"]), "Calendar Agent"
elif is_rejection(transcript):
session["pending_action"] = None
r = companion_reply(audio_array, transcript, session, "User declined your suggestion. ")
return r or "Ok lah, no problem! Anything else?", "MERaLiON"
else:
session["pending_action"] = None
if detect_scam(transcript):
return execute_scam(transcript), "Scam Agent"
cal = detect_calendar_hint(transcript)
if cal == "strong":
ev = extract_event(transcript)
session["pending_action"] = {"type":"calendar","transcript":transcript}
return f"📅 Want me to add '{ev['title']}' to your calendar on {ev['date']} at {ev['time']}?", "MERaLiON"
item = detect_food_item(transcript)
if item:
session["pending_action"] = {"type":"food","transcript":transcript,"food_item":item}
return f"🍜 {item} sounds good leh! Want me to order it for you?", "MERaLiON"
if detect_food_hint(transcript):
session["pending_action"] = {"type":"food_item_needed","transcript":transcript}
return "Aiyo hungry ah! What you feel like eating?", "MERaLiON"
if cal == "weak":
ev = extract_event(transcript)
session["pending_action"] = {"type":"calendar","transcript":transcript}
return f"Wah sounds like you got something on {ev['date']} {ev['time']}. Want me to add to your calendar?", "MERaLiON"
return companion_reply(audio_array, transcript, session) or "Tell me more leh!", "MERaLiON"
# ─────────────────────────────────────────────────────────────────────────────
# TTS
# ─────────────────────────────────────────────────────────────────────────────
def detect_voice(text):
zh = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
total = len([c for c in text if c.strip()])
return TTS_VOICE_ZH if total > 0 and zh/total > 0.5 else TTS_VOICE_EN
async def tts_async(text, voice):
import edge_tts
communicate = edge_tts.Communicate(text, voice)
audio_bytes = b""
async for chunk in communicate.stream():
if chunk["type"] == "audio":
audio_bytes += chunk["data"]
return audio_bytes
def tts(text):
clean = re.sub(r'[^\w\s\.,!?\-\'\":。,!?]','',text).strip()
if not clean or len(clean) < 3: clean = "Done."
voice = detect_voice(clean)
try:
return asyncio.run(tts_async(clean, voice))
except Exception:
en_only = re.sub(r'[\u4e00-\u9fff\u3000-\u303f]','',clean).strip()
if not en_only or len(en_only) < 3: en_only = "I hear you. Tell me more."
return asyncio.run(tts_async(en_only, TTS_VOICE_EN))
# ─────────────────────────────────────────────────────────────────────────────
# GRADIO INTERFACE
# ─────────────────────────────────────────────────────────────────────────────
def process_voice(audio, session_state):
"""Main Gradio handler — takes audio, returns text response + audio."""
if audio is None:
return session_state, "Please hold and speak...", None, ""
session_id = session_state.get("id", "default")
session = get_session(session_id)
try:
# Load audio from Gradio (returns (sample_rate, numpy_array))
sr, audio_array = audio
if sr != 16000:
try:
import librosa
audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sr, target_sr=16000)
except ImportError:
pass
if len(audio_array.shape) > 1:
audio_array = audio_array.mean(axis=1)
audio_array = audio_array.astype(np.float32)
# Normalise
if audio_array.max() > 1.0:
audio_array = audio_array / 32768.0
# Transcribe + respond
transcript = transcribe(audio_array)
response_text, agent_name = handle_conversation(audio_array, transcript, session)
add_to_history(session, "user", transcript)
add_to_history(session, "assistant", response_text)
# TTS
tts_text = response_text if len(response_text) >= 5 else "Done."
audio_bytes = tts(tts_text)
# Save to temp file for Gradio
with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
f.write(audio_bytes)
audio_path = f.name
display = f"**[{agent_name}]** {response_text}\n\n*You said: \"{transcript}\"*"
return session_state, display, audio_path, transcript
except Exception as e:
traceback.print_exc()
return session_state, f"Sorry, something went wrong: {str(e)}", None, ""
# Build Gradio UI
with gr.Blocks(
title="ROAR — MERaLiON Companion",
theme=gr.themes.Base(),
css="""
.gradio-container { max-width: 600px; margin: auto; }
.title { text-align: center; color: #6B2D8B; }
.subtitle { text-align: center; color: #888; margin-bottom: 20px; }
"""
) as demo:
gr.Markdown("# 🦁 ROAR — MERaLiON Companion", elem_classes="title")
gr.Markdown(
"Speak in Singlish, Mandarin, Malay or Tamil. "
"ROAR understands you and routes to the right agent.",
elem_classes="subtitle"
)
session_state = gr.State({"id": "gradio_user"})
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="🎤 Hold to speak",
)
response_box = gr.Markdown("*Waiting for your voice...*")
audio_output = gr.Audio(label="🔊 MERaLiON responds", autoplay=True)
transcript_box = gr.Textbox(label="📝 What MERaLiON heard", interactive=False)
gr.Markdown("""
**Try saying:**
- *"Aiyo so sian today lah"* → Companion chat
- *"I very hungry, want chicken rice"* → 🍜 Food Agent
- *"Schedule meeting with Lawrence tomorrow 3pm"* → 📅 Calendar Agent
- *"I think I kena scam"* → 🛡️ Scam Agent
""")
audio_input.change(
fn=process_voice,
inputs=[audio_input, session_state],
outputs=[session_state, response_box, audio_output, transcript_box]
)
if __name__ == "__main__":
demo.launch()