| """ |
| app.py |
| ROAR Companion — HuggingFace Space |
| MERaLiON-2-3B + Multi-Agent (Scam/Calendar/Food) + Warm Singlish Companion |
| Runs on HuggingFace Persistent GPU (T4 Small) |
| """ |
|
|
| import os |
| import re |
| import asyncio |
| import base64 |
| import tempfile |
| import traceback |
| import datetime |
| import numpy as np |
| import soundfile as sf |
| import torch |
| import urllib.request |
| import urllib.parse |
| import json |
| import gradio as gr |
| from pathlib import Path |
|
|
| |
| |
| |
| TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "") |
| TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "") |
| GOOGLE_CREDS_JSON = os.environ.get("GOOGLE_CREDS_JSON", "") |
|
|
| TTS_VOICE_EN = "en-SG-WayneNeural" |
| TTS_VOICE_ZH = "zh-SG-WanLungNeural" |
| MODEL_ID = "MERaLiON/MERaLiON-2-3B" |
|
|
| COMPANION_PERSONA = """You are a warm, helpful AI companion for Singaporeans. |
| You understand Singlish, code-switching between English, Mandarin, Malay, and Tamil. |
| You speak naturally, like a friendly Singaporean friend — not stiff or formal. |
| Keep responses concise and conversational — 1 to 3 sentences. |
| If the user seems stressed, acknowledge their feelings first.""" |
|
|
| |
| |
| |
| print("=" * 60) |
| print("Loading MERaLiON-2-3B from HuggingFace...") |
| print("=" * 60) |
|
|
| try: |
| from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq |
| processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) |
| model = AutoModelForSpeechSeq2Seq.from_pretrained( |
| MODEL_ID, |
| torch_dtype=torch.float16, |
| device_map="cuda" if torch.cuda.is_available() else "cpu", |
| trust_remote_code=True, |
| use_safetensors=True |
| ) |
| model.eval() |
| device = "CUDA" if torch.cuda.is_available() else "CPU" |
| print(f"MERaLiON-2-3B loaded on {device}.") |
| except Exception as e: |
| print(f"Failed to load MERaLiON: {e}") |
| model = None |
| processor = None |
|
|
| |
| |
| |
| sessions = {} |
|
|
| def get_session(session_id): |
| if session_id not in sessions: |
| sessions[session_id] = {"history": [], "pending_action": None} |
| return sessions[session_id] |
|
|
| def add_to_history(session, role, text): |
| session["history"].append({"role": role, "text": text}) |
| if len(session["history"]) > 6: |
| session["history"] = session["history"][-6:] |
|
|
| def format_history(session): |
| if not session["history"]: |
| return "" |
| lines = [] |
| for turn in session["history"][-4:]: |
| prefix = "User" if turn["role"] == "user" else "Companion" |
| lines.append(f"{prefix}: {turn['text']}") |
| return "\n".join(lines) + "\n" |
|
|
| |
| |
| |
| YES_WORDS = ["yes","ya","yep","yeah","yah","ok","okay","sure","can","can lah", |
| "ok lah","yes lah","go ahead","do it","please","confirm","alright", |
| "fine","why not","of course","ya can","can can","shiok","好","好的","可以"] |
| NO_WORDS = ["no","nope","nah","no need","don't","dont","cancel","never mind", |
| "nevermind","no lah","nah lah","forget it","skip","not now","later", |
| "no thanks","不用","不要","算了"] |
|
|
| SCAM_KEYWORDS = ["scam","kena scam","suspicious","fraud","phishing","transfer money", |
| "bank account","otp","lucky draw","prize","win","stranger called", |
| "unknown number","weird sms","suspicious link","lost money","cheated"] |
| FOOD_HINTS = ["hungry","makan","eat","food","hungry lah","very hungry","starving", |
| "want to eat","craving","order food","grab food","supper","lunch", |
| "dinner","breakfast","没吃","饿"] |
| FOOD_ITEMS = ["chicken rice","nasi lemak","wonton mee","char kway teow","bak chor mee", |
| "roti prata","laksa","mee goreng","pizza","burger","chicken noodles", |
| "fried rice","noodles","rice","chicken","fish","beef","prawn","prata","mee", |
| "porridge","ban mian","hokkien mee","satay","bak kut teh"] |
| CALENDAR_HINTS = ["tomorrow","next week","monday","tuesday","wednesday","thursday","friday", |
| "saturday","sunday","tonight","this afternoon","this morning","this evening", |
| "must go","need to go","have to go","going to","got to"] |
| CALENDAR_ACTIONS = ["schedule","meeting","appointment","book","remind","set reminder", |
| "add to calendar","block out","calendar","add event","create event"] |
|
|
| def is_confirmation(t): return any(w in t.lower() for w in YES_WORDS) |
| def is_rejection(t): return any(w in t.lower() for w in NO_WORDS) |
| def detect_scam(t): return sum(1 for k in SCAM_KEYWORDS if k in t.lower()) > 0 |
| def detect_food_hint(t):return any(k in t.lower() for k in FOOD_HINTS) |
|
|
| def detect_food_item(t): |
| tl = t.lower() |
| for item in FOOD_ITEMS: |
| if item in tl: return item.title() |
| m = re.search(r'(?:want|order|eat|have|get|bring me)\s+(?:some\s+)?(.+?)(?:\s*$|[.,])', tl) |
| if m: |
| c = m.group(1).strip() |
| if 2 < len(c) < 30: return c.title() |
| return None |
|
|
| def detect_calendar_hint(t): |
| tl = t.lower() |
| has_time = any(k in tl for k in CALENDAR_HINTS) |
| has_action = any(k in tl for k in CALENDAR_ACTIONS) |
| if has_action and has_time: return "strong" |
| if has_time: return "weak" |
| return None |
|
|
| |
| |
| |
|
|
| def clean_text(text): |
| if not text: return "" |
| for t in ["<|audio|>","<|chat|>","<|text|>","<|","|>","<SpeechHere>", |
| "<Speaker1>","<Speaker2>","<bos>","<eos>"]: |
| text = text.replace(t, "") |
| lines = [] |
| for line in text.split("\n"): |
| line = line.strip() |
| if not line or line.startswith("User:") or line.startswith("* "): continue |
| if line.startswith("Companion:"): line = line.replace("Companion:","").strip() |
| lines.append(line) |
| return " ".join(lines).strip() |
|
|
| def run_meralion(audio_array, query, max_tokens=150): |
| if model is None or processor is None: return "" |
| pt = "Instruction: {query} \nFollow the text instruction based on the following audio: <SpeechHere>" |
| conv = [{"role": "user", "content": pt.format(query=query)}] |
| chat_prompt = processor.tokenizer.apply_chat_template( |
| conv, tokenize=False, add_generation_prompt=True) |
| if isinstance(chat_prompt, list): chat_prompt = chat_prompt[0] |
| inputs = processor(text=chat_prompt, audios=audio_array, sampling_rate=16000) |
| device = next(model.parameters()).device |
| inputs = {k: v.to(device) if hasattr(v,"to") else v for k,v in inputs.items()} |
| if "input_features" in inputs: |
| inputs["input_features"] = inputs["input_features"].to(torch.float16) |
| with torch.no_grad(): |
| out = model.generate(**inputs, max_new_tokens=max_tokens, |
| do_sample=True, temperature=0.75, |
| top_p=0.9, repetition_penalty=1.1) |
| gen = out[:, inputs["input_ids"].shape[1]:] |
| return clean_text(processor.batch_decode(gen, skip_special_tokens=True)[0].strip()) |
|
|
| def transcribe(audio_array): |
| return run_meralion(audio_array, |
| "Please transcribe what the user said accurately, including Singlish, Malay, Mandarin or Tamil words.", |
| max_tokens=200) |
|
|
| def companion_reply(audio_array, transcript, session, extra=""): |
| history = format_history(session) |
| ctx = f"{COMPANION_PERSONA}\n\n" |
| if history: ctx += f"Conversation so far:\n{history}\n" |
| if extra: ctx += f"{extra}\n" |
| ctx += f"The user just said: '{transcript}'. Reply warmly in 1-2 sentences." |
| result = run_meralion(audio_array, ctx) |
| return result if len(result) > 5 else "Wah, tell me more leh!" |
|
|
| |
| |
| |
|
|
| def send_telegram(message): |
| if not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID: return False |
| try: |
| encoded = urllib.parse.quote(message) |
| url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/sendMessage?chat_id={TELEGRAM_CHAT_ID}&text={encoded}&parse_mode=HTML" |
| urllib.request.urlopen(url, timeout=10) |
| return True |
| except Exception as e: |
| print(f"Telegram error: {e}") |
| return False |
|
|
| |
| |
| |
|
|
| def extract_event(transcript): |
| tl = transcript.lower() |
| today = datetime.date.today() |
| if "tomorrow" in tl: event_date = today + datetime.timedelta(days=1) |
| elif "monday" in tl: event_date = today + datetime.timedelta(days=(0-today.weekday())%7 or 7) |
| elif "tuesday" in tl: event_date = today + datetime.timedelta(days=(1-today.weekday())%7 or 7) |
| elif "wednesday" in tl: event_date = today + datetime.timedelta(days=(2-today.weekday())%7 or 7) |
| elif "thursday" in tl: event_date = today + datetime.timedelta(days=(3-today.weekday())%7 or 7) |
| elif "friday" in tl: event_date = today + datetime.timedelta(days=(4-today.weekday())%7 or 7) |
| elif "next week" in tl: event_date = today + datetime.timedelta(days=7) |
| else: event_date = today + datetime.timedelta(days=1) |
|
|
| m = re.search(r'(\d{1,2})(?::(\d{2}))?\s*(am|pm)', tl) |
| if m: |
| h = int(m.group(1)); mn = int(m.group(2) or 0) |
| if m.group(3)=="pm" and h!=12: h+=12 |
| elif m.group(3)=="am" and h==12: h=0 |
| event_time = f"{h:02d}:{mn:02d}" |
| elif "morning" in tl: event_time = "09:00" |
| elif "afternoon" in tl: event_time = "14:00" |
| elif "evening" in tl or "tonight" in tl: event_time = "19:00" |
| elif "noon" in tl: event_time = "12:00" |
| else: event_time = "10:00" |
|
|
| title = "Reminder" |
| wm = re.search(r'(?:with|meet)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', transcript) |
| if wm: title = f"Meeting with {wm.group(1)}" |
| elif "meeting" in tl: title = "Meeting" |
| elif "lunch" in tl: title = "Lunch" |
| elif "dinner" in tl: title = "Dinner" |
| elif "office" in tl: title = "Back to Office" |
| elif "call" in tl: title = "Call" |
| return {"title": title, "date": str(event_date), "time": event_time} |
|
|
| |
| |
| |
|
|
| def execute_scam(transcript): |
| pm = re.search(r'\b(\+?65)?[\s-]?([689]\d{7})\b', transcript) |
| phone = pm.group(0).strip() if pm else "unknown number" |
| send_telegram(f"🚨 <b>ROAR Scam Alert</b>\n\n📞 {phone}\n💬 \"{transcript[:100]}\"\n\n⚠️ Call ScamShield 1799") |
| if phone != "unknown number": |
| return f"🛡️ I've flagged {phone} and sent an alert. Don't transfer money or share your OTP. Call ScamShield at 1799 now!" |
| return "🛡️ Alamak, sounds very suspicious! I've logged a scam alert. Don't transfer any money — call ScamShield at 1799 immediately!" |
|
|
| def execute_food(food_item): |
| ot = datetime.datetime.now().strftime("%H:%M") |
| eta = (datetime.datetime.now()+datetime.timedelta(minutes=25)).strftime("%H:%M") |
| send_telegram(f"🍜 <b>ROAR Food Order!</b>\n\n🛒 {food_item}\n⏰ Ordered: {ot}\n🚴 ETA: {eta}\n\n✅ Confirmed via ROAR!") |
| return f"🍜 Shiok! Order placed for {food_item}. ETA {eta}. Check Telegram for confirmation!" |
|
|
| def execute_calendar(transcript): |
| event = extract_event(transcript) |
| try: |
| from google.oauth2.credentials import Credentials |
| from google_auth_oauthlib.flow import InstalledAppFlow |
| from google.auth.transport.requests import Request |
| from googleapiclient.discovery import build |
| import pickle, io |
|
|
| SCOPES = ['https://www.googleapis.com/auth/calendar'] |
| token_path = '/tmp/token.pickle' |
| creds = None |
|
|
| if os.path.exists(token_path): |
| with open(token_path,'rb') as f: creds = pickle.load(f) |
| if not creds or not creds.valid: |
| if creds and creds.expired and creds.refresh_token: |
| creds.refresh(Request()) |
| elif GOOGLE_CREDS_JSON: |
| import tempfile |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: |
| f.write(GOOGLE_CREDS_JSON) |
| creds_file = f.name |
| flow = InstalledAppFlow.from_client_secrets_file(creds_file, SCOPES) |
| creds = flow.run_local_server(port=0) |
| os.unlink(creds_file) |
| with open(token_path,'wb') as f: pickle.dump(creds, f) |
|
|
| service = build('calendar','v3',credentials=creds) |
| start_dt = datetime.datetime.strptime(f"{event['date']} {event['time']}", "%Y-%m-%d %H:%M") |
| end_dt = start_dt + datetime.timedelta(hours=1) |
| cal_event = { |
| 'summary': event['title'], |
| 'start': {'dateTime': start_dt.isoformat(), 'timeZone': 'Asia/Singapore'}, |
| 'end': {'dateTime': end_dt.isoformat(), 'timeZone': 'Asia/Singapore'}, |
| } |
| service.events().insert(calendarId='primary', body=cal_event).execute() |
| send_telegram(f"📅 <b>ROAR Calendar</b>\n\n✅ {event['title']}\n🗓 {event['date']} at {event['time']}") |
| return f"📅 Done lah! '{event['title']}' added to your Google Calendar on {event['date']} at {event['time']}!" |
| except Exception as e: |
| print(f"Calendar error: {e}") |
| send_telegram(f"📅 <b>ROAR Calendar</b>\n📌 {event['title']}\n🗓 {event['date']} at {event['time']}\n⚠️ Add manually") |
| return f"📅 Noted! '{event['title']}' on {event['date']} at {event['time']}. Check Telegram!" |
|
|
| |
| |
| |
|
|
| def handle_conversation(audio_array, transcript, session): |
| pending = session.get("pending_action") |
|
|
| if pending: |
| if is_confirmation(transcript): |
| session["pending_action"] = None |
| if pending["type"] == "scam": |
| return execute_scam(pending["transcript"]), "Scam Agent" |
| elif pending["type"] == "food": |
| item = pending.get("food_item") or detect_food_item(transcript) or "Your order" |
| return execute_food(item), "Food Agent" |
| elif pending["type"] == "food_item_needed": |
| item = detect_food_item(transcript) |
| if item: return execute_food(item), "Food Agent" |
| session["pending_action"] = {"type":"food_item_needed","transcript":transcript} |
| return "What would you like ah? Chicken rice? Nasi lemak?", "MERaLiON" |
| elif pending["type"] == "calendar": |
| return execute_calendar(pending["transcript"]), "Calendar Agent" |
| elif is_rejection(transcript): |
| session["pending_action"] = None |
| r = companion_reply(audio_array, transcript, session, "User declined your suggestion. ") |
| return r or "Ok lah, no problem! Anything else?", "MERaLiON" |
| else: |
| session["pending_action"] = None |
|
|
| if detect_scam(transcript): |
| return execute_scam(transcript), "Scam Agent" |
|
|
| cal = detect_calendar_hint(transcript) |
| if cal == "strong": |
| ev = extract_event(transcript) |
| session["pending_action"] = {"type":"calendar","transcript":transcript} |
| return f"📅 Want me to add '{ev['title']}' to your calendar on {ev['date']} at {ev['time']}?", "MERaLiON" |
|
|
| item = detect_food_item(transcript) |
| if item: |
| session["pending_action"] = {"type":"food","transcript":transcript,"food_item":item} |
| return f"🍜 {item} sounds good leh! Want me to order it for you?", "MERaLiON" |
|
|
| if detect_food_hint(transcript): |
| session["pending_action"] = {"type":"food_item_needed","transcript":transcript} |
| return "Aiyo hungry ah! What you feel like eating?", "MERaLiON" |
|
|
| if cal == "weak": |
| ev = extract_event(transcript) |
| session["pending_action"] = {"type":"calendar","transcript":transcript} |
| return f"Wah sounds like you got something on {ev['date']} {ev['time']}. Want me to add to your calendar?", "MERaLiON" |
|
|
| return companion_reply(audio_array, transcript, session) or "Tell me more leh!", "MERaLiON" |
|
|
| |
| |
| |
|
|
| def detect_voice(text): |
| zh = sum(1 for c in text if '\u4e00' <= c <= '\u9fff') |
| total = len([c for c in text if c.strip()]) |
| return TTS_VOICE_ZH if total > 0 and zh/total > 0.5 else TTS_VOICE_EN |
|
|
| async def tts_async(text, voice): |
| import edge_tts |
| communicate = edge_tts.Communicate(text, voice) |
| audio_bytes = b"" |
| async for chunk in communicate.stream(): |
| if chunk["type"] == "audio": |
| audio_bytes += chunk["data"] |
| return audio_bytes |
|
|
| def tts(text): |
| clean = re.sub(r'[^\w\s\.,!?\-\'\":。,!?]','',text).strip() |
| if not clean or len(clean) < 3: clean = "Done." |
| voice = detect_voice(clean) |
| try: |
| return asyncio.run(tts_async(clean, voice)) |
| except Exception: |
| en_only = re.sub(r'[\u4e00-\u9fff\u3000-\u303f]','',clean).strip() |
| if not en_only or len(en_only) < 3: en_only = "I hear you. Tell me more." |
| return asyncio.run(tts_async(en_only, TTS_VOICE_EN)) |
|
|
| |
| |
| |
|
|
| def process_voice(audio, session_state): |
| """Main Gradio handler — takes audio, returns text response + audio.""" |
| if audio is None: |
| return session_state, "Please hold and speak...", None, "" |
|
|
| session_id = session_state.get("id", "default") |
| session = get_session(session_id) |
|
|
| try: |
| |
| sr, audio_array = audio |
| if sr != 16000: |
| try: |
| import librosa |
| audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sr, target_sr=16000) |
| except ImportError: |
| pass |
| if len(audio_array.shape) > 1: |
| audio_array = audio_array.mean(axis=1) |
| audio_array = audio_array.astype(np.float32) |
|
|
| |
| if audio_array.max() > 1.0: |
| audio_array = audio_array / 32768.0 |
|
|
| |
| transcript = transcribe(audio_array) |
| response_text, agent_name = handle_conversation(audio_array, transcript, session) |
|
|
| add_to_history(session, "user", transcript) |
| add_to_history(session, "assistant", response_text) |
|
|
| |
| tts_text = response_text if len(response_text) >= 5 else "Done." |
| audio_bytes = tts(tts_text) |
|
|
| |
| with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: |
| f.write(audio_bytes) |
| audio_path = f.name |
|
|
| display = f"**[{agent_name}]** {response_text}\n\n*You said: \"{transcript}\"*" |
| return session_state, display, audio_path, transcript |
|
|
| except Exception as e: |
| traceback.print_exc() |
| return session_state, f"Sorry, something went wrong: {str(e)}", None, "" |
|
|
|
|
| |
| with gr.Blocks( |
| title="ROAR — MERaLiON Companion", |
| theme=gr.themes.Base(), |
| css=""" |
| .gradio-container { max-width: 600px; margin: auto; } |
| .title { text-align: center; color: #6B2D8B; } |
| .subtitle { text-align: center; color: #888; margin-bottom: 20px; } |
| """ |
| ) as demo: |
|
|
| gr.Markdown("# 🦁 ROAR — MERaLiON Companion", elem_classes="title") |
| gr.Markdown( |
| "Speak in Singlish, Mandarin, Malay or Tamil. " |
| "ROAR understands you and routes to the right agent.", |
| elem_classes="subtitle" |
| ) |
|
|
| session_state = gr.State({"id": "gradio_user"}) |
|
|
| with gr.Row(): |
| audio_input = gr.Audio( |
| sources=["microphone"], |
| type="numpy", |
| label="🎤 Hold to speak", |
| ) |
|
|
| response_box = gr.Markdown("*Waiting for your voice...*") |
| audio_output = gr.Audio(label="🔊 MERaLiON responds", autoplay=True) |
| transcript_box = gr.Textbox(label="📝 What MERaLiON heard", interactive=False) |
|
|
| gr.Markdown(""" |
| **Try saying:** |
| - *"Aiyo so sian today lah"* → Companion chat |
| - *"I very hungry, want chicken rice"* → 🍜 Food Agent |
| - *"Schedule meeting with Lawrence tomorrow 3pm"* → 📅 Calendar Agent |
| - *"I think I kena scam"* → 🛡️ Scam Agent |
| """) |
|
|
| audio_input.change( |
| fn=process_voice, |
| inputs=[audio_input, session_state], |
| outputs=[session_state, response_box, audio_output, transcript_box] |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|