""" app.py ROAR Companion — HuggingFace Space MERaLiON-2-3B + Multi-Agent (Scam/Calendar/Food) + Warm Singlish Companion Runs on HuggingFace Persistent GPU (T4 Small) """ import os import re import asyncio import base64 import tempfile import traceback import datetime import numpy as np import soundfile as sf import torch import urllib.request import urllib.parse import json import gradio as gr from pathlib import Path # ───────────────────────────────────────────────────────────────────────────── # CONFIG — set via HuggingFace Space Secrets # ───────────────────────────────────────────────────────────────────────────── TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "") TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "") GOOGLE_CREDS_JSON = os.environ.get("GOOGLE_CREDS_JSON", "") # JSON string TTS_VOICE_EN = "en-SG-WayneNeural" TTS_VOICE_ZH = "zh-SG-WanLungNeural" MODEL_ID = "MERaLiON/MERaLiON-2-3B" COMPANION_PERSONA = """You are a warm, helpful AI companion for Singaporeans. You understand Singlish, code-switching between English, Mandarin, Malay, and Tamil. You speak naturally, like a friendly Singaporean friend — not stiff or formal. Keep responses concise and conversational — 1 to 3 sentences. If the user seems stressed, acknowledge their feelings first.""" # ───────────────────────────────────────────────────────────────────────────── # LOAD MERALION # ───────────────────────────────────────────────────────────────────────────── print("=" * 60) print("Loading MERaLiON-2-3B from HuggingFace...") print("=" * 60) try: from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = AutoModelForSpeechSeq2Seq.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="cuda" if torch.cuda.is_available() else "cpu", trust_remote_code=True, use_safetensors=True ) model.eval() device = "CUDA" if torch.cuda.is_available() else "CPU" print(f"MERaLiON-2-3B loaded on {device}.") except Exception as e: print(f"Failed to load MERaLiON: {e}") model = None processor = None # ───────────────────────────────────────────────────────────────────────────── # SESSION STORE # ───────────────────────────────────────────────────────────────────────────── sessions = {} def get_session(session_id): if session_id not in sessions: sessions[session_id] = {"history": [], "pending_action": None} return sessions[session_id] def add_to_history(session, role, text): session["history"].append({"role": role, "text": text}) if len(session["history"]) > 6: session["history"] = session["history"][-6:] def format_history(session): if not session["history"]: return "" lines = [] for turn in session["history"][-4:]: prefix = "User" if turn["role"] == "user" else "Companion" lines.append(f"{prefix}: {turn['text']}") return "\n".join(lines) + "\n" # ───────────────────────────────────────────────────────────────────────────── # INTENT DETECTION # ───────────────────────────────────────────────────────────────────────────── YES_WORDS = ["yes","ya","yep","yeah","yah","ok","okay","sure","can","can lah", "ok lah","yes lah","go ahead","do it","please","confirm","alright", "fine","why not","of course","ya can","can can","shiok","好","好的","可以"] NO_WORDS = ["no","nope","nah","no need","don't","dont","cancel","never mind", "nevermind","no lah","nah lah","forget it","skip","not now","later", "no thanks","不用","不要","算了"] SCAM_KEYWORDS = ["scam","kena scam","suspicious","fraud","phishing","transfer money", "bank account","otp","lucky draw","prize","win","stranger called", "unknown number","weird sms","suspicious link","lost money","cheated"] FOOD_HINTS = ["hungry","makan","eat","food","hungry lah","very hungry","starving", "want to eat","craving","order food","grab food","supper","lunch", "dinner","breakfast","没吃","饿"] FOOD_ITEMS = ["chicken rice","nasi lemak","wonton mee","char kway teow","bak chor mee", "roti prata","laksa","mee goreng","pizza","burger","chicken noodles", "fried rice","noodles","rice","chicken","fish","beef","prawn","prata","mee", "porridge","ban mian","hokkien mee","satay","bak kut teh"] CALENDAR_HINTS = ["tomorrow","next week","monday","tuesday","wednesday","thursday","friday", "saturday","sunday","tonight","this afternoon","this morning","this evening", "must go","need to go","have to go","going to","got to"] CALENDAR_ACTIONS = ["schedule","meeting","appointment","book","remind","set reminder", "add to calendar","block out","calendar","add event","create event"] def is_confirmation(t): return any(w in t.lower() for w in YES_WORDS) def is_rejection(t): return any(w in t.lower() for w in NO_WORDS) def detect_scam(t): return sum(1 for k in SCAM_KEYWORDS if k in t.lower()) > 0 def detect_food_hint(t):return any(k in t.lower() for k in FOOD_HINTS) def detect_food_item(t): tl = t.lower() for item in FOOD_ITEMS: if item in tl: return item.title() m = re.search(r'(?:want|order|eat|have|get|bring me)\s+(?:some\s+)?(.+?)(?:\s*$|[.,])', tl) if m: c = m.group(1).strip() if 2 < len(c) < 30: return c.title() return None def detect_calendar_hint(t): tl = t.lower() has_time = any(k in tl for k in CALENDAR_HINTS) has_action = any(k in tl for k in CALENDAR_ACTIONS) if has_action and has_time: return "strong" if has_time: return "weak" return None # ───────────────────────────────────────────────────────────────────────────── # MERALION INFERENCE # ───────────────────────────────────────────────────────────────────────────── def clean_text(text): if not text: return "" for t in ["<|audio|>","<|chat|>","<|text|>","<|","|>","", "","","",""]: text = text.replace(t, "") lines = [] for line in text.split("\n"): line = line.strip() if not line or line.startswith("User:") or line.startswith("* "): continue if line.startswith("Companion:"): line = line.replace("Companion:","").strip() lines.append(line) return " ".join(lines).strip() def run_meralion(audio_array, query, max_tokens=150): if model is None or processor is None: return "" pt = "Instruction: {query} \nFollow the text instruction based on the following audio: " conv = [{"role": "user", "content": pt.format(query=query)}] chat_prompt = processor.tokenizer.apply_chat_template( conv, tokenize=False, add_generation_prompt=True) if isinstance(chat_prompt, list): chat_prompt = chat_prompt[0] inputs = processor(text=chat_prompt, audios=audio_array, sampling_rate=16000) device = next(model.parameters()).device inputs = {k: v.to(device) if hasattr(v,"to") else v for k,v in inputs.items()} if "input_features" in inputs: inputs["input_features"] = inputs["input_features"].to(torch.float16) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=True, temperature=0.75, top_p=0.9, repetition_penalty=1.1) gen = out[:, inputs["input_ids"].shape[1]:] return clean_text(processor.batch_decode(gen, skip_special_tokens=True)[0].strip()) def transcribe(audio_array): return run_meralion(audio_array, "Please transcribe what the user said accurately, including Singlish, Malay, Mandarin or Tamil words.", max_tokens=200) def companion_reply(audio_array, transcript, session, extra=""): history = format_history(session) ctx = f"{COMPANION_PERSONA}\n\n" if history: ctx += f"Conversation so far:\n{history}\n" if extra: ctx += f"{extra}\n" ctx += f"The user just said: '{transcript}'. Reply warmly in 1-2 sentences." result = run_meralion(audio_array, ctx) return result if len(result) > 5 else "Wah, tell me more leh!" # ───────────────────────────────────────────────────────────────────────────── # TELEGRAM # ───────────────────────────────────────────────────────────────────────────── def send_telegram(message): if not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID: return False try: encoded = urllib.parse.quote(message) url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/sendMessage?chat_id={TELEGRAM_CHAT_ID}&text={encoded}&parse_mode=HTML" urllib.request.urlopen(url, timeout=10) return True except Exception as e: print(f"Telegram error: {e}") return False # ───────────────────────────────────────────────────────────────────────────── # EVENT EXTRACTION # ───────────────────────────────────────────────────────────────────────────── def extract_event(transcript): tl = transcript.lower() today = datetime.date.today() if "tomorrow" in tl: event_date = today + datetime.timedelta(days=1) elif "monday" in tl: event_date = today + datetime.timedelta(days=(0-today.weekday())%7 or 7) elif "tuesday" in tl: event_date = today + datetime.timedelta(days=(1-today.weekday())%7 or 7) elif "wednesday" in tl: event_date = today + datetime.timedelta(days=(2-today.weekday())%7 or 7) elif "thursday" in tl: event_date = today + datetime.timedelta(days=(3-today.weekday())%7 or 7) elif "friday" in tl: event_date = today + datetime.timedelta(days=(4-today.weekday())%7 or 7) elif "next week" in tl: event_date = today + datetime.timedelta(days=7) else: event_date = today + datetime.timedelta(days=1) m = re.search(r'(\d{1,2})(?::(\d{2}))?\s*(am|pm)', tl) if m: h = int(m.group(1)); mn = int(m.group(2) or 0) if m.group(3)=="pm" and h!=12: h+=12 elif m.group(3)=="am" and h==12: h=0 event_time = f"{h:02d}:{mn:02d}" elif "morning" in tl: event_time = "09:00" elif "afternoon" in tl: event_time = "14:00" elif "evening" in tl or "tonight" in tl: event_time = "19:00" elif "noon" in tl: event_time = "12:00" else: event_time = "10:00" title = "Reminder" wm = re.search(r'(?:with|meet)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', transcript) if wm: title = f"Meeting with {wm.group(1)}" elif "meeting" in tl: title = "Meeting" elif "lunch" in tl: title = "Lunch" elif "dinner" in tl: title = "Dinner" elif "office" in tl: title = "Back to Office" elif "call" in tl: title = "Call" return {"title": title, "date": str(event_date), "time": event_time} # ───────────────────────────────────────────────────────────────────────────── # AGENT EXECUTORS # ───────────────────────────────────────────────────────────────────────────── def execute_scam(transcript): pm = re.search(r'\b(\+?65)?[\s-]?([689]\d{7})\b', transcript) phone = pm.group(0).strip() if pm else "unknown number" send_telegram(f"🚨 ROAR Scam Alert\n\n📞 {phone}\n💬 \"{transcript[:100]}\"\n\n⚠️ Call ScamShield 1799") if phone != "unknown number": return f"🛡️ I've flagged {phone} and sent an alert. Don't transfer money or share your OTP. Call ScamShield at 1799 now!" return "🛡️ Alamak, sounds very suspicious! I've logged a scam alert. Don't transfer any money — call ScamShield at 1799 immediately!" def execute_food(food_item): ot = datetime.datetime.now().strftime("%H:%M") eta = (datetime.datetime.now()+datetime.timedelta(minutes=25)).strftime("%H:%M") send_telegram(f"🍜 ROAR Food Order!\n\n🛒 {food_item}\n⏰ Ordered: {ot}\n🚴 ETA: {eta}\n\n✅ Confirmed via ROAR!") return f"🍜 Shiok! Order placed for {food_item}. ETA {eta}. Check Telegram for confirmation!" def execute_calendar(transcript): event = extract_event(transcript) try: from google.oauth2.credentials import Credentials from google_auth_oauthlib.flow import InstalledAppFlow from google.auth.transport.requests import Request from googleapiclient.discovery import build import pickle, io SCOPES = ['https://www.googleapis.com/auth/calendar'] token_path = '/tmp/token.pickle' creds = None if os.path.exists(token_path): with open(token_path,'rb') as f: creds = pickle.load(f) if not creds or not creds.valid: if creds and creds.expired and creds.refresh_token: creds.refresh(Request()) elif GOOGLE_CREDS_JSON: import tempfile with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: f.write(GOOGLE_CREDS_JSON) creds_file = f.name flow = InstalledAppFlow.from_client_secrets_file(creds_file, SCOPES) creds = flow.run_local_server(port=0) os.unlink(creds_file) with open(token_path,'wb') as f: pickle.dump(creds, f) service = build('calendar','v3',credentials=creds) start_dt = datetime.datetime.strptime(f"{event['date']} {event['time']}", "%Y-%m-%d %H:%M") end_dt = start_dt + datetime.timedelta(hours=1) cal_event = { 'summary': event['title'], 'start': {'dateTime': start_dt.isoformat(), 'timeZone': 'Asia/Singapore'}, 'end': {'dateTime': end_dt.isoformat(), 'timeZone': 'Asia/Singapore'}, } service.events().insert(calendarId='primary', body=cal_event).execute() send_telegram(f"📅 ROAR Calendar\n\n✅ {event['title']}\n🗓 {event['date']} at {event['time']}") return f"📅 Done lah! '{event['title']}' added to your Google Calendar on {event['date']} at {event['time']}!" except Exception as e: print(f"Calendar error: {e}") send_telegram(f"📅 ROAR Calendar\n📌 {event['title']}\n🗓 {event['date']} at {event['time']}\n⚠️ Add manually") return f"📅 Noted! '{event['title']}' on {event['date']} at {event['time']}. Check Telegram!" # ───────────────────────────────────────────────────────────────────────────── # CONVERSATION HANDLER # ───────────────────────────────────────────────────────────────────────────── def handle_conversation(audio_array, transcript, session): pending = session.get("pending_action") if pending: if is_confirmation(transcript): session["pending_action"] = None if pending["type"] == "scam": return execute_scam(pending["transcript"]), "Scam Agent" elif pending["type"] == "food": item = pending.get("food_item") or detect_food_item(transcript) or "Your order" return execute_food(item), "Food Agent" elif pending["type"] == "food_item_needed": item = detect_food_item(transcript) if item: return execute_food(item), "Food Agent" session["pending_action"] = {"type":"food_item_needed","transcript":transcript} return "What would you like ah? Chicken rice? Nasi lemak?", "MERaLiON" elif pending["type"] == "calendar": return execute_calendar(pending["transcript"]), "Calendar Agent" elif is_rejection(transcript): session["pending_action"] = None r = companion_reply(audio_array, transcript, session, "User declined your suggestion. ") return r or "Ok lah, no problem! Anything else?", "MERaLiON" else: session["pending_action"] = None if detect_scam(transcript): return execute_scam(transcript), "Scam Agent" cal = detect_calendar_hint(transcript) if cal == "strong": ev = extract_event(transcript) session["pending_action"] = {"type":"calendar","transcript":transcript} return f"📅 Want me to add '{ev['title']}' to your calendar on {ev['date']} at {ev['time']}?", "MERaLiON" item = detect_food_item(transcript) if item: session["pending_action"] = {"type":"food","transcript":transcript,"food_item":item} return f"🍜 {item} sounds good leh! Want me to order it for you?", "MERaLiON" if detect_food_hint(transcript): session["pending_action"] = {"type":"food_item_needed","transcript":transcript} return "Aiyo hungry ah! What you feel like eating?", "MERaLiON" if cal == "weak": ev = extract_event(transcript) session["pending_action"] = {"type":"calendar","transcript":transcript} return f"Wah sounds like you got something on {ev['date']} {ev['time']}. Want me to add to your calendar?", "MERaLiON" return companion_reply(audio_array, transcript, session) or "Tell me more leh!", "MERaLiON" # ───────────────────────────────────────────────────────────────────────────── # TTS # ───────────────────────────────────────────────────────────────────────────── def detect_voice(text): zh = sum(1 for c in text if '\u4e00' <= c <= '\u9fff') total = len([c for c in text if c.strip()]) return TTS_VOICE_ZH if total > 0 and zh/total > 0.5 else TTS_VOICE_EN async def tts_async(text, voice): import edge_tts communicate = edge_tts.Communicate(text, voice) audio_bytes = b"" async for chunk in communicate.stream(): if chunk["type"] == "audio": audio_bytes += chunk["data"] return audio_bytes def tts(text): clean = re.sub(r'[^\w\s\.,!?\-\'\":。,!?]','',text).strip() if not clean or len(clean) < 3: clean = "Done." voice = detect_voice(clean) try: return asyncio.run(tts_async(clean, voice)) except Exception: en_only = re.sub(r'[\u4e00-\u9fff\u3000-\u303f]','',clean).strip() if not en_only or len(en_only) < 3: en_only = "I hear you. Tell me more." return asyncio.run(tts_async(en_only, TTS_VOICE_EN)) # ───────────────────────────────────────────────────────────────────────────── # GRADIO INTERFACE # ───────────────────────────────────────────────────────────────────────────── def process_voice(audio, session_state): """Main Gradio handler — takes audio, returns text response + audio.""" if audio is None: return session_state, "Please hold and speak...", None, "" session_id = session_state.get("id", "default") session = get_session(session_id) try: # Load audio from Gradio (returns (sample_rate, numpy_array)) sr, audio_array = audio if sr != 16000: try: import librosa audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sr, target_sr=16000) except ImportError: pass if len(audio_array.shape) > 1: audio_array = audio_array.mean(axis=1) audio_array = audio_array.astype(np.float32) # Normalise if audio_array.max() > 1.0: audio_array = audio_array / 32768.0 # Transcribe + respond transcript = transcribe(audio_array) response_text, agent_name = handle_conversation(audio_array, transcript, session) add_to_history(session, "user", transcript) add_to_history(session, "assistant", response_text) # TTS tts_text = response_text if len(response_text) >= 5 else "Done." audio_bytes = tts(tts_text) # Save to temp file for Gradio with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f: f.write(audio_bytes) audio_path = f.name display = f"**[{agent_name}]** {response_text}\n\n*You said: \"{transcript}\"*" return session_state, display, audio_path, transcript except Exception as e: traceback.print_exc() return session_state, f"Sorry, something went wrong: {str(e)}", None, "" # Build Gradio UI with gr.Blocks( title="ROAR — MERaLiON Companion", theme=gr.themes.Base(), css=""" .gradio-container { max-width: 600px; margin: auto; } .title { text-align: center; color: #6B2D8B; } .subtitle { text-align: center; color: #888; margin-bottom: 20px; } """ ) as demo: gr.Markdown("# 🦁 ROAR — MERaLiON Companion", elem_classes="title") gr.Markdown( "Speak in Singlish, Mandarin, Malay or Tamil. " "ROAR understands you and routes to the right agent.", elem_classes="subtitle" ) session_state = gr.State({"id": "gradio_user"}) with gr.Row(): audio_input = gr.Audio( sources=["microphone"], type="numpy", label="🎤 Hold to speak", ) response_box = gr.Markdown("*Waiting for your voice...*") audio_output = gr.Audio(label="🔊 MERaLiON responds", autoplay=True) transcript_box = gr.Textbox(label="📝 What MERaLiON heard", interactive=False) gr.Markdown(""" **Try saying:** - *"Aiyo so sian today lah"* → Companion chat - *"I very hungry, want chicken rice"* → 🍜 Food Agent - *"Schedule meeting with Lawrence tomorrow 3pm"* → 📅 Calendar Agent - *"I think I kena scam"* → 🛡️ Scam Agent """) audio_input.change( fn=process_voice, inputs=[audio_input, session_state], outputs=[session_state, response_box, audio_output, transcript_box] ) if __name__ == "__main__": demo.launch()