Upload app.py with huggingface_hub

78d4ef4 verified about 1 month ago

25.1 kB

	"""
	app.py
	ROAR Companion — HuggingFace Space
	MERaLiON-2-3B + Multi-Agent (Scam/Calendar/Food) + Warm Singlish Companion
	Runs on HuggingFace Persistent GPU (T4 Small)
	"""

	import os
	import re
	import asyncio
	import base64
	import tempfile
	import traceback
	import datetime
	import numpy as np
	import soundfile as sf
	import torch
	import urllib.request
	import urllib.parse
	import json
	import gradio as gr
	from pathlib import Path

	# ─────────────────────────────────────────────────────────────────────────────
	# CONFIG — set via HuggingFace Space Secrets
	# ─────────────────────────────────────────────────────────────────────────────
	TELEGRAM_TOKEN = os.environ.get("TELEGRAM_TOKEN", "")
	TELEGRAM_CHAT_ID = os.environ.get("TELEGRAM_CHAT_ID", "")
	GOOGLE_CREDS_JSON = os.environ.get("GOOGLE_CREDS_JSON", "") # JSON string

	TTS_VOICE_EN = "en-SG-WayneNeural"
	TTS_VOICE_ZH = "zh-SG-WanLungNeural"
	MODEL_ID = "MERaLiON/MERaLiON-2-3B"

	COMPANION_PERSONA = """You are a warm, helpful AI companion for Singaporeans.
	You understand Singlish, code-switching between English, Mandarin, Malay, and Tamil.
	You speak naturally, like a friendly Singaporean friend — not stiff or formal.
	Keep responses concise and conversational — 1 to 3 sentences.
	If the user seems stressed, acknowledge their feelings first."""

	# ─────────────────────────────────────────────────────────────────────────────
	# LOAD MERALION
	# ─────────────────────────────────────────────────────────────────────────────
	print("=" * 60)
	print("Loading MERaLiON-2-3B from HuggingFace...")
	print("=" * 60)

	try:
	from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
	processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModelForSpeechSeq2Seq.from_pretrained(
	MODEL_ID,
	torch_dtype=torch.float16,
	device_map="cuda" if torch.cuda.is_available() else "cpu",
	trust_remote_code=True,
	use_safetensors=True
	)
	model.eval()
	device = "CUDA" if torch.cuda.is_available() else "CPU"
	print(f"MERaLiON-2-3B loaded on {device}.")
	except Exception as e:
	print(f"Failed to load MERaLiON: {e}")
	model = None
	processor = None

	# ─────────────────────────────────────────────────────────────────────────────
	# SESSION STORE
	# ─────────────────────────────────────────────────────────────────────────────
	sessions = {}

	def get_session(session_id):
	if session_id not in sessions:
	sessions[session_id] = {"history": [], "pending_action": None}
	return sessions[session_id]

	def add_to_history(session, role, text):
	session["history"].append({"role": role, "text": text})
	if len(session["history"]) > 6:
	session["history"] = session["history"][-6:]

	def format_history(session):
	if not session["history"]:
	return ""
	lines = []
	for turn in session["history"][-4:]:
	prefix = "User" if turn["role"] == "user" else "Companion"
	lines.append(f"{prefix}: {turn['text']}")
	return "\n".join(lines) + "\n"

	# ─────────────────────────────────────────────────────────────────────────────
	# INTENT DETECTION
	# ─────────────────────────────────────────────────────────────────────────────
	YES_WORDS = ["yes","ya","yep","yeah","yah","ok","okay","sure","can","can lah",
	"ok lah","yes lah","go ahead","do it","please","confirm","alright",
	"fine","why not","of course","ya can","can can","shiok","好","好的","可以"]
	NO_WORDS = ["no","nope","nah","no need","don't","dont","cancel","never mind",
	"nevermind","no lah","nah lah","forget it","skip","not now","later",
	"no thanks","不用","不要","算了"]

	SCAM_KEYWORDS = ["scam","kena scam","suspicious","fraud","phishing","transfer money",
	"bank account","otp","lucky draw","prize","win","stranger called",
	"unknown number","weird sms","suspicious link","lost money","cheated"]
	FOOD_HINTS = ["hungry","makan","eat","food","hungry lah","very hungry","starving",
	"want to eat","craving","order food","grab food","supper","lunch",
	"dinner","breakfast","没吃","饿"]
	FOOD_ITEMS = ["chicken rice","nasi lemak","wonton mee","char kway teow","bak chor mee",
	"roti prata","laksa","mee goreng","pizza","burger","chicken noodles",
	"fried rice","noodles","rice","chicken","fish","beef","prawn","prata","mee",
	"porridge","ban mian","hokkien mee","satay","bak kut teh"]
	CALENDAR_HINTS = ["tomorrow","next week","monday","tuesday","wednesday","thursday","friday",
	"saturday","sunday","tonight","this afternoon","this morning","this evening",
	"must go","need to go","have to go","going to","got to"]
	CALENDAR_ACTIONS = ["schedule","meeting","appointment","book","remind","set reminder",
	"add to calendar","block out","calendar","add event","create event"]

	def is_confirmation(t): return any(w in t.lower() for w in YES_WORDS)
	def is_rejection(t): return any(w in t.lower() for w in NO_WORDS)
	def detect_scam(t): return sum(1 for k in SCAM_KEYWORDS if k in t.lower()) > 0
	def detect_food_hint(t):return any(k in t.lower() for k in FOOD_HINTS)

	def detect_food_item(t):
	tl = t.lower()
	for item in FOOD_ITEMS:
	if item in tl: return item.title()
	m = re.search(r'(?:want\|order\|eat\|have\|get\|bring me)\s+(?:some\s+)?(.+?)(?:\s*$\|[.,])', tl)
	if m:
	c = m.group(1).strip()
	if 2 < len(c) < 30: return c.title()
	return None

	def detect_calendar_hint(t):
	tl = t.lower()
	has_time = any(k in tl for k in CALENDAR_HINTS)
	has_action = any(k in tl for k in CALENDAR_ACTIONS)
	if has_action and has_time: return "strong"
	if has_time: return "weak"
	return None

	# ─────────────────────────────────────────────────────────────────────────────
	# MERALION INFERENCE
	# ─────────────────────────────────────────────────────────────────────────────

	def clean_text(text):
	if not text: return ""
	for t in ["<\|audio\|>","<\|chat\|>","<\|text\|>","<\|","\|>","<SpeechHere>",
	"<Speaker1>","<Speaker2>","<bos>","<eos>"]:
	text = text.replace(t, "")
	lines = []
	for line in text.split("\n"):
	line = line.strip()
	if not line or line.startswith("User:") or line.startswith("* "): continue
	if line.startswith("Companion:"): line = line.replace("Companion:","").strip()
	lines.append(line)
	return " ".join(lines).strip()

	def run_meralion(audio_array, query, max_tokens=150):
	if model is None or processor is None: return ""
	pt = "Instruction: {query} \nFollow the text instruction based on the following audio: <SpeechHere>"
	conv = [{"role": "user", "content": pt.format(query=query)}]
	chat_prompt = processor.tokenizer.apply_chat_template(
	conv, tokenize=False, add_generation_prompt=True)
	if isinstance(chat_prompt, list): chat_prompt = chat_prompt[0]
	inputs = processor(text=chat_prompt, audios=audio_array, sampling_rate=16000)
	device = next(model.parameters()).device
	inputs = {k: v.to(device) if hasattr(v,"to") else v for k,v in inputs.items()}
	if "input_features" in inputs:
	inputs["input_features"] = inputs["input_features"].to(torch.float16)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=max_tokens,
	do_sample=True, temperature=0.75,
	top_p=0.9, repetition_penalty=1.1)
	gen = out[:, inputs["input_ids"].shape[1]:]
	return clean_text(processor.batch_decode(gen, skip_special_tokens=True)[0].strip())

	def transcribe(audio_array):
	return run_meralion(audio_array,
	"Please transcribe what the user said accurately, including Singlish, Malay, Mandarin or Tamil words.",
	max_tokens=200)

	def companion_reply(audio_array, transcript, session, extra=""):
	history = format_history(session)
	ctx = f"{COMPANION_PERSONA}\n\n"
	if history: ctx += f"Conversation so far:\n{history}\n"
	if extra: ctx += f"{extra}\n"
	ctx += f"The user just said: '{transcript}'. Reply warmly in 1-2 sentences."
	result = run_meralion(audio_array, ctx)
	return result if len(result) > 5 else "Wah, tell me more leh!"

	# ─────────────────────────────────────────────────────────────────────────────
	# TELEGRAM
	# ─────────────────────────────────────────────────────────────────────────────

	def send_telegram(message):
	if not TELEGRAM_TOKEN or not TELEGRAM_CHAT_ID: return False
	try:
	encoded = urllib.parse.quote(message)
	url = f"https://api.telegram.org/bot{TELEGRAM_TOKEN}/sendMessage?chat_id={TELEGRAM_CHAT_ID}&text={encoded}&parse_mode=HTML"
	urllib.request.urlopen(url, timeout=10)
	return True
	except Exception as e:
	print(f"Telegram error: {e}")
	return False

	# ─────────────────────────────────────────────────────────────────────────────
	# EVENT EXTRACTION
	# ─────────────────────────────────────────────────────────────────────────────

	def extract_event(transcript):
	tl = transcript.lower()
	today = datetime.date.today()
	if "tomorrow" in tl: event_date = today + datetime.timedelta(days=1)
	elif "monday" in tl: event_date = today + datetime.timedelta(days=(0-today.weekday())%7 or 7)
	elif "tuesday" in tl: event_date = today + datetime.timedelta(days=(1-today.weekday())%7 or 7)
	elif "wednesday" in tl: event_date = today + datetime.timedelta(days=(2-today.weekday())%7 or 7)
	elif "thursday" in tl: event_date = today + datetime.timedelta(days=(3-today.weekday())%7 or 7)
	elif "friday" in tl: event_date = today + datetime.timedelta(days=(4-today.weekday())%7 or 7)
	elif "next week" in tl: event_date = today + datetime.timedelta(days=7)
	else: event_date = today + datetime.timedelta(days=1)

	m = re.search(r'(\d{1,2})(?::(\d{2}))?\s*(am\|pm)', tl)
	if m:
	h = int(m.group(1)); mn = int(m.group(2) or 0)
	if m.group(3)=="pm" and h!=12: h+=12
	elif m.group(3)=="am" and h==12: h=0
	event_time = f"{h:02d}:{mn:02d}"
	elif "morning" in tl: event_time = "09:00"
	elif "afternoon" in tl: event_time = "14:00"
	elif "evening" in tl or "tonight" in tl: event_time = "19:00"
	elif "noon" in tl: event_time = "12:00"
	else: event_time = "10:00"

	title = "Reminder"
	wm = re.search(r'(?:with\|meet)\s+([A-Z][a-z]+(?:\s+[A-Z][a-z]+)?)', transcript)
	if wm: title = f"Meeting with {wm.group(1)}"
	elif "meeting" in tl: title = "Meeting"
	elif "lunch" in tl: title = "Lunch"
	elif "dinner" in tl: title = "Dinner"
	elif "office" in tl: title = "Back to Office"
	elif "call" in tl: title = "Call"
	return {"title": title, "date": str(event_date), "time": event_time}

	# ─────────────────────────────────────────────────────────────────────────────
	# AGENT EXECUTORS
	# ─────────────────────────────────────────────────────────────────────────────

	def execute_scam(transcript):
	pm = re.search(r'\b(\+?65)?[\s-]?([689]\d{7})\b', transcript)
	phone = pm.group(0).strip() if pm else "unknown number"
	send_telegram(f"🚨 <b>ROAR Scam Alert</b>\n\n📞 {phone}\n💬 \"{transcript[:100]}\"\n\n⚠️ Call ScamShield 1799")
	if phone != "unknown number":
	return f"🛡️ I've flagged {phone} and sent an alert. Don't transfer money or share your OTP. Call ScamShield at 1799 now!"
	return "🛡️ Alamak, sounds very suspicious! I've logged a scam alert. Don't transfer any money — call ScamShield at 1799 immediately!"

	def execute_food(food_item):
	ot = datetime.datetime.now().strftime("%H:%M")
	eta = (datetime.datetime.now()+datetime.timedelta(minutes=25)).strftime("%H:%M")
	send_telegram(f"🍜 <b>ROAR Food Order!</b>\n\n🛒 {food_item}\n⏰ Ordered: {ot}\n🚴 ETA: {eta}\n\n✅ Confirmed via ROAR!")
	return f"🍜 Shiok! Order placed for {food_item}. ETA {eta}. Check Telegram for confirmation!"

	def execute_calendar(transcript):
	event = extract_event(transcript)
	try:
	from google.oauth2.credentials import Credentials
	from google_auth_oauthlib.flow import InstalledAppFlow
	from google.auth.transport.requests import Request
	from googleapiclient.discovery import build
	import pickle, io

	SCOPES = ['https://www.googleapis.com/auth/calendar']
	token_path = '/tmp/token.pickle'
	creds = None

	if os.path.exists(token_path):
	with open(token_path,'rb') as f: creds = pickle.load(f)
	if not creds or not creds.valid:
	if creds and creds.expired and creds.refresh_token:
	creds.refresh(Request())
	elif GOOGLE_CREDS_JSON:
	import tempfile
	with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
	f.write(GOOGLE_CREDS_JSON)
	creds_file = f.name
	flow = InstalledAppFlow.from_client_secrets_file(creds_file, SCOPES)
	creds = flow.run_local_server(port=0)
	os.unlink(creds_file)
	with open(token_path,'wb') as f: pickle.dump(creds, f)

	service = build('calendar','v3',credentials=creds)
	start_dt = datetime.datetime.strptime(f"{event['date']} {event['time']}", "%Y-%m-%d %H:%M")
	end_dt = start_dt + datetime.timedelta(hours=1)
	cal_event = {
	'summary': event['title'],
	'start': {'dateTime': start_dt.isoformat(), 'timeZone': 'Asia/Singapore'},
	'end': {'dateTime': end_dt.isoformat(), 'timeZone': 'Asia/Singapore'},
	}
	service.events().insert(calendarId='primary', body=cal_event).execute()
	send_telegram(f"📅 <b>ROAR Calendar</b>\n\n✅ {event['title']}\n🗓 {event['date']} at {event['time']}")
	return f"📅 Done lah! '{event['title']}' added to your Google Calendar on {event['date']} at {event['time']}!"
	except Exception as e:
	print(f"Calendar error: {e}")
	send_telegram(f"📅 <b>ROAR Calendar</b>\n📌 {event['title']}\n🗓 {event['date']} at {event['time']}\n⚠️ Add manually")
	return f"📅 Noted! '{event['title']}' on {event['date']} at {event['time']}. Check Telegram!"

	# ─────────────────────────────────────────────────────────────────────────────
	# CONVERSATION HANDLER
	# ─────────────────────────────────────────────────────────────────────────────

	def handle_conversation(audio_array, transcript, session):
	pending = session.get("pending_action")

	if pending:
	if is_confirmation(transcript):
	session["pending_action"] = None
	if pending["type"] == "scam":
	return execute_scam(pending["transcript"]), "Scam Agent"
	elif pending["type"] == "food":
	item = pending.get("food_item") or detect_food_item(transcript) or "Your order"
	return execute_food(item), "Food Agent"
	elif pending["type"] == "food_item_needed":
	item = detect_food_item(transcript)
	if item: return execute_food(item), "Food Agent"
	session["pending_action"] = {"type":"food_item_needed","transcript":transcript}
	return "What would you like ah? Chicken rice? Nasi lemak?", "MERaLiON"
	elif pending["type"] == "calendar":
	return execute_calendar(pending["transcript"]), "Calendar Agent"
	elif is_rejection(transcript):
	session["pending_action"] = None
	r = companion_reply(audio_array, transcript, session, "User declined your suggestion. ")
	return r or "Ok lah, no problem! Anything else?", "MERaLiON"
	else:
	session["pending_action"] = None

	if detect_scam(transcript):
	return execute_scam(transcript), "Scam Agent"

	cal = detect_calendar_hint(transcript)
	if cal == "strong":
	ev = extract_event(transcript)
	session["pending_action"] = {"type":"calendar","transcript":transcript}
	return f"📅 Want me to add '{ev['title']}' to your calendar on {ev['date']} at {ev['time']}?", "MERaLiON"

	item = detect_food_item(transcript)
	if item:
	session["pending_action"] = {"type":"food","transcript":transcript,"food_item":item}
	return f"🍜 {item} sounds good leh! Want me to order it for you?", "MERaLiON"

	if detect_food_hint(transcript):
	session["pending_action"] = {"type":"food_item_needed","transcript":transcript}
	return "Aiyo hungry ah! What you feel like eating?", "MERaLiON"

	if cal == "weak":
	ev = extract_event(transcript)
	session["pending_action"] = {"type":"calendar","transcript":transcript}
	return f"Wah sounds like you got something on {ev['date']} {ev['time']}. Want me to add to your calendar?", "MERaLiON"

	return companion_reply(audio_array, transcript, session) or "Tell me more leh!", "MERaLiON"

	# ─────────────────────────────────────────────────────────────────────────────
	# TTS
	# ─────────────────────────────────────────────────────────────────────────────

	def detect_voice(text):
	zh = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
	total = len([c for c in text if c.strip()])
	return TTS_VOICE_ZH if total > 0 and zh/total > 0.5 else TTS_VOICE_EN

	async def tts_async(text, voice):
	import edge_tts
	communicate = edge_tts.Communicate(text, voice)
	audio_bytes = b""
	async for chunk in communicate.stream():
	if chunk["type"] == "audio":
	audio_bytes += chunk["data"]
	return audio_bytes

	def tts(text):
	clean = re.sub(r'[^\w\s\.,!?\-\'\"：。，！？]','',text).strip()
	if not clean or len(clean) < 3: clean = "Done."
	voice = detect_voice(clean)
	try:
	return asyncio.run(tts_async(clean, voice))
	except Exception:
	en_only = re.sub(r'[\u4e00-\u9fff\u3000-\u303f]','',clean).strip()
	if not en_only or len(en_only) < 3: en_only = "I hear you. Tell me more."
	return asyncio.run(tts_async(en_only, TTS_VOICE_EN))

	# ─────────────────────────────────────────────────────────────────────────────
	# GRADIO INTERFACE
	# ─────────────────────────────────────────────────────────────────────────────

	def process_voice(audio, session_state):
	"""Main Gradio handler — takes audio, returns text response + audio."""
	if audio is None:
	return session_state, "Please hold and speak...", None, ""

	session_id = session_state.get("id", "default")
	session = get_session(session_id)

	try:
	# Load audio from Gradio (returns (sample_rate, numpy_array))
	sr, audio_array = audio
	if sr != 16000:
	try:
	import librosa
	audio_array = librosa.resample(audio_array.astype(np.float32), orig_sr=sr, target_sr=16000)
	except ImportError:
	pass
	if len(audio_array.shape) > 1:
	audio_array = audio_array.mean(axis=1)
	audio_array = audio_array.astype(np.float32)

	# Normalise
	if audio_array.max() > 1.0:
	audio_array = audio_array / 32768.0

	# Transcribe + respond
	transcript = transcribe(audio_array)
	response_text, agent_name = handle_conversation(audio_array, transcript, session)

	add_to_history(session, "user", transcript)
	add_to_history(session, "assistant", response_text)

	# TTS
	tts_text = response_text if len(response_text) >= 5 else "Done."
	audio_bytes = tts(tts_text)

	# Save to temp file for Gradio
	with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
	f.write(audio_bytes)
	audio_path = f.name

	display = f"[{agent_name}] {response_text}\n\nYou said: \"{transcript}\""
	return session_state, display, audio_path, transcript

	except Exception as e:
	traceback.print_exc()
	return session_state, f"Sorry, something went wrong: {str(e)}", None, ""


	# Build Gradio UI
	with gr.Blocks(
	title="ROAR — MERaLiON Companion",
	theme=gr.themes.Base(),
	css="""
	.gradio-container { max-width: 600px; margin: auto; }
	.title { text-align: center; color: #6B2D8B; }
	.subtitle { text-align: center; color: #888; margin-bottom: 20px; }
	"""
	) as demo:

	gr.Markdown("# 🦁 ROAR — MERaLiON Companion", elem_classes="title")
	gr.Markdown(
	"Speak in Singlish, Mandarin, Malay or Tamil. "
	"ROAR understands you and routes to the right agent.",
	elem_classes="subtitle"
	)

	session_state = gr.State({"id": "gradio_user"})

	with gr.Row():
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="🎤 Hold to speak",
	)

	response_box = gr.Markdown("Waiting for your voice...")
	audio_output = gr.Audio(label="🔊 MERaLiON responds", autoplay=True)
	transcript_box = gr.Textbox(label="📝 What MERaLiON heard", interactive=False)

	gr.Markdown("""
	Try saying:
	- "Aiyo so sian today lah" → Companion chat
	- "I very hungry, want chicken rice" → 🍜 Food Agent
	- "Schedule meeting with Lawrence tomorrow 3pm" → 📅 Calendar Agent
	- "I think I kena scam" → 🛡️ Scam Agent
	""")

	audio_input.change(
	fn=process_voice,
	inputs=[audio_input, session_state],
	outputs=[session_state, response_box, audio_output, transcript_box]
	)

	if __name__ == "__main__":
	demo.launch()