Spaces:

jlazoflores
/

Language_Tutor

Running

Language_Tutor / language_tutor.py

joelazo

Added information to the Auto clear recording checkbox.

f744169 15 days ago

14.8 kB

	import gradio as gr
	from huggingface_hub import InferenceClient
	from dotenv import load_dotenv
	from voice_handler import (
	create_stt_provider,
	create_tts_provider,
	get_available_stt_providers,
	get_available_tts_providers,
	get_voices_for_provider,
	get_available_languages,
	get_language_code,
	get_default_voice_for_language,
	VoiceConfig
	)

	load_dotenv(override=True)

	# Initialize the Hugging Face Inference Client
	model_name = "swiss-ai/Apertus-8B-Instruct-2509"
	short_model_name = "Apertus-8B-Instruct"
	client = InferenceClient(model=model_name)

	def format_messages(message, chat_history, system_prompt):
	"""Format the conversation into messages list."""
	messages = []

	# Add system prompt if provided
	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt})

	# Add chat history (already in messages format)
	messages.extend(chat_history)

	# Add current message
	messages.append({"role": "user", "content": message})

	return messages


	def create_language_tutor_prompt(native_language, target_language, enable_translations=True):
	"""
	Create a system prompt for the language tutor based on native and target languages.

	Args:
	native_language: User's native language
	target_language: Language the user wants to learn
	enable_translations: Whether to include native language translations

	Returns:
	System prompt string
	"""
	translation_guidance = ""
	if enable_translations:
	translation_guidance = f"""- Provide {native_language} translations when the user seems confused or asks for help
	- Include {native_language} explanations in parentheses when helpful"""
	else:
	translation_guidance = f"""- Keep responses entirely in {target_language} for full immersion
	- Only use {native_language} if the user explicitly asks for translation or clarification"""

	prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}.

	Your role:
	- Respond primarily in {target_language} to provide immersive practice
	{translation_guidance}
	- Correct mistakes gently and explain grammar rules when appropriate
	- Adjust your vocabulary and sentence complexity based on the user's level
	- Ask engaging questions to encourage conversation practice
	- Provide cultural context when relevant
	- Be patient, encouraging, and supportive

	Guidelines:
	- Keep responses conversational and natural
	- Use {target_language} for the main response
	- Praise progress and provide constructive feedback
	- Adapt difficulty based on the user's responses

	Start by greeting the user and asking what they'd like to practice today."""

	return prompt


	def transcribe_audio(audio_path, stt_provider_name):
	"""
	Transcribe audio to text using selected STT provider.

	Args:
	audio_path: Path to audio file
	stt_provider_name: Name of STT provider

	Returns:
	Transcribed text or error message
	"""
	if audio_path is None:
	return ""

	try:
	stt_provider = create_stt_provider(stt_provider_name)
	text = stt_provider.transcribe(audio_path)
	return text
	except Exception as e:
	return f"[Transcription Error: {str(e)}]"


	def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"):
	"""
	Synthesize text to speech using selected TTS provider.

	Args:
	text: Text to synthesize
	tts_provider_name: Name of TTS provider
	tts_voice: Voice to use
	target_language: Target language name for TTS

	Returns:
	Path to generated audio file or None if failed
	"""
	if not text or not text.strip():
	return None

	try:
	language_code = get_language_code(target_language)
	tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code)
	audio_path = tts_provider.synthesize(text)
	return audio_path
	except Exception as e:
	print(f"TTS Error: {str(e)}")
	return None


	def update_voice_dropdown(tts_provider_name, target_language="English"):
	"""
	Update the voice dropdown based on selected TTS provider and target language.

	Args:
	tts_provider_name: Name of TTS provider
	target_language: Target language for voice selection

	Returns:
	Updated dropdown configuration
	"""
	language_code = get_language_code(target_language)
	voices = get_voices_for_provider(tts_provider_name, language_code)
	return gr.Dropdown(choices=voices, value=voices[0] if voices else None)

	def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p,
	enable_tts, tts_provider_name, tts_voice, target_language):
	"""Generate a response from the Hugging Face hosted model."""
	if not message.strip():
	return "", chat_history, None

	# Format the messages
	messages = format_messages(message, chat_history, system_prompt)

	try:
	# Call the Hugging Face Inference API
	response = client.chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	top_p=top_p,
	stream=False
	)

	# Extract the assistant's reply
	assistant_message = response.choices[0].message.content

	# Update chat history with messages format
	chat_history.append({"role": "user", "content": message})
	chat_history.append({"role": "assistant", "content": assistant_message})

	# Generate TTS audio if enabled
	audio_output = None
	if enable_tts:
	audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language)

	return "", chat_history, audio_output

	except Exception as e:
	error_message = f"Error: {str(e)}"
	chat_history.append({"role": "user", "content": message})
	chat_history.append({"role": "assistant", "content": error_message})
	return "", chat_history, None


	def process_voice_input(audio, stt_provider_name):
	"""
	Process voice input and return transcribed text.

	Args:
	audio: Audio file from microphone
	stt_provider_name: Name of STT provider

	Returns:
	Transcribed text
	"""
	if audio is None:
	return ""

	transcribed_text = transcribe_audio(audio, stt_provider_name)
	return transcribed_text


	def clear_voice_input_if_enabled(auto_clear):
	"""
	Clear the voice input component if auto-clear is enabled.

	Args:
	auto_clear: Boolean indicating if auto-clear is enabled

	Returns:
	None if auto-clear is enabled (clears the audio), otherwise gr.update() to keep it
	"""
	if auto_clear:
	return None
	else:
	return gr.update()

	# Create Gradio interface
	with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo:
	gr.Markdown("# 🌍 Language Tutor")
	gr.Markdown(f"Practice any language with an AI tutor powered by Swiss AI {short_model_name} - trained on 1000+ languages!")
	# gr.Markdown("⚠️ Note: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.")

	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(label="Conversation", height=400, type='messages')

	# Text input section
	with gr.Row():
	msg = gr.Textbox(
	label="Your Message",
	placeholder="Type your message here...",
	scale=4,
	lines=2
	)
	submit = gr.Button("Send", scale=1, variant="primary")

	# Voice input section
	with gr.Row():
	voice_input = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Voice Input (Recording auto-transcribes when you stop)",
	waveform_options=gr.WaveformOptions(
	show_controls=False
	)
	)

	# Voice output section
	voice_output = gr.Audio(
	label="Assistant Voice Response",
	autoplay=True,
	visible=True
	)

	clear = gr.Button("Clear Conversation")

	with gr.Column(scale=1):
	gr.Markdown("### 🌐 Language Settings")

	native_language = gr.Dropdown(
	choices=get_available_languages(),
	value="English",
	label="Your Native Language",
	info="Language for explanations and help"
	)

	target_language = gr.Dropdown(
	choices=get_available_languages(),
	value="German",
	label="Language to Practice",
	info="Language you want to learn"
	)

	enable_translations = gr.Checkbox(
	label="Enable Native Language Hints",
	value=True,
	info="Show translations and explanations in your native language (in parentheses)"
	)

	system_prompt = gr.Textbox(
	label="System Prompt (Auto-generated)",
	placeholder="System prompt is automatically generated based on language selection...",
	lines=5,
	value=create_language_tutor_prompt("English", "German", True),
	interactive=True,
	info="You can customize this if needed",
	visible=False # Hidden from UI, but still functional in backend
	)

	gr.Markdown("### Voice Settings")

	enable_voice_input = gr.Checkbox(
	label="Enable Voice Input (STT)",
	value=True,
	info="Transcribe voice to text"
	)

	stt_provider = gr.Dropdown(
	choices=get_available_stt_providers(),
	value=VoiceConfig.DEFAULT_STT,
	label="Speech-to-Text Provider",
	info="Choose quality/cost tier"
	)

	auto_clear_recording = gr.Checkbox(
	label="Auto-clear recording after transcription",
	value=True,
	info="Remove check to enable playback. Enables you to play the recording and listen to your pronunciation"
	)

	enable_voice_output = gr.Checkbox(
	label="Enable Voice Output (TTS)",
	value=True,
	info="Convert responses to speech"
	)

	tts_provider = gr.Dropdown(
	choices=get_available_tts_providers(),
	value=VoiceConfig.DEFAULT_TTS,
	label="Text-to-Speech Provider",
	info="Choose quality/cost tier"
	)

	tts_voice = gr.Dropdown(
	choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("German")),
	value=get_default_voice_for_language("German", VoiceConfig.DEFAULT_TTS),
	label="TTS Voice",
	info="Voice automatically matched to target language"
	)

	gr.Markdown("### Generation Parameters")

	max_tokens = gr.Slider(
	minimum=50,
	maximum=2048,
	value=512,
	step=50,
	label="Max Tokens",
	info="Maximum length of the response"
	)

	temperature = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=0.7,
	step=0.1,
	label="Temperature",
	info="Higher = more creative, Lower = more focused"
	)

	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.9,
	step=0.05,
	label="Top P",
	info="Nucleus sampling threshold"
	)

	# Event handlers

	# Update system prompt when languages or translation setting changes
	def update_system_prompt(native_lang, target_lang, enable_trans):
	return create_language_tutor_prompt(native_lang, target_lang, enable_trans)

	native_language.change(
	update_system_prompt,
	inputs=[native_language, target_language, enable_translations],
	outputs=[system_prompt]
	)

	target_language.change(
	update_system_prompt,
	inputs=[native_language, target_language, enable_translations],
	outputs=[system_prompt]
	)

	enable_translations.change(
	update_system_prompt,
	inputs=[native_language, target_language, enable_translations],
	outputs=[system_prompt]
	)

	# Update TTS voice dropdown when target language or provider changes
	target_language.change(
	update_voice_dropdown,
	inputs=[tts_provider, target_language],
	outputs=[tts_voice]
	)

	tts_provider.change(
	update_voice_dropdown,
	inputs=[tts_provider, target_language],
	outputs=[tts_voice]
	)

	# Text message submit
	submit.click(
	chat,
	inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
	enable_voice_output, tts_provider, tts_voice, target_language],
	outputs=[msg, chatbot, voice_output]
	)

	msg.submit(
	chat,
	inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
	enable_voice_output, tts_provider, tts_voice, target_language],
	outputs=[msg, chatbot, voice_output]
	)

	# Automatic voice input transcription when recording stops, then auto-send to LLM
	voice_input.stop_recording(
	process_voice_input,
	inputs=[voice_input, stt_provider],
	outputs=[msg]
	).then(
	chat,
	inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
	enable_voice_output, tts_provider, tts_voice, target_language],
	outputs=[msg, chatbot, voice_output]
	).then(
	clear_voice_input_if_enabled,
	inputs=[auto_clear_recording],
	outputs=[voice_input]
	)

	# Also trigger transcription on audio change (for uploaded files)
	# Note: No auto-send here to avoid duplicate calls when stop_recording fires
	voice_input.change(
	process_voice_input,
	inputs=[voice_input, stt_provider],
	outputs=[msg]
	)

	# Clear conversation
	clear.click(
	lambda: ([], None),
	outputs=[chatbot, voice_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch(share=False, inbrowser=True)