Language_Tutor / language_tutor.py
joelazo
Added information to the Auto clear recording checkbox.
f744169
import gradio as gr
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
from voice_handler import (
create_stt_provider,
create_tts_provider,
get_available_stt_providers,
get_available_tts_providers,
get_voices_for_provider,
get_available_languages,
get_language_code,
get_default_voice_for_language,
VoiceConfig
)
load_dotenv(override=True)
# Initialize the Hugging Face Inference Client
model_name = "swiss-ai/Apertus-8B-Instruct-2509"
short_model_name = "Apertus-8B-Instruct"
client = InferenceClient(model=model_name)
def format_messages(message, chat_history, system_prompt):
"""Format the conversation into messages list."""
messages = []
# Add system prompt if provided
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt})
# Add chat history (already in messages format)
messages.extend(chat_history)
# Add current message
messages.append({"role": "user", "content": message})
return messages
def create_language_tutor_prompt(native_language, target_language, enable_translations=True):
"""
Create a system prompt for the language tutor based on native and target languages.
Args:
native_language: User's native language
target_language: Language the user wants to learn
enable_translations: Whether to include native language translations
Returns:
System prompt string
"""
translation_guidance = ""
if enable_translations:
translation_guidance = f"""- Provide {native_language} translations when the user seems confused or asks for help
- Include {native_language} explanations in parentheses when helpful"""
else:
translation_guidance = f"""- Keep responses entirely in {target_language} for full immersion
- Only use {native_language} if the user explicitly asks for translation or clarification"""
prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}.
Your role:
- Respond primarily in {target_language} to provide immersive practice
{translation_guidance}
- Correct mistakes gently and explain grammar rules when appropriate
- Adjust your vocabulary and sentence complexity based on the user's level
- Ask engaging questions to encourage conversation practice
- Provide cultural context when relevant
- Be patient, encouraging, and supportive
Guidelines:
- Keep responses conversational and natural
- Use {target_language} for the main response
- Praise progress and provide constructive feedback
- Adapt difficulty based on the user's responses
Start by greeting the user and asking what they'd like to practice today."""
return prompt
def transcribe_audio(audio_path, stt_provider_name):
"""
Transcribe audio to text using selected STT provider.
Args:
audio_path: Path to audio file
stt_provider_name: Name of STT provider
Returns:
Transcribed text or error message
"""
if audio_path is None:
return ""
try:
stt_provider = create_stt_provider(stt_provider_name)
text = stt_provider.transcribe(audio_path)
return text
except Exception as e:
return f"[Transcription Error: {str(e)}]"
def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"):
"""
Synthesize text to speech using selected TTS provider.
Args:
text: Text to synthesize
tts_provider_name: Name of TTS provider
tts_voice: Voice to use
target_language: Target language name for TTS
Returns:
Path to generated audio file or None if failed
"""
if not text or not text.strip():
return None
try:
language_code = get_language_code(target_language)
tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code)
audio_path = tts_provider.synthesize(text)
return audio_path
except Exception as e:
print(f"TTS Error: {str(e)}")
return None
def update_voice_dropdown(tts_provider_name, target_language="English"):
"""
Update the voice dropdown based on selected TTS provider and target language.
Args:
tts_provider_name: Name of TTS provider
target_language: Target language for voice selection
Returns:
Updated dropdown configuration
"""
language_code = get_language_code(target_language)
voices = get_voices_for_provider(tts_provider_name, language_code)
return gr.Dropdown(choices=voices, value=voices[0] if voices else None)
def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p,
enable_tts, tts_provider_name, tts_voice, target_language):
"""Generate a response from the Hugging Face hosted model."""
if not message.strip():
return "", chat_history, None
# Format the messages
messages = format_messages(message, chat_history, system_prompt)
try:
# Call the Hugging Face Inference API
response = client.chat_completion(
messages=messages,
max_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
stream=False
)
# Extract the assistant's reply
assistant_message = response.choices[0].message.content
# Update chat history with messages format
chat_history.append({"role": "user", "content": message})
chat_history.append({"role": "assistant", "content": assistant_message})
# Generate TTS audio if enabled
audio_output = None
if enable_tts:
audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language)
return "", chat_history, audio_output
except Exception as e:
error_message = f"Error: {str(e)}"
chat_history.append({"role": "user", "content": message})
chat_history.append({"role": "assistant", "content": error_message})
return "", chat_history, None
def process_voice_input(audio, stt_provider_name):
"""
Process voice input and return transcribed text.
Args:
audio: Audio file from microphone
stt_provider_name: Name of STT provider
Returns:
Transcribed text
"""
if audio is None:
return ""
transcribed_text = transcribe_audio(audio, stt_provider_name)
return transcribed_text
def clear_voice_input_if_enabled(auto_clear):
"""
Clear the voice input component if auto-clear is enabled.
Args:
auto_clear: Boolean indicating if auto-clear is enabled
Returns:
None if auto-clear is enabled (clears the audio), otherwise gr.update() to keep it
"""
if auto_clear:
return None
else:
return gr.update()
# Create Gradio interface
with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo:
gr.Markdown("# 🌍 Language Tutor")
gr.Markdown(f"Practice any language with an AI tutor powered by **Swiss AI {short_model_name}** - trained on 1000+ languages!")
# gr.Markdown("⚠️ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.")
with gr.Row():
with gr.Column(scale=3):
chatbot = gr.Chatbot(label="Conversation", height=400, type='messages')
# Text input section
with gr.Row():
msg = gr.Textbox(
label="Your Message",
placeholder="Type your message here...",
scale=4,
lines=2
)
submit = gr.Button("Send", scale=1, variant="primary")
# Voice input section
with gr.Row():
voice_input = gr.Audio(
sources=["microphone"],
type="filepath",
label="Voice Input (Recording auto-transcribes when you stop)",
waveform_options=gr.WaveformOptions(
show_controls=False
)
)
# Voice output section
voice_output = gr.Audio(
label="Assistant Voice Response",
autoplay=True,
visible=True
)
clear = gr.Button("Clear Conversation")
with gr.Column(scale=1):
gr.Markdown("### 🌐 Language Settings")
native_language = gr.Dropdown(
choices=get_available_languages(),
value="English",
label="Your Native Language",
info="Language for explanations and help"
)
target_language = gr.Dropdown(
choices=get_available_languages(),
value="German",
label="Language to Practice",
info="Language you want to learn"
)
enable_translations = gr.Checkbox(
label="Enable Native Language Hints",
value=True,
info="Show translations and explanations in your native language (in parentheses)"
)
system_prompt = gr.Textbox(
label="System Prompt (Auto-generated)",
placeholder="System prompt is automatically generated based on language selection...",
lines=5,
value=create_language_tutor_prompt("English", "German", True),
interactive=True,
info="You can customize this if needed",
visible=False # Hidden from UI, but still functional in backend
)
gr.Markdown("### Voice Settings")
enable_voice_input = gr.Checkbox(
label="Enable Voice Input (STT)",
value=True,
info="Transcribe voice to text"
)
stt_provider = gr.Dropdown(
choices=get_available_stt_providers(),
value=VoiceConfig.DEFAULT_STT,
label="Speech-to-Text Provider",
info="Choose quality/cost tier"
)
auto_clear_recording = gr.Checkbox(
label="Auto-clear recording after transcription",
value=True,
info="Remove check to enable playback. Enables you to play the recording and listen to your pronunciation"
)
enable_voice_output = gr.Checkbox(
label="Enable Voice Output (TTS)",
value=True,
info="Convert responses to speech"
)
tts_provider = gr.Dropdown(
choices=get_available_tts_providers(),
value=VoiceConfig.DEFAULT_TTS,
label="Text-to-Speech Provider",
info="Choose quality/cost tier"
)
tts_voice = gr.Dropdown(
choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("German")),
value=get_default_voice_for_language("German", VoiceConfig.DEFAULT_TTS),
label="TTS Voice",
info="Voice automatically matched to target language"
)
gr.Markdown("### Generation Parameters")
max_tokens = gr.Slider(
minimum=50,
maximum=2048,
value=512,
step=50,
label="Max Tokens",
info="Maximum length of the response"
)
temperature = gr.Slider(
minimum=0.0,
maximum=2.0,
value=0.7,
step=0.1,
label="Temperature",
info="Higher = more creative, Lower = more focused"
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.9,
step=0.05,
label="Top P",
info="Nucleus sampling threshold"
)
# Event handlers
# Update system prompt when languages or translation setting changes
def update_system_prompt(native_lang, target_lang, enable_trans):
return create_language_tutor_prompt(native_lang, target_lang, enable_trans)
native_language.change(
update_system_prompt,
inputs=[native_language, target_language, enable_translations],
outputs=[system_prompt]
)
target_language.change(
update_system_prompt,
inputs=[native_language, target_language, enable_translations],
outputs=[system_prompt]
)
enable_translations.change(
update_system_prompt,
inputs=[native_language, target_language, enable_translations],
outputs=[system_prompt]
)
# Update TTS voice dropdown when target language or provider changes
target_language.change(
update_voice_dropdown,
inputs=[tts_provider, target_language],
outputs=[tts_voice]
)
tts_provider.change(
update_voice_dropdown,
inputs=[tts_provider, target_language],
outputs=[tts_voice]
)
# Text message submit
submit.click(
chat,
inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
enable_voice_output, tts_provider, tts_voice, target_language],
outputs=[msg, chatbot, voice_output]
)
msg.submit(
chat,
inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
enable_voice_output, tts_provider, tts_voice, target_language],
outputs=[msg, chatbot, voice_output]
)
# Automatic voice input transcription when recording stops, then auto-send to LLM
voice_input.stop_recording(
process_voice_input,
inputs=[voice_input, stt_provider],
outputs=[msg]
).then(
chat,
inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
enable_voice_output, tts_provider, tts_voice, target_language],
outputs=[msg, chatbot, voice_output]
).then(
clear_voice_input_if_enabled,
inputs=[auto_clear_recording],
outputs=[voice_input]
)
# Also trigger transcription on audio change (for uploaded files)
# Note: No auto-send here to avoid duplicate calls when stop_recording fires
voice_input.change(
process_voice_input,
inputs=[voice_input, stt_provider],
outputs=[msg]
)
# Clear conversation
clear.click(
lambda: ([], None),
outputs=[chatbot, voice_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch(share=False, inbrowser=True)