import gradio as gr from huggingface_hub import InferenceClient from dotenv import load_dotenv from voice_handler import ( create_stt_provider, create_tts_provider, get_available_stt_providers, get_available_tts_providers, get_voices_for_provider, get_available_languages, get_language_code, get_default_voice_for_language, VoiceConfig ) load_dotenv(override=True) # Initialize the Hugging Face Inference Client model_name = "swiss-ai/Apertus-8B-Instruct-2509" short_model_name = "Apertus-8B-Instruct" client = InferenceClient(model=model_name) def format_messages(message, chat_history, system_prompt): """Format the conversation into messages list.""" messages = [] # Add system prompt if provided if system_prompt.strip(): messages.append({"role": "system", "content": system_prompt}) # Add chat history (already in messages format) messages.extend(chat_history) # Add current message messages.append({"role": "user", "content": message}) return messages def create_language_tutor_prompt(native_language, target_language, enable_translations=True): """ Create a system prompt for the language tutor based on native and target languages. Args: native_language: User's native language target_language: Language the user wants to learn enable_translations: Whether to include native language translations Returns: System prompt string """ translation_guidance = "" if enable_translations: translation_guidance = f"""- Provide {native_language} translations when the user seems confused or asks for help - Include {native_language} explanations in parentheses when helpful""" else: translation_guidance = f"""- Keep responses entirely in {target_language} for full immersion - Only use {native_language} if the user explicitly asks for translation or clarification""" prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}. Your role: - Respond primarily in {target_language} to provide immersive practice {translation_guidance} - Correct mistakes gently and explain grammar rules when appropriate - Adjust your vocabulary and sentence complexity based on the user's level - Ask engaging questions to encourage conversation practice - Provide cultural context when relevant - Be patient, encouraging, and supportive Guidelines: - Keep responses conversational and natural - Use {target_language} for the main response - Praise progress and provide constructive feedback - Adapt difficulty based on the user's responses Start by greeting the user and asking what they'd like to practice today.""" return prompt def transcribe_audio(audio_path, stt_provider_name): """ Transcribe audio to text using selected STT provider. Args: audio_path: Path to audio file stt_provider_name: Name of STT provider Returns: Transcribed text or error message """ if audio_path is None: return "" try: stt_provider = create_stt_provider(stt_provider_name) text = stt_provider.transcribe(audio_path) return text except Exception as e: return f"[Transcription Error: {str(e)}]" def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"): """ Synthesize text to speech using selected TTS provider. Args: text: Text to synthesize tts_provider_name: Name of TTS provider tts_voice: Voice to use target_language: Target language name for TTS Returns: Path to generated audio file or None if failed """ if not text or not text.strip(): return None try: language_code = get_language_code(target_language) tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code) audio_path = tts_provider.synthesize(text) return audio_path except Exception as e: print(f"TTS Error: {str(e)}") return None def update_voice_dropdown(tts_provider_name, target_language="English"): """ Update the voice dropdown based on selected TTS provider and target language. Args: tts_provider_name: Name of TTS provider target_language: Target language for voice selection Returns: Updated dropdown configuration """ language_code = get_language_code(target_language) voices = get_voices_for_provider(tts_provider_name, language_code) return gr.Dropdown(choices=voices, value=voices[0] if voices else None) def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p, enable_tts, tts_provider_name, tts_voice, target_language): """Generate a response from the Hugging Face hosted model.""" if not message.strip(): return "", chat_history, None # Format the messages messages = format_messages(message, chat_history, system_prompt) try: # Call the Hugging Face Inference API response = client.chat_completion( messages=messages, max_tokens=max_tokens, temperature=temperature, top_p=top_p, stream=False ) # Extract the assistant's reply assistant_message = response.choices[0].message.content # Update chat history with messages format chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": assistant_message}) # Generate TTS audio if enabled audio_output = None if enable_tts: audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language) return "", chat_history, audio_output except Exception as e: error_message = f"Error: {str(e)}" chat_history.append({"role": "user", "content": message}) chat_history.append({"role": "assistant", "content": error_message}) return "", chat_history, None def process_voice_input(audio, stt_provider_name): """ Process voice input and return transcribed text. Args: audio: Audio file from microphone stt_provider_name: Name of STT provider Returns: Transcribed text """ if audio is None: return "" transcribed_text = transcribe_audio(audio, stt_provider_name) return transcribed_text def clear_voice_input_if_enabled(auto_clear): """ Clear the voice input component if auto-clear is enabled. Args: auto_clear: Boolean indicating if auto-clear is enabled Returns: None if auto-clear is enabled (clears the audio), otherwise gr.update() to keep it """ if auto_clear: return None else: return gr.update() # Create Gradio interface with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo: gr.Markdown("# 🌍 Language Tutor") gr.Markdown(f"Practice any language with an AI tutor powered by **Swiss AI {short_model_name}** - trained on 1000+ languages!") # gr.Markdown("⚠️ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.") with gr.Row(): with gr.Column(scale=3): chatbot = gr.Chatbot(label="Conversation", height=400, type='messages') # Text input section with gr.Row(): msg = gr.Textbox( label="Your Message", placeholder="Type your message here...", scale=4, lines=2 ) submit = gr.Button("Send", scale=1, variant="primary") # Voice input section with gr.Row(): voice_input = gr.Audio( sources=["microphone"], type="filepath", label="Voice Input (Recording auto-transcribes when you stop)", waveform_options=gr.WaveformOptions( show_controls=False ) ) # Voice output section voice_output = gr.Audio( label="Assistant Voice Response", autoplay=True, visible=True ) clear = gr.Button("Clear Conversation") with gr.Column(scale=1): gr.Markdown("### 🌐 Language Settings") native_language = gr.Dropdown( choices=get_available_languages(), value="English", label="Your Native Language", info="Language for explanations and help" ) target_language = gr.Dropdown( choices=get_available_languages(), value="German", label="Language to Practice", info="Language you want to learn" ) enable_translations = gr.Checkbox( label="Enable Native Language Hints", value=True, info="Show translations and explanations in your native language (in parentheses)" ) system_prompt = gr.Textbox( label="System Prompt (Auto-generated)", placeholder="System prompt is automatically generated based on language selection...", lines=5, value=create_language_tutor_prompt("English", "German", True), interactive=True, info="You can customize this if needed", visible=False # Hidden from UI, but still functional in backend ) gr.Markdown("### Voice Settings") enable_voice_input = gr.Checkbox( label="Enable Voice Input (STT)", value=True, info="Transcribe voice to text" ) stt_provider = gr.Dropdown( choices=get_available_stt_providers(), value=VoiceConfig.DEFAULT_STT, label="Speech-to-Text Provider", info="Choose quality/cost tier" ) auto_clear_recording = gr.Checkbox( label="Auto-clear recording after transcription", value=True, info="Remove check to enable playback. Enables you to play the recording and listen to your pronunciation" ) enable_voice_output = gr.Checkbox( label="Enable Voice Output (TTS)", value=True, info="Convert responses to speech" ) tts_provider = gr.Dropdown( choices=get_available_tts_providers(), value=VoiceConfig.DEFAULT_TTS, label="Text-to-Speech Provider", info="Choose quality/cost tier" ) tts_voice = gr.Dropdown( choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("German")), value=get_default_voice_for_language("German", VoiceConfig.DEFAULT_TTS), label="TTS Voice", info="Voice automatically matched to target language" ) gr.Markdown("### Generation Parameters") max_tokens = gr.Slider( minimum=50, maximum=2048, value=512, step=50, label="Max Tokens", info="Maximum length of the response" ) temperature = gr.Slider( minimum=0.0, maximum=2.0, value=0.7, step=0.1, label="Temperature", info="Higher = more creative, Lower = more focused" ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Top P", info="Nucleus sampling threshold" ) # Event handlers # Update system prompt when languages or translation setting changes def update_system_prompt(native_lang, target_lang, enable_trans): return create_language_tutor_prompt(native_lang, target_lang, enable_trans) native_language.change( update_system_prompt, inputs=[native_language, target_language, enable_translations], outputs=[system_prompt] ) target_language.change( update_system_prompt, inputs=[native_language, target_language, enable_translations], outputs=[system_prompt] ) enable_translations.change( update_system_prompt, inputs=[native_language, target_language, enable_translations], outputs=[system_prompt] ) # Update TTS voice dropdown when target language or provider changes target_language.change( update_voice_dropdown, inputs=[tts_provider, target_language], outputs=[tts_voice] ) tts_provider.change( update_voice_dropdown, inputs=[tts_provider, target_language], outputs=[tts_voice] ) # Text message submit submit.click( chat, inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, enable_voice_output, tts_provider, tts_voice, target_language], outputs=[msg, chatbot, voice_output] ) msg.submit( chat, inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, enable_voice_output, tts_provider, tts_voice, target_language], outputs=[msg, chatbot, voice_output] ) # Automatic voice input transcription when recording stops, then auto-send to LLM voice_input.stop_recording( process_voice_input, inputs=[voice_input, stt_provider], outputs=[msg] ).then( chat, inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, enable_voice_output, tts_provider, tts_voice, target_language], outputs=[msg, chatbot, voice_output] ).then( clear_voice_input_if_enabled, inputs=[auto_clear_recording], outputs=[voice_input] ) # Also trigger transcription on audio change (for uploaded files) # Note: No auto-send here to avoid duplicate calls when stop_recording fires voice_input.change( process_voice_input, inputs=[voice_input, stt_provider], outputs=[msg] ) # Clear conversation clear.click( lambda: ([], None), outputs=[chatbot, voice_output] ) # Launch the app if __name__ == "__main__": demo.launch(share=False, inbrowser=True)