Spaces:
Running
Running
| import gradio as gr | |
| from huggingface_hub import InferenceClient | |
| from dotenv import load_dotenv | |
| from voice_handler import ( | |
| create_stt_provider, | |
| create_tts_provider, | |
| get_available_stt_providers, | |
| get_available_tts_providers, | |
| get_voices_for_provider, | |
| get_available_languages, | |
| get_language_code, | |
| get_default_voice_for_language, | |
| VoiceConfig | |
| ) | |
| load_dotenv(override=True) | |
| # Initialize the Hugging Face Inference Client | |
| model_name = "swiss-ai/Apertus-8B-Instruct-2509" | |
| short_model_name = "Apertus-8B-Instruct" | |
| client = InferenceClient(model=model_name) | |
| def format_messages(message, chat_history, system_prompt): | |
| """Format the conversation into messages list.""" | |
| messages = [] | |
| # Add system prompt if provided | |
| if system_prompt.strip(): | |
| messages.append({"role": "system", "content": system_prompt}) | |
| # Add chat history (already in messages format) | |
| messages.extend(chat_history) | |
| # Add current message | |
| messages.append({"role": "user", "content": message}) | |
| return messages | |
| def create_language_tutor_prompt(native_language, target_language, enable_translations=True): | |
| """ | |
| Create a system prompt for the language tutor based on native and target languages. | |
| Args: | |
| native_language: User's native language | |
| target_language: Language the user wants to learn | |
| enable_translations: Whether to include native language translations | |
| Returns: | |
| System prompt string | |
| """ | |
| translation_guidance = "" | |
| if enable_translations: | |
| translation_guidance = f"""- Provide {native_language} translations when the user seems confused or asks for help | |
| - Include {native_language} explanations in parentheses when helpful""" | |
| else: | |
| translation_guidance = f"""- Keep responses entirely in {target_language} for full immersion | |
| - Only use {native_language} if the user explicitly asks for translation or clarification""" | |
| prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}. | |
| Your role: | |
| - Respond primarily in {target_language} to provide immersive practice | |
| {translation_guidance} | |
| - Correct mistakes gently and explain grammar rules when appropriate | |
| - Adjust your vocabulary and sentence complexity based on the user's level | |
| - Ask engaging questions to encourage conversation practice | |
| - Provide cultural context when relevant | |
| - Be patient, encouraging, and supportive | |
| Guidelines: | |
| - Keep responses conversational and natural | |
| - Use {target_language} for the main response | |
| - Praise progress and provide constructive feedback | |
| - Adapt difficulty based on the user's responses | |
| Start by greeting the user and asking what they'd like to practice today.""" | |
| return prompt | |
| def transcribe_audio(audio_path, stt_provider_name): | |
| """ | |
| Transcribe audio to text using selected STT provider. | |
| Args: | |
| audio_path: Path to audio file | |
| stt_provider_name: Name of STT provider | |
| Returns: | |
| Transcribed text or error message | |
| """ | |
| if audio_path is None: | |
| return "" | |
| try: | |
| stt_provider = create_stt_provider(stt_provider_name) | |
| text = stt_provider.transcribe(audio_path) | |
| return text | |
| except Exception as e: | |
| return f"[Transcription Error: {str(e)}]" | |
| def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"): | |
| """ | |
| Synthesize text to speech using selected TTS provider. | |
| Args: | |
| text: Text to synthesize | |
| tts_provider_name: Name of TTS provider | |
| tts_voice: Voice to use | |
| target_language: Target language name for TTS | |
| Returns: | |
| Path to generated audio file or None if failed | |
| """ | |
| if not text or not text.strip(): | |
| return None | |
| try: | |
| language_code = get_language_code(target_language) | |
| tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code) | |
| audio_path = tts_provider.synthesize(text) | |
| return audio_path | |
| except Exception as e: | |
| print(f"TTS Error: {str(e)}") | |
| return None | |
| def update_voice_dropdown(tts_provider_name, target_language="English"): | |
| """ | |
| Update the voice dropdown based on selected TTS provider and target language. | |
| Args: | |
| tts_provider_name: Name of TTS provider | |
| target_language: Target language for voice selection | |
| Returns: | |
| Updated dropdown configuration | |
| """ | |
| language_code = get_language_code(target_language) | |
| voices = get_voices_for_provider(tts_provider_name, language_code) | |
| return gr.Dropdown(choices=voices, value=voices[0] if voices else None) | |
| def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p, | |
| enable_tts, tts_provider_name, tts_voice, target_language): | |
| """Generate a response from the Hugging Face hosted model.""" | |
| if not message.strip(): | |
| return "", chat_history, None | |
| # Format the messages | |
| messages = format_messages(message, chat_history, system_prompt) | |
| try: | |
| # Call the Hugging Face Inference API | |
| response = client.chat_completion( | |
| messages=messages, | |
| max_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| stream=False | |
| ) | |
| # Extract the assistant's reply | |
| assistant_message = response.choices[0].message.content | |
| # Update chat history with messages format | |
| chat_history.append({"role": "user", "content": message}) | |
| chat_history.append({"role": "assistant", "content": assistant_message}) | |
| # Generate TTS audio if enabled | |
| audio_output = None | |
| if enable_tts: | |
| audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language) | |
| return "", chat_history, audio_output | |
| except Exception as e: | |
| error_message = f"Error: {str(e)}" | |
| chat_history.append({"role": "user", "content": message}) | |
| chat_history.append({"role": "assistant", "content": error_message}) | |
| return "", chat_history, None | |
| def process_voice_input(audio, stt_provider_name): | |
| """ | |
| Process voice input and return transcribed text. | |
| Args: | |
| audio: Audio file from microphone | |
| stt_provider_name: Name of STT provider | |
| Returns: | |
| Transcribed text | |
| """ | |
| if audio is None: | |
| return "" | |
| transcribed_text = transcribe_audio(audio, stt_provider_name) | |
| return transcribed_text | |
| def clear_voice_input_if_enabled(auto_clear): | |
| """ | |
| Clear the voice input component if auto-clear is enabled. | |
| Args: | |
| auto_clear: Boolean indicating if auto-clear is enabled | |
| Returns: | |
| None if auto-clear is enabled (clears the audio), otherwise gr.update() to keep it | |
| """ | |
| if auto_clear: | |
| return None | |
| else: | |
| return gr.update() | |
| # Create Gradio interface | |
| with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo: | |
| gr.Markdown("# 🌍 Language Tutor") | |
| gr.Markdown(f"Practice any language with an AI tutor powered by **Swiss AI {short_model_name}** - trained on 1000+ languages!") | |
| # gr.Markdown("⚠️ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.") | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| chatbot = gr.Chatbot(label="Conversation", height=400, type='messages') | |
| # Text input section | |
| with gr.Row(): | |
| msg = gr.Textbox( | |
| label="Your Message", | |
| placeholder="Type your message here...", | |
| scale=4, | |
| lines=2 | |
| ) | |
| submit = gr.Button("Send", scale=1, variant="primary") | |
| # Voice input section | |
| with gr.Row(): | |
| voice_input = gr.Audio( | |
| sources=["microphone"], | |
| type="filepath", | |
| label="Voice Input (Recording auto-transcribes when you stop)", | |
| waveform_options=gr.WaveformOptions( | |
| show_controls=False | |
| ) | |
| ) | |
| # Voice output section | |
| voice_output = gr.Audio( | |
| label="Assistant Voice Response", | |
| autoplay=True, | |
| visible=True | |
| ) | |
| clear = gr.Button("Clear Conversation") | |
| with gr.Column(scale=1): | |
| gr.Markdown("### 🌐 Language Settings") | |
| native_language = gr.Dropdown( | |
| choices=get_available_languages(), | |
| value="English", | |
| label="Your Native Language", | |
| info="Language for explanations and help" | |
| ) | |
| target_language = gr.Dropdown( | |
| choices=get_available_languages(), | |
| value="German", | |
| label="Language to Practice", | |
| info="Language you want to learn" | |
| ) | |
| enable_translations = gr.Checkbox( | |
| label="Enable Native Language Hints", | |
| value=True, | |
| info="Show translations and explanations in your native language (in parentheses)" | |
| ) | |
| system_prompt = gr.Textbox( | |
| label="System Prompt (Auto-generated)", | |
| placeholder="System prompt is automatically generated based on language selection...", | |
| lines=5, | |
| value=create_language_tutor_prompt("English", "German", True), | |
| interactive=True, | |
| info="You can customize this if needed", | |
| visible=False # Hidden from UI, but still functional in backend | |
| ) | |
| gr.Markdown("### Voice Settings") | |
| enable_voice_input = gr.Checkbox( | |
| label="Enable Voice Input (STT)", | |
| value=True, | |
| info="Transcribe voice to text" | |
| ) | |
| stt_provider = gr.Dropdown( | |
| choices=get_available_stt_providers(), | |
| value=VoiceConfig.DEFAULT_STT, | |
| label="Speech-to-Text Provider", | |
| info="Choose quality/cost tier" | |
| ) | |
| auto_clear_recording = gr.Checkbox( | |
| label="Auto-clear recording after transcription", | |
| value=True, | |
| info="Remove check to enable playback. Enables you to play the recording and listen to your pronunciation" | |
| ) | |
| enable_voice_output = gr.Checkbox( | |
| label="Enable Voice Output (TTS)", | |
| value=True, | |
| info="Convert responses to speech" | |
| ) | |
| tts_provider = gr.Dropdown( | |
| choices=get_available_tts_providers(), | |
| value=VoiceConfig.DEFAULT_TTS, | |
| label="Text-to-Speech Provider", | |
| info="Choose quality/cost tier" | |
| ) | |
| tts_voice = gr.Dropdown( | |
| choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("German")), | |
| value=get_default_voice_for_language("German", VoiceConfig.DEFAULT_TTS), | |
| label="TTS Voice", | |
| info="Voice automatically matched to target language" | |
| ) | |
| gr.Markdown("### Generation Parameters") | |
| max_tokens = gr.Slider( | |
| minimum=50, | |
| maximum=2048, | |
| value=512, | |
| step=50, | |
| label="Max Tokens", | |
| info="Maximum length of the response" | |
| ) | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=2.0, | |
| value=0.7, | |
| step=0.1, | |
| label="Temperature", | |
| info="Higher = more creative, Lower = more focused" | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.9, | |
| step=0.05, | |
| label="Top P", | |
| info="Nucleus sampling threshold" | |
| ) | |
| # Event handlers | |
| # Update system prompt when languages or translation setting changes | |
| def update_system_prompt(native_lang, target_lang, enable_trans): | |
| return create_language_tutor_prompt(native_lang, target_lang, enable_trans) | |
| native_language.change( | |
| update_system_prompt, | |
| inputs=[native_language, target_language, enable_translations], | |
| outputs=[system_prompt] | |
| ) | |
| target_language.change( | |
| update_system_prompt, | |
| inputs=[native_language, target_language, enable_translations], | |
| outputs=[system_prompt] | |
| ) | |
| enable_translations.change( | |
| update_system_prompt, | |
| inputs=[native_language, target_language, enable_translations], | |
| outputs=[system_prompt] | |
| ) | |
| # Update TTS voice dropdown when target language or provider changes | |
| target_language.change( | |
| update_voice_dropdown, | |
| inputs=[tts_provider, target_language], | |
| outputs=[tts_voice] | |
| ) | |
| tts_provider.change( | |
| update_voice_dropdown, | |
| inputs=[tts_provider, target_language], | |
| outputs=[tts_voice] | |
| ) | |
| # Text message submit | |
| submit.click( | |
| chat, | |
| inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, | |
| enable_voice_output, tts_provider, tts_voice, target_language], | |
| outputs=[msg, chatbot, voice_output] | |
| ) | |
| msg.submit( | |
| chat, | |
| inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, | |
| enable_voice_output, tts_provider, tts_voice, target_language], | |
| outputs=[msg, chatbot, voice_output] | |
| ) | |
| # Automatic voice input transcription when recording stops, then auto-send to LLM | |
| voice_input.stop_recording( | |
| process_voice_input, | |
| inputs=[voice_input, stt_provider], | |
| outputs=[msg] | |
| ).then( | |
| chat, | |
| inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p, | |
| enable_voice_output, tts_provider, tts_voice, target_language], | |
| outputs=[msg, chatbot, voice_output] | |
| ).then( | |
| clear_voice_input_if_enabled, | |
| inputs=[auto_clear_recording], | |
| outputs=[voice_input] | |
| ) | |
| # Also trigger transcription on audio change (for uploaded files) | |
| # Note: No auto-send here to avoid duplicate calls when stop_recording fires | |
| voice_input.change( | |
| process_voice_input, | |
| inputs=[voice_input, stt_provider], | |
| outputs=[msg] | |
| ) | |
| # Clear conversation | |
| clear.click( | |
| lambda: ([], None), | |
| outputs=[chatbot, voice_output] | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch(share=False, inbrowser=True) |