File size: 14,827 Bytes
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02a1ddd
 
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd581eb
4b9febd
 
 
 
 
 
dd581eb
4b9febd
 
 
 
dd581eb
 
 
 
 
 
 
 
4b9febd
 
 
 
dd581eb
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbdbcdd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b9febd
 
32d203a
 
 
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32d203a
 
 
 
 
 
4b9febd
32d203a
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32d203a
4b9febd
 
 
 
dd581eb
 
 
 
 
 
4b9febd
 
 
 
dd581eb
4b9febd
32d203a
 
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbdbcdd
 
f744169
 
dbdbcdd
 
4b9febd
 
32d203a
4b9febd
 
 
 
 
 
 
 
 
 
 
32d203a
 
4b9febd
 
 
32d203a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b9febd
 
 
dd581eb
 
 
4b9febd
 
 
dd581eb
4b9febd
 
 
 
 
dd581eb
 
 
 
 
 
 
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbdbcdd
32d203a
 
 
 
dbdbcdd
 
 
 
 
 
 
 
 
32d203a
 
dbdbcdd
 
32d203a
4b9febd
 
 
 
 
 
 
 
 
 
 
 
 
c4e2180
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
import gradio as gr
from huggingface_hub import InferenceClient
from dotenv import load_dotenv
from voice_handler import (
    create_stt_provider,
    create_tts_provider,
    get_available_stt_providers,
    get_available_tts_providers,
    get_voices_for_provider,
    get_available_languages,
    get_language_code,
    get_default_voice_for_language,
    VoiceConfig
)

load_dotenv(override=True)

# Initialize the Hugging Face Inference Client
model_name = "swiss-ai/Apertus-8B-Instruct-2509"
short_model_name = "Apertus-8B-Instruct"
client = InferenceClient(model=model_name)

def format_messages(message, chat_history, system_prompt):
    """Format the conversation into messages list."""
    messages = []

    # Add system prompt if provided
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt})

    # Add chat history (already in messages format)
    messages.extend(chat_history)

    # Add current message
    messages.append({"role": "user", "content": message})

    return messages


def create_language_tutor_prompt(native_language, target_language, enable_translations=True):
    """
    Create a system prompt for the language tutor based on native and target languages.

    Args:
        native_language: User's native language
        target_language: Language the user wants to learn
        enable_translations: Whether to include native language translations

    Returns:
        System prompt string
    """
    translation_guidance = ""
    if enable_translations:
        translation_guidance = f"""- Provide {native_language} translations when the user seems confused or asks for help
- Include {native_language} explanations in parentheses when helpful"""
    else:
        translation_guidance = f"""- Keep responses entirely in {target_language} for full immersion
- Only use {native_language} if the user explicitly asks for translation or clarification"""

    prompt = f"""You are an expert language tutor helping a {native_language} speaker learn {target_language}.

Your role:
- Respond primarily in {target_language} to provide immersive practice
{translation_guidance}
- Correct mistakes gently and explain grammar rules when appropriate
- Adjust your vocabulary and sentence complexity based on the user's level
- Ask engaging questions to encourage conversation practice
- Provide cultural context when relevant
- Be patient, encouraging, and supportive

Guidelines:
- Keep responses conversational and natural
- Use {target_language} for the main response
- Praise progress and provide constructive feedback
- Adapt difficulty based on the user's responses

Start by greeting the user and asking what they'd like to practice today."""

    return prompt


def transcribe_audio(audio_path, stt_provider_name):
    """
    Transcribe audio to text using selected STT provider.

    Args:
        audio_path: Path to audio file
        stt_provider_name: Name of STT provider

    Returns:
        Transcribed text or error message
    """
    if audio_path is None:
        return ""

    try:
        stt_provider = create_stt_provider(stt_provider_name)
        text = stt_provider.transcribe(audio_path)
        return text
    except Exception as e:
        return f"[Transcription Error: {str(e)}]"


def synthesize_speech(text, tts_provider_name, tts_voice, target_language="English"):
    """
    Synthesize text to speech using selected TTS provider.

    Args:
        text: Text to synthesize
        tts_provider_name: Name of TTS provider
        tts_voice: Voice to use
        target_language: Target language name for TTS

    Returns:
        Path to generated audio file or None if failed
    """
    if not text or not text.strip():
        return None

    try:
        language_code = get_language_code(target_language)
        tts_provider = create_tts_provider(tts_provider_name, voice=tts_voice, language=language_code)
        audio_path = tts_provider.synthesize(text)
        return audio_path
    except Exception as e:
        print(f"TTS Error: {str(e)}")
        return None


def update_voice_dropdown(tts_provider_name, target_language="English"):
    """
    Update the voice dropdown based on selected TTS provider and target language.

    Args:
        tts_provider_name: Name of TTS provider
        target_language: Target language for voice selection

    Returns:
        Updated dropdown configuration
    """
    language_code = get_language_code(target_language)
    voices = get_voices_for_provider(tts_provider_name, language_code)
    return gr.Dropdown(choices=voices, value=voices[0] if voices else None)

def chat(message, chat_history, system_prompt, max_tokens, temperature, top_p,
         enable_tts, tts_provider_name, tts_voice, target_language):
    """Generate a response from the Hugging Face hosted model."""
    if not message.strip():
        return "", chat_history, None

    # Format the messages
    messages = format_messages(message, chat_history, system_prompt)

    try:
        # Call the Hugging Face Inference API
        response = client.chat_completion(
            messages=messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
            stream=False
        )

        # Extract the assistant's reply
        assistant_message = response.choices[0].message.content

        # Update chat history with messages format
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": assistant_message})

        # Generate TTS audio if enabled
        audio_output = None
        if enable_tts:
            audio_output = synthesize_speech(assistant_message, tts_provider_name, tts_voice, target_language)

        return "", chat_history, audio_output

    except Exception as e:
        error_message = f"Error: {str(e)}"
        chat_history.append({"role": "user", "content": message})
        chat_history.append({"role": "assistant", "content": error_message})
        return "", chat_history, None


def process_voice_input(audio, stt_provider_name):
    """
    Process voice input and return transcribed text.

    Args:
        audio: Audio file from microphone
        stt_provider_name: Name of STT provider

    Returns:
        Transcribed text
    """
    if audio is None:
        return ""

    transcribed_text = transcribe_audio(audio, stt_provider_name)
    return transcribed_text


def clear_voice_input_if_enabled(auto_clear):
    """
    Clear the voice input component if auto-clear is enabled.

    Args:
        auto_clear: Boolean indicating if auto-clear is enabled

    Returns:
        None if auto-clear is enabled (clears the audio), otherwise gr.update() to keep it
    """
    if auto_clear:
        return None
    else:
        return gr.update()

# Create Gradio interface
with gr.Blocks(title="Language Tutor with Apertus-70B", theme=gr.themes.Glass(primary_hue="indigo")) as demo:
    gr.Markdown("# ๐ŸŒ Language Tutor")
    gr.Markdown(f"Practice any language with an AI tutor powered by **Swiss AI {short_model_name}** - trained on 1000+ languages!")
    # gr.Markdown("โš ๏ธ **Note**: You may need a Hugging Face token for API access. Set it with `huggingface-cli login` or pass it to InferenceClient.")
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="Conversation", height=400, type='messages')

            # Text input section
            with gr.Row():
                msg = gr.Textbox(
                    label="Your Message",
                    placeholder="Type your message here...",
                    scale=4,
                    lines=2
                )
                submit = gr.Button("Send", scale=1, variant="primary")

            # Voice input section
            with gr.Row():
                voice_input = gr.Audio(
                    sources=["microphone"],
                    type="filepath",
                    label="Voice Input (Recording auto-transcribes when you stop)",
                    waveform_options=gr.WaveformOptions(
                        show_controls=False
                    )
                )

            # Voice output section
            voice_output = gr.Audio(
                label="Assistant Voice Response",
                autoplay=True,
                visible=True
            )

            clear = gr.Button("Clear Conversation")

        with gr.Column(scale=1):
            gr.Markdown("### ๐ŸŒ Language Settings")

            native_language = gr.Dropdown(
                choices=get_available_languages(),
                value="English",
                label="Your Native Language",
                info="Language for explanations and help"
            )

            target_language = gr.Dropdown(
                choices=get_available_languages(),
                value="German",
                label="Language to Practice",
                info="Language you want to learn"
            )

            enable_translations = gr.Checkbox(
                label="Enable Native Language Hints",
                value=True,
                info="Show translations and explanations in your native language (in parentheses)"
            )

            system_prompt = gr.Textbox(
                label="System Prompt (Auto-generated)",
                placeholder="System prompt is automatically generated based on language selection...",
                lines=5,
                value=create_language_tutor_prompt("English", "German", True),
                interactive=True,
                info="You can customize this if needed",
                visible=False  # Hidden from UI, but still functional in backend
            )

            gr.Markdown("### Voice Settings")

            enable_voice_input = gr.Checkbox(
                label="Enable Voice Input (STT)",
                value=True,
                info="Transcribe voice to text"
            )

            stt_provider = gr.Dropdown(
                choices=get_available_stt_providers(),
                value=VoiceConfig.DEFAULT_STT,
                label="Speech-to-Text Provider",
                info="Choose quality/cost tier"
            )

            auto_clear_recording = gr.Checkbox(
                label="Auto-clear recording after transcription",
                value=True,
                info="Remove check to enable playback. Enables you to play the recording and listen to your pronunciation"
            )

            enable_voice_output = gr.Checkbox(
                label="Enable Voice Output (TTS)",
                value=True,
                info="Convert responses to speech"
            )

            tts_provider = gr.Dropdown(
                choices=get_available_tts_providers(),
                value=VoiceConfig.DEFAULT_TTS,
                label="Text-to-Speech Provider",
                info="Choose quality/cost tier"
            )

            tts_voice = gr.Dropdown(
                choices=get_voices_for_provider(VoiceConfig.DEFAULT_TTS, get_language_code("German")),
                value=get_default_voice_for_language("German", VoiceConfig.DEFAULT_TTS),
                label="TTS Voice",
                info="Voice automatically matched to target language"
            )

            gr.Markdown("### Generation Parameters")

            max_tokens = gr.Slider(
                minimum=50,
                maximum=2048,
                value=512,
                step=50,
                label="Max Tokens",
                info="Maximum length of the response"
            )

            temperature = gr.Slider(
                minimum=0.0,
                maximum=2.0,
                value=0.7,
                step=0.1,
                label="Temperature",
                info="Higher = more creative, Lower = more focused"
            )

            top_p = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                value=0.9,
                step=0.05,
                label="Top P",
                info="Nucleus sampling threshold"
            )
    
    # Event handlers

    # Update system prompt when languages or translation setting changes
    def update_system_prompt(native_lang, target_lang, enable_trans):
        return create_language_tutor_prompt(native_lang, target_lang, enable_trans)

    native_language.change(
        update_system_prompt,
        inputs=[native_language, target_language, enable_translations],
        outputs=[system_prompt]
    )

    target_language.change(
        update_system_prompt,
        inputs=[native_language, target_language, enable_translations],
        outputs=[system_prompt]
    )

    enable_translations.change(
        update_system_prompt,
        inputs=[native_language, target_language, enable_translations],
        outputs=[system_prompt]
    )

    # Update TTS voice dropdown when target language or provider changes
    target_language.change(
        update_voice_dropdown,
        inputs=[tts_provider, target_language],
        outputs=[tts_voice]
    )

    tts_provider.change(
        update_voice_dropdown,
        inputs=[tts_provider, target_language],
        outputs=[tts_voice]
    )

    # Text message submit
    submit.click(
        chat,
        inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
                enable_voice_output, tts_provider, tts_voice, target_language],
        outputs=[msg, chatbot, voice_output]
    )

    msg.submit(
        chat,
        inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
                enable_voice_output, tts_provider, tts_voice, target_language],
        outputs=[msg, chatbot, voice_output]
    )

    # Automatic voice input transcription when recording stops, then auto-send to LLM
    voice_input.stop_recording(
        process_voice_input,
        inputs=[voice_input, stt_provider],
        outputs=[msg]
    ).then(
        chat,
        inputs=[msg, chatbot, system_prompt, max_tokens, temperature, top_p,
                enable_voice_output, tts_provider, tts_voice, target_language],
        outputs=[msg, chatbot, voice_output]
    ).then(
        clear_voice_input_if_enabled,
        inputs=[auto_clear_recording],
        outputs=[voice_input]
    )

    # Also trigger transcription on audio change (for uploaded files)
    # Note: No auto-send here to avoid duplicate calls when stop_recording fires
    voice_input.change(
        process_voice_input,
        inputs=[voice_input, stt_provider],
        outputs=[msg]
    )

    # Clear conversation
    clear.click(
        lambda: ([], None),
        outputs=[chatbot, voice_output]
    )

# Launch the app
if __name__ == "__main__":
    demo.launch(share=False, inbrowser=True)