Spaces:

umint
/

qwen3-0.6b

Paused

File size: 5,353 Bytes

f41e246

#
# SPDX-FileCopyrightText: Hadad <[email protected]>
# SPDX-License-Identifier: Apache-2.0
#

import os
from ollama import AsyncClient
import gradio as gr

async def playground(
    message,
    history,
    num_ctx,
    temperature,
    repeat_penalty,
    min_p,
    top_k,
    top_p,
    presence_penalty
):
    if not isinstance(message, str) or not message.strip():
        yield []
        return

    client = AsyncClient(
        host=os.getenv("OLLAMA_API_BASE_URL"),
        headers={
            "Authorization": f"Bearer {os.getenv('OLLAMA_API_KEY')}"
        }
    )

    messages = []
    for item in history:
        if isinstance(item, dict) and "role" in item and "content" in item:
            messages.append({
                "role": item["role"],
                "content": item["content"]
            })
    messages.append({"role": "user", "content": message})

    response = ""
    async for part in await client.chat(
        model="qwen3:0.6b",
        messages=messages,
        options={
            "num_ctx": int(num_ctx),
            "temperature": float(temperature),
            "repeat_penalty": float(repeat_penalty),
            "min_p": float(min_p),
            "top_k": int(top_k),
            "top_p": float(top_p),
            "presence_penalty": float(presence_penalty)
        },
        stream=True
    ):
        response += part.get("message", {}).get("content", "")
        yield response

with gr.Blocks(
    fill_height=True,
    fill_width=True
) as app:
    with gr.Sidebar():
        gr.Markdown("## Ollama Playground by UltimaX Intelligence")
        gr.HTML(
            """
            This space run the <b><a href=
            "https://huggingface.co/Qwen/Qwen3-0.6B" 
            target="_blank">Qwen 3 (0.6B)</a></b> model from 
            <b>Alibaba Cloud</b>, hosted on a server using <b>Ollama</b> 
            and accessed via the <b>Ollama Python SDK</b>.<br><br>

            Official <b>documentation</b> for using Ollama with the 
            Python SDK can be found 
            <b><a href="https://github.com/ollama/ollama-python" 
            target="_blank">here</a></b>.<br><br>

            Qwen 3 (0.6B) runs entirely on a <b>dual-core CPU</b>. 
            Thanks to its small size, the model can 
            operate efficiently on minimal hardware.<br><br>

            The Qwen 3 (0.6B) model can also be viewed or downloaded 
            from the official Ollama website 
            <b><a href="https://ollama.com/library/qwen3:0.6b" 
            target="_blank">here</a></b>.<br><br>

            <b>Like this project? You can support me by buying a 
            <a href="https://ko-fi.com/hadad" target="_blank">
            coffee</a></b>.
            """
        )
        gr.Markdown("---")
        gr.Markdown("## Model Parameters")
        num_ctx = gr.Slider(
            minimum=512,
            maximum=1024,
            value=512,
            step=128,
            label="Context Length (num_ctx)",
            info="Maximum context window size. Limited to CPU usage."
        )
        gr.Markdown("")
        temperature = gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=0.6,
            step=0.1,
            label="Temperature",
            info="Controls randomness in generation"
        )
        gr.Markdown("")
        repeat_penalty = gr.Slider(
            minimum=0.1,
            maximum=2.0,
            value=1.0,
            step=0.1,
            label="Repeat Penalty",
            info="Penalty for repeating tokens"
        )
        gr.Markdown("")
        min_p = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.00,
            step=0.01,
            label="Min P",
            info="Minimum probability threshold"
        )
        gr.Markdown("")
        top_k = gr.Slider(
            minimum=0,
            maximum=100,
            value=20,
            step=1,
            label="Top K",
            info="Number of top tokens to consider"
        )
        gr.Markdown("")
        top_p = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.95,
            step=0.05,
            label="Top P",
            info="Cumulative probability threshold"
        )
        gr.Markdown("")
        presence_penalty = gr.Slider(
            minimum=0.0,
            maximum=2.0,
            value=1.5,
            step=0.1,
            label="Presence Penalty",
            info="Penalty for introducing new tokens"
        )

    gr.ChatInterface(
        fn=playground,
        additional_inputs=[
            num_ctx,
            temperature,
            repeat_penalty,
            min_p,
            top_k,
            top_p,
            presence_penalty
        ],
        chatbot=gr.Chatbot(
            label="Ollama | Qwen 3 (0.6B)",
            type="messages",
            show_copy_button=True,
            allow_tags=["think"],
            scale=1
        ),
        type="messages",
        examples=[
            ["Please introduce yourself."],
            ["What caused World War II?"],
            ["Give me a short introduction to large language model."],
            ["Explain about quantum computers."]
        ],
        cache_examples=False,
        show_api=False
    )

app.launch(
    server_name="0.0.0.0",
    pwa=True
)