Spaces:

umint
/

qwen3-0.6b

Paused

qwen3-0.6b / app.py

github-actions[bot]

Automatically deploy

f41e246 2 months ago

5.35 kB

	#
	# SPDX-FileCopyrightText: Hadad <[email protected]>
	# SPDX-License-Identifier: Apache-2.0
	#

	import os
	from ollama import AsyncClient
	import gradio as gr

	async def playground(
	message,
	history,
	num_ctx,
	temperature,
	repeat_penalty,
	min_p,
	top_k,
	top_p,
	presence_penalty
	):
	if not isinstance(message, str) or not message.strip():
	yield []
	return

	client = AsyncClient(
	host=os.getenv("OLLAMA_API_BASE_URL"),
	headers={
	"Authorization": f"Bearer {os.getenv('OLLAMA_API_KEY')}"
	}
	)

	messages = []
	for item in history:
	if isinstance(item, dict) and "role" in item and "content" in item:
	messages.append({
	"role": item["role"],
	"content": item["content"]
	})
	messages.append({"role": "user", "content": message})

	response = ""
	async for part in await client.chat(
	model="qwen3:0.6b",
	messages=messages,
	options={
	"num_ctx": int(num_ctx),
	"temperature": float(temperature),
	"repeat_penalty": float(repeat_penalty),
	"min_p": float(min_p),
	"top_k": int(top_k),
	"top_p": float(top_p),
	"presence_penalty": float(presence_penalty)
	},
	stream=True
	):
	response += part.get("message", {}).get("content", "")
	yield response

	with gr.Blocks(
	fill_height=True,
	fill_width=True
	) as app:
	with gr.Sidebar():
	gr.Markdown("## Ollama Playground by UltimaX Intelligence")
	gr.HTML(
	"""
	This space run the <b><a href=
	"https://huggingface.co/Qwen/Qwen3-0.6B"
	target="_blank">Qwen 3 (0.6B)</a></b> model from
	<b>Alibaba Cloud</b>, hosted on a server using <b>Ollama</b>
	and accessed via the <b>Ollama Python SDK</b>.<br><br>

	Official <b>documentation</b> for using Ollama with the
	Python SDK can be found
	<b><a href="https://github.com/ollama/ollama-python"
	target="_blank">here</a></b>.<br><br>

	Qwen 3 (0.6B) runs entirely on a <b>dual-core CPU</b>.
	Thanks to its small size, the model can
	operate efficiently on minimal hardware.<br><br>

	The Qwen 3 (0.6B) model can also be viewed or downloaded
	from the official Ollama website
	<b><a href="https://ollama.com/library/qwen3:0.6b"
	target="_blank">here</a></b>.<br><br>

	<b>Like this project? You can support me by buying a
	<a href="https://ko-fi.com/hadad" target="_blank">
	coffee</a></b>.
	"""
	)
	gr.Markdown("---")
	gr.Markdown("## Model Parameters")
	num_ctx = gr.Slider(
	minimum=512,
	maximum=1024,
	value=512,
	step=128,
	label="Context Length (num_ctx)",
	info="Maximum context window size. Limited to CPU usage."
	)
	gr.Markdown("")
	temperature = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=0.6,
	step=0.1,
	label="Temperature",
	info="Controls randomness in generation"
	)
	gr.Markdown("")
	repeat_penalty = gr.Slider(
	minimum=0.1,
	maximum=2.0,
	value=1.0,
	step=0.1,
	label="Repeat Penalty",
	info="Penalty for repeating tokens"
	)
	gr.Markdown("")
	min_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.00,
	step=0.01,
	label="Min P",
	info="Minimum probability threshold"
	)
	gr.Markdown("")
	top_k = gr.Slider(
	minimum=0,
	maximum=100,
	value=20,
	step=1,
	label="Top K",
	info="Number of top tokens to consider"
	)
	gr.Markdown("")
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.95,
	step=0.05,
	label="Top P",
	info="Cumulative probability threshold"
	)
	gr.Markdown("")
	presence_penalty = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	value=1.5,
	step=0.1,
	label="Presence Penalty",
	info="Penalty for introducing new tokens"
	)

	gr.ChatInterface(
	fn=playground,
	additional_inputs=[
	num_ctx,
	temperature,
	repeat_penalty,
	min_p,
	top_k,
	top_p,
	presence_penalty
	],
	chatbot=gr.Chatbot(
	label="Ollama \| Qwen 3 (0.6B)",
	type="messages",
	show_copy_button=True,
	allow_tags=["think"],
	scale=1
	),
	type="messages",
	examples=[
	["Please introduce yourself."],
	["What caused World War II?"],
	["Give me a short introduction to large language model."],
	["Explain about quantum computers."]
	],
	cache_examples=False,
	show_api=False
	)

	app.launch(
	server_name="0.0.0.0",
	pwa=True
	)