Spaces:

aexyb
/

Gomes

Running

App Files Files Community

Gomes / app.py

aexyb

Update app.py

a7e488b verified about 19 hours ago

Raw

History Blame Contribute Delete

5.39 kB

	import os
	# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
	os.environ["GRADIO_SSR"] = "0"

	import gradio as gr
	from llama_cpp import Llama

	# ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
	# RAM Budget (16 GB Total):
	# Model (Q4_K_M) ≈ 7.4 GB
	# KV Cache (q8_0, 4096) ≈ 1.3 GB
	# OS / Gradio / Python ≈ 3.0 GB
	# Safety Headroom ≈ 4.3 GB
	# CPU Budget (2 Threads):
	# Locked to 2 threads to eliminate context-switching overhead.
	# ──────────────────────────────────────────────────────────────────────────────

	N_THREADS = 2

	print("Optimizing for 16GB RAM / 2 CPU Cores...")

	llm = Llama.from_pretrained(
	repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
	filename="gemma4-v2-Q4_K_M.gguf",

	# ── Context & Memory ───────────────────────────────────────────────
	n_ctx=4096, # Fits comfortably in 16GB RAM
	cache_type="q8_0", # Halves KV cache RAM with zero quality loss
	use_mlock=True, # Lock weights in physical RAM
	use_mmap=True, # Efficient memory mapping

	# ── Extreme CPU Tuning ─────────────────────────────────────────────
	n_gpu_layers=0, # CPU only
	n_threads=N_THREADS, # Exact core count (no context switching)
	n_threads_batch=N_THREADS, # Match batch threads to core count
	n_batch=512, # Sweet spot for L1/L2 cache on 2 cores
	n_ubatch=32, # Ultra-small micro-batch for zero overhead

	verbose=False,
	chat_format="gemma",
	)

	print("Model loaded & locked \| 2 Threads \| 4096 ctx \| q8_0 KV Cache")


	# ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
	def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
	# 1. Build messages for llama.cpp
	messages = []
	if system_prompt.strip():
	messages.append({"role": "system", "content": system_prompt.strip()})

	# Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
	messages.extend(history)
	messages.append({"role": "user", "content": message})

	# 2. Stream tokens
	response = ""
	for chunk in llm.create_chat_completion(
	messages=messages,
	temperature=temperature,
	max_tokens=max_tokens,
	top_p=top_p,
	top_k=top_k,
	repeat_penalty=repeat_penalty,
	stream=True,
	):
	delta = chunk["choices"][0].get("delta", {})
	token = delta.get("content", "")
	if token:
	response += token
	# 3. Yield the EXACT format Gradio 6 requires
	yield history + [
	{"role": "user", "content": message},
	{"role": "assistant", "content": response}
	]


	# ─── UI ───────────────────────────────────────────────────────────────────────
	CSS = """
	.gradio-container { max-width: 1100px !important; }
	#chatbot { height: 650px !important; }
	"""

	with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
	gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n2 Cores · 16GB RAM · q8_0 KV · 4096 ctx")

	with gr.Row(equal_height=False):
	with gr.Column(scale=5):
	chatbot = gr.Chatbot(elem_id="chatbot")
	with gr.Row():
	msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
	send_btn = gr.Button("Send", variant="primary", scale=1)

	with gr.Column(scale=2):
	system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
	temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
	max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
	top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
	top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
	repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
	clear_btn = gr.Button("🗑 Clear Chat", variant="secondary")

	for evt in [msg_input.submit, send_btn.click]:
	evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
	clear_btn.click(lambda: None, None, chatbot, queue=False)

	if __name__ == "__main__":
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	theme=gr.themes.Soft(primary_hue="indigo"),
	css=CSS
	)