Gomes / app.py
aexyb's picture
Update app.py
a7e488b verified
Raw
History Blame Contribute Delete
5.39 kB
import os
# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
os.environ["GRADIO_SSR"] = "0"
import gradio as gr
from llama_cpp import Llama
# ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
# RAM Budget (16 GB Total):
# Model (Q4_K_M) ≈ 7.4 GB
# KV Cache (q8_0, 4096) ≈ 1.3 GB
# OS / Gradio / Python ≈ 3.0 GB
# Safety Headroom ≈ 4.3 GB
# CPU Budget (2 Threads):
# Locked to 2 threads to eliminate context-switching overhead.
# ──────────────────────────────────────────────────────────────────────────────
N_THREADS = 2
print("Optimizing for 16GB RAM / 2 CPU Cores...")
llm = Llama.from_pretrained(
repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
filename="gemma4-v2-Q4_K_M.gguf",
# ── Context & Memory ───────────────────────────────────────────────
n_ctx=4096, # Fits comfortably in 16GB RAM
cache_type="q8_0", # Halves KV cache RAM with zero quality loss
use_mlock=True, # Lock weights in physical RAM
use_mmap=True, # Efficient memory mapping
# ── Extreme CPU Tuning ─────────────────────────────────────────────
n_gpu_layers=0, # CPU only
n_threads=N_THREADS, # Exact core count (no context switching)
n_threads_batch=N_THREADS, # Match batch threads to core count
n_batch=512, # Sweet spot for L1/L2 cache on 2 cores
n_ubatch=32, # Ultra-small micro-batch for zero overhead
verbose=False,
chat_format="gemma",
)
print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")
# ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
# 1. Build messages for llama.cpp
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
# Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
messages.extend(history)
messages.append({"role": "user", "content": message})
# 2. Stream tokens
response = ""
for chunk in llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stream=True,
):
delta = chunk["choices"][0].get("delta", {})
token = delta.get("content", "")
if token:
response += token
# 3. Yield the EXACT format Gradio 6 requires
yield history + [
{"role": "user", "content": message},
{"role": "assistant", "content": response}
]
# ─── UI ───────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container { max-width: 1100px !important; }
#chatbot { height: 650px !important; }
"""
with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**")
with gr.Row(equal_height=False):
with gr.Column(scale=5):
chatbot = gr.Chatbot(elem_id="chatbot")
with gr.Row():
msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
send_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Column(scale=2):
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
clear_btn = gr.Button("🗑 Clear Chat", variant="secondary")
for evt in [msg_input.submit, send_btn.click]:
evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
clear_btn.click(lambda: None, None, chatbot, queue=False)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
theme=gr.themes.Soft(primary_hue="indigo"),
css=CSS
)