File size: 5,393 Bytes
b84112b a7e488b b84112b cf76dc8 a7e488b 698c39e a7e488b cf76dc8 a756324 a7e488b a756324 b84112b a756324 a7e488b a756324 cf76dc8 a7e488b cf76dc8 a7e488b 0d0b6aa a7e488b cf76dc8 a756324 a7e488b a756324 cf76dc8 a7e488b cf76dc8 a7e488b a756324 cf76dc8 a7e488b cf76dc8 a7e488b cf76dc8 a7e488b 0d0b6aa a7e488b 0d0b6aa d669dab a7e488b b84112b d669dab | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | import os
# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
os.environ["GRADIO_SSR"] = "0"
import gradio as gr
from llama_cpp import Llama
# βββ Extreme 2-Thread / 16GB RAM Optimization ββββββββββββββββββββββββββββββββ
# RAM Budget (16 GB Total):
# Model (Q4_K_M) β 7.4 GB
# KV Cache (q8_0, 4096) β 1.3 GB
# OS / Gradio / Python β 3.0 GB
# Safety Headroom β 4.3 GB
# CPU Budget (2 Threads):
# Locked to 2 threads to eliminate context-switching overhead.
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
N_THREADS = 2
print("Optimizing for 16GB RAM / 2 CPU Cores...")
llm = Llama.from_pretrained(
repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
filename="gemma4-v2-Q4_K_M.gguf",
# ββ Context & Memory βββββββββββββββββββββββββββββββββββββββββββββββ
n_ctx=4096, # Fits comfortably in 16GB RAM
cache_type="q8_0", # Halves KV cache RAM with zero quality loss
use_mlock=True, # Lock weights in physical RAM
use_mmap=True, # Efficient memory mapping
# ββ Extreme CPU Tuning βββββββββββββββββββββββββββββββββββββββββββββ
n_gpu_layers=0, # CPU only
n_threads=N_THREADS, # Exact core count (no context switching)
n_threads_batch=N_THREADS, # Match batch threads to core count
n_batch=512, # Sweet spot for L1/L2 cache on 2 cores
n_ubatch=32, # Ultra-small micro-batch for zero overhead
verbose=False,
chat_format="gemma",
)
print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")
# βββ Chat Function (Gradio 6 Dict Format) ββββββββββββββββββββββββββββββββββββ
def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
# 1. Build messages for llama.cpp
messages = []
if system_prompt.strip():
messages.append({"role": "system", "content": system_prompt.strip()})
# Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
messages.extend(history)
messages.append({"role": "user", "content": message})
# 2. Stream tokens
response = ""
for chunk in llm.create_chat_completion(
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
top_p=top_p,
top_k=top_k,
repeat_penalty=repeat_penalty,
stream=True,
):
delta = chunk["choices"][0].get("delta", {})
token = delta.get("content", "")
if token:
response += token
# 3. Yield the EXACT format Gradio 6 requires
yield history + [
{"role": "user", "content": message},
{"role": "assistant", "content": response}
]
# βββ UI βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
CSS = """
.gradio-container { max-width: 1100px !important; }
#chatbot { height: 650px !important; }
"""
with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
gr.Markdown("# π Gemma-4 12B Β· Q4_K_M Β· Extreme CPU Tuning\n**2 Cores** Β· **16GB RAM** Β· **q8_0 KV** Β· **4096 ctx**")
with gr.Row(equal_height=False):
with gr.Column(scale=5):
chatbot = gr.Chatbot(elem_id="chatbot")
with gr.Row():
msg_input = gr.Textbox(placeholder="Type your messageβ¦", show_label=False, scale=5, autofocus=True)
send_btn = gr.Button("Send", variant="primary", scale=1)
with gr.Column(scale=2):
system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
clear_btn = gr.Button("π Clear Chat", variant="secondary")
for evt in [msg_input.submit, send_btn.click]:
evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
clear_btn.click(lambda: None, None, chatbot, queue=False)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
theme=gr.themes.Soft(primary_hue="indigo"),
css=CSS
) |