File size: 5,393 Bytes
b84112b
a7e488b
b84112b
 
cf76dc8
 
 
a7e488b
 
 
 
 
 
 
 
 
698c39e
a7e488b
 
 
cf76dc8
 
 
 
a756324
 
a7e488b
 
a756324
b84112b
a756324
a7e488b
 
 
 
 
 
a756324
cf76dc8
 
 
 
a7e488b
cf76dc8
 
a7e488b
0d0b6aa
a7e488b
cf76dc8
 
 
a756324
a7e488b
a756324
cf76dc8
 
a7e488b
cf76dc8
 
 
 
 
 
 
 
 
 
 
 
 
 
a7e488b
a756324
 
 
 
cf76dc8
 
a7e488b
cf76dc8
a7e488b
 
cf76dc8
 
a7e488b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0d0b6aa
a7e488b
 
0d0b6aa
 
d669dab
 
 
a7e488b
b84112b
d669dab
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import os
# Force Gradio to skip the slow Node.js frontend build on 2-core CPU
os.environ["GRADIO_SSR"] = "0"

import gradio as gr
from llama_cpp import Llama

# ─── Extreme 2-Thread / 16GB RAM Optimization ────────────────────────────────
# RAM Budget (16 GB Total):
#   Model (Q4_K_M)          β‰ˆ  7.4 GB
#   KV Cache (q8_0, 4096)   β‰ˆ  1.3 GB
#   OS / Gradio / Python    β‰ˆ  3.0 GB
#   Safety Headroom         β‰ˆ  4.3 GB
# CPU Budget (2 Threads):
#   Locked to 2 threads to eliminate context-switching overhead.
# ──────────────────────────────────────────────────────────────────────────────

N_THREADS = 2

print("Optimizing for 16GB RAM / 2 CPU Cores...")

llm = Llama.from_pretrained(
    repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF",
    filename="gemma4-v2-Q4_K_M.gguf",
    
    # ── Context & Memory ───────────────────────────────────────────────
    n_ctx=4096,                # Fits comfortably in 16GB RAM
    cache_type="q8_0",         # Halves KV cache RAM with zero quality loss
    use_mlock=True,            # Lock weights in physical RAM
    use_mmap=True,             # Efficient memory mapping
    
    # ── Extreme CPU Tuning ─────────────────────────────────────────────
    n_gpu_layers=0,            # CPU only
    n_threads=N_THREADS,       # Exact core count (no context switching)
    n_threads_batch=N_THREADS, # Match batch threads to core count
    n_batch=512,               # Sweet spot for L1/L2 cache on 2 cores
    n_ubatch=32,               # Ultra-small micro-batch for zero overhead
    
    verbose=False,
    chat_format="gemma",
)

print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache")


# ─── Chat Function (Gradio 6 Dict Format) ────────────────────────────────────
def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty):
    # 1. Build messages for llama.cpp
    messages = []
    if system_prompt.strip():
        messages.append({"role": "system", "content": system_prompt.strip()})
    
    # Gradio 6 passes history as a list of dicts: [{"role": "user", "content": "..."}]
    messages.extend(history)
    messages.append({"role": "user", "content": message})

    # 2. Stream tokens
    response = ""
    for chunk in llm.create_chat_completion(
        messages=messages,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        top_k=top_k,
        repeat_penalty=repeat_penalty,
        stream=True,
    ):
        delta = chunk["choices"][0].get("delta", {})
        token = delta.get("content", "")
        if token:
            response += token
            # 3. Yield the EXACT format Gradio 6 requires
            yield history + [
                {"role": "user", "content": message},
                {"role": "assistant", "content": response}
            ]


# ─── UI ───────────────────────────────────────────────────────────────────────
CSS = """
.gradio-container { max-width: 1100px !important; }
#chatbot { height: 650px !important; }
"""

with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo:
    gr.Markdown("# πŸ’Ž Gemma-4 12B Β· Q4_K_M Β· Extreme CPU Tuning\n**2 Cores** Β· **16GB RAM** Β· **q8_0 KV** Β· **4096 ctx**")

    with gr.Row(equal_height=False):
        with gr.Column(scale=5):
            chatbot = gr.Chatbot(elem_id="chatbot")
            with gr.Row():
                msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True)
                send_btn = gr.Button("Send", variant="primary", scale=1)

        with gr.Column(scale=2):
            system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4)
            temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05)
            max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64)
            top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05)
            top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1)
            repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05)
            clear_btn = gr.Button("πŸ—‘ Clear Chat", variant="secondary")

    for evt in [msg_input.submit, send_btn.click]:
        evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input)
    clear_btn.click(lambda: None, None, chatbot, queue=False)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0", 
        server_port=7860, 
        theme=gr.themes.Soft(primary_hue="indigo"), 
        css=CSS
    )