| import os |
| |
| os.environ["GRADIO_SSR"] = "0" |
|
|
| import gradio as gr |
| from llama_cpp import Llama |
|
|
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| N_THREADS = 2 |
|
|
| print("Optimizing for 16GB RAM / 2 CPU Cores...") |
|
|
| llm = Llama.from_pretrained( |
| repo_id="yuxinlu1/gemma-4-12B-agentic-fable5-composer2.5-v2-3.5x-tau2-GGUF", |
| filename="gemma4-v2-Q4_K_M.gguf", |
| |
| |
| n_ctx=4096, |
| cache_type="q8_0", |
| use_mlock=True, |
| use_mmap=True, |
| |
| |
| n_gpu_layers=0, |
| n_threads=N_THREADS, |
| n_threads_batch=N_THREADS, |
| n_batch=512, |
| n_ubatch=32, |
| |
| verbose=False, |
| chat_format="gemma", |
| ) |
|
|
| print("Model loaded & locked | 2 Threads | 4096 ctx | q8_0 KV Cache") |
|
|
|
|
| |
| def respond(message, history, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty): |
| |
| messages = [] |
| if system_prompt.strip(): |
| messages.append({"role": "system", "content": system_prompt.strip()}) |
| |
| |
| messages.extend(history) |
| messages.append({"role": "user", "content": message}) |
|
|
| |
| response = "" |
| for chunk in llm.create_chat_completion( |
| messages=messages, |
| temperature=temperature, |
| max_tokens=max_tokens, |
| top_p=top_p, |
| top_k=top_k, |
| repeat_penalty=repeat_penalty, |
| stream=True, |
| ): |
| delta = chunk["choices"][0].get("delta", {}) |
| token = delta.get("content", "") |
| if token: |
| response += token |
| |
| yield history + [ |
| {"role": "user", "content": message}, |
| {"role": "assistant", "content": response} |
| ] |
|
|
|
|
| |
| CSS = """ |
| .gradio-container { max-width: 1100px !important; } |
| #chatbot { height: 650px !important; } |
| """ |
|
|
| with gr.Blocks(title="Gemma-4 12B Q4 CPU") as demo: |
| gr.Markdown("# 💎 Gemma-4 12B · Q4_K_M · Extreme CPU Tuning\n**2 Cores** · **16GB RAM** · **q8_0 KV** · **4096 ctx**") |
|
|
| with gr.Row(equal_height=False): |
| with gr.Column(scale=5): |
| chatbot = gr.Chatbot(elem_id="chatbot") |
| with gr.Row(): |
| msg_input = gr.Textbox(placeholder="Type your message…", show_label=False, scale=5, autofocus=True) |
| send_btn = gr.Button("Send", variant="primary", scale=1) |
|
|
| with gr.Column(scale=2): |
| system_prompt = gr.Textbox(label="System Prompt", value="You are a helpful, harmless, and honest assistant.", lines=4) |
| temperature = gr.Slider(label="Temperature", minimum=0.0, maximum=2.0, value=0.7, step=0.05) |
| max_tokens = gr.Slider(label="Max Tokens", minimum=64, maximum=4096, value=1024, step=64) |
| top_p = gr.Slider(label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.05) |
| top_k = gr.Slider(label="Top K", minimum=1, maximum=200, value=64, step=1) |
| repeat_penalty = gr.Slider(label="Repeat Penalty", minimum=1.0, maximum=2.0, value=1.1, step=0.05) |
| clear_btn = gr.Button("🗑 Clear Chat", variant="secondary") |
|
|
| for evt in [msg_input.submit, send_btn.click]: |
| evt(respond, [msg_input, chatbot, system_prompt, temperature, max_tokens, top_p, top_k, repeat_penalty], chatbot).then(lambda: "", None, msg_input) |
| clear_btn.click(lambda: None, None, chatbot, queue=False) |
|
|
| if __name__ == "__main__": |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| theme=gr.themes.Soft(primary_hue="indigo"), |
| css=CSS |
| ) |