From c068d86bf0f9613d6cc7caba95e7cf2d98cf5a87 Mon Sep 17 00:00:00 2001 From: Dmitry Ng <19asdek91@gmail.com> Date: Sat, 2 May 2026 18:57:42 +0300 Subject: [PATCH] feat(docker): update Dockerfile and add new vLLM configurations - Added new provider configurations for vLLM Qwen 3.6 in both thinking and non-thinking modes. - Updated the Dockerfile to include the new configuration files for vLLM Qwen 3.6 and ensure proper setup for deployment. --- Dockerfile | 4 +- ...vllm-qwen3.6-27b-fp8-no-think.provider.yml | 193 ++++++++++++++++++ .../configs/vllm-qwen3.6-27b-fp8.provider.yml | 174 ++++++++++++++++ 3 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml create mode 100644 examples/configs/vllm-qwen3.6-27b-fp8.provider.yml diff --git a/Dockerfile b/Dockerfile index c1b1b81..850b044 100644 --- a/Dockerfile +++ b/Dockerfile @@ -171,8 +171,10 @@ COPY examples/configs/ollama-qwen332b-fp16-tc.provider.yml /opt/pentagi/conf/ COPY examples/configs/ollama-qwq32b-fp16-tc.provider.yml /opt/pentagi/conf/ COPY examples/configs/openrouter.provider.yml /opt/pentagi/conf/ COPY examples/configs/novita.provider.yml /opt/pentagi/conf/ -COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/ COPY examples/configs/vllm-qwen3.5-27b-fp8-no-think.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.6-27b-fp8.provider.yml /opt/pentagi/conf/ COPY examples/configs/vllm-qwen332b-fp16.provider.yml /opt/pentagi/conf/ COPY LICENSE /opt/pentagi/LICENSE diff --git a/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml new file mode 100644 index 0000000..6007e2b --- /dev/null +++ b/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml @@ -0,0 +1,193 @@ +# Qwen3.6-27B FP8 Provider Configuration - NON-THINKING MODE +# Based on official Qwen recommendations for vLLM inference +# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers) +# Context: 262K native, expandable to 1M with YaRN +# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks) +# +# Non-thinking mode is disabled via extra_body parameter +# Recommended sampling parameters: +# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 + +simple: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +simple_json: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + json: true + extra_body: + chat_template_kwargs: + enable_thinking: false + +primary_agent: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +assistant: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +generator: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +refiner: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +adviser: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +reflector: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +searcher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +enricher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +coder: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +installer: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +pentester: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false diff --git a/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml new file mode 100644 index 0000000..82433ea --- /dev/null +++ b/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml @@ -0,0 +1,174 @@ +# Qwen3.6-27B FP8 Provider Configuration - THINKING MODE (default) +# Based on official Qwen recommendations for vLLM inference +# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers) +# Context: 262K native, expandable to 1M with YaRN +# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks) +# +# Thinking mode is enabled by default (no extra_body needed) +# Recommended sampling parameters: +# - General tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Precise coding: temp=0.6, top_p=0.95, top_k=20, min_p=0.0, pp=0.0, rp=1.0 +# +# Non-thinking mode is disabled via extra_body parameter +# Recommended sampling parameters: +# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 + +simple: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +simple_json: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + json: true + extra_body: + chat_template_kwargs: + enable_thinking: false + +primary_agent: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +assistant: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +generator: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +refiner: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +adviser: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +reflector: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +searcher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +enricher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +coder: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +installer: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +pentester: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768