feat(docker): update Dockerfile and add new vLLM configurations

- Added new provider configurations for vLLM Qwen 3.6 in both thinking and non-thinking modes.
- Updated the Dockerfile to include the new configuration files for vLLM Qwen 3.6 and ensure proper setup for deployment.
This commit is contained in:
Dmitry Ng
2026-05-02 18:57:42 +03:00
parent 268732f610
commit c068d86bf0
3 changed files with 370 additions and 1 deletions
+3 -1
View File
@@ -171,8 +171,10 @@ COPY examples/configs/ollama-qwen332b-fp16-tc.provider.yml /opt/pentagi/conf/
COPY examples/configs/ollama-qwq32b-fp16-tc.provider.yml /opt/pentagi/conf/
COPY examples/configs/openrouter.provider.yml /opt/pentagi/conf/
COPY examples/configs/novita.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen3.5-27b-fp8-no-think.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen3.6-27b-fp8.provider.yml /opt/pentagi/conf/
COPY examples/configs/vllm-qwen332b-fp16.provider.yml /opt/pentagi/conf/
COPY LICENSE /opt/pentagi/LICENSE
@@ -0,0 +1,193 @@
# Qwen3.6-27B FP8 Provider Configuration - NON-THINKING MODE
# Based on official Qwen recommendations for vLLM inference
# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers)
# Context: 262K native, expandable to 1M with YaRN
# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks)
#
# Non-thinking mode is disabled via extra_body parameter
# Recommended sampling parameters:
# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0
# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
simple:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
simple_json:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
json: true
extra_body:
chat_template_kwargs:
enable_thinking: false
primary_agent:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
assistant:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
generator:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
refiner:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
adviser:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
reflector:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
searcher:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
enricher:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
coder:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
installer:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
pentester:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
@@ -0,0 +1,174 @@
# Qwen3.6-27B FP8 Provider Configuration - THINKING MODE (default)
# Based on official Qwen recommendations for vLLM inference
# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers)
# Context: 262K native, expandable to 1M with YaRN
# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks)
#
# Thinking mode is enabled by default (no extra_body needed)
# Recommended sampling parameters:
# - General tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
# - Precise coding: temp=0.6, top_p=0.95, top_k=20, min_p=0.0, pp=0.0, rp=1.0
#
# Non-thinking mode is disabled via extra_body parameter
# Recommended sampling parameters:
# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0
# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0
simple:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
simple_json:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
json: true
extra_body:
chat_template_kwargs:
enable_thinking: false
primary_agent:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
assistant:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
generator:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
refiner:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
adviser:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
reflector:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 1.0
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
searcher:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
enricher:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.7
top_k: 20
top_p: 0.8
min_p: 0.0
presence_penalty: 1.5
repetition_penalty: 1.0
n: 1
max_tokens: 32768
extra_body:
chat_template_kwargs:
enable_thinking: false
coder:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.6
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 0.0
repetition_penalty: 1.0
n: 1
max_tokens: 32768
installer:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.6
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 0.0
repetition_penalty: 1.0
n: 1
max_tokens: 32768
pentester:
model: "Qwen/Qwen3.6-27B-FP8"
temperature: 0.6
top_k: 20
top_p: 0.95
min_p: 0.0
presence_penalty: 0.0
repetition_penalty: 1.0
n: 1
max_tokens: 32768