diff --git a/Dockerfile b/Dockerfile index c1b1b81..850b044 100644 --- a/Dockerfile +++ b/Dockerfile @@ -171,8 +171,10 @@ COPY examples/configs/ollama-qwen332b-fp16-tc.provider.yml /opt/pentagi/conf/ COPY examples/configs/ollama-qwq32b-fp16-tc.provider.yml /opt/pentagi/conf/ COPY examples/configs/openrouter.provider.yml /opt/pentagi/conf/ COPY examples/configs/novita.provider.yml /opt/pentagi/conf/ -COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/ COPY examples/configs/vllm-qwen3.5-27b-fp8-no-think.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.5-27b-fp8.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml /opt/pentagi/conf/ +COPY examples/configs/vllm-qwen3.6-27b-fp8.provider.yml /opt/pentagi/conf/ COPY examples/configs/vllm-qwen332b-fp16.provider.yml /opt/pentagi/conf/ COPY LICENSE /opt/pentagi/LICENSE diff --git a/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml new file mode 100644 index 0000000..6007e2b --- /dev/null +++ b/examples/configs/vllm-qwen3.6-27b-fp8-no-think.provider.yml @@ -0,0 +1,193 @@ +# Qwen3.6-27B FP8 Provider Configuration - NON-THINKING MODE +# Based on official Qwen recommendations for vLLM inference +# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers) +# Context: 262K native, expandable to 1M with YaRN +# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks) +# +# Non-thinking mode is disabled via extra_body parameter +# Recommended sampling parameters: +# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 + +simple: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +simple_json: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + json: true + extra_body: + chat_template_kwargs: + enable_thinking: false + +primary_agent: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +assistant: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +generator: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +refiner: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +adviser: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +reflector: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +searcher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +enricher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +coder: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +installer: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +pentester: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false diff --git a/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml b/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml new file mode 100644 index 0000000..82433ea --- /dev/null +++ b/examples/configs/vllm-qwen3.6-27b-fp8.provider.yml @@ -0,0 +1,174 @@ +# Qwen3.6-27B FP8 Provider Configuration - THINKING MODE (default) +# Based on official Qwen recommendations for vLLM inference +# Architecture: Hybrid 75% DeltaNet + 25% Full Attention (48+16 layers) +# Context: 262K native, expandable to 1M with YaRN +# Vision: VLM with Vision Encoder (uses VRAM even for text-only tasks) +# +# Thinking mode is enabled by default (no extra_body needed) +# Recommended sampling parameters: +# - General tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Precise coding: temp=0.6, top_p=0.95, top_k=20, min_p=0.0, pp=0.0, rp=1.0 +# +# Non-thinking mode is disabled via extra_body parameter +# Recommended sampling parameters: +# - General tasks: temp=0.7, top_p=0.8, top_k=20, min_p=0.0, pp=1.5, rp=1.0 +# - Reasoning tasks: temp=1.0, top_p=0.95, top_k=20, min_p=0.0, pp=1.5, rp=1.0 + +simple: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +simple_json: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + json: true + extra_body: + chat_template_kwargs: + enable_thinking: false + +primary_agent: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +assistant: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +generator: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +refiner: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +adviser: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +reflector: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 1.0 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +searcher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +enricher: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.7 + top_k: 20 + top_p: 0.8 + min_p: 0.0 + presence_penalty: 1.5 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + extra_body: + chat_template_kwargs: + enable_thinking: false + +coder: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +installer: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768 + +pentester: + model: "Qwen/Qwen3.6-27B-FP8" + temperature: 0.6 + top_k: 20 + top_p: 0.95 + min_p: 0.0 + presence_penalty: 0.0 + repetition_penalty: 1.0 + n: 1 + max_tokens: 32768