From 33d5c0737eb74be5c94ca10ca1eebbcea6841f28 Mon Sep 17 00:00:00 2001
From: Nariman Jelveh <nj@puter.com>
Date: Sun, 3 May 2026 09:33:50 -0700
Subject: [PATCH] Add Gemini TTS provider and integrate client/docs (#2889)

Introduce a new Gemini TTS provider and wire it through the driver, client, docs, and examples. Adds src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts (Google GenAI client usage, PCM->WAV wrapping, metering, model/voice validation) and a costs table in providers/gemini/costs.ts. Registers the provider in TTSDriver, exposes the alias gemini-tts, and prefers it in provider selection. Updates puter-js client to recognize "gemini" provider/engine and route driver calls to gemini-tts. Documentation updated with Gemini options, a usage example, and a playground example HTML file.
---
 src/backend/drivers/ai-tts/TTSDriver.ts       |  37 +-
 .../providers/gemini/GeminiTTSProvider.ts     | 361 ++++++++++++++++++
 .../drivers/ai-tts/providers/gemini/costs.ts  |  44 +++
 src/docs/src/AI.md                            |   1 +
 src/docs/src/AI/txt2speech.md                 |  39 +-
 .../examples/ai-txt2speech-gemini.html        |  20 +
 src/puter-js/src/modules/AI.js                |  54 ++-
 7 files changed, 543 insertions(+), 13 deletions(-)
 create mode 100644 src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts
 create mode 100644 src/backend/drivers/ai-tts/providers/gemini/costs.ts
 create mode 100644 src/docs/src/playground/examples/ai-txt2speech-gemini.html

diff --git a/src/backend/drivers/ai-tts/TTSDriver.ts b/src/backend/drivers/ai-tts/TTSDriver.ts
index 023a9af0b..2b910952c 100644
--- a/src/backend/drivers/ai-tts/TTSDriver.ts
+++ b/src/backend/drivers/ai-tts/TTSDriver.ts
@@ -23,6 +23,7 @@ import type { DriverStreamResult } from '../meta.js';
 import { PuterDriver } from '../types.js';
 import { AWSPollyTTSProvider } from './providers/awsPolly/AWSPollyTTSProvider.js';
 import { ElevenLabsTTSProvider } from './providers/elevenlabs/ElevenLabsTTSProvider.js';
+import { GeminiTTSProvider } from './providers/gemini/GeminiTTSProvider.js';
 import { OpenAITTSProvider } from './providers/openai/OpenAITTSProvider.js';
 import type {
     ISynthesizeArgs,
@@ -43,12 +44,18 @@ import type {
 // than passing `{ provider }` in args, so alias the unified driver under
 // the names the client expects. `#providerFromAlias` normalizes those
 // aliases to the internal provider keys used by `#providers`.
-const TTS_ALIASES = ['aws-polly', 'openai-tts', 'elevenlabs-tts'] as const;
+const TTS_ALIASES = [
+    'aws-polly',
+    'openai-tts',
+    'elevenlabs-tts',
+    'gemini-tts',
+] as const;
 type TTSAlias = (typeof TTS_ALIASES)[number];
 const ALIAS_TO_PROVIDER: Record<TTSAlias, string> = {
     'aws-polly': 'aws-polly',
     'openai-tts': 'openai',
     'elevenlabs-tts': 'elevenlabs',
+    'gemini-tts': 'gemini',
 };
 
 export class TTSDriver extends PuterDriver {
@@ -247,14 +254,40 @@ export class TTSDriver extends PuterDriver {
                 );
             }
         }
+
+        this.#registerGeminiProvider(providers);
+    }
+
+    #registerGeminiProvider(providers: Record<string, unknown>) {
+        const m = this.services.metering;
+        const gemini = (providers['gemini'] ?? providers['gemini-tts']) as
+            | Record<string, unknown>
+            | undefined;
+        const geminiKey =
+            (gemini?.apiKey as string | undefined) ??
+            (gemini?.api_key as string | undefined) ??
+            (gemini?.key as string | undefined);
+        if (geminiKey) {
+            try {
+                this.#providers['gemini'] = new GeminiTTSProvider(m, {
+                    apiKey: geminiKey,
+                });
+            } catch (e) {
+                console.warn(
+                    '[TTSDriver] Failed to init Gemini TTS provider:',
+                    (e as Error).message,
+                );
+            }
+        }
     }
 
     #getDefaultProviderName(): string | null {
         const names = Object.keys(this.#providers);
         if (names.length === 0) return null;
-        // Prefer openai, then elevenlabs, then aws-polly
+        // Prefer openai, then elevenlabs, then gemini, then aws-polly
         if (this.#providers['openai']) return 'openai';
         if (this.#providers['elevenlabs']) return 'elevenlabs';
+        if (this.#providers['gemini']) return 'gemini';
         return names[0];
     }
 }
diff --git a/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts b/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts
new file mode 100644
index 000000000..9b38cb768
--- /dev/null
+++ b/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts
@@ -0,0 +1,361 @@
+/**
+ * Copyright (C) 2024-present Puter Technologies Inc.
+ *
+ * This file is part of Puter.
+ *
+ * Puter is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+import { GoogleGenAI } from '@google/genai';
+import { Readable } from 'node:stream';
+import { HttpError } from '../../../../core/http/HttpError.js';
+import { Context } from '../../../../core/context.js';
+import type { MeteringService } from '../../../../services/metering/MeteringService.js';
+import type { DriverStreamResult } from '../../../meta.js';
+import type { ITTSVoice, ITTSEngine, ISynthesizeArgs } from '../../types.js';
+import { TTSProvider } from '../TTSProvider.js';
+import { GEMINI_TTS_COSTS } from './costs.js';
+
+const DEFAULT_MODEL = 'gemini-2.5-flash-preview-tts';
+const DEFAULT_VOICE = 'Kore';
+const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3';
+
+const GEMINI_TTS_MODELS = [
+    {
+        id: 'gemini-2.5-flash-preview-tts',
+        name: 'Gemini 2.5 Flash TTS',
+    },
+    {
+        id: 'gemini-2.5-pro-preview-tts',
+        name: 'Gemini 2.5 Pro TTS',
+    },
+    {
+        id: 'gemini-3.1-flash-tts-preview',
+        name: 'Gemini 3.1 Flash TTS',
+    },
+];
+
+const GEMINI_TTS_VOICES = [
+    { id: 'Zephyr', name: 'Zephyr', description: 'Bright' },
+    { id: 'Puck', name: 'Puck', description: 'Upbeat' },
+    { id: 'Charon', name: 'Charon', description: 'Informative' },
+    { id: 'Kore', name: 'Kore', description: 'Firm' },
+    { id: 'Fenrir', name: 'Fenrir', description: 'Excitable' },
+    { id: 'Leda', name: 'Leda', description: 'Youthful' },
+    { id: 'Orus', name: 'Orus', description: 'Firm' },
+    { id: 'Aoede', name: 'Aoede', description: 'Breezy' },
+    { id: 'Callirrhoe', name: 'Callirrhoe', description: 'Easy-going' },
+    { id: 'Autonoe', name: 'Autonoe', description: 'Bright' },
+    { id: 'Enceladus', name: 'Enceladus', description: 'Breathy' },
+    { id: 'Iapetus', name: 'Iapetus', description: 'Clear' },
+    { id: 'Umbriel', name: 'Umbriel', description: 'Easy-going' },
+    { id: 'Algieba', name: 'Algieba', description: 'Smooth' },
+    { id: 'Despina', name: 'Despina', description: 'Smooth' },
+    { id: 'Erinome', name: 'Erinome', description: 'Clear' },
+    { id: 'Algenib', name: 'Algenib', description: 'Gravelly' },
+    { id: 'Rasalgethi', name: 'Rasalgethi', description: 'Informative' },
+    { id: 'Laomedeia', name: 'Laomedeia', description: 'Upbeat' },
+    { id: 'Achernar', name: 'Achernar', description: 'Soft' },
+    { id: 'Alnilam', name: 'Alnilam', description: 'Firm' },
+    { id: 'Schedar', name: 'Schedar', description: 'Even' },
+    { id: 'Gacrux', name: 'Gacrux', description: 'Mature' },
+    { id: 'Pulcherrima', name: 'Pulcherrima', description: 'Forward' },
+    { id: 'Achird', name: 'Achird', description: 'Friendly' },
+    { id: 'Zubenelgenubi', name: 'Zubenelgenubi', description: 'Casual' },
+    { id: 'Vindemiatrix', name: 'Vindemiatrix', description: 'Gentle' },
+    { id: 'Sadachbia', name: 'Sadachbia', description: 'Lively' },
+    { id: 'Sadaltager', name: 'Sadaltager', description: 'Knowledgeable' },
+    { id: 'Sulafat', name: 'Sulafat', description: 'Warm' },
+];
+
+/**
+ * Gemini TTS provider. Calls the Gemini generateContent API with
+ * `responseModalities: ["AUDIO"]` and `speechConfig` to synthesize speech.
+ * Returns raw PCM audio wrapped in a WAV container.
+ */
+export class GeminiTTSProvider extends TTSProvider {
+    readonly providerName = 'gemini';
+
+    #client: GoogleGenAI;
+
+    constructor(meteringService: MeteringService, config: { apiKey: string }) {
+        super(meteringService, config);
+        if (!config.apiKey) {
+            throw new Error('Gemini TTS requires an API key');
+        }
+        this.#client = new GoogleGenAI({ apiKey: config.apiKey });
+    }
+
+    async listVoices(): Promise<ITTSVoice[]> {
+        return GEMINI_TTS_VOICES.map((voice) => ({
+            id: voice.id,
+            name: voice.name,
+            description: voice.description,
+            provider: 'gemini',
+            supported_models: GEMINI_TTS_MODELS.map((m) => m.id),
+        }));
+    }
+
+    async listEngines(): Promise<ITTSEngine[]> {
+        return GEMINI_TTS_MODELS.map((model) => ({
+            id: model.id,
+            name: model.name,
+            provider: 'gemini',
+        }));
+    }
+
+    override getReportedCosts(): Record<string, unknown>[] {
+        return Object.entries(GEMINI_TTS_COSTS).map(([model, costs]) => ({
+            usageType: `gemini:${model}:tts`,
+            ucentsInputPerToken: this.#toMicroCents(costs.input / 1_000_000),
+            ucentsOutputAudioPerToken: this.#toMicroCents(
+                costs.output_audio / 1_000_000,
+            ),
+            unit: 'token',
+            source: 'driver:aiTts/gemini',
+        }));
+    }
+
+    async synthesize(
+        args: ISynthesizeArgs,
+    ): Promise<DriverStreamResult | { url: string; content_type: string }> {
+        const {
+            text,
+            voice: voiceArg,
+            model: modelArg,
+            instructions,
+            test_mode,
+        } = args;
+
+        if (test_mode) {
+            return { url: SAMPLE_AUDIO_URL, content_type: 'audio' };
+        }
+
+        if (typeof text !== 'string' || !text.trim()) {
+            throw new HttpError(400, 'Missing required field: text', {
+                legacyCode: 'field_required',
+                fields: { key: 'text' },
+            });
+        }
+
+        const model = modelArg || DEFAULT_MODEL;
+        if (!GEMINI_TTS_MODELS.find(({ id }) => id === model)) {
+            throw new HttpError(
+                400,
+                `Invalid model: ${model}. Expected: ${GEMINI_TTS_MODELS.map(({ id }) => id).join(', ')}`,
+                {
+                    legacyCode: 'field_invalid',
+                    fields: {
+                        key: 'model',
+                        expected: GEMINI_TTS_MODELS.map(({ id }) => id).join(
+                            ', ',
+                        ),
+                        got: model,
+                    },
+                },
+            );
+        }
+
+        const voice = voiceArg || DEFAULT_VOICE;
+        if (
+            !GEMINI_TTS_VOICES.find(
+                ({ id }) => id.toLowerCase() === voice.toLowerCase(),
+            )
+        ) {
+            throw new HttpError(
+                400,
+                `Invalid voice: ${voice}. Expected: ${GEMINI_TTS_VOICES.map(({ id }) => id).join(', ')}`,
+                {
+                    legacyCode: 'field_invalid',
+                    fields: {
+                        key: 'voice',
+                        expected: GEMINI_TTS_VOICES.map(({ id }) => id).join(
+                            ', ',
+                        ),
+                        got: voice,
+                    },
+                },
+            );
+        }
+
+        const actor = Context.get('actor')!;
+        const costs = GEMINI_TTS_COSTS[model];
+        if (!costs) {
+            throw new HttpError(500, `No cost data for model: ${model}`);
+        }
+
+        // Estimate input tokens (~4 chars per token) and a rough output
+        // audio duration (~150 words/min, 25 tokens/sec).
+        const estimatedInputTokens = Math.max(1, Math.ceil(text.length / 4));
+        const wordCount = text.split(/\s+/).length;
+        const estimatedDurationSec = Math.max(1, (wordCount / 150) * 60);
+        const estimatedOutputTokens = Math.ceil(estimatedDurationSec * 25);
+
+        const estimatedInputCostCents =
+            (estimatedInputTokens / 1_000_000) * costs.input;
+        const estimatedOutputCostCents =
+            (estimatedOutputTokens / 1_000_000) * costs.output_audio;
+        const estimatedTotalMicroCents = this.#toMicroCents(
+            estimatedInputCostCents + estimatedOutputCostCents,
+        );
+
+        const usageAllowed = await this.meteringService.hasEnoughCredits(
+            actor,
+            estimatedTotalMicroCents,
+        );
+        if (!usageAllowed) {
+            throw new HttpError(402, 'Insufficient funds', {
+                legacyCode: 'insufficient_funds',
+            });
+        }
+
+        // The TTS models require the text to be framed as a transcript
+        // to read aloud. Prefixing with "Say:" prevents the model from
+        // trying to generate conversational text instead of audio.
+        const inputText = instructions
+            ? `${instructions}\n\nSay the following text aloud:\n${text}`
+            : `Say the following text aloud:\n${text}`;
+
+        // eslint-disable-next-line @typescript-eslint/no-explicit-any
+        let response: any;
+        try {
+            response = await this.#client.models.generateContent({
+                model,
+                contents: [{ parts: [{ text: inputText }] }],
+                config: {
+                    responseModalities: ['AUDIO'],
+                    speechConfig: {
+                        voiceConfig: {
+                            prebuiltVoiceConfig: { voiceName: voice },
+                        },
+                    },
+                },
+            });
+        } catch (e: unknown) {
+            const msg = (e as Error).message ?? String(e);
+            console.error('[GeminiTTSProvider] API error:', msg);
+            throw new HttpError(502, `Gemini TTS API error: ${msg}`, {
+                fields: { provider: 'gemini' },
+            });
+        }
+
+        // Extract audio data from response
+        const part = response?.candidates?.[0]?.content?.parts?.[0];
+        if (!part?.inlineData?.data) {
+            throw new HttpError(502, 'Gemini TTS did not return audio data', {
+                fields: { provider: 'gemini' },
+            });
+        }
+
+        const audioBase64: string = part.inlineData.data;
+        const mimeType: string =
+            part.inlineData.mimeType || 'audio/L16;rate=24000';
+
+        // Convert base64 PCM to a WAV buffer for broad client compatibility
+        const pcmBuffer = Buffer.from(audioBase64, 'base64');
+        let outputBuffer: Buffer;
+        let contentType: string;
+
+        if (mimeType.startsWith('audio/L16') || mimeType === 'audio/pcm') {
+            // Wrap raw PCM (16-bit LE, 24kHz, mono) in a WAV container
+            outputBuffer = this.#wrapPcmInWav(pcmBuffer, 24000, 1, 16);
+            contentType = 'audio/wav';
+        } else {
+            // If the API returns encoded audio (unlikely today), pass through
+            outputBuffer = pcmBuffer;
+            contentType = mimeType;
+        }
+
+        // Meter actual usage from response metadata
+        const usage = response.usageMetadata;
+        const actualInputTokens =
+            typeof usage?.promptTokenCount === 'number'
+                ? usage.promptTokenCount
+                : estimatedInputTokens;
+        const actualOutputTokens =
+            typeof usage?.candidatesTokenCount === 'number'
+                ? usage.candidatesTokenCount
+                : estimatedOutputTokens;
+
+        const inputCostCents = (actualInputTokens / 1_000_000) * costs.input;
+        const outputCostCents =
+            (actualOutputTokens / 1_000_000) * costs.output_audio;
+
+        const usagePrefix = `gemini:${model}`;
+        this.meteringService.batchIncrementUsages(actor, [
+            {
+                usageType: `${usagePrefix}:input`,
+                usageAmount: Math.max(actualInputTokens, 1),
+                costOverride: this.#toMicroCents(inputCostCents),
+            },
+            {
+                usageType: `${usagePrefix}:output:audio`,
+                usageAmount: Math.max(actualOutputTokens, 1),
+                costOverride: this.#toMicroCents(outputCostCents),
+            },
+        ]);
+
+        const stream = Readable.from(outputBuffer);
+
+        return {
+            dataType: 'stream',
+            content_type: contentType,
+            chunked: true,
+            stream,
+        };
+    }
+
+    /**
+     * Wrap raw PCM samples in a WAV container so browsers can play it.
+     */
+    #wrapPcmInWav(
+        pcm: Buffer,
+        sampleRate: number,
+        channels: number,
+        bitsPerSample: number,
+    ): Buffer {
+        const byteRate = (sampleRate * channels * bitsPerSample) / 8;
+        const blockAlign = (channels * bitsPerSample) / 8;
+        const dataSize = pcm.length;
+        const headerSize = 44;
+        const buffer = Buffer.alloc(headerSize + dataSize);
+
+        // RIFF header
+        buffer.write('RIFF', 0);
+        buffer.writeUInt32LE(36 + dataSize, 4);
+        buffer.write('WAVE', 8);
+
+        // fmt sub-chunk
+        buffer.write('fmt ', 12);
+        buffer.writeUInt32LE(16, 16); // sub-chunk size
+        buffer.writeUInt16LE(1, 20); // PCM format
+        buffer.writeUInt16LE(channels, 22);
+        buffer.writeUInt32LE(sampleRate, 24);
+        buffer.writeUInt32LE(byteRate, 28);
+        buffer.writeUInt16LE(blockAlign, 32);
+        buffer.writeUInt16LE(bitsPerSample, 34);
+
+        // data sub-chunk
+        buffer.write('data', 36);
+        buffer.writeUInt32LE(dataSize, 40);
+        pcm.copy(buffer, headerSize);
+
+        return buffer;
+    }
+
+    #toMicroCents(cents: number): number {
+        if (!Number.isFinite(cents) || cents <= 0) return 1;
+        return Math.ceil(cents * 1_000_000);
+    }
+}
diff --git a/src/backend/drivers/ai-tts/providers/gemini/costs.ts b/src/backend/drivers/ai-tts/providers/gemini/costs.ts
new file mode 100644
index 000000000..297779727
--- /dev/null
+++ b/src/backend/drivers/ai-tts/providers/gemini/costs.ts
@@ -0,0 +1,44 @@
+/**
+ * Copyright (C) 2024-present Puter Technologies Inc.
+ *
+ * This file is part of Puter.
+ *
+ * Puter is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU Affero General Public License as published
+ * by the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU Affero General Public License for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program.  If not, see <https://www.gnu.org/licenses/>.
+ */
+
+// Gemini TTS pricing in USD per 1M tokens:
+//   gemini-2.5-flash-preview-tts:  input $0.50, output (audio) $10.00
+//   gemini-2.5-pro-preview-tts:    input $1.00, output (audio) $20.00
+//   gemini-3.1-flash-tts-preview:  input $1.00, output (audio) $20.00
+//
+// Audio output tokens = ~25 tokens/second of audio.
+//
+// Costs here are in USD-cents per 1M tokens for input and output.
+export const GEMINI_TTS_COSTS: Record<
+    string,
+    { input: number; output_audio: number }
+> = {
+    'gemini-2.5-flash-preview-tts': {
+        input: 50, // $0.50 per 1M tokens = 50 cents
+        output_audio: 1000, // $10.00 per 1M tokens = 1000 cents
+    },
+    'gemini-2.5-pro-preview-tts': {
+        input: 100, // $1.00 per 1M tokens
+        output_audio: 2000, // $20.00 per 1M tokens
+    },
+    'gemini-3.1-flash-tts-preview': {
+        input: 100, // $1.00 per 1M tokens
+        output_audio: 2000, // $20.00 per 1M tokens
+    },
+};
diff --git a/src/docs/src/AI.md b/src/docs/src/AI.md
index a543db3a1..e89b53050 100644
--- a/src/docs/src/AI.md
+++ b/src/docs/src/AI.md
@@ -226,6 +226,7 @@ You can see various Puter.js AI features in action from the following examples:
   - [Text to Speech with options](/playground/ai-txt2speech-options/)
   - [Text to Speech with engines](/playground/ai-txt2speech-engines/)
   - [Text to Speech with OpenAI voices](/playground/ai-txt2speech-openai/)
+  - [Text to Speech with Gemini voices](/playground/ai-txt2speech-gemini/)
   - [Transcribe audio with `speech2txt`](/AI/speech2txt/)
 - Text to Video
   - [Generate a sample Sora clip](/AI/txt2vid/)
diff --git a/src/docs/src/AI/txt2speech.md b/src/docs/src/AI/txt2speech.md
index 52158807e..dde915721 100755
--- a/src/docs/src/AI/txt2speech.md
+++ b/src/docs/src/AI/txt2speech.md
@@ -32,7 +32,7 @@ Additional settings for the generation request. Available options depend on the
 
 | Option | Type | Description |
 |--------|------|-------------|
-| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'` |
+| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'`, `'gemini'` |
 | `model` | `String` | Model identifier (provider-specific) |
 | `voice` | `String` | Voice ID used for synthesis (provider-specific) |
 | `test_mode` | `Boolean` | When `true`, returns a sample audio without using credits |
@@ -74,6 +74,18 @@ Available when `provider: 'elevenlabs'`:
 
 For more details about each option, see the [ElevenLabs API reference](https://elevenlabs.io/docs/api-reference/text-to-speech).
 
+#### Gemini Options
+
+Available when `provider: 'gemini'`:
+
+| Option | Type | Description |
+|--------|------|-------------|
+| `model` | `String` | TTS model. Available: `'gemini-2.5-flash-preview-tts'` (default), `'gemini-2.5-pro-preview-tts'`, `'gemini-3.1-flash-tts-preview'` |
+| `voice` | `String` | Voice name. Defaults to `'Kore'`. Available: `'Zephyr'`, `'Puck'`, `'Charon'`, `'Kore'`, `'Fenrir'`, `'Leda'`, `'Orus'`, `'Aoede'`, `'Callirrhoe'`, `'Autonoe'`, `'Enceladus'`, `'Iapetus'`, `'Umbriel'`, `'Algieba'`, `'Despina'`, `'Erinome'`, `'Algenib'`, `'Rasalgethi'`, `'Laomedeia'`, `'Achernar'`, `'Alnilam'`, `'Schedar'`, `'Gacrux'`, `'Pulcherrima'`, `'Achird'`, `'Zubenelgenubi'`, `'Vindemiatrix'`, `'Sadachbia'`, `'Sadaltager'`, `'Sulafat'` |
+| `instructions` | `String` | Natural language instructions to control speaking style (tone, speed, mood, etc.) |
+
+For more details about Gemini TTS, see the [Google Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/text-to-speech).
+
 ## Return value
 
 A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points at a blob or remote URL containing the synthesized audio.
@@ -171,6 +183,31 @@ A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points
 </html>
 ```
 
+<strong class="example-title">Use Gemini voices</strong>
+
+```html;ai-txt2speech-gemini
+<html>
+<body>
+    <script src="https://js.puter.com/v2/"></script>
+    <button id="play">Use Gemini voice</button>
+    <script>
+        document.getElementById('play').addEventListener('click', async ()=>{
+            const audio = await puter.ai.txt2speech(
+                "Hello! This sample uses the Gemini Puck voice.",
+                {
+                    provider: "gemini",
+                    model: "gemini-2.5-flash-preview-tts",
+                    voice: "Puck",
+                    instructions: "Speak in a friendly, upbeat tone."
+                }
+            );
+            audio.play();
+        });
+    </script>
+</body>
+</html>
+```
+
 <strong class="example-title">Compare different engines</strong>
 
 ```html;ai-txt2speech-engines
diff --git a/src/docs/src/playground/examples/ai-txt2speech-gemini.html b/src/docs/src/playground/examples/ai-txt2speech-gemini.html
new file mode 100644
index 000000000..482b4c398
--- /dev/null
+++ b/src/docs/src/playground/examples/ai-txt2speech-gemini.html
@@ -0,0 +1,20 @@
+<html>
+<body>
+    <script src="https://js.puter.com/v2/"></script>
+    <button id="play">Use Gemini voice</button>
+    <script>
+        document.getElementById('play').addEventListener('click', async ()=>{
+            const audio = await puter.ai.txt2speech(
+                "Hello! This sample uses the Gemini Puck voice.",
+                {
+                    provider: "gemini",
+                    model: "gemini-2.5-flash-preview-tts",
+                    voice: "Puck",
+                    instructions: "Speak in a friendly, upbeat tone."
+                }
+            );
+            audio.play();
+        });
+    </script>
+</body>
+</html>
\ No newline at end of file
diff --git a/src/puter-js/src/modules/AI.js b/src/puter-js/src/modules/AI.js
index 421f49e9e..68961fbd6 100644
--- a/src/puter-js/src/modules/AI.js
+++ b/src/puter-js/src/modules/AI.js
@@ -2,11 +2,12 @@ import * as utils from '../lib/utils.js';
 
 const normalizeTTSProvider = (value) => {
     if ( typeof value !== 'string' ) {
-        return 'aws-polly';
+        return null;
     }
     const lower = value.toLowerCase();
     if ( lower === 'openai' ) return 'openai';
     if ( ['elevenlabs', 'eleven', '11labs', '11-labs', 'eleven-labs', 'elevenlabs-tts'].includes(lower) ) return 'elevenlabs';
+    if ( ['gemini', 'google', 'gemini-tts', 'google-tts'].includes(lower) ) return 'gemini';
     if ( lower === 'aws' || lower === 'polly' || lower === 'aws-polly' ) return 'aws-polly';
     return value;
 };
@@ -270,6 +271,10 @@ class AI {
             provider = 'elevenlabs';
         }
 
+        if ( options.engine && normalizeTTSProvider(options.engine) === 'gemini' && !options.provider ) {
+            provider = 'gemini';
+        }
+
         if ( provider === 'openai' ) {
             if ( !options.model && typeof options.engine === 'string' ) {
                 options.model = options.engine;
@@ -301,6 +306,17 @@ class AI {
                 options.output_format = options.response_format;
             }
             delete options.engine;
+        } else if ( provider === 'gemini' ) {
+            if ( !options.model && typeof options.engine === 'string' ) {
+                options.model = options.engine;
+            }
+            if ( ! options.voice ) {
+                options.voice = 'Kore';
+            }
+            if ( ! options.model ) {
+                options.model = 'gemini-2.5-flash-preview-tts';
+            }
+            delete options.engine;
         } else {
             provider = 'aws-polly';
 
@@ -332,9 +348,12 @@ class AI {
             }
         }
 
-        const driverName = provider === 'openai'
-            ? 'openai-tts'
-            : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
+        const driverNameMap = {
+            'openai': 'openai-tts',
+            'elevenlabs': 'elevenlabs-tts',
+            'gemini': 'gemini-tts',
+        };
+        const driverName = driverNameMap[provider] || 'aws-polly';
 
         return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', {
             responseType: 'blob',
@@ -574,9 +593,16 @@ class AI {
                 params.provider = 'elevenlabs';
             }
 
-            const driverName = provider === 'openai'
-                ? 'openai-tts'
-                : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
+            if ( provider === 'gemini' ) {
+                params.provider = 'gemini';
+            }
+
+            const driverNameMap = {
+                'openai': 'openai-tts',
+                'elevenlabs': 'elevenlabs-tts',
+                'gemini': 'gemini-tts',
+            };
+            const driverName = driverNameMap[provider] || 'aws-polly';
 
             return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', {
                 responseType: 'text',
@@ -609,9 +635,17 @@ class AI {
                 params.provider = 'elevenlabs';
             }
 
-            const driverName = provider === 'openai'
-                ? 'openai-tts'
-                : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
+            if ( provider === 'gemini' ) {
+                params.provider = 'gemini';
+                delete params.engine;
+            }
+
+            const driverNameMap2 = {
+                'openai': 'openai-tts',
+                'elevenlabs': 'elevenlabs-tts',
+                'gemini': 'gemini-tts',
+            };
+            const driverName = driverNameMap2[provider] || 'aws-polly';
 
             return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', {
                 responseType: 'text',