From 33d5c0737eb74be5c94ca10ca1eebbcea6841f28 Mon Sep 17 00:00:00 2001 From: Nariman Jelveh Date: Sun, 3 May 2026 09:33:50 -0700 Subject: [PATCH] Add Gemini TTS provider and integrate client/docs (#2889) Introduce a new Gemini TTS provider and wire it through the driver, client, docs, and examples. Adds src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts (Google GenAI client usage, PCM->WAV wrapping, metering, model/voice validation) and a costs table in providers/gemini/costs.ts. Registers the provider in TTSDriver, exposes the alias gemini-tts, and prefers it in provider selection. Updates puter-js client to recognize "gemini" provider/engine and route driver calls to gemini-tts. Documentation updated with Gemini options, a usage example, and a playground example HTML file. --- src/backend/drivers/ai-tts/TTSDriver.ts | 37 +- .../providers/gemini/GeminiTTSProvider.ts | 361 ++++++++++++++++++ .../drivers/ai-tts/providers/gemini/costs.ts | 44 +++ src/docs/src/AI.md | 1 + src/docs/src/AI/txt2speech.md | 39 +- .../examples/ai-txt2speech-gemini.html | 20 + src/puter-js/src/modules/AI.js | 54 ++- 7 files changed, 543 insertions(+), 13 deletions(-) create mode 100644 src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts create mode 100644 src/backend/drivers/ai-tts/providers/gemini/costs.ts create mode 100644 src/docs/src/playground/examples/ai-txt2speech-gemini.html diff --git a/src/backend/drivers/ai-tts/TTSDriver.ts b/src/backend/drivers/ai-tts/TTSDriver.ts index 023a9af0b..2b910952c 100644 --- a/src/backend/drivers/ai-tts/TTSDriver.ts +++ b/src/backend/drivers/ai-tts/TTSDriver.ts @@ -23,6 +23,7 @@ import type { DriverStreamResult } from '../meta.js'; import { PuterDriver } from '../types.js'; import { AWSPollyTTSProvider } from './providers/awsPolly/AWSPollyTTSProvider.js'; import { ElevenLabsTTSProvider } from './providers/elevenlabs/ElevenLabsTTSProvider.js'; +import { GeminiTTSProvider } from './providers/gemini/GeminiTTSProvider.js'; import { OpenAITTSProvider } from './providers/openai/OpenAITTSProvider.js'; import type { ISynthesizeArgs, @@ -43,12 +44,18 @@ import type { // than passing `{ provider }` in args, so alias the unified driver under // the names the client expects. `#providerFromAlias` normalizes those // aliases to the internal provider keys used by `#providers`. -const TTS_ALIASES = ['aws-polly', 'openai-tts', 'elevenlabs-tts'] as const; +const TTS_ALIASES = [ + 'aws-polly', + 'openai-tts', + 'elevenlabs-tts', + 'gemini-tts', +] as const; type TTSAlias = (typeof TTS_ALIASES)[number]; const ALIAS_TO_PROVIDER: Record = { 'aws-polly': 'aws-polly', 'openai-tts': 'openai', 'elevenlabs-tts': 'elevenlabs', + 'gemini-tts': 'gemini', }; export class TTSDriver extends PuterDriver { @@ -247,14 +254,40 @@ export class TTSDriver extends PuterDriver { ); } } + + this.#registerGeminiProvider(providers); + } + + #registerGeminiProvider(providers: Record) { + const m = this.services.metering; + const gemini = (providers['gemini'] ?? providers['gemini-tts']) as + | Record + | undefined; + const geminiKey = + (gemini?.apiKey as string | undefined) ?? + (gemini?.api_key as string | undefined) ?? + (gemini?.key as string | undefined); + if (geminiKey) { + try { + this.#providers['gemini'] = new GeminiTTSProvider(m, { + apiKey: geminiKey, + }); + } catch (e) { + console.warn( + '[TTSDriver] Failed to init Gemini TTS provider:', + (e as Error).message, + ); + } + } } #getDefaultProviderName(): string | null { const names = Object.keys(this.#providers); if (names.length === 0) return null; - // Prefer openai, then elevenlabs, then aws-polly + // Prefer openai, then elevenlabs, then gemini, then aws-polly if (this.#providers['openai']) return 'openai'; if (this.#providers['elevenlabs']) return 'elevenlabs'; + if (this.#providers['gemini']) return 'gemini'; return names[0]; } } diff --git a/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts b/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts new file mode 100644 index 000000000..9b38cb768 --- /dev/null +++ b/src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts @@ -0,0 +1,361 @@ +/** + * Copyright (C) 2024-present Puter Technologies Inc. + * + * This file is part of Puter. + * + * Puter is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +import { GoogleGenAI } from '@google/genai'; +import { Readable } from 'node:stream'; +import { HttpError } from '../../../../core/http/HttpError.js'; +import { Context } from '../../../../core/context.js'; +import type { MeteringService } from '../../../../services/metering/MeteringService.js'; +import type { DriverStreamResult } from '../../../meta.js'; +import type { ITTSVoice, ITTSEngine, ISynthesizeArgs } from '../../types.js'; +import { TTSProvider } from '../TTSProvider.js'; +import { GEMINI_TTS_COSTS } from './costs.js'; + +const DEFAULT_MODEL = 'gemini-2.5-flash-preview-tts'; +const DEFAULT_VOICE = 'Kore'; +const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3'; + +const GEMINI_TTS_MODELS = [ + { + id: 'gemini-2.5-flash-preview-tts', + name: 'Gemini 2.5 Flash TTS', + }, + { + id: 'gemini-2.5-pro-preview-tts', + name: 'Gemini 2.5 Pro TTS', + }, + { + id: 'gemini-3.1-flash-tts-preview', + name: 'Gemini 3.1 Flash TTS', + }, +]; + +const GEMINI_TTS_VOICES = [ + { id: 'Zephyr', name: 'Zephyr', description: 'Bright' }, + { id: 'Puck', name: 'Puck', description: 'Upbeat' }, + { id: 'Charon', name: 'Charon', description: 'Informative' }, + { id: 'Kore', name: 'Kore', description: 'Firm' }, + { id: 'Fenrir', name: 'Fenrir', description: 'Excitable' }, + { id: 'Leda', name: 'Leda', description: 'Youthful' }, + { id: 'Orus', name: 'Orus', description: 'Firm' }, + { id: 'Aoede', name: 'Aoede', description: 'Breezy' }, + { id: 'Callirrhoe', name: 'Callirrhoe', description: 'Easy-going' }, + { id: 'Autonoe', name: 'Autonoe', description: 'Bright' }, + { id: 'Enceladus', name: 'Enceladus', description: 'Breathy' }, + { id: 'Iapetus', name: 'Iapetus', description: 'Clear' }, + { id: 'Umbriel', name: 'Umbriel', description: 'Easy-going' }, + { id: 'Algieba', name: 'Algieba', description: 'Smooth' }, + { id: 'Despina', name: 'Despina', description: 'Smooth' }, + { id: 'Erinome', name: 'Erinome', description: 'Clear' }, + { id: 'Algenib', name: 'Algenib', description: 'Gravelly' }, + { id: 'Rasalgethi', name: 'Rasalgethi', description: 'Informative' }, + { id: 'Laomedeia', name: 'Laomedeia', description: 'Upbeat' }, + { id: 'Achernar', name: 'Achernar', description: 'Soft' }, + { id: 'Alnilam', name: 'Alnilam', description: 'Firm' }, + { id: 'Schedar', name: 'Schedar', description: 'Even' }, + { id: 'Gacrux', name: 'Gacrux', description: 'Mature' }, + { id: 'Pulcherrima', name: 'Pulcherrima', description: 'Forward' }, + { id: 'Achird', name: 'Achird', description: 'Friendly' }, + { id: 'Zubenelgenubi', name: 'Zubenelgenubi', description: 'Casual' }, + { id: 'Vindemiatrix', name: 'Vindemiatrix', description: 'Gentle' }, + { id: 'Sadachbia', name: 'Sadachbia', description: 'Lively' }, + { id: 'Sadaltager', name: 'Sadaltager', description: 'Knowledgeable' }, + { id: 'Sulafat', name: 'Sulafat', description: 'Warm' }, +]; + +/** + * Gemini TTS provider. Calls the Gemini generateContent API with + * `responseModalities: ["AUDIO"]` and `speechConfig` to synthesize speech. + * Returns raw PCM audio wrapped in a WAV container. + */ +export class GeminiTTSProvider extends TTSProvider { + readonly providerName = 'gemini'; + + #client: GoogleGenAI; + + constructor(meteringService: MeteringService, config: { apiKey: string }) { + super(meteringService, config); + if (!config.apiKey) { + throw new Error('Gemini TTS requires an API key'); + } + this.#client = new GoogleGenAI({ apiKey: config.apiKey }); + } + + async listVoices(): Promise { + return GEMINI_TTS_VOICES.map((voice) => ({ + id: voice.id, + name: voice.name, + description: voice.description, + provider: 'gemini', + supported_models: GEMINI_TTS_MODELS.map((m) => m.id), + })); + } + + async listEngines(): Promise { + return GEMINI_TTS_MODELS.map((model) => ({ + id: model.id, + name: model.name, + provider: 'gemini', + })); + } + + override getReportedCosts(): Record[] { + return Object.entries(GEMINI_TTS_COSTS).map(([model, costs]) => ({ + usageType: `gemini:${model}:tts`, + ucentsInputPerToken: this.#toMicroCents(costs.input / 1_000_000), + ucentsOutputAudioPerToken: this.#toMicroCents( + costs.output_audio / 1_000_000, + ), + unit: 'token', + source: 'driver:aiTts/gemini', + })); + } + + async synthesize( + args: ISynthesizeArgs, + ): Promise { + const { + text, + voice: voiceArg, + model: modelArg, + instructions, + test_mode, + } = args; + + if (test_mode) { + return { url: SAMPLE_AUDIO_URL, content_type: 'audio' }; + } + + if (typeof text !== 'string' || !text.trim()) { + throw new HttpError(400, 'Missing required field: text', { + legacyCode: 'field_required', + fields: { key: 'text' }, + }); + } + + const model = modelArg || DEFAULT_MODEL; + if (!GEMINI_TTS_MODELS.find(({ id }) => id === model)) { + throw new HttpError( + 400, + `Invalid model: ${model}. Expected: ${GEMINI_TTS_MODELS.map(({ id }) => id).join(', ')}`, + { + legacyCode: 'field_invalid', + fields: { + key: 'model', + expected: GEMINI_TTS_MODELS.map(({ id }) => id).join( + ', ', + ), + got: model, + }, + }, + ); + } + + const voice = voiceArg || DEFAULT_VOICE; + if ( + !GEMINI_TTS_VOICES.find( + ({ id }) => id.toLowerCase() === voice.toLowerCase(), + ) + ) { + throw new HttpError( + 400, + `Invalid voice: ${voice}. Expected: ${GEMINI_TTS_VOICES.map(({ id }) => id).join(', ')}`, + { + legacyCode: 'field_invalid', + fields: { + key: 'voice', + expected: GEMINI_TTS_VOICES.map(({ id }) => id).join( + ', ', + ), + got: voice, + }, + }, + ); + } + + const actor = Context.get('actor')!; + const costs = GEMINI_TTS_COSTS[model]; + if (!costs) { + throw new HttpError(500, `No cost data for model: ${model}`); + } + + // Estimate input tokens (~4 chars per token) and a rough output + // audio duration (~150 words/min, 25 tokens/sec). + const estimatedInputTokens = Math.max(1, Math.ceil(text.length / 4)); + const wordCount = text.split(/\s+/).length; + const estimatedDurationSec = Math.max(1, (wordCount / 150) * 60); + const estimatedOutputTokens = Math.ceil(estimatedDurationSec * 25); + + const estimatedInputCostCents = + (estimatedInputTokens / 1_000_000) * costs.input; + const estimatedOutputCostCents = + (estimatedOutputTokens / 1_000_000) * costs.output_audio; + const estimatedTotalMicroCents = this.#toMicroCents( + estimatedInputCostCents + estimatedOutputCostCents, + ); + + const usageAllowed = await this.meteringService.hasEnoughCredits( + actor, + estimatedTotalMicroCents, + ); + if (!usageAllowed) { + throw new HttpError(402, 'Insufficient funds', { + legacyCode: 'insufficient_funds', + }); + } + + // The TTS models require the text to be framed as a transcript + // to read aloud. Prefixing with "Say:" prevents the model from + // trying to generate conversational text instead of audio. + const inputText = instructions + ? `${instructions}\n\nSay the following text aloud:\n${text}` + : `Say the following text aloud:\n${text}`; + + // eslint-disable-next-line @typescript-eslint/no-explicit-any + let response: any; + try { + response = await this.#client.models.generateContent({ + model, + contents: [{ parts: [{ text: inputText }] }], + config: { + responseModalities: ['AUDIO'], + speechConfig: { + voiceConfig: { + prebuiltVoiceConfig: { voiceName: voice }, + }, + }, + }, + }); + } catch (e: unknown) { + const msg = (e as Error).message ?? String(e); + console.error('[GeminiTTSProvider] API error:', msg); + throw new HttpError(502, `Gemini TTS API error: ${msg}`, { + fields: { provider: 'gemini' }, + }); + } + + // Extract audio data from response + const part = response?.candidates?.[0]?.content?.parts?.[0]; + if (!part?.inlineData?.data) { + throw new HttpError(502, 'Gemini TTS did not return audio data', { + fields: { provider: 'gemini' }, + }); + } + + const audioBase64: string = part.inlineData.data; + const mimeType: string = + part.inlineData.mimeType || 'audio/L16;rate=24000'; + + // Convert base64 PCM to a WAV buffer for broad client compatibility + const pcmBuffer = Buffer.from(audioBase64, 'base64'); + let outputBuffer: Buffer; + let contentType: string; + + if (mimeType.startsWith('audio/L16') || mimeType === 'audio/pcm') { + // Wrap raw PCM (16-bit LE, 24kHz, mono) in a WAV container + outputBuffer = this.#wrapPcmInWav(pcmBuffer, 24000, 1, 16); + contentType = 'audio/wav'; + } else { + // If the API returns encoded audio (unlikely today), pass through + outputBuffer = pcmBuffer; + contentType = mimeType; + } + + // Meter actual usage from response metadata + const usage = response.usageMetadata; + const actualInputTokens = + typeof usage?.promptTokenCount === 'number' + ? usage.promptTokenCount + : estimatedInputTokens; + const actualOutputTokens = + typeof usage?.candidatesTokenCount === 'number' + ? usage.candidatesTokenCount + : estimatedOutputTokens; + + const inputCostCents = (actualInputTokens / 1_000_000) * costs.input; + const outputCostCents = + (actualOutputTokens / 1_000_000) * costs.output_audio; + + const usagePrefix = `gemini:${model}`; + this.meteringService.batchIncrementUsages(actor, [ + { + usageType: `${usagePrefix}:input`, + usageAmount: Math.max(actualInputTokens, 1), + costOverride: this.#toMicroCents(inputCostCents), + }, + { + usageType: `${usagePrefix}:output:audio`, + usageAmount: Math.max(actualOutputTokens, 1), + costOverride: this.#toMicroCents(outputCostCents), + }, + ]); + + const stream = Readable.from(outputBuffer); + + return { + dataType: 'stream', + content_type: contentType, + chunked: true, + stream, + }; + } + + /** + * Wrap raw PCM samples in a WAV container so browsers can play it. + */ + #wrapPcmInWav( + pcm: Buffer, + sampleRate: number, + channels: number, + bitsPerSample: number, + ): Buffer { + const byteRate = (sampleRate * channels * bitsPerSample) / 8; + const blockAlign = (channels * bitsPerSample) / 8; + const dataSize = pcm.length; + const headerSize = 44; + const buffer = Buffer.alloc(headerSize + dataSize); + + // RIFF header + buffer.write('RIFF', 0); + buffer.writeUInt32LE(36 + dataSize, 4); + buffer.write('WAVE', 8); + + // fmt sub-chunk + buffer.write('fmt ', 12); + buffer.writeUInt32LE(16, 16); // sub-chunk size + buffer.writeUInt16LE(1, 20); // PCM format + buffer.writeUInt16LE(channels, 22); + buffer.writeUInt32LE(sampleRate, 24); + buffer.writeUInt32LE(byteRate, 28); + buffer.writeUInt16LE(blockAlign, 32); + buffer.writeUInt16LE(bitsPerSample, 34); + + // data sub-chunk + buffer.write('data', 36); + buffer.writeUInt32LE(dataSize, 40); + pcm.copy(buffer, headerSize); + + return buffer; + } + + #toMicroCents(cents: number): number { + if (!Number.isFinite(cents) || cents <= 0) return 1; + return Math.ceil(cents * 1_000_000); + } +} diff --git a/src/backend/drivers/ai-tts/providers/gemini/costs.ts b/src/backend/drivers/ai-tts/providers/gemini/costs.ts new file mode 100644 index 000000000..297779727 --- /dev/null +++ b/src/backend/drivers/ai-tts/providers/gemini/costs.ts @@ -0,0 +1,44 @@ +/** + * Copyright (C) 2024-present Puter Technologies Inc. + * + * This file is part of Puter. + * + * Puter is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +// Gemini TTS pricing in USD per 1M tokens: +// gemini-2.5-flash-preview-tts: input $0.50, output (audio) $10.00 +// gemini-2.5-pro-preview-tts: input $1.00, output (audio) $20.00 +// gemini-3.1-flash-tts-preview: input $1.00, output (audio) $20.00 +// +// Audio output tokens = ~25 tokens/second of audio. +// +// Costs here are in USD-cents per 1M tokens for input and output. +export const GEMINI_TTS_COSTS: Record< + string, + { input: number; output_audio: number } +> = { + 'gemini-2.5-flash-preview-tts': { + input: 50, // $0.50 per 1M tokens = 50 cents + output_audio: 1000, // $10.00 per 1M tokens = 1000 cents + }, + 'gemini-2.5-pro-preview-tts': { + input: 100, // $1.00 per 1M tokens + output_audio: 2000, // $20.00 per 1M tokens + }, + 'gemini-3.1-flash-tts-preview': { + input: 100, // $1.00 per 1M tokens + output_audio: 2000, // $20.00 per 1M tokens + }, +}; diff --git a/src/docs/src/AI.md b/src/docs/src/AI.md index a543db3a1..e89b53050 100644 --- a/src/docs/src/AI.md +++ b/src/docs/src/AI.md @@ -226,6 +226,7 @@ You can see various Puter.js AI features in action from the following examples: - [Text to Speech with options](/playground/ai-txt2speech-options/) - [Text to Speech with engines](/playground/ai-txt2speech-engines/) - [Text to Speech with OpenAI voices](/playground/ai-txt2speech-openai/) + - [Text to Speech with Gemini voices](/playground/ai-txt2speech-gemini/) - [Transcribe audio with `speech2txt`](/AI/speech2txt/) - Text to Video - [Generate a sample Sora clip](/AI/txt2vid/) diff --git a/src/docs/src/AI/txt2speech.md b/src/docs/src/AI/txt2speech.md index 52158807e..dde915721 100755 --- a/src/docs/src/AI/txt2speech.md +++ b/src/docs/src/AI/txt2speech.md @@ -32,7 +32,7 @@ Additional settings for the generation request. Available options depend on the | Option | Type | Description | |--------|------|-------------| -| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'` | +| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'`, `'gemini'` | | `model` | `String` | Model identifier (provider-specific) | | `voice` | `String` | Voice ID used for synthesis (provider-specific) | | `test_mode` | `Boolean` | When `true`, returns a sample audio without using credits | @@ -74,6 +74,18 @@ Available when `provider: 'elevenlabs'`: For more details about each option, see the [ElevenLabs API reference](https://elevenlabs.io/docs/api-reference/text-to-speech). +#### Gemini Options + +Available when `provider: 'gemini'`: + +| Option | Type | Description | +|--------|------|-------------| +| `model` | `String` | TTS model. Available: `'gemini-2.5-flash-preview-tts'` (default), `'gemini-2.5-pro-preview-tts'`, `'gemini-3.1-flash-tts-preview'` | +| `voice` | `String` | Voice name. Defaults to `'Kore'`. Available: `'Zephyr'`, `'Puck'`, `'Charon'`, `'Kore'`, `'Fenrir'`, `'Leda'`, `'Orus'`, `'Aoede'`, `'Callirrhoe'`, `'Autonoe'`, `'Enceladus'`, `'Iapetus'`, `'Umbriel'`, `'Algieba'`, `'Despina'`, `'Erinome'`, `'Algenib'`, `'Rasalgethi'`, `'Laomedeia'`, `'Achernar'`, `'Alnilam'`, `'Schedar'`, `'Gacrux'`, `'Pulcherrima'`, `'Achird'`, `'Zubenelgenubi'`, `'Vindemiatrix'`, `'Sadachbia'`, `'Sadaltager'`, `'Sulafat'` | +| `instructions` | `String` | Natural language instructions to control speaking style (tone, speed, mood, etc.) | + +For more details about Gemini TTS, see the [Google Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/text-to-speech). + ## Return value A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points at a blob or remote URL containing the synthesized audio. @@ -171,6 +183,31 @@ A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points ``` +Use Gemini voices + +```html;ai-txt2speech-gemini + + + + + + + +``` + Compare different engines ```html;ai-txt2speech-engines diff --git a/src/docs/src/playground/examples/ai-txt2speech-gemini.html b/src/docs/src/playground/examples/ai-txt2speech-gemini.html new file mode 100644 index 000000000..482b4c398 --- /dev/null +++ b/src/docs/src/playground/examples/ai-txt2speech-gemini.html @@ -0,0 +1,20 @@ + + + + + + + \ No newline at end of file diff --git a/src/puter-js/src/modules/AI.js b/src/puter-js/src/modules/AI.js index 421f49e9e..68961fbd6 100644 --- a/src/puter-js/src/modules/AI.js +++ b/src/puter-js/src/modules/AI.js @@ -2,11 +2,12 @@ import * as utils from '../lib/utils.js'; const normalizeTTSProvider = (value) => { if ( typeof value !== 'string' ) { - return 'aws-polly'; + return null; } const lower = value.toLowerCase(); if ( lower === 'openai' ) return 'openai'; if ( ['elevenlabs', 'eleven', '11labs', '11-labs', 'eleven-labs', 'elevenlabs-tts'].includes(lower) ) return 'elevenlabs'; + if ( ['gemini', 'google', 'gemini-tts', 'google-tts'].includes(lower) ) return 'gemini'; if ( lower === 'aws' || lower === 'polly' || lower === 'aws-polly' ) return 'aws-polly'; return value; }; @@ -270,6 +271,10 @@ class AI { provider = 'elevenlabs'; } + if ( options.engine && normalizeTTSProvider(options.engine) === 'gemini' && !options.provider ) { + provider = 'gemini'; + } + if ( provider === 'openai' ) { if ( !options.model && typeof options.engine === 'string' ) { options.model = options.engine; @@ -301,6 +306,17 @@ class AI { options.output_format = options.response_format; } delete options.engine; + } else if ( provider === 'gemini' ) { + if ( !options.model && typeof options.engine === 'string' ) { + options.model = options.engine; + } + if ( ! options.voice ) { + options.voice = 'Kore'; + } + if ( ! options.model ) { + options.model = 'gemini-2.5-flash-preview-tts'; + } + delete options.engine; } else { provider = 'aws-polly'; @@ -332,9 +348,12 @@ class AI { } } - const driverName = provider === 'openai' - ? 'openai-tts' - : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly'); + const driverNameMap = { + 'openai': 'openai-tts', + 'elevenlabs': 'elevenlabs-tts', + 'gemini': 'gemini-tts', + }; + const driverName = driverNameMap[provider] || 'aws-polly'; return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', { responseType: 'blob', @@ -574,9 +593,16 @@ class AI { params.provider = 'elevenlabs'; } - const driverName = provider === 'openai' - ? 'openai-tts' - : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly'); + if ( provider === 'gemini' ) { + params.provider = 'gemini'; + } + + const driverNameMap = { + 'openai': 'openai-tts', + 'elevenlabs': 'elevenlabs-tts', + 'gemini': 'gemini-tts', + }; + const driverName = driverNameMap[provider] || 'aws-polly'; return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', { responseType: 'text', @@ -609,9 +635,17 @@ class AI { params.provider = 'elevenlabs'; } - const driverName = provider === 'openai' - ? 'openai-tts' - : (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly'); + if ( provider === 'gemini' ) { + params.provider = 'gemini'; + delete params.engine; + } + + const driverNameMap2 = { + 'openai': 'openai-tts', + 'elevenlabs': 'elevenlabs-tts', + 'gemini': 'gemini-tts', + }; + const driverName = driverNameMap2[provider] || 'aws-polly'; return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', { responseType: 'text',