mirror of
https://github.com/HeyPuter/puter.git
synced 2026-05-04 00:20:45 +00:00
Add Gemini TTS provider and integrate client/docs (#2889)
Introduce a new Gemini TTS provider and wire it through the driver, client, docs, and examples. Adds src/backend/drivers/ai-tts/providers/gemini/GeminiTTSProvider.ts (Google GenAI client usage, PCM->WAV wrapping, metering, model/voice validation) and a costs table in providers/gemini/costs.ts. Registers the provider in TTSDriver, exposes the alias gemini-tts, and prefers it in provider selection. Updates puter-js client to recognize "gemini" provider/engine and route driver calls to gemini-tts. Documentation updated with Gemini options, a usage example, and a playground example HTML file.
This commit is contained in:
@@ -23,6 +23,7 @@ import type { DriverStreamResult } from '../meta.js';
|
||||
import { PuterDriver } from '../types.js';
|
||||
import { AWSPollyTTSProvider } from './providers/awsPolly/AWSPollyTTSProvider.js';
|
||||
import { ElevenLabsTTSProvider } from './providers/elevenlabs/ElevenLabsTTSProvider.js';
|
||||
import { GeminiTTSProvider } from './providers/gemini/GeminiTTSProvider.js';
|
||||
import { OpenAITTSProvider } from './providers/openai/OpenAITTSProvider.js';
|
||||
import type {
|
||||
ISynthesizeArgs,
|
||||
@@ -43,12 +44,18 @@ import type {
|
||||
// than passing `{ provider }` in args, so alias the unified driver under
|
||||
// the names the client expects. `#providerFromAlias` normalizes those
|
||||
// aliases to the internal provider keys used by `#providers`.
|
||||
const TTS_ALIASES = ['aws-polly', 'openai-tts', 'elevenlabs-tts'] as const;
|
||||
const TTS_ALIASES = [
|
||||
'aws-polly',
|
||||
'openai-tts',
|
||||
'elevenlabs-tts',
|
||||
'gemini-tts',
|
||||
] as const;
|
||||
type TTSAlias = (typeof TTS_ALIASES)[number];
|
||||
const ALIAS_TO_PROVIDER: Record<TTSAlias, string> = {
|
||||
'aws-polly': 'aws-polly',
|
||||
'openai-tts': 'openai',
|
||||
'elevenlabs-tts': 'elevenlabs',
|
||||
'gemini-tts': 'gemini',
|
||||
};
|
||||
|
||||
export class TTSDriver extends PuterDriver {
|
||||
@@ -247,14 +254,40 @@ export class TTSDriver extends PuterDriver {
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
this.#registerGeminiProvider(providers);
|
||||
}
|
||||
|
||||
#registerGeminiProvider(providers: Record<string, unknown>) {
|
||||
const m = this.services.metering;
|
||||
const gemini = (providers['gemini'] ?? providers['gemini-tts']) as
|
||||
| Record<string, unknown>
|
||||
| undefined;
|
||||
const geminiKey =
|
||||
(gemini?.apiKey as string | undefined) ??
|
||||
(gemini?.api_key as string | undefined) ??
|
||||
(gemini?.key as string | undefined);
|
||||
if (geminiKey) {
|
||||
try {
|
||||
this.#providers['gemini'] = new GeminiTTSProvider(m, {
|
||||
apiKey: geminiKey,
|
||||
});
|
||||
} catch (e) {
|
||||
console.warn(
|
||||
'[TTSDriver] Failed to init Gemini TTS provider:',
|
||||
(e as Error).message,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#getDefaultProviderName(): string | null {
|
||||
const names = Object.keys(this.#providers);
|
||||
if (names.length === 0) return null;
|
||||
// Prefer openai, then elevenlabs, then aws-polly
|
||||
// Prefer openai, then elevenlabs, then gemini, then aws-polly
|
||||
if (this.#providers['openai']) return 'openai';
|
||||
if (this.#providers['elevenlabs']) return 'elevenlabs';
|
||||
if (this.#providers['gemini']) return 'gemini';
|
||||
return names[0];
|
||||
}
|
||||
}
|
||||
|
||||
@@ -0,0 +1,361 @@
|
||||
/**
|
||||
* Copyright (C) 2024-present Puter Technologies Inc.
|
||||
*
|
||||
* This file is part of Puter.
|
||||
*
|
||||
* Puter is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
import { GoogleGenAI } from '@google/genai';
|
||||
import { Readable } from 'node:stream';
|
||||
import { HttpError } from '../../../../core/http/HttpError.js';
|
||||
import { Context } from '../../../../core/context.js';
|
||||
import type { MeteringService } from '../../../../services/metering/MeteringService.js';
|
||||
import type { DriverStreamResult } from '../../../meta.js';
|
||||
import type { ITTSVoice, ITTSEngine, ISynthesizeArgs } from '../../types.js';
|
||||
import { TTSProvider } from '../TTSProvider.js';
|
||||
import { GEMINI_TTS_COSTS } from './costs.js';
|
||||
|
||||
const DEFAULT_MODEL = 'gemini-2.5-flash-preview-tts';
|
||||
const DEFAULT_VOICE = 'Kore';
|
||||
const SAMPLE_AUDIO_URL = 'https://puter-sample-data.puter.site/tts_example.mp3';
|
||||
|
||||
const GEMINI_TTS_MODELS = [
|
||||
{
|
||||
id: 'gemini-2.5-flash-preview-tts',
|
||||
name: 'Gemini 2.5 Flash TTS',
|
||||
},
|
||||
{
|
||||
id: 'gemini-2.5-pro-preview-tts',
|
||||
name: 'Gemini 2.5 Pro TTS',
|
||||
},
|
||||
{
|
||||
id: 'gemini-3.1-flash-tts-preview',
|
||||
name: 'Gemini 3.1 Flash TTS',
|
||||
},
|
||||
];
|
||||
|
||||
const GEMINI_TTS_VOICES = [
|
||||
{ id: 'Zephyr', name: 'Zephyr', description: 'Bright' },
|
||||
{ id: 'Puck', name: 'Puck', description: 'Upbeat' },
|
||||
{ id: 'Charon', name: 'Charon', description: 'Informative' },
|
||||
{ id: 'Kore', name: 'Kore', description: 'Firm' },
|
||||
{ id: 'Fenrir', name: 'Fenrir', description: 'Excitable' },
|
||||
{ id: 'Leda', name: 'Leda', description: 'Youthful' },
|
||||
{ id: 'Orus', name: 'Orus', description: 'Firm' },
|
||||
{ id: 'Aoede', name: 'Aoede', description: 'Breezy' },
|
||||
{ id: 'Callirrhoe', name: 'Callirrhoe', description: 'Easy-going' },
|
||||
{ id: 'Autonoe', name: 'Autonoe', description: 'Bright' },
|
||||
{ id: 'Enceladus', name: 'Enceladus', description: 'Breathy' },
|
||||
{ id: 'Iapetus', name: 'Iapetus', description: 'Clear' },
|
||||
{ id: 'Umbriel', name: 'Umbriel', description: 'Easy-going' },
|
||||
{ id: 'Algieba', name: 'Algieba', description: 'Smooth' },
|
||||
{ id: 'Despina', name: 'Despina', description: 'Smooth' },
|
||||
{ id: 'Erinome', name: 'Erinome', description: 'Clear' },
|
||||
{ id: 'Algenib', name: 'Algenib', description: 'Gravelly' },
|
||||
{ id: 'Rasalgethi', name: 'Rasalgethi', description: 'Informative' },
|
||||
{ id: 'Laomedeia', name: 'Laomedeia', description: 'Upbeat' },
|
||||
{ id: 'Achernar', name: 'Achernar', description: 'Soft' },
|
||||
{ id: 'Alnilam', name: 'Alnilam', description: 'Firm' },
|
||||
{ id: 'Schedar', name: 'Schedar', description: 'Even' },
|
||||
{ id: 'Gacrux', name: 'Gacrux', description: 'Mature' },
|
||||
{ id: 'Pulcherrima', name: 'Pulcherrima', description: 'Forward' },
|
||||
{ id: 'Achird', name: 'Achird', description: 'Friendly' },
|
||||
{ id: 'Zubenelgenubi', name: 'Zubenelgenubi', description: 'Casual' },
|
||||
{ id: 'Vindemiatrix', name: 'Vindemiatrix', description: 'Gentle' },
|
||||
{ id: 'Sadachbia', name: 'Sadachbia', description: 'Lively' },
|
||||
{ id: 'Sadaltager', name: 'Sadaltager', description: 'Knowledgeable' },
|
||||
{ id: 'Sulafat', name: 'Sulafat', description: 'Warm' },
|
||||
];
|
||||
|
||||
/**
|
||||
* Gemini TTS provider. Calls the Gemini generateContent API with
|
||||
* `responseModalities: ["AUDIO"]` and `speechConfig` to synthesize speech.
|
||||
* Returns raw PCM audio wrapped in a WAV container.
|
||||
*/
|
||||
export class GeminiTTSProvider extends TTSProvider {
|
||||
readonly providerName = 'gemini';
|
||||
|
||||
#client: GoogleGenAI;
|
||||
|
||||
constructor(meteringService: MeteringService, config: { apiKey: string }) {
|
||||
super(meteringService, config);
|
||||
if (!config.apiKey) {
|
||||
throw new Error('Gemini TTS requires an API key');
|
||||
}
|
||||
this.#client = new GoogleGenAI({ apiKey: config.apiKey });
|
||||
}
|
||||
|
||||
async listVoices(): Promise<ITTSVoice[]> {
|
||||
return GEMINI_TTS_VOICES.map((voice) => ({
|
||||
id: voice.id,
|
||||
name: voice.name,
|
||||
description: voice.description,
|
||||
provider: 'gemini',
|
||||
supported_models: GEMINI_TTS_MODELS.map((m) => m.id),
|
||||
}));
|
||||
}
|
||||
|
||||
async listEngines(): Promise<ITTSEngine[]> {
|
||||
return GEMINI_TTS_MODELS.map((model) => ({
|
||||
id: model.id,
|
||||
name: model.name,
|
||||
provider: 'gemini',
|
||||
}));
|
||||
}
|
||||
|
||||
override getReportedCosts(): Record<string, unknown>[] {
|
||||
return Object.entries(GEMINI_TTS_COSTS).map(([model, costs]) => ({
|
||||
usageType: `gemini:${model}:tts`,
|
||||
ucentsInputPerToken: this.#toMicroCents(costs.input / 1_000_000),
|
||||
ucentsOutputAudioPerToken: this.#toMicroCents(
|
||||
costs.output_audio / 1_000_000,
|
||||
),
|
||||
unit: 'token',
|
||||
source: 'driver:aiTts/gemini',
|
||||
}));
|
||||
}
|
||||
|
||||
async synthesize(
|
||||
args: ISynthesizeArgs,
|
||||
): Promise<DriverStreamResult | { url: string; content_type: string }> {
|
||||
const {
|
||||
text,
|
||||
voice: voiceArg,
|
||||
model: modelArg,
|
||||
instructions,
|
||||
test_mode,
|
||||
} = args;
|
||||
|
||||
if (test_mode) {
|
||||
return { url: SAMPLE_AUDIO_URL, content_type: 'audio' };
|
||||
}
|
||||
|
||||
if (typeof text !== 'string' || !text.trim()) {
|
||||
throw new HttpError(400, 'Missing required field: text', {
|
||||
legacyCode: 'field_required',
|
||||
fields: { key: 'text' },
|
||||
});
|
||||
}
|
||||
|
||||
const model = modelArg || DEFAULT_MODEL;
|
||||
if (!GEMINI_TTS_MODELS.find(({ id }) => id === model)) {
|
||||
throw new HttpError(
|
||||
400,
|
||||
`Invalid model: ${model}. Expected: ${GEMINI_TTS_MODELS.map(({ id }) => id).join(', ')}`,
|
||||
{
|
||||
legacyCode: 'field_invalid',
|
||||
fields: {
|
||||
key: 'model',
|
||||
expected: GEMINI_TTS_MODELS.map(({ id }) => id).join(
|
||||
', ',
|
||||
),
|
||||
got: model,
|
||||
},
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
const voice = voiceArg || DEFAULT_VOICE;
|
||||
if (
|
||||
!GEMINI_TTS_VOICES.find(
|
||||
({ id }) => id.toLowerCase() === voice.toLowerCase(),
|
||||
)
|
||||
) {
|
||||
throw new HttpError(
|
||||
400,
|
||||
`Invalid voice: ${voice}. Expected: ${GEMINI_TTS_VOICES.map(({ id }) => id).join(', ')}`,
|
||||
{
|
||||
legacyCode: 'field_invalid',
|
||||
fields: {
|
||||
key: 'voice',
|
||||
expected: GEMINI_TTS_VOICES.map(({ id }) => id).join(
|
||||
', ',
|
||||
),
|
||||
got: voice,
|
||||
},
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
const actor = Context.get('actor')!;
|
||||
const costs = GEMINI_TTS_COSTS[model];
|
||||
if (!costs) {
|
||||
throw new HttpError(500, `No cost data for model: ${model}`);
|
||||
}
|
||||
|
||||
// Estimate input tokens (~4 chars per token) and a rough output
|
||||
// audio duration (~150 words/min, 25 tokens/sec).
|
||||
const estimatedInputTokens = Math.max(1, Math.ceil(text.length / 4));
|
||||
const wordCount = text.split(/\s+/).length;
|
||||
const estimatedDurationSec = Math.max(1, (wordCount / 150) * 60);
|
||||
const estimatedOutputTokens = Math.ceil(estimatedDurationSec * 25);
|
||||
|
||||
const estimatedInputCostCents =
|
||||
(estimatedInputTokens / 1_000_000) * costs.input;
|
||||
const estimatedOutputCostCents =
|
||||
(estimatedOutputTokens / 1_000_000) * costs.output_audio;
|
||||
const estimatedTotalMicroCents = this.#toMicroCents(
|
||||
estimatedInputCostCents + estimatedOutputCostCents,
|
||||
);
|
||||
|
||||
const usageAllowed = await this.meteringService.hasEnoughCredits(
|
||||
actor,
|
||||
estimatedTotalMicroCents,
|
||||
);
|
||||
if (!usageAllowed) {
|
||||
throw new HttpError(402, 'Insufficient funds', {
|
||||
legacyCode: 'insufficient_funds',
|
||||
});
|
||||
}
|
||||
|
||||
// The TTS models require the text to be framed as a transcript
|
||||
// to read aloud. Prefixing with "Say:" prevents the model from
|
||||
// trying to generate conversational text instead of audio.
|
||||
const inputText = instructions
|
||||
? `${instructions}\n\nSay the following text aloud:\n${text}`
|
||||
: `Say the following text aloud:\n${text}`;
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
let response: any;
|
||||
try {
|
||||
response = await this.#client.models.generateContent({
|
||||
model,
|
||||
contents: [{ parts: [{ text: inputText }] }],
|
||||
config: {
|
||||
responseModalities: ['AUDIO'],
|
||||
speechConfig: {
|
||||
voiceConfig: {
|
||||
prebuiltVoiceConfig: { voiceName: voice },
|
||||
},
|
||||
},
|
||||
},
|
||||
});
|
||||
} catch (e: unknown) {
|
||||
const msg = (e as Error).message ?? String(e);
|
||||
console.error('[GeminiTTSProvider] API error:', msg);
|
||||
throw new HttpError(502, `Gemini TTS API error: ${msg}`, {
|
||||
fields: { provider: 'gemini' },
|
||||
});
|
||||
}
|
||||
|
||||
// Extract audio data from response
|
||||
const part = response?.candidates?.[0]?.content?.parts?.[0];
|
||||
if (!part?.inlineData?.data) {
|
||||
throw new HttpError(502, 'Gemini TTS did not return audio data', {
|
||||
fields: { provider: 'gemini' },
|
||||
});
|
||||
}
|
||||
|
||||
const audioBase64: string = part.inlineData.data;
|
||||
const mimeType: string =
|
||||
part.inlineData.mimeType || 'audio/L16;rate=24000';
|
||||
|
||||
// Convert base64 PCM to a WAV buffer for broad client compatibility
|
||||
const pcmBuffer = Buffer.from(audioBase64, 'base64');
|
||||
let outputBuffer: Buffer;
|
||||
let contentType: string;
|
||||
|
||||
if (mimeType.startsWith('audio/L16') || mimeType === 'audio/pcm') {
|
||||
// Wrap raw PCM (16-bit LE, 24kHz, mono) in a WAV container
|
||||
outputBuffer = this.#wrapPcmInWav(pcmBuffer, 24000, 1, 16);
|
||||
contentType = 'audio/wav';
|
||||
} else {
|
||||
// If the API returns encoded audio (unlikely today), pass through
|
||||
outputBuffer = pcmBuffer;
|
||||
contentType = mimeType;
|
||||
}
|
||||
|
||||
// Meter actual usage from response metadata
|
||||
const usage = response.usageMetadata;
|
||||
const actualInputTokens =
|
||||
typeof usage?.promptTokenCount === 'number'
|
||||
? usage.promptTokenCount
|
||||
: estimatedInputTokens;
|
||||
const actualOutputTokens =
|
||||
typeof usage?.candidatesTokenCount === 'number'
|
||||
? usage.candidatesTokenCount
|
||||
: estimatedOutputTokens;
|
||||
|
||||
const inputCostCents = (actualInputTokens / 1_000_000) * costs.input;
|
||||
const outputCostCents =
|
||||
(actualOutputTokens / 1_000_000) * costs.output_audio;
|
||||
|
||||
const usagePrefix = `gemini:${model}`;
|
||||
this.meteringService.batchIncrementUsages(actor, [
|
||||
{
|
||||
usageType: `${usagePrefix}:input`,
|
||||
usageAmount: Math.max(actualInputTokens, 1),
|
||||
costOverride: this.#toMicroCents(inputCostCents),
|
||||
},
|
||||
{
|
||||
usageType: `${usagePrefix}:output:audio`,
|
||||
usageAmount: Math.max(actualOutputTokens, 1),
|
||||
costOverride: this.#toMicroCents(outputCostCents),
|
||||
},
|
||||
]);
|
||||
|
||||
const stream = Readable.from(outputBuffer);
|
||||
|
||||
return {
|
||||
dataType: 'stream',
|
||||
content_type: contentType,
|
||||
chunked: true,
|
||||
stream,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Wrap raw PCM samples in a WAV container so browsers can play it.
|
||||
*/
|
||||
#wrapPcmInWav(
|
||||
pcm: Buffer,
|
||||
sampleRate: number,
|
||||
channels: number,
|
||||
bitsPerSample: number,
|
||||
): Buffer {
|
||||
const byteRate = (sampleRate * channels * bitsPerSample) / 8;
|
||||
const blockAlign = (channels * bitsPerSample) / 8;
|
||||
const dataSize = pcm.length;
|
||||
const headerSize = 44;
|
||||
const buffer = Buffer.alloc(headerSize + dataSize);
|
||||
|
||||
// RIFF header
|
||||
buffer.write('RIFF', 0);
|
||||
buffer.writeUInt32LE(36 + dataSize, 4);
|
||||
buffer.write('WAVE', 8);
|
||||
|
||||
// fmt sub-chunk
|
||||
buffer.write('fmt ', 12);
|
||||
buffer.writeUInt32LE(16, 16); // sub-chunk size
|
||||
buffer.writeUInt16LE(1, 20); // PCM format
|
||||
buffer.writeUInt16LE(channels, 22);
|
||||
buffer.writeUInt32LE(sampleRate, 24);
|
||||
buffer.writeUInt32LE(byteRate, 28);
|
||||
buffer.writeUInt16LE(blockAlign, 32);
|
||||
buffer.writeUInt16LE(bitsPerSample, 34);
|
||||
|
||||
// data sub-chunk
|
||||
buffer.write('data', 36);
|
||||
buffer.writeUInt32LE(dataSize, 40);
|
||||
pcm.copy(buffer, headerSize);
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
#toMicroCents(cents: number): number {
|
||||
if (!Number.isFinite(cents) || cents <= 0) return 1;
|
||||
return Math.ceil(cents * 1_000_000);
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,44 @@
|
||||
/**
|
||||
* Copyright (C) 2024-present Puter Technologies Inc.
|
||||
*
|
||||
* This file is part of Puter.
|
||||
*
|
||||
* Puter is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU Affero General Public License as published
|
||||
* by the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU Affero General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Affero General Public License
|
||||
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
// Gemini TTS pricing in USD per 1M tokens:
|
||||
// gemini-2.5-flash-preview-tts: input $0.50, output (audio) $10.00
|
||||
// gemini-2.5-pro-preview-tts: input $1.00, output (audio) $20.00
|
||||
// gemini-3.1-flash-tts-preview: input $1.00, output (audio) $20.00
|
||||
//
|
||||
// Audio output tokens = ~25 tokens/second of audio.
|
||||
//
|
||||
// Costs here are in USD-cents per 1M tokens for input and output.
|
||||
export const GEMINI_TTS_COSTS: Record<
|
||||
string,
|
||||
{ input: number; output_audio: number }
|
||||
> = {
|
||||
'gemini-2.5-flash-preview-tts': {
|
||||
input: 50, // $0.50 per 1M tokens = 50 cents
|
||||
output_audio: 1000, // $10.00 per 1M tokens = 1000 cents
|
||||
},
|
||||
'gemini-2.5-pro-preview-tts': {
|
||||
input: 100, // $1.00 per 1M tokens
|
||||
output_audio: 2000, // $20.00 per 1M tokens
|
||||
},
|
||||
'gemini-3.1-flash-tts-preview': {
|
||||
input: 100, // $1.00 per 1M tokens
|
||||
output_audio: 2000, // $20.00 per 1M tokens
|
||||
},
|
||||
};
|
||||
@@ -226,6 +226,7 @@ You can see various Puter.js AI features in action from the following examples:
|
||||
- [Text to Speech with options](/playground/ai-txt2speech-options/)
|
||||
- [Text to Speech with engines](/playground/ai-txt2speech-engines/)
|
||||
- [Text to Speech with OpenAI voices](/playground/ai-txt2speech-openai/)
|
||||
- [Text to Speech with Gemini voices](/playground/ai-txt2speech-gemini/)
|
||||
- [Transcribe audio with `speech2txt`](/AI/speech2txt/)
|
||||
- Text to Video
|
||||
- [Generate a sample Sora clip](/AI/txt2vid/)
|
||||
|
||||
@@ -32,7 +32,7 @@ Additional settings for the generation request. Available options depend on the
|
||||
|
||||
| Option | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'` |
|
||||
| `provider` | `String` | TTS provider to use. `'aws-polly'` (default), `'openai'`, `'elevenlabs'`, `'gemini'` |
|
||||
| `model` | `String` | Model identifier (provider-specific) |
|
||||
| `voice` | `String` | Voice ID used for synthesis (provider-specific) |
|
||||
| `test_mode` | `Boolean` | When `true`, returns a sample audio without using credits |
|
||||
@@ -74,6 +74,18 @@ Available when `provider: 'elevenlabs'`:
|
||||
|
||||
For more details about each option, see the [ElevenLabs API reference](https://elevenlabs.io/docs/api-reference/text-to-speech).
|
||||
|
||||
#### Gemini Options
|
||||
|
||||
Available when `provider: 'gemini'`:
|
||||
|
||||
| Option | Type | Description |
|
||||
|--------|------|-------------|
|
||||
| `model` | `String` | TTS model. Available: `'gemini-2.5-flash-preview-tts'` (default), `'gemini-2.5-pro-preview-tts'`, `'gemini-3.1-flash-tts-preview'` |
|
||||
| `voice` | `String` | Voice name. Defaults to `'Kore'`. Available: `'Zephyr'`, `'Puck'`, `'Charon'`, `'Kore'`, `'Fenrir'`, `'Leda'`, `'Orus'`, `'Aoede'`, `'Callirrhoe'`, `'Autonoe'`, `'Enceladus'`, `'Iapetus'`, `'Umbriel'`, `'Algieba'`, `'Despina'`, `'Erinome'`, `'Algenib'`, `'Rasalgethi'`, `'Laomedeia'`, `'Achernar'`, `'Alnilam'`, `'Schedar'`, `'Gacrux'`, `'Pulcherrima'`, `'Achird'`, `'Zubenelgenubi'`, `'Vindemiatrix'`, `'Sadachbia'`, `'Sadaltager'`, `'Sulafat'` |
|
||||
| `instructions` | `String` | Natural language instructions to control speaking style (tone, speed, mood, etc.) |
|
||||
|
||||
For more details about Gemini TTS, see the [Google Gemini TTS documentation](https://ai.google.dev/gemini-api/docs/text-to-speech).
|
||||
|
||||
## Return value
|
||||
|
||||
A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points at a blob or remote URL containing the synthesized audio.
|
||||
@@ -171,6 +183,31 @@ A `Promise` that resolves to an `HTMLAudioElement`. The element’s `src` points
|
||||
</html>
|
||||
```
|
||||
|
||||
<strong class="example-title">Use Gemini voices</strong>
|
||||
|
||||
```html;ai-txt2speech-gemini
|
||||
<html>
|
||||
<body>
|
||||
<script src="https://js.puter.com/v2/"></script>
|
||||
<button id="play">Use Gemini voice</button>
|
||||
<script>
|
||||
document.getElementById('play').addEventListener('click', async ()=>{
|
||||
const audio = await puter.ai.txt2speech(
|
||||
"Hello! This sample uses the Gemini Puck voice.",
|
||||
{
|
||||
provider: "gemini",
|
||||
model: "gemini-2.5-flash-preview-tts",
|
||||
voice: "Puck",
|
||||
instructions: "Speak in a friendly, upbeat tone."
|
||||
}
|
||||
);
|
||||
audio.play();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
```
|
||||
|
||||
<strong class="example-title">Compare different engines</strong>
|
||||
|
||||
```html;ai-txt2speech-engines
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
<html>
|
||||
<body>
|
||||
<script src="https://js.puter.com/v2/"></script>
|
||||
<button id="play">Use Gemini voice</button>
|
||||
<script>
|
||||
document.getElementById('play').addEventListener('click', async ()=>{
|
||||
const audio = await puter.ai.txt2speech(
|
||||
"Hello! This sample uses the Gemini Puck voice.",
|
||||
{
|
||||
provider: "gemini",
|
||||
model: "gemini-2.5-flash-preview-tts",
|
||||
voice: "Puck",
|
||||
instructions: "Speak in a friendly, upbeat tone."
|
||||
}
|
||||
);
|
||||
audio.play();
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -2,11 +2,12 @@ import * as utils from '../lib/utils.js';
|
||||
|
||||
const normalizeTTSProvider = (value) => {
|
||||
if ( typeof value !== 'string' ) {
|
||||
return 'aws-polly';
|
||||
return null;
|
||||
}
|
||||
const lower = value.toLowerCase();
|
||||
if ( lower === 'openai' ) return 'openai';
|
||||
if ( ['elevenlabs', 'eleven', '11labs', '11-labs', 'eleven-labs', 'elevenlabs-tts'].includes(lower) ) return 'elevenlabs';
|
||||
if ( ['gemini', 'google', 'gemini-tts', 'google-tts'].includes(lower) ) return 'gemini';
|
||||
if ( lower === 'aws' || lower === 'polly' || lower === 'aws-polly' ) return 'aws-polly';
|
||||
return value;
|
||||
};
|
||||
@@ -270,6 +271,10 @@ class AI {
|
||||
provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
if ( options.engine && normalizeTTSProvider(options.engine) === 'gemini' && !options.provider ) {
|
||||
provider = 'gemini';
|
||||
}
|
||||
|
||||
if ( provider === 'openai' ) {
|
||||
if ( !options.model && typeof options.engine === 'string' ) {
|
||||
options.model = options.engine;
|
||||
@@ -301,6 +306,17 @@ class AI {
|
||||
options.output_format = options.response_format;
|
||||
}
|
||||
delete options.engine;
|
||||
} else if ( provider === 'gemini' ) {
|
||||
if ( !options.model && typeof options.engine === 'string' ) {
|
||||
options.model = options.engine;
|
||||
}
|
||||
if ( ! options.voice ) {
|
||||
options.voice = 'Kore';
|
||||
}
|
||||
if ( ! options.model ) {
|
||||
options.model = 'gemini-2.5-flash-preview-tts';
|
||||
}
|
||||
delete options.engine;
|
||||
} else {
|
||||
provider = 'aws-polly';
|
||||
|
||||
@@ -332,9 +348,12 @@ class AI {
|
||||
}
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
const driverNameMap = {
|
||||
'openai': 'openai-tts',
|
||||
'elevenlabs': 'elevenlabs-tts',
|
||||
'gemini': 'gemini-tts',
|
||||
};
|
||||
const driverName = driverNameMap[provider] || 'aws-polly';
|
||||
|
||||
return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'synthesize', {
|
||||
responseType: 'blob',
|
||||
@@ -574,9 +593,16 @@ class AI {
|
||||
params.provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
if ( provider === 'gemini' ) {
|
||||
params.provider = 'gemini';
|
||||
}
|
||||
|
||||
const driverNameMap = {
|
||||
'openai': 'openai-tts',
|
||||
'elevenlabs': 'elevenlabs-tts',
|
||||
'gemini': 'gemini-tts',
|
||||
};
|
||||
const driverName = driverNameMap[provider] || 'aws-polly';
|
||||
|
||||
return await utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_engines', {
|
||||
responseType: 'text',
|
||||
@@ -609,9 +635,17 @@ class AI {
|
||||
params.provider = 'elevenlabs';
|
||||
}
|
||||
|
||||
const driverName = provider === 'openai'
|
||||
? 'openai-tts'
|
||||
: (provider === 'elevenlabs' ? 'elevenlabs-tts' : 'aws-polly');
|
||||
if ( provider === 'gemini' ) {
|
||||
params.provider = 'gemini';
|
||||
delete params.engine;
|
||||
}
|
||||
|
||||
const driverNameMap2 = {
|
||||
'openai': 'openai-tts',
|
||||
'elevenlabs': 'elevenlabs-tts',
|
||||
'gemini': 'gemini-tts',
|
||||
};
|
||||
const driverName = driverNameMap2[provider] || 'aws-polly';
|
||||
|
||||
return utils.make_driver_method(['source'], 'puter-tts', driverName, 'list_voices', {
|
||||
responseType: 'text',
|
||||
|
||||
Reference in New Issue
Block a user