diff --git a/src/backend/drivers/ai-speech2speech/VoiceChangerDriver.test.ts b/src/backend/drivers/ai-speech2speech/VoiceChangerDriver.test.ts new file mode 100644 index 000000000..b3d507d81 --- /dev/null +++ b/src/backend/drivers/ai-speech2speech/VoiceChangerDriver.test.ts @@ -0,0 +1,344 @@ +/* + * Copyright (C) 2024-present Puter Technologies Inc. + * + * This file is part of Puter. + * + * Puter is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as published + * by the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +/** + * Offline unit tests for VoiceChangerDriver. + * + * Boots a real PuterServer (in-memory sqlite + dynamo + s3 + mock + * redis) configured with an ElevenLabs API key, then drives + * `server.drivers.aiSpeech2Speech` directly. ElevenLabs is reached + * over plain `fetch` rather than an SDK, so we stub the global fetch + * — that's the real network egress point. Inputs use `data:` URLs + * through the live `loadFileInput`. Aligns with AGENTS.md: "Prefer + * test server over mocking deps." + */ + +import { + afterAll, + afterEach, + beforeAll, + beforeEach, + describe, + expect, + it, + vi, + type MockInstance, +} from 'vitest'; +import { v4 as uuidv4 } from 'uuid'; + +import type { Actor } from '../../core/actor.js'; +import { runWithContext } from '../../core/context.js'; +import { PuterServer } from '../../server.js'; +import type { MeteringService } from '../../services/metering/MeteringService.js'; +import { setupTestServer } from '../../testUtil.js'; +import { generateDefaultFsentries } from '../../util/userProvisioning.js'; +import type { VoiceChangerDriver } from './VoiceChangerDriver.js'; +import { VOICE_CHANGER_COSTS } from './costs.js'; + +// ── Test harness ──────────────────────────────────────────────────── + +let server: PuterServer; +let driver: VoiceChangerDriver; +let fetchSpy: MockInstance; +let hasCreditsSpy: MockInstance; +let incrementUsageSpy: MockInstance; + +beforeAll(async () => { + server = await setupTestServer({ + providers: { + elevenlabs: { apiKey: 'eleven-test-key' }, + }, + } as never); + driver = server.drivers.aiSpeech2Speech as unknown as VoiceChangerDriver; +}); + +afterAll(async () => { + await server?.shutdown(); +}); + +beforeEach(() => { + fetchSpy = vi.spyOn(globalThis, 'fetch') as MockInstance; + hasCreditsSpy = vi.spyOn(server.services.metering, 'hasEnoughCredits'); + incrementUsageSpy = vi.spyOn(server.services.metering, 'incrementUsage'); +}); + +afterEach(() => { + vi.restoreAllMocks(); +}); + +const makeUser = async (): Promise<{ actor: Actor; userId: number }> => { + const username = `vc-${Math.random().toString(36).slice(2, 10)}`; + const created = await server.stores.user.create({ + username, + uuid: uuidv4(), + password: null, + email: `${username}@test.local`, + free_storage: 100 * 1024 * 1024, + requires_email_confirmation: false, + }); + await generateDefaultFsentries( + server.clients.db, + server.stores.user, + created, + ); + const refreshed = (await server.stores.user.getById(created.id))!; + return { + userId: refreshed.id, + actor: { + user: { + id: refreshed.id, + uuid: refreshed.uuid, + username: refreshed.username, + email: refreshed.email ?? null, + email_confirmed: true, + } as Actor['user'], + }, + }; +}; + +const withActor = (actor: Actor, fn: () => T | Promise): Promise => + Promise.resolve(runWithContext({ actor }, fn)); + +const dataUrl = (buffer: Buffer, mime: string) => + `data:${mime};base64,${buffer.toString('base64')}`; + +const okResponse = (body: ArrayBuffer, contentType = 'audio/mpeg') => + new Response(body, { + status: 200, + headers: { 'content-type': contentType }, + }); + +// ── getReportedCosts ──────────────────────────────────────────────── + +describe('VoiceChangerDriver.getReportedCosts', () => { + it('mirrors every entry in costs.ts as a per-second line item', () => { + const reported = driver.getReportedCosts(); + expect(reported).toHaveLength(Object.keys(VOICE_CHANGER_COSTS).length); + for (const [usageType, ucentsPerUnit] of Object.entries( + VOICE_CHANGER_COSTS, + )) { + expect(reported).toContainEqual({ + usageType, + ucentsPerUnit, + unit: 'second', + source: 'driver:aiSpeech2Speech', + }); + } + }); +}); + +// ── Argument validation ───────────────────────────────────────────── + +describe('VoiceChangerDriver.convert argument validation', () => { + it('returns the canned sample when test_mode is set, bypassing all I/O', async () => { + const result = await driver.convert({ + audio: undefined, + test_mode: true, + }); + expect(result).toMatchObject({ + url: expect.stringContaining('puter-sample-data'), + content_type: 'audio/mpeg', + }); + expect(fetchSpy).not.toHaveBeenCalled(); + expect(incrementUsageSpy).not.toHaveBeenCalled(); + }); + + it('throws 401 when no actor is on the request context', async () => { + await expect( + driver.convert({ + audio: dataUrl(Buffer.from('audio'), 'audio/mpeg'), + }), + ).rejects.toMatchObject({ statusCode: 401 }); + }); + + it('throws 400 when audio is missing', async () => { + const { actor } = await makeUser(); + await expect( + withActor(actor, () => driver.convert({ audio: undefined })), + ).rejects.toMatchObject({ statusCode: 400 }); + }); +}); + +// ── Credit gate ───────────────────────────────────────────────────── + +describe('VoiceChangerDriver.convert credit gate', () => { + it('throws 402 BEFORE hitting ElevenLabs when the actor lacks credits', async () => { + const { actor } = await makeUser(); + hasCreditsSpy.mockResolvedValueOnce(false); + + await expect( + withActor(actor, () => + driver.convert({ + audio: dataUrl(Buffer.from('a'.repeat(64000)), 'audio/mpeg'), + }), + ), + ).rejects.toMatchObject({ statusCode: 402 }); + + expect(fetchSpy).not.toHaveBeenCalled(); + expect(incrementUsageSpy).not.toHaveBeenCalled(); + }); +}); + +// ── Successful conversion ─────────────────────────────────────────── + +describe('VoiceChangerDriver.convert success path', () => { + it('POSTs to ElevenLabs with the configured api key, default voice + model, and forwards the audio stream', async () => { + const { actor } = await makeUser(); + const replyBytes = new TextEncoder().encode('audio-bytes'); + fetchSpy.mockResolvedValueOnce(okResponse(replyBytes.buffer)); + + const buf = Buffer.from('input-audio'); + const result = (await withActor(actor, () => + driver.convert({ + audio: dataUrl(buf, 'audio/mpeg'), + }), + )) as { dataType: string; content_type: string; stream: NodeJS.ReadableStream }; + + // Driver hit the ElevenLabs endpoint with the configured key. + expect(fetchSpy).toHaveBeenCalledTimes(1); + const [calledUrl, init] = fetchSpy.mock.calls[0]!; + expect(String(calledUrl)).toMatch(/api\.elevenlabs\.io/); + expect(String(calledUrl)).toMatch( + /\/v1\/speech-to-speech\/21m00Tcm4TlvDq8ikWAM/, + ); + // Default mp3_44100_128 output format threaded through search params. + expect(String(calledUrl)).toMatch(/output_format=mp3_44100_128/); + expect(init?.method).toBe('POST'); + expect((init?.headers as Record)['xi-api-key']).toBe( + 'eleven-test-key', + ); + + // Form data carries the model_id + audio blob. + const form = init?.body as FormData; + expect(form.get('model_id')).toBe('eleven_multilingual_sts_v2'); + expect(form.get('audio')).toBeInstanceOf(Blob); + + // Returned shape is a Node stream the controller can pipe. + expect(result.dataType).toBe('stream'); + expect(result.content_type).toBe('audio/mpeg'); + expect(typeof result.stream.pipe).toBe('function'); + }); + + it('honours an explicit voice + model override', async () => { + const { actor } = await makeUser(); + fetchSpy.mockResolvedValueOnce( + okResponse(new ArrayBuffer(0), 'audio/mpeg'), + ); + + await withActor(actor, () => + driver.convert({ + audio: dataUrl(Buffer.from('x'), 'audio/mpeg'), + voice_id: 'voice-XYZ', + model_id: 'eleven_english_sts_v2', + }), + ); + + const [calledUrl, init] = fetchSpy.mock.calls[0]!; + expect(String(calledUrl)).toMatch( + /\/v1\/speech-to-speech\/voice-XYZ/, + ); + const form = init?.body as FormData; + expect(form.get('model_id')).toBe('eleven_english_sts_v2'); + }); + + it('forwards optional knobs (voice_settings, seed, remove_background_noise, file_format, optimize_streaming_latency)', async () => { + const { actor } = await makeUser(); + fetchSpy.mockResolvedValueOnce( + okResponse(new ArrayBuffer(0), 'audio/mpeg'), + ); + + await withActor(actor, () => + driver.convert({ + audio: dataUrl(Buffer.from('x'), 'audio/mpeg'), + voice_settings: { stability: 0.5 }, + seed: 42, + remove_background_noise: true, + file_format: 'pcm_s16le', + optimize_streaming_latency: 3, + enable_logging: false, + }), + ); + + const [calledUrl, init] = fetchSpy.mock.calls[0]!; + const form = init?.body as FormData; + expect(form.get('voice_settings')).toBe( + JSON.stringify({ stability: 0.5 }), + ); + expect(form.get('seed')).toBe('42'); + expect(form.get('remove_background_noise')).toBe('true'); + expect(form.get('file_format')).toBe('pcm_s16le'); + expect(String(calledUrl)).toMatch(/optimize_streaming_latency=3/); + expect(String(calledUrl)).toMatch(/enable_logging=false/); + }); + + it('meters one usage line at the per-second rate from costs.ts', async () => { + const { actor } = await makeUser(); + fetchSpy.mockResolvedValueOnce( + okResponse(new ArrayBuffer(0), 'audio/mpeg'), + ); + + // 32 KB audio at 16 kbit/s = 2 seconds rounded up. + const buf = Buffer.alloc(32_000); + await withActor(actor, () => + driver.convert({ audio: dataUrl(buf, 'audio/mpeg') }), + ); + + const usageType = 'elevenlabs:eleven_multilingual_sts_v2:second'; + const perSecond = VOICE_CHANGER_COSTS[usageType]; + const expectedSeconds = Math.max(1, Math.ceil(32_000 / 16000)); + + const calls = incrementUsageSpy.mock.calls.filter( + ([, type]) => type === usageType, + ); + expect(calls).toHaveLength(1); + const [actorArg, , count, cost] = calls[0]!; + expect((actorArg as Actor).user.id).toBe(actor.user.id); + expect(count).toBe(expectedSeconds); + expect(cost).toBe(perSecond * expectedSeconds); + }); +}); + +// ── Error mapping ─────────────────────────────────────────────────── + +describe('VoiceChangerDriver.convert error mapping', () => { + it('rethrows the upstream status when ElevenLabs returns an error body', async () => { + const { actor } = await makeUser(); + fetchSpy.mockResolvedValueOnce( + new Response( + JSON.stringify({ detail: 'voice not found' }), + { + status: 404, + headers: { 'content-type': 'application/json' }, + }, + ), + ); + + await expect( + withActor(actor, () => + driver.convert({ + audio: dataUrl(Buffer.from('x'), 'audio/mpeg'), + voice_id: 'missing-voice', + }), + ), + ).rejects.toMatchObject({ statusCode: 404 }); + + // No metering should be recorded on a failed call. + expect(incrementUsageSpy).not.toHaveBeenCalled(); + }); +});