Merge pull request mastra-ai#1338 from mastra-ai/audio

TTS primitives in Mastra
Joshuafolorunsho · Jan 10, 2025 · 35d1602 · 35d1602
2 parents cc8ff90 + 42d7438
commit 35d1602
Show file tree

Hide file tree

Showing 28 changed files with 988 additions and 66 deletions.
diff --git a/.changeset/selfish-pandas-kick.md b/.changeset/selfish-pandas-kick.md
@@ -0,0 +1,7 @@
+---
+'@mastra/core': patch
+'@mastra/tts': patch
+'docs': patch
+---
+
+TTS module
diff --git a/docs/src/pages/docs/reference/_meta.ts b/docs/src/pages/docs/reference/_meta.ts
@@ -2,6 +2,7 @@ const meta = {
   core: "Core",
   cli: "CLI",
   llm: "LLM",
+  tts: "TTS",
   agents: "Agents",
   workflows: "Workflows",
   rag: "Knowledge",

diff --git a/docs/src/pages/docs/reference/llm/_meta.ts b/docs/src/pages/docs/reference/llm/_meta.ts
@@ -1,6 +1,7 @@
 const meta = {
   "providers-and-models": "Providers and Models",
   generate: "generate",
+  stream: "stream",
 };
 
 export default meta;
diff --git a/docs/src/pages/docs/reference/tts/_meta.ts b/docs/src/pages/docs/reference/tts/_meta.ts
@@ -0,0 +1,7 @@
+const meta = {
+  "providers-and-models": "Providers and Models",
+  generate: "generate",
+  stream: "stream",
+};
+
+export default meta;
diff --git a/docs/src/pages/docs/reference/tts/generate.mdx b/docs/src/pages/docs/reference/tts/generate.mdx
@@ -0,0 +1,81 @@
+# `generate()`
+
+The `generate()` method is used to interact with the TTS model to produce an audio response. This method accepts `text` and `voice` as parameters.
+
+## Parameters
+
+
+<PropertiesTable
+  content={[
+    {
+      name: 'text',
+      type: 'string',
+      description: 'The messages to be processed by TTS.',
+    },
+    {
+      name: 'voice',
+      type: 'string',
+      description: 'Voice ID to be used with generation.',
+    },
+  ]}
+/>
+
+## Returns
+
+<PropertiesTable
+  content={[
+    {
+      name: 'audioResult',
+      type: 'Readable',
+      isOptional: true,
+      description: 'The generated audio stream',
+    },
+  ]}
+/>
+
+## Examples
+
+### Basic Audio Generation (ElevenLabs)
+
+```typescript
+import { ElevenLabsTTS } from '@mastra/tts'
+
+ const tts = new ElevenLabsTTS({
+    model: {
+      name: 'eleven_multilingual_v2',
+      apiKey: process.env.ELEVENLABS_API_KEY!,
+    },
+  });
+
+const voices = await tts.voices();
+const voiceId = voices?.[0]?.voice_id!;
+
+const { audioResult } = await tts.generate({ text: "What is AI?", voice: voiceId });
+
+await writeFile(path.join(process.cwd(), '/test-outputs/generate-output.mp3'), audioBuffer);
+```
+
+### Basic Audio Generation (OpenAI)
+
+```typescript
+import { OpenAITTS } from '@mastra/tts'
+
+ const tts = new OpenAITTS({
+    model: {
+      name: 'tts-1',
+      apiKey: process.env.OPENAI_API_KEY!,
+    },
+  });
+
+const voices = await tts.voices();
+const voiceId = voices?.[0]?.voice_id!;
+
+const { audioResult } = await tts.generate({ text: "What is AI?", voice: voiceId });
+
+const outputPath = path.join(process.cwd(), 'test-outputs/open-aigenerate-test.mp3');
+writeFileSync(outputPath, audioResult);
+```
+
+## Related Methods
+
+For streaming audio responses, see the [`stream()`](./stream.mdx) method documentation.
diff --git a/docs/src/pages/docs/reference/tts/providers-and-models.mdx b/docs/src/pages/docs/reference/tts/providers-and-models.mdx
@@ -0,0 +1,8 @@
+# Providers and Models
+
+## Most popular providers
+
+| Provider      | Supported Models                                                                                                                                                         |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ElevenLabs    | `eleven_multilingual_v2`, `eleven_flash_v2_5`, `eleven_flash_v2`, `eleven_multilingual_sts_v2`, `eleven_english_sts_v2`                                                  |
+| OpenAI        | `tts-1`, `tts-1-hd`                                                                                                                                                      |                                                                                                  
diff --git a/docs/src/pages/docs/reference/tts/stream.mdx b/docs/src/pages/docs/reference/tts/stream.mdx
@@ -0,0 +1,165 @@
+# `stream()`
+
+The `stream()` method is used to interact with the TTS model to produce an audio response stream. This method accepts `text` and `voice` as parameters.
+
+## Parameters
+
+
+<PropertiesTable
+  content={[
+    {
+      name: 'text',
+      type: 'string',
+      description: 'The messages to be processed by TTS.',
+    },
+    {
+      name: 'voice',
+      type: 'string',
+      description: 'Voice ID to be used with generation.',
+    },
+  ]}
+/>
+
+## Returns
+
+<PropertiesTable
+  content={[
+    {
+      name: 'audioResult',
+      type: 'Readable',
+      isOptional: true,
+      description: 'The generated audio stream',
+    },
+  ]}
+/>
+
+## Examples
+
+### Basic Audio Generation (ElevenLabs)
+
+```typescript
+import { ElevenLabsTTS } from '@mastra/tts'
+
+ const tts = new ElevenLabsTTS({
+    model: {
+      name: 'eleven_multilingual_v2',
+      apiKey: process.env.ELEVENLABS_API_KEY!,
+    },
+  });
+
+const voices = await tts.voices();
+const voiceId = voices?.[0]?.voice_id!;
+
+const { audioResult } = await tts.stream({ text: "What is AI?", voice: voiceId });
+
+// Create a write stream to simulate real-time playback
+const outputPath = path.join(process.cwd(), '/test-outputs/streaming-output.mp3');
+const writeStream = createWriteStream(outputPath);
+
+let firstChunkTime: number | null = null;
+let lastChunkTime: number | null = null;
+let totalChunks = 0;
+
+// Process chunks as they arrive
+for await (const chunk of audioResult) {
+    if (!firstChunkTime) {
+    firstChunkTime = Date.now();
+    }
+    lastChunkTime = Date.now();
+    totalChunks++;
+
+    // Write chunk immediately as it arrives
+    writeStream.write(chunk);
+
+    // Log timing of chunk arrival
+    console.log(`Received chunk ${totalChunks} at ${lastChunkTime - firstChunkTime!}ms`);
+}
+
+writeStream.end()
+```
+
+### Basic Audio Stream (ElevenLabs)
+
+```typescript
+import { ElevenLabsTTS } from '@mastra/tts'
+
+ const tts = new ElevenLabsTTS({
+    model: {
+      name: 'eleven_multilingual_v2',
+      apiKey: process.env.ELEVENLABS_API_KEY!,
+    },
+  });
+
+const voices = await tts.voices();
+const voiceId = voices?.[0]?.voice_id!;
+
+const { audioResult } = await tts.stream({ text: "What is AI?", voice: voiceId });
+
+// Create a write stream to simulate real-time playback
+const outputPath = path.join(process.cwd(), '/test-outputs/streaming-output.mp3');
+const writeStream = createWriteStream(outputPath);
+
+let firstChunkTime: number | null = null;
+let lastChunkTime: number | null = null;
+let totalChunks = 0;
+
+// Process chunks as they arrive
+for await (const chunk of audioResult) {
+    if (!firstChunkTime) {
+    firstChunkTime = Date.now();
+    }
+    lastChunkTime = Date.now();
+    totalChunks++;
+
+    // Write chunk immediately as it arrives
+    writeStream.write(chunk);
+
+    // Log timing of chunk arrival
+    console.log(`Received chunk ${totalChunks} at ${lastChunkTime - firstChunkTime!}ms`);
+}
+
+writeStream.end()
+```
+
+### Basic Audio Stream (OpenAI)
+
+```typescript
+import { OpenAITTS } from '@mastra/tts'
+
+ const tts = new OpenAITTS({
+    model: {
+      name: 'tts-1',
+      apiKey: process.env.OPENAI_API_KEY!,
+    },
+  });
+
+const voices = await tts.voices();
+const voiceId = voices?.[0]?.voice_id!;
+
+const { audioResult } = await tts.stream({ text: "What is AI?", voice: voiceId });
+
+// Create a write stream to simulate real-time playback
+const outputPath = path.join(process.cwd(), '/test-outputs/streaming-output.mp3');
+const writeStream = createWriteStream(outputPath);
+
+let firstChunkTime: number | null = null;
+let lastChunkTime: number | null = null;
+let totalChunks = 0;
+
+// Process chunks as they arrive
+for await (const chunk of audioResult) {
+    if (!firstChunkTime) {
+    firstChunkTime = Date.now();
+    }
+    lastChunkTime = Date.now();
+    totalChunks++;
+
+    // Write chunk immediately as it arrives
+    writeStream.write(chunk);
+
+    // Log timing of chunk arrival
+    console.log(`Received chunk ${totalChunks} at ${lastChunkTime - firstChunkTime!}ms`);
+}
+
+writeStream.end()
+```
diff --git a/package.json b/package.json
@@ -33,13 +33,13 @@
     "build:memory": "pnpm --filter ./packages/memory build",
     "build:toolsets": "pnpm --filter ./packages/toolsets build",
     "generate:integration": "pnpx tsx ./integration-generator/index.ts",
-    "test": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest",
-    "test:watch": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --watch",
-    "test:cli": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --projects ./packages/cli --passWithNoTests",
-    "test:core": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --projects ./packages/core",
-    "test:engine": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --projects ./packages/engine",
-    "test:rag": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --projects ./packages/rag",
-    "test:memory": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\"jest --projects ./packages/memory",
+    "test": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest",
+    "test:watch": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --watch",
+    "test:cli": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --projects ./packages/cli --passWithNoTests",
+    "test:core": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --projects ./packages/core",
+    "test:engine": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --projects ./packages/engine",
+    "test:rag": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --projects ./packages/rag",
+    "test:memory": "NODE_OPTIONS=\"--experimental-vm-modules --max-old-space-size=8192\" jest --projects ./packages/memory",
     "lint-staged": "lint-staged",
     "preinstall": "npx only-allow pnpm",
     "prepare": "husky",

diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
@@ -13,3 +13,4 @@ export * from './memory';
 export * from './telemetry';
 export * from './utils';
 export * from './embeddings';
+export * from './tts';
diff --git a/packages/core/src/logger/index.ts b/packages/core/src/logger/index.ts
@@ -9,6 +9,7 @@ export const RegisteredLogger = {
   AGENT: 'AGENT',
   WORKFLOW: 'WORKFLOW',
   LLM: 'LLM',
+  TTS: 'TTS',
 } as const;
 
 export type RegisteredLogger = (typeof RegisteredLogger)[keyof typeof RegisteredLogger];

diff --git a/packages/core/src/tts/index.ts b/packages/core/src/tts/index.ts
@@ -0,0 +1,36 @@
+import { MastraBase } from '../base';
+import { InstrumentClass } from '../telemetry';
+
+interface BuiltInModelConfig {
+  provider: string;
+  name: string;
+  apiKey?: string;
+}
+
+@InstrumentClass({
+  prefix: 'tts',
+  excludeMethods: ['__setTools', '__setLogger', '__setTelemetry', '#log'],
+})
+export abstract class MastraTTS extends MastraBase {
+  model: BuiltInModelConfig;
+  constructor({ model }: { model: BuiltInModelConfig }) {
+    super({
+      component: 'TTS',
+    });
+    this.model = model;
+  }
+
+  traced<T extends Function>(method: T, methodName: string): T {
+    return (
+      this.telemetry?.traceMethod(method, {
+        spanName: `${this.model.name}-tts.${methodName}`,
+        attributes: {
+          'tts.type': `${this.model.name}`,
+        },
+      }) ?? method
+    );
+  }
+
+  abstract generate({ text }: { text: string }): Promise<any>;
+  abstract stream({ text }: { text: string }): Promise<any>;
+}
diff --git a/packages/tts/.gitignore b/packages/tts/.gitignore
@@ -0,0 +1 @@
+.env
diff --git a/packages/tts/jest.config.ts b/packages/tts/jest.config.ts
@@ -0,0 +1,19 @@
+import { config } from 'dotenv';
+
+config();
+
+export default {
+  preset: 'ts-jest',
+  extensionsToTreatAsEsm: ['.ts'],
+  moduleNameMapper: {
+    '^(\\.{1,2}/.*)\\.js$': '$1',
+  },
+  transform: {
+    '^.+\\.tsx?$': [
+      'ts-jest',
+      {
+        useESM: true,
+      },
+    ],
+  },
+};