Initial realtime voice assistant spike (#27)

livekit · Sep 12, 2024 · 1063d2a · 1063d2a
1 parent d2a31f3
commit 1063d2a
Show file tree

Hide file tree

Showing 16 changed files with 751 additions and 8 deletions.
diff --git a/.changeset/real-grapes-change.md b/.changeset/real-grapes-change.md
@@ -0,0 +1,6 @@
+---
+"@livekit/agents": patch
+"@livekit/agents-plugin-openai": patch
+---
+
+Add transcript support to realtime voice assistant
diff --git a/.envrc b/.envrc
@@ -0,0 +1 @@
+use flake
diff --git a/agents/package.json b/agents/package.json
@@ -18,7 +18,7 @@
     "typescript": "^5.0.0"
   },
   "dependencies": {
-    "@livekit/rtc-node": "^0.7.0",
+    "@livekit/rtc-node": "^0.8.0",
     "@livekit/protocol": "^1.21.0",
     "commander": "^12.0.0",
     "livekit-server-sdk": "^2.6.1",

diff --git a/agents/src/audio.ts b/agents/src/audio.ts
@@ -0,0 +1,61 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { AudioFrame } from '@livekit/rtc-node';
+import { log } from './log.js';
+
+export class AudioByteStream {
+  private sampleRate: number;
+  private numChannels: number;
+  private bytesPerFrame: number;
+  private buf: Int8Array;
+
+  constructor(sampleRate: number, numChannels: number, samplesPerChannel: number | null = null) {
+    this.sampleRate = sampleRate;
+    this.numChannels = numChannels;
+
+    if (samplesPerChannel === null) {
+      samplesPerChannel = Math.floor(sampleRate / 50); // 20ms by default
+    }
+
+    this.bytesPerFrame = numChannels * samplesPerChannel * 2; // 2 bytes per sample (Int16)
+    this.buf = new Int8Array();
+  }
+
+  write(data: ArrayBuffer): AudioFrame[] {
+    this.buf = new Int8Array([...this.buf, ...new Int8Array(data)]);
+
+    const frames: AudioFrame[] = [];
+    while (this.buf.length >= this.bytesPerFrame) {
+      const frameData = this.buf.slice(0, this.bytesPerFrame);
+      this.buf = this.buf.slice(this.bytesPerFrame);
+
+      frames.push(
+        new AudioFrame(
+          new Int16Array(frameData.buffer),
+          this.sampleRate,
+          this.numChannels,
+          frameData.length / 2,
+        ),
+      );
+    }
+
+    return frames;
+  }
+
+  flush(): AudioFrame[] {
+    if (this.buf.length % (2 * this.numChannels) !== 0) {
+      log().warn('AudioByteStream: incomplete frame during flush, dropping');
+      return [];
+    }
+
+    return [
+      new AudioFrame(
+        new Int16Array(this.buf.buffer),
+        this.sampleRate,
+        this.numChannels,
+        this.buf.length / 2,
+      ),
+    ];
+  }
+}
diff --git a/agents/src/index.ts b/agents/src/index.ts
@@ -14,5 +14,6 @@ export * from './utils.js';
 export * from './log.js';
 export * from './generator.js';
 export * from './tokenize.js';
+export * from './audio.js';
 
 export { cli, stt, tts };
diff --git a/agents/src/utils.ts b/agents/src/utils.ts
@@ -1,7 +1,13 @@
 // SPDX-FileCopyrightText: 2024 LiveKit, Inc.
 //
 // SPDX-License-Identifier: Apache-2.0
-import { AudioFrame } from '@livekit/rtc-node';
+import type {
+  LocalParticipant,
+  RemoteParticipant,
+  Room,
+  TrackPublication,
+} from '@livekit/rtc-node';
+import { AudioFrame, TrackSource } from '@livekit/rtc-node';
 import { EventEmitter, once } from 'events';
 
 export type AudioBuffer = AudioFrame[] | AudioFrame;
@@ -42,6 +48,33 @@ export const mergeFrames = (buffer: AudioBuffer): AudioFrame => {
   return buffer;
 };
 
+export const findMicroTrackId = (room: Room, identity: string): string => {
+  let p: RemoteParticipant | LocalParticipant | undefined = room.remoteParticipants.get(identity);
+
+  if (identity === room.localParticipant?.identity) {
+    p = room.localParticipant;
+  }
+
+  if (!p) {
+    throw new Error(`participant ${identity} not found`);
+  }
+
+  // find first micro track
+  let trackId: string | undefined;
+  p.trackPublications.forEach((track: TrackPublication) => {
+    if (track.source === TrackSource.SOURCE_MICROPHONE) {
+      trackId = track.sid;
+      return;
+    }
+  });
+
+  if (!trackId) {
+    throw new Error(`participant ${identity} does not have a microphone track`);
+  }
+
+  return trackId;
+};
+
 /** @internal */
 export class Mutex {
   #locking: Promise<void>;

diff --git a/examples/package.json b/examples/package.json
@@ -12,6 +12,7 @@
   "dependencies": {
     "@livekit/agents": "workspace:*",
     "@livekit/agents-plugin-elevenlabs": "workspace:*",
-    "@livekit/rtc-node": "^0.7.0"
+    "@livekit/agents-plugin-openai": "workspace:*",
+    "@livekit/rtc-node": "^0.8.0"
   }
 }
diff --git a/examples/src/minimal_assistant.ts b/examples/src/minimal_assistant.ts
@@ -0,0 +1,28 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+import { type JobContext, WorkerOptions, cli, defineAgent } from '@livekit/agents';
+import { VoiceAssistant, defaultInferenceConfig } from '@livekit/agents-plugin-openai';
+
+export default defineAgent({
+  entry: async (ctx: JobContext) => {
+    await ctx.connect();
+
+    console.log('starting assistant example agent');
+
+    // FIXME: for some reason the remoteParticipants are not being populated at connection time nor calling onParticipantConnected
+    setTimeout(() => {
+      const assistant = new VoiceAssistant({
+        ...defaultInferenceConfig,
+        system_message: 'You talk unprompted.',
+      });
+      assistant.start(ctx.room);
+    }, 500);
+  },
+});
+
+// check that we're running this file and not importing functions from it
+// without this if closure, our code would start` a new Agents process on every job process.
+if (process.argv[1] === import.meta.filename) {
+  cli.runApp(new WorkerOptions({ agent: import.meta.filename }));
+}
diff --git a/plugins/openai/api-extractor.json b/plugins/openai/api-extractor.json
@@ -0,0 +1,20 @@
+/**
+ * Config file for API Extractor.  For more info, please visit: https://api-extractor.com
+ */
+{
+  "$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json",
+
+  /**
+   * Optionally specifies another JSON config file that this file extends from.  This provides a way for
+   * standard settings to be shared across multiple projects.
+   *
+   * If the path starts with "./" or "../", the path is resolved relative to the folder of the file that contains
+   * the "extends" field.  Otherwise, the first path segment is interpreted as an NPM package name, and will be
+   * resolved using NodeJS require().
+   *
+   * SUPPORTED TOKENS: none
+   * DEFAULT VALUE: ""
+   */
+  "extends": "../../api-extractor-shared.json",
+  "mainEntryPointFilePath": "./dist/index.d.ts"
+}
diff --git a/plugins/openai/package.json b/plugins/openai/package.json
@@ -0,0 +1,25 @@
+{
+  "name": "@livekit/agents-plugin-openai",
+  "version": "0.1.0",
+  "description": "OpenAI plugin for LiveKit Node Agents",
+  "main": "dist/index.js",
+  "types": "dist/index.d.ts",
+  "author": "LiveKit",
+  "type": "module",
+  "scripts": {
+    "build": "tsc",
+    "lint": "eslint -f unix \"src/**/*.{ts,js}\"",
+    "api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
+    "api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
+  },
+  "devDependencies": {
+    "@microsoft/api-extractor": "^7.35.0",
+    "@types/ws": "^8.5.10",
+    "typescript": "^5.0.0"
+  },
+  "dependencies": {
+    "@livekit/agents": "workspace:*",
+    "@livekit/rtc-node": "^0.8.0",
+    "ws": "^8.16.0"
+  }
+}
diff --git a/plugins/openai/src/index.ts b/plugins/openai/src/index.ts
@@ -0,0 +1,5 @@
+// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
+//
+// SPDX-License-Identifier: Apache-2.0
+
+export * from './voice_assistant/index.js';