Skip to content

Commit

Permalink
Initial realtime voice assistant spike (#27)
Browse files Browse the repository at this point in the history
  • Loading branch information
bcherry committed Sep 12, 2024
1 parent d2a31f3 commit 1063d2a
Show file tree
Hide file tree
Showing 16 changed files with 751 additions and 8 deletions.
6 changes: 6 additions & 0 deletions .changeset/real-grapes-change.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@livekit/agents": patch
"@livekit/agents-plugin-openai": patch
---

Add transcript support to realtime voice assistant
1 change: 1 addition & 0 deletions .envrc
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
use flake
2 changes: 1 addition & 1 deletion agents/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"typescript": "^5.0.0"
},
"dependencies": {
"@livekit/rtc-node": "^0.7.0",
"@livekit/rtc-node": "^0.8.0",
"@livekit/protocol": "^1.21.0",
"commander": "^12.0.0",
"livekit-server-sdk": "^2.6.1",
Expand Down
61 changes: 61 additions & 0 deletions agents/src/audio.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { AudioFrame } from '@livekit/rtc-node';
import { log } from './log.js';

export class AudioByteStream {
private sampleRate: number;
private numChannels: number;
private bytesPerFrame: number;
private buf: Int8Array;

constructor(sampleRate: number, numChannels: number, samplesPerChannel: number | null = null) {
this.sampleRate = sampleRate;
this.numChannels = numChannels;

if (samplesPerChannel === null) {
samplesPerChannel = Math.floor(sampleRate / 50); // 20ms by default
}

this.bytesPerFrame = numChannels * samplesPerChannel * 2; // 2 bytes per sample (Int16)
this.buf = new Int8Array();
}

write(data: ArrayBuffer): AudioFrame[] {
this.buf = new Int8Array([...this.buf, ...new Int8Array(data)]);

const frames: AudioFrame[] = [];
while (this.buf.length >= this.bytesPerFrame) {
const frameData = this.buf.slice(0, this.bytesPerFrame);
this.buf = this.buf.slice(this.bytesPerFrame);

frames.push(
new AudioFrame(
new Int16Array(frameData.buffer),
this.sampleRate,
this.numChannels,
frameData.length / 2,
),
);
}

return frames;
}

flush(): AudioFrame[] {
if (this.buf.length % (2 * this.numChannels) !== 0) {
log().warn('AudioByteStream: incomplete frame during flush, dropping');
return [];
}

return [
new AudioFrame(
new Int16Array(this.buf.buffer),
this.sampleRate,
this.numChannels,
this.buf.length / 2,
),
];
}
}
1 change: 1 addition & 0 deletions agents/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,6 @@ export * from './utils.js';
export * from './log.js';
export * from './generator.js';
export * from './tokenize.js';
export * from './audio.js';

export { cli, stt, tts };
35 changes: 34 additions & 1 deletion agents/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,13 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { AudioFrame } from '@livekit/rtc-node';
import type {
LocalParticipant,
RemoteParticipant,
Room,
TrackPublication,
} from '@livekit/rtc-node';
import { AudioFrame, TrackSource } from '@livekit/rtc-node';
import { EventEmitter, once } from 'events';

export type AudioBuffer = AudioFrame[] | AudioFrame;
Expand Down Expand Up @@ -42,6 +48,33 @@ export const mergeFrames = (buffer: AudioBuffer): AudioFrame => {
return buffer;
};

export const findMicroTrackId = (room: Room, identity: string): string => {
let p: RemoteParticipant | LocalParticipant | undefined = room.remoteParticipants.get(identity);

if (identity === room.localParticipant?.identity) {
p = room.localParticipant;
}

if (!p) {
throw new Error(`participant ${identity} not found`);
}

// find first micro track
let trackId: string | undefined;
p.trackPublications.forEach((track: TrackPublication) => {
if (track.source === TrackSource.SOURCE_MICROPHONE) {
trackId = track.sid;
return;
}
});

if (!trackId) {
throw new Error(`participant ${identity} does not have a microphone track`);
}

return trackId;
};

/** @internal */
export class Mutex {
#locking: Promise<void>;
Expand Down
3 changes: 2 additions & 1 deletion examples/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
"dependencies": {
"@livekit/agents": "workspace:*",
"@livekit/agents-plugin-elevenlabs": "workspace:*",
"@livekit/rtc-node": "^0.7.0"
"@livekit/agents-plugin-openai": "workspace:*",
"@livekit/rtc-node": "^0.8.0"
}
}
28 changes: 28 additions & 0 deletions examples/src/minimal_assistant.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0
import { type JobContext, WorkerOptions, cli, defineAgent } from '@livekit/agents';
import { VoiceAssistant, defaultInferenceConfig } from '@livekit/agents-plugin-openai';

export default defineAgent({
entry: async (ctx: JobContext) => {
await ctx.connect();

console.log('starting assistant example agent');

// FIXME: for some reason the remoteParticipants are not being populated at connection time nor calling onParticipantConnected
setTimeout(() => {
const assistant = new VoiceAssistant({
...defaultInferenceConfig,
system_message: 'You talk unprompted.',
});
assistant.start(ctx.room);
}, 500);
},
});

// check that we're running this file and not importing functions from it
// without this if closure, our code would start` a new Agents process on every job process.
if (process.argv[1] === import.meta.filename) {
cli.runApp(new WorkerOptions({ agent: import.meta.filename }));
}
20 changes: 20 additions & 0 deletions plugins/openai/api-extractor.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
/**
* Config file for API Extractor. For more info, please visit: https://api-extractor.com
*/
{
"$schema": "https://developer.microsoft.com/json-schemas/api-extractor/v7/api-extractor.schema.json",

/**
* Optionally specifies another JSON config file that this file extends from. This provides a way for
* standard settings to be shared across multiple projects.
*
* If the path starts with "./" or "../", the path is resolved relative to the folder of the file that contains
* the "extends" field. Otherwise, the first path segment is interpreted as an NPM package name, and will be
* resolved using NodeJS require().
*
* SUPPORTED TOKENS: none
* DEFAULT VALUE: ""
*/
"extends": "../../api-extractor-shared.json",
"mainEntryPointFilePath": "./dist/index.d.ts"
}
25 changes: 25 additions & 0 deletions plugins/openai/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "@livekit/agents-plugin-openai",
"version": "0.1.0",
"description": "OpenAI plugin for LiveKit Node Agents",
"main": "dist/index.js",
"types": "dist/index.d.ts",
"author": "LiveKit",
"type": "module",
"scripts": {
"build": "tsc",
"lint": "eslint -f unix \"src/**/*.{ts,js}\"",
"api:check": "api-extractor run --typescript-compiler-folder ../../node_modules/typescript",
"api:update": "api-extractor run --local --typescript-compiler-folder ../../node_modules/typescript --verbose"
},
"devDependencies": {
"@microsoft/api-extractor": "^7.35.0",
"@types/ws": "^8.5.10",
"typescript": "^5.0.0"
},
"dependencies": {
"@livekit/agents": "workspace:*",
"@livekit/rtc-node": "^0.8.0",
"ws": "^8.16.0"
}
}
5 changes: 5 additions & 0 deletions plugins/openai/src/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
// SPDX-FileCopyrightText: 2024 LiveKit, Inc.
//
// SPDX-License-Identifier: Apache-2.0

export * from './voice_assistant/index.js';
Loading

0 comments on commit 1063d2a

Please sign in to comment.