vercel · lgrammel · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024 · Oct 25, 2024
diff --git a/.changeset/unlucky-wasps-shout.md b/.changeset/unlucky-wasps-shout.md
@@ -0,0 +1,9 @@
+---
+'@ai-sdk/provider-utils': patch
+'@ai-sdk/anthropic': patch
+'@ai-sdk/provider': patch
+'@ai-sdk/ui-utils': patch
+'ai': patch
+---
+
+feat (ai/core): multi-part tool results (incl. images)
diff --git a/content/docs/02-foundations/03-prompts.mdx b/content/docs/02-foundations/03-prompts.mdx
@@ -360,6 +360,55 @@ const result = await generateText({
 });
 ```
 
+#### Multi-modal Tool Results
+
+<Note type="warning">
+  Multi-part tool results are experimental and only supported by Anthropic.
+</Note>
+
+Tool results can be multi-part and multi-modal, e.g. a text and an image.
+You can use the `experimental_content` property on tool parts to specify multi-part tool results.
+
+```ts highlight="20-32"
+const result = await generateText({
+  model: yourModel,
+  messages: [
+    // ...
+    {
+      role: 'tool',
+      content: [
+        {
+          type: 'tool-result',
+          toolCallId: '12345', // needs to match the tool call id
+          toolName: 'get-nutrition-data',
+          // for models that do not support multi-part tool results,
+          // you can include a regular result part:
+          result: {
+            name: 'Cheese, roquefort',
+            calories: 369,
+            fat: 31,
+            protein: 22,
+          },
+          // for models that support multi-part tool results,
+          // you can include a multi-part content part:
+          content: [
+            {
+              type: 'text',
+              text: 'Here is an image of the nutrition data for the cheese:',
+            },
+            {
+              type: 'image',
+              data: fs.readFileSync('./data/roquefort-nutrition-data.png'),
+              mimeType: 'image/png',
+            },
+          ],
+        },
+      ],
+    },
+  ],
+});
+```
+
 ### System Messages
 
 System messages are messages that are sent to the model before the user messages to guide the assistant's behavior.

diff --git a/content/docs/03-ai-sdk-core/15-tools-and-tool-calling.mdx b/content/docs/03-ai-sdk-core/15-tools-and-tool-calling.mdx
@@ -276,6 +276,10 @@ async function generateSomething(prompt: string): Promise<{
 
 ## Active Tools
 
+<Note type="warning">
+  The `activeTools` property is experimental and may change in the future.
+</Note>
+
 Language models can only handle a limited number of tools at a time, depending on the model.
 To allow for static typing using a large number of tools and limiting the available tools to the model at the same time,
 the AI SDK provides the `experimental_activeTools` property.
@@ -294,6 +298,54 @@ const { text } = await generateText({
 });
 ```
 
+## Multi-modal Tool Results
+
+<Note type="warning">
+  Multi-modal tool results are experimental and only supported by Anthropic.
+</Note>
+
+In order to send multi-modal tool results, e.g. screenshots, back to the model,
+they need to be converted into a specific format.
+
+AI SDK Core tools have an optional `experimental_toToolResultContent` function
+that converts the tool result into a content part.
+
+Here is an example for converting a screenshot into a content part:
+
+```ts highlight="22-27"
+const result = await generateText({
+  model: anthropic('claude-3-5-sonnet-20241022'),
+  tools: {
+    computer: anthropic.tools.computer_20241022({
+      // ...
+      async execute({ action, coordinate, text }) {
+        switch (action) {
+          case 'screenshot': {
+            return {
+              type: 'image',
+              data: fs
+                .readFileSync('./data/screenshot-editor.png')
+                .toString('base64'),
+            };
+          }
+          default: {
+            return `executed ${action}`;
+          }
+        }
+      },
+
+      // map to tool result content for LLM consumption:
+      experimental_toToolResultContent(result) {
+        return typeof result === 'string'
+          ? [{ type: 'text', text: result }]
+          : [{ type: 'image', data: result.data, mimeType: 'image/png' }];
+      },
+    }),
+  },
+  // ...
+});
+```
+
 ## Examples
 
 You can see tools in action using various frameworks in the following examples:

diff --git a/content/docs/07-reference/01-ai-sdk-core/20-tool.mdx b/content/docs/07-reference/01-ai-sdk-core/20-tool.mdx
@@ -52,26 +52,71 @@ export const weatherTool = tool({
               isOptional: true,
               type: 'string',
               description:
-                'Information about the purpose of the tool including details on how and when it can be used by the model.',
+                'Information about the purpose of the tool including details on how and when it can be used by the model.'
             },
             {
               name: 'parameters',
               type: 'Zod Schema | JSON Schema',
               description:
-                'The schema of the input that the tool expects. The language model will use this to generate the input. It is also used to validate the output of the language model. Use descriptions to make the input understandable for the language model. You can either pass in a Zod schema or a JSON schema (using the `jsonSchema` function).',
+                'The schema of the input that the tool expects. The language model will use this to generate the input. It is also used to validate the output of the language model. Use descriptions to make the input understandable for the language model. You can either pass in a Zod schema or a JSON schema (using the `jsonSchema` function).'
             },
             {
               name: 'execute',
               isOptional: true,
-              type: 'async (parameters: T, options: { abortSignal: AbortSignal }) => JSONValue',
+              type: 'async (parameters: T, options: { abortSignal: AbortSignal }) => RESULT',
               description:
-                'An async function that is called with the arguments from the tool call and produces a result. If not provided, the tool will not be executed automatically.',
+                'An async function that is called with the arguments from the tool call and produces a result. If not provided, the tool will not be executed automatically.'
             },
-          ],
-        },
-      ],
-    },
-  ]}
+            {
+              name: 'experimental_toToolResultContent',
+              isOptional: true,
+              type: '(result: RESULT) => TextToolResultContent | ImageToolResultContent',
+              description: 'An optional function that converts the result of the tool call to a content object that can be used in LLM messages.',
+              properties: [
+                {
+                  type: 'TextToolResultContent',
+                  parameters: [
+                    {
+                      name: 'type',
+                      type: "'text'",
+                      description: 'The type of the tool result content.'
+                    },
+                    {
+                      name: 'text',
+                      type: 'string',
+                      description: 'The content of the message.'
+                    }
+                  ]
+                },
+                {
+                  type: 'ImageToolResultContent',
+                  parameters: [
+                    {
+                      name: 'type',
+                      type: "'image'",
+                      description: 'The type of the tool result content.'
+                    },
+                    {
+                      name: 'data',
+                      type: 'string',
+                      description: 'The base64 encoded png image.'
+                    },
+                    {
+                      name: 'mimeType',
+                      isOptional: true,
+                      type: 'string',
+                      description: 'The mime type of the image.'
+                    }
+                  ]
+                }
+              ]
+            }
+          ]
+        }
+      ]
+    }
+
+]}
 />
 
 ### Returns

diff --git a/content/docs/07-reference/01-ai-sdk-core/30-core-message.mdx b/content/docs/07-reference/01-ai-sdk-core/30-core-message.mdx
@@ -173,9 +173,30 @@ export interface ToolResultPart {
    */
   result: unknown;
 
+  /**
+   * Multi-part content of the tool result. Only for tools that support multipart results.
+   */
+  experimental_content?: ToolResultContent;
+
   /**
    * Optional flag if the result is an error or an error message.
    */
   isError?: boolean;
 }
 ```
+
+### `ToolResultContent`
+
+```ts
+export type ToolResultContent = Array<
+  | {
+      type: 'text';
+      text: string;
+    }
+  | {
+      type: 'image';
+      data: string; // base64 encoded png image, e.g. screenshot
+      mimeType?: string; // e.g. 'image/png';
+    }
+>;
+```
diff --git a/content/providers/01-ai-sdk-providers/05-anthropic.mdx b/content/providers/01-ai-sdk-providers/05-anthropic.mdx
@@ -238,9 +238,36 @@ const computerTool = anthropic.tools.computer_20241022({
   displayWidthPx: 1920,
   displayHeightPx: 1080,
   displayNumber: 0, // Optional, for X11 environments
+
   execute: async ({ action, coordinate, text }) => {
     // Implement your computer control logic here
     // Return the result of the action
+
+    // Example code:
+    switch (action) {
+      case 'screenshot': {
+        // multipart result:
+        return {
+          type: 'image',
+          data: fs
+            .readFileSync('./data/screenshot-editor.png')
+            .toString('base64'),
+        };
+      }
+      default: {
+        console.log('Action:', action);
+        console.log('Coordinate:', coordinate);
+        console.log('Text:', text);
+        return `executed ${action}`;
+      }
+    }
+  },
+
+  // map to tool result content for LLM consumption:
+  experimental_toToolResultContent(result) {
+    return typeof result === 'string'
+      ? [{ type: 'text', text: result }]
+      : [{ type: 'image', data: result.data, mimeType: 'image/png' }];
   },
 });
 ```

diff --git a/examples/ai-core/data/screenshot-editor.png b/examples/ai-core/data/screenshot-editor.png
diff --git a/examples/ai-core/src/generate-text/anthropic-computer-use-bash.ts b/examples/ai-core/src/generate-text/anthropic-computer-use-bash.ts
@@ -0,0 +1,31 @@
+import { anthropic } from '@ai-sdk/anthropic';
+import { generateText } from 'ai';
+import 'dotenv/config';
+
+async function main() {
+  const result = await generateText({
+    model: anthropic('claude-3-5-sonnet-20241022'),
+    tools: {
+      bash: anthropic.tools.bash_20241022({
+        async execute({ command }) {
+          console.log('COMMAND', command);
+          return [
+            {
+              type: 'text',
+              text: `
+          ❯ ls
+          README.md     build         data          node_modules  package.json  src           tsconfig.json
+          `,
+            },
+          ];
+        },
+      }),
+    },
+    prompt: 'List the files in my home directory.',
+    maxSteps: 2,
+  });
+
+  console.log(result.text);
+}
+
+main().catch(console.error);
diff --git a/examples/ai-core/src/generate-text/anthropic-computer-use-computer.ts b/examples/ai-core/src/generate-text/anthropic-computer-use-computer.ts
@@ -0,0 +1,53 @@
+import { anthropic } from '@ai-sdk/anthropic';
+import { generateText } from 'ai';
+import 'dotenv/config';
+import fs from 'node:fs';
+
+async function main() {
+  const result = await generateText({
+    model: anthropic('claude-3-5-sonnet-20241022'),
+    tools: {
+      computer: anthropic.tools.computer_20241022({
+        displayWidthPx: 1024,
+        displayHeightPx: 768,
+
+        async execute({ action, coordinate, text }) {
+          console.log('args', { action, coordinate, text });
+          switch (action) {
+            case 'screenshot': {
+              // multipart result:
+              return {
+                type: 'image',
+                data: fs
+                  .readFileSync('./data/screenshot-editor.png')
+                  .toString('base64'),
+              };
+            }
+            default: {
+              console.log('Action:', action);
+              console.log('Coordinate:', coordinate);
+              console.log('Text:', text);
+              return `executed ${action}`;
+            }
+          }
+        },
+
+        // map to tool result content for LLM consumption:
+        experimental_toToolResultContent(result) {
+          return typeof result === 'string'
+            ? [{ type: 'text', text: result }]
+            : [{ type: 'image', data: result.data, mimeType: 'image/png' }];
+        },
+      }),
+    },
+    prompt:
+      'How can I switch to dark mode? Take a look at the screen and tell me.',
+    maxSteps: 5,
+  });
+
+  console.log(result.text);
+  console.log(result.finishReason);
+  console.log(JSON.stringify(result.toolCalls, null, 2));
+}
+
+main().catch(console.error);