Merge pull request #47 from WqyJh/main

feat: support o200k_base (gpt-4o)
pkoukk · May 21, 2024 · 7defbfc · 7defbfc
2 parents 5fef437 + 14f6877
commit 7defbfc
Show file tree

Hide file tree

Showing 11 changed files with 130 additions and 32 deletions.
diff --git a/README.md b/README.md
@@ -176,6 +176,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 # Available Encodings
  | Encoding name           | OpenAI models                                        |
  | ----------------------- | ---------------------------------------------------- |
+ | `o200k_base`            | `gpt-4o`                                             |
  | `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, `text-embedding-3-small`, `text-embedding-3-large`   |
  | `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003` |
  | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                          |
@@ -185,8 +186,10 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 # Available Models
 | Model name                   | OpenAI models |
 | ---------------------------- | ------------- |
+| gpt-4o-*                     | o200k_base    |
 | gpt-4-*                      | cl100k_base   |
 | gpt-3.5-turbo-*              | cl100k_base   |
+| gpt-4o                       | o200k_base    |
 | gpt-4                        | cl100k_base   |
 | gpt-3.5-turbo                | cl100k_base   |
 | text-davinci-003             | p50k_base     |
@@ -254,5 +257,14 @@ Or maybe my benchmark method is not appropriate.
 
 If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR.
 
+For new `o200k_base` encoding, it seems slower than `cl100k_base`. tiktoken-go is slightly slower than tiktoken on the following benchmark.
+
+| name        | encoding | time/op | os         | cpu      | text                             | times  |
+| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
+| tiktoken-go | o200k_base | 108522 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | o200k_base | 70198 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken-go | cl100k_base | 94502 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | cl100k_base | 54642 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+
 # License
 [MIT](./LICENSE)
diff --git a/README_zh-hans.md b/README_zh-hans.md
@@ -169,6 +169,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 # available encodings
  | Encoding name           | OpenAI models                                        |
  | ----------------------- | ---------------------------------------------------- |
+ | `o200k_base`            | `gpt-4o`                                             |
  | `cl100k_base`           | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002`, 	`text-embedding-3-small`, `text-embedding-3-large`   |
  | `p50k_base`             | Codex models, `text-davinci-002`, `text-davinci-003` |
  | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci`                          |
@@ -177,6 +178,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 # available models
 | Model name                   | OpenAI models |
 | ---------------------------- | ------------- |
+| gpt-4o-*                     | o200k_base    |
 | gpt-4                        | cl100k_base   |
 | gpt-4-*                      | cl100k_base   |
 | gpt-3.5-turbo                | cl100k_base   |
@@ -237,5 +239,14 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string
 
 如果你有更好的测试方法，或者说你想添加在你机器上的测试结果，欢迎提PR。
 
+新的 `o200k_base` 编码, 看起来比 `cl100k_base` 慢. 在以下硬件上，tiktoken-go 比 tiktoken 略慢。
+
+| name        | encoding | time/op | os         | cpu      | text                             | times  |
+| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ |
+| tiktoken-go | o200k_base | 108522 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | o200k_base | 70198 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken-go | cl100k_base | 94502 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+| tiktoken    | cl100k_base | 54642 ns  | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 |
+
 # License
 [MIT](./LICENSE)
diff --git a/benchmark_test.go b/benchmark_test.go
@@ -20,7 +20,7 @@ func BenchmarkEncoding(b *testing.B) {
 		panic(err)
 	}
 
-	tkm, err := EncodingForModel("gpt-4")
+	tkm, err := EncodingForModel("gpt-4o")
 	if err != nil {
 		panic(err)
 	}