From 175c72f02b185aa08aedb30cfd048b3b9d3d1d31 Mon Sep 17 00:00:00 2001 From: WqyJh <781345688@qq.com> Date: Wed, 15 May 2024 19:14:05 +0800 Subject: [PATCH 1/2] feat: support o200k_base --- README.md | 11 +++++++++++ README_zh-hans.md | 9 +++++++++ benchmark_test.go | 2 +- doc/test_result.md | 28 ++++++++++++++++++++++++++++ encoding.go | 32 ++++++++++++++++++++++++++++++++ test/benchmark.py | 18 ++++++++---------- test/benchmark_test.go | 20 ++++++-------------- test/get_udhr.py | 27 +++++++++++++++++++++++++++ test/test.txt | 4 ++-- test/token_num.go | 4 ++-- test/token_num.py | 4 +--- 11 files changed, 127 insertions(+), 32 deletions(-) create mode 100644 test/get_udhr.py diff --git a/README.md b/README.md index 45bf4dd..51130d7 100644 --- a/README.md +++ b/README.md @@ -185,8 +185,10 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string # Available Models | Model name | OpenAI models | | ---------------------------- | ------------- | +| gpt-4o-* | o200k_base | | gpt-4-* | cl100k_base | | gpt-3.5-turbo-* | cl100k_base | +| gpt-4o | o200k_base | | gpt-4 | cl100k_base | | gpt-3.5-turbo | cl100k_base | | text-davinci-003 | p50k_base | @@ -252,5 +254,14 @@ Or maybe my benchmark method is not appropriate. If you have better benchmark method or if you want add your benchmark result, please feel free to submit a PR. +For new `o200k_base` encoding, it seems slower than `cl100k_base`. tiktoken-go is slightly slower than tiktoken on the following benchmark. + +| name | encoding | time/op | os | cpu | text | times | +| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ | +| tiktoken-go | o200k_base | 108522 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken | o200k_base | 70198 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken-go | cl100k_base | 94502 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken | cl100k_base | 54642 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | + # License [MIT](./LICENSE) diff --git a/README_zh-hans.md b/README_zh-hans.md index a332503..dd0ac82 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -235,5 +235,14 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string 如果你有更好的测试方法,或者说你想添加在你机器上的测试结果,欢迎提PR。 +新的 `o200k_base` 编码, 看起来比 `cl100k_base` 慢. 在以下硬件上,tiktoken-go 比 tiktoken 略慢。 + +| name | encoding | time/op | os | cpu | text | times | +| ----------- | ------- | ------- | ---------- | -------- | -------------------------------- | ------ | +| tiktoken-go | o200k_base | 108522 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken | o200k_base | 70198 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken-go | cl100k_base | 94502 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | +| tiktoken | cl100k_base | 54642 ns | Ubuntu 22.04 | AMD Ryzen 9 5900HS | [UDHR](http://research.ics.aalto.fi/cog/data/udhr/) | 100000 | + # License [MIT](./LICENSE) diff --git a/benchmark_test.go b/benchmark_test.go index 45171ba..5ff82a7 100644 --- a/benchmark_test.go +++ b/benchmark_test.go @@ -20,7 +20,7 @@ func BenchmarkEncoding(b *testing.B) { panic(err) } - tkm, err := EncodingForModel("gpt-4") + tkm, err := EncodingForModel("gpt-4o") if err != nil { panic(err) } diff --git a/doc/test_result.md b/doc/test_result.md index e908e8f..2760446 100644 --- a/doc/test_result.md +++ b/doc/test_result.md @@ -1,45 +1,59 @@ # Encoding Test Result | python tiktoken | golang tiktoken-go | | :------------------------------------------------------- | :------------------------------------------------------- | +| text: hallo world!, encoding: o200k_base, token: 4 | text: hallo world!, encoding: o200k_base, token: 4 | | text: hallo world!, encoding: cl100k_base, token: 4 | text: hallo world!, encoding: cl100k_base, token: 4 | | text: hallo world!, encoding: p50k_base, token: 4 | text: hallo world!, encoding: p50k_base, token: 4 | | text: hallo world!, encoding: r50k_base, token: 4 | text: hallo world!, encoding: r50k_base, token: 4 | +| text: 你好世界!, encoding: o200k_base, token: 3 | text: 你好世界!, encoding: o200k_base, token: 3 | | text: 你好世界!, encoding: cl100k_base, token: 6 | text: 你好世界!, encoding: cl100k_base, token: 6 | | text: 你好世界!, encoding: p50k_base, token: 11 | text: 你好世界!, encoding: p50k_base, token: 11 | | text: 你好世界!, encoding: r50k_base, token: 11 | text: 你好世界!, encoding: r50k_base, token: 11 | +| text: こんにちは世界!, encoding: o200k_base, token: 3 | text: こんにちは世界!, encoding: o200k_base, token: 3 | | text: こんにちは世界!, encoding: cl100k_base, token: 5 | text: こんにちは世界!, encoding: cl100k_base, token: 5 | | text: こんにちは世界!, encoding: p50k_base, token: 13 | text: こんにちは世界!, encoding: p50k_base, token: 13 | | text: こんにちは世界!, encoding: r50k_base, token: 13 | text: こんにちは世界!, encoding: r50k_base, token: 13 | +| text: 안녕하세요 세계!, encoding: o200k_base, token: 4 | text: 안녕하세요 세계!, encoding: o200k_base, token: 4 | | text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 | text: 안녕하세요 세계!, encoding: cl100k_base, token: 10 | | text: 안녕하세요 세계!, encoding: p50k_base, token: 21 | text: 안녕하세요 세계!, encoding: p50k_base, token: 21 | | text: 안녕하세요 세계!, encoding: r50k_base, token: 21 | text: 안녕하세요 세계!, encoding: r50k_base, token: 21 | +| text: Привет мир!, encoding: o200k_base, token: 4 | text: Привет мир!, encoding: o200k_base, token: 4 | | text: Привет мир!, encoding: cl100k_base, token: 6 | text: Привет мир!, encoding: cl100k_base, token: 6 | | text: Привет мир!, encoding: p50k_base, token: 12 | text: Привет мир!, encoding: p50k_base, token: 12 | | text: Привет мир!, encoding: r50k_base, token: 12 | text: Привет мир!, encoding: r50k_base, token: 12 | +| text: ¡Hola mundo!, encoding: o200k_base, token: 4 | text: ¡Hola mundo!, encoding: o200k_base, token: 4 | | text: ¡Hola mundo!, encoding: cl100k_base, token: 4 | text: ¡Hola mundo!, encoding: cl100k_base, token: 4 | | text: ¡Hola mundo!, encoding: p50k_base, token: 7 | text: ¡Hola mundo!, encoding: p50k_base, token: 7 | | text: ¡Hola mundo!, encoding: r50k_base, token: 7 | text: ¡Hola mundo!, encoding: r50k_base, token: 7 | +| text: Hallo Welt!, encoding: o200k_base, token: 3 | text: Hallo Welt!, encoding: o200k_base, token: 3 | | text: Hallo Welt!, encoding: cl100k_base, token: 3 | text: Hallo Welt!, encoding: cl100k_base, token: 3 | | text: Hallo Welt!, encoding: p50k_base, token: 5 | text: Hallo Welt!, encoding: p50k_base, token: 5 | | text: Hallo Welt!, encoding: r50k_base, token: 5 | text: Hallo Welt!, encoding: r50k_base, token: 5 | +| text: Bonjour le monde!, encoding: o200k_base, token: 4 | text: Bonjour le monde!, encoding: o200k_base, token: 4 | | text: Bonjour le monde!, encoding: cl100k_base, token: 4 | text: Bonjour le monde!, encoding: cl100k_base, token: 4 | | text: Bonjour le monde!, encoding: p50k_base, token: 7 | text: Bonjour le monde!, encoding: p50k_base, token: 7 | | text: Bonjour le monde!, encoding: r50k_base, token: 7 | text: Bonjour le monde!, encoding: r50k_base, token: 7 | +| text: Ciao mondo!, encoding: o200k_base, token: 4 | text: Ciao mondo!, encoding: o200k_base, token: 4 | | text: Ciao mondo!, encoding: cl100k_base, token: 4 | text: Ciao mondo!, encoding: cl100k_base, token: 4 | | text: Ciao mondo!, encoding: p50k_base, token: 5 | text: Ciao mondo!, encoding: p50k_base, token: 5 | | text: Ciao mondo!, encoding: r50k_base, token: 5 | text: Ciao mondo!, encoding: r50k_base, token: 5 | +| text: Hej världen!, encoding: o200k_base, token: 3 | text: Hej världen!, encoding: o200k_base, token: 3 | | text: Hej världen!, encoding: cl100k_base, token: 7 | text: Hej världen!, encoding: cl100k_base, token: 7 | | text: Hej världen!, encoding: p50k_base, token: 8 | text: Hej världen!, encoding: p50k_base, token: 8 | | text: Hej världen!, encoding: r50k_base, token: 8 | text: Hej världen!, encoding: r50k_base, token: 8 | +| text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 | | text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 | | text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 | | text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 | +| text: Hallo verden!, encoding: o200k_base, token: 3 | text: Hallo verden!, encoding: o200k_base, token: 3 | | text: Hallo verden!, encoding: cl100k_base, token: 4 | text: Hallo verden!, encoding: cl100k_base, token: 4 | | text: Hallo verden!, encoding: p50k_base, token: 5 | text: Hallo verden!, encoding: p50k_base, token: 5 | | text: Hallo verden!, encoding: r50k_base, token: 5 | text: Hallo verden!, encoding: r50k_base, token: 5 | +| text: Hallo wereld!, encoding: o200k_base, token: 3 | text: Hallo wereld!, encoding: o200k_base, token: 3 | | text: Hallo wereld!, encoding: cl100k_base, token: 3 | text: Hallo wereld!, encoding: cl100k_base, token: 3 | | text: Hallo wereld!, encoding: p50k_base, token: 5 | text: Hallo wereld!, encoding: p50k_base, token: 5 | | text: Hallo wereld!, encoding: r50k_base, token: 5 | text: Hallo wereld!, encoding: r50k_base, token: 5 | +| text: Hallo verden!, encoding: o200k_base, token: 3 | text: Hallo verden!, encoding: o200k_base, token: 3 | | text: Hallo verden!, encoding: cl100k_base, token: 4 | text: Hallo verden!, encoding: cl100k_base, token: 4 | | text: Hallo verden!, encoding: p50k_base, token: 5 | text: Hallo verden!, encoding: p50k_base, token: 5 | | text: Hallo verden!, encoding: r50k_base, token: 5 | text: Hallo verden!, encoding: r50k_base, token: 5 | @@ -47,6 +61,7 @@ # Model Test Result | python tiktoken | golang tiktoken-go | | --------------------------------------------------------------------- | --------------------------------------------------------------------- | +| text: hallo world!, model: gpt-4o, token: 4 | text: hallo world!, model: gpt-4o, token: 4 | | text: hallo world!, model: gpt-4, token: 4 | text: hallo world!, model: gpt-4, token: 4 | | text: hallo world!, model: gpt-3.5-turbo, token: 4 | text: hallo world!, model: gpt-3.5-turbo, token: 4 | | text: hallo world!, model: text-davinci-003, token: 4 | text: hallo world!, model: text-davinci-003, token: 4 | @@ -69,6 +84,7 @@ | text: hallo world!, model: code-davinci-edit-001, token: 4 | text: hallo world!, model: code-davinci-edit-001, token: 4 | | text: hallo world!, model: text-embedding-ada-002, token: 4 | text: hallo world!, model: text-embedding-ada-002, token: 4 | | text: hallo world!, model: text-similarity-davinci-001, token: 4 | text: hallo world!, model: text-similarity-davinci-001, token: 4 | +| text: 你好世界!, model: gpt-4o, token: 3 | text: 你好世界!, model: gpt-4o, token: 3 | | text: 你好世界!, model: gpt-4, token: 6 | text: 你好世界!, model: gpt-4, token: 6 | | text: 你好世界!, model: gpt-3.5-turbo, token: 6 | text: 你好世界!, model: gpt-3.5-turbo, token: 6 | | text: 你好世界!, model: text-davinci-003, token: 11 | text: 你好世界!, model: text-davinci-003, token: 11 | @@ -91,6 +107,7 @@ | text: 你好世界!, model: code-davinci-edit-001, token: 11 | text: 你好世界!, model: code-davinci-edit-001, token: 11 | | text: 你好世界!, model: text-embedding-ada-002, token: 6 | text: 你好世界!, model: text-embedding-ada-002, token: 6 | | text: 你好世界!, model: text-similarity-davinci-001, token: 11 | text: 你好世界!, model: text-similarity-davinci-001, token: 11 | +| text: こんにちは世界!, model: gpt-4o, token: 3 | text: こんにちは世界!, model: gpt-4o, token: 3 | | text: こんにちは世界!, model: gpt-4, token: 5 | text: こんにちは世界!, model: gpt-4, token: 5 | | text: こんにちは世界!, model: gpt-3.5-turbo, token: 5 | text: こんにちは世界!, model: gpt-3.5-turbo, token: 5 | | text: こんにちは世界!, model: text-davinci-003, token: 13 | text: こんにちは世界!, model: text-davinci-003, token: 13 | @@ -113,6 +130,7 @@ | text: こんにちは世界!, model: code-davinci-edit-001, token: 13 | text: こんにちは世界!, model: code-davinci-edit-001, token: 13 | | text: こんにちは世界!, model: text-embedding-ada-002, token: 5 | text: こんにちは世界!, model: text-embedding-ada-002, token: 5 | | text: こんにちは世界!, model: text-similarity-davinci-001, token: 13 | text: こんにちは世界!, model: text-similarity-davinci-001, token: 13 | +| text: 안녕하세요 세계!, model: gpt-4o, token: 4 | text: 안녕하세요 세계!, model: gpt-4o, token: 4 | | text: 안녕하세요 세계!, model: gpt-4, token: 10 | text: 안녕하세요 세계!, model: gpt-4, token: 10 | | text: 안녕하세요 세계!, model: gpt-3.5-turbo, token: 10 | text: 안녕하세요 세계!, model: gpt-3.5-turbo, token: 10 | | text: 안녕하세요 세계!, model: text-davinci-003, token: 21 | text: 안녕하세요 세계!, model: text-davinci-003, token: 21 | @@ -135,6 +153,7 @@ | text: 안녕하세요 세계!, model: code-davinci-edit-001, token: 21 | text: 안녕하세요 세계!, model: code-davinci-edit-001, token: 21 | | text: 안녕하세요 세계!, model: text-embedding-ada-002, token: 10 | text: 안녕하세요 세계!, model: text-embedding-ada-002, token: 10 | | text: 안녕하세요 세계!, model: text-similarity-davinci-001, token: 21 | text: 안녕하세요 세계!, model: text-similarity-davinci-001, token: 21 | +| text: Привет мир!, model: gpt-4o, token: 4 | text: Привет мир!, model: gpt-4o, token: 4 | | text: Привет мир!, model: gpt-4, token: 6 | text: Привет мир!, model: gpt-4, token: 6 | | text: Привет мир!, model: gpt-3.5-turbo, token: 6 | text: Привет мир!, model: gpt-3.5-turbo, token: 6 | | text: Привет мир!, model: text-davinci-003, token: 12 | text: Привет мир!, model: text-davinci-003, token: 12 | @@ -157,6 +176,7 @@ | text: Привет мир!, model: code-davinci-edit-001, token: 12 | text: Привет мир!, model: code-davinci-edit-001, token: 12 | | text: Привет мир!, model: text-embedding-ada-002, token: 6 | text: Привет мир!, model: text-embedding-ada-002, token: 6 | | text: Привет мир!, model: text-similarity-davinci-001, token: 12 | text: Привет мир!, model: text-similarity-davinci-001, token: 12 | +| text: ¡Hola mundo!, model: gpt-4o, token: 4 | text: ¡Hola mundo!, model: gpt-4o, token: 4 | | text: ¡Hola mundo!, model: gpt-4, token: 4 | text: ¡Hola mundo!, model: gpt-4, token: 4 | | text: ¡Hola mundo!, model: gpt-3.5-turbo, token: 4 | text: ¡Hola mundo!, model: gpt-3.5-turbo, token: 4 | | text: ¡Hola mundo!, model: text-davinci-003, token: 7 | text: ¡Hola mundo!, model: text-davinci-003, token: 7 | @@ -179,6 +199,7 @@ | text: ¡Hola mundo!, model: code-davinci-edit-001, token: 7 | text: ¡Hola mundo!, model: code-davinci-edit-001, token: 7 | | text: ¡Hola mundo!, model: text-embedding-ada-002, token: 4 | text: ¡Hola mundo!, model: text-embedding-ada-002, token: 4 | | text: ¡Hola mundo!, model: text-similarity-davinci-001, token: 7 | text: ¡Hola mundo!, model: text-similarity-davinci-001, token: 7 | +| text: Hallo Welt!, model: gpt-4o, token: 3 | text: Hallo Welt!, model: gpt-4o, token: 3 | | text: Hallo Welt!, model: gpt-4, token: 3 | text: Hallo Welt!, model: gpt-4, token: 3 | | text: Hallo Welt!, model: gpt-3.5-turbo, token: 3 | text: Hallo Welt!, model: gpt-3.5-turbo, token: 3 | | text: Hallo Welt!, model: text-davinci-003, token: 5 | text: Hallo Welt!, model: text-davinci-003, token: 5 | @@ -201,6 +222,7 @@ | text: Hallo Welt!, model: code-davinci-edit-001, token: 5 | text: Hallo Welt!, model: code-davinci-edit-001, token: 5 | | text: Hallo Welt!, model: text-embedding-ada-002, token: 3 | text: Hallo Welt!, model: text-embedding-ada-002, token: 3 | | text: Hallo Welt!, model: text-similarity-davinci-001, token: 5 | text: Hallo Welt!, model: text-similarity-davinci-001, token: 5 | +| text: Bonjour le monde!, model: gpt-4o, token: 4 | text: Bonjour le monde!, model: gpt-4o, token: 4 | | text: Bonjour le monde!, model: gpt-4, token: 4 | text: Bonjour le monde!, model: gpt-4, token: 4 | | text: Bonjour le monde!, model: gpt-3.5-turbo, token: 4 | text: Bonjour le monde!, model: gpt-3.5-turbo, token: 4 | | text: Bonjour le monde!, model: text-davinci-003, token: 7 | text: Bonjour le monde!, model: text-davinci-003, token: 7 | @@ -223,6 +245,7 @@ | text: Bonjour le monde!, model: code-davinci-edit-001, token: 7 | text: Bonjour le monde!, model: code-davinci-edit-001, token: 7 | | text: Bonjour le monde!, model: text-embedding-ada-002, token: 4 | text: Bonjour le monde!, model: text-embedding-ada-002, token: 4 | | text: Bonjour le monde!, model: text-similarity-davinci-001, token: 7 | text: Bonjour le monde!, model: text-similarity-davinci-001, token: 7 | +| text: Ciao mondo!, model: gpt-4o, token: 4 | text: Ciao mondo!, model: gpt-4o, token: 4 | | text: Ciao mondo!, model: gpt-4, token: 4 | text: Ciao mondo!, model: gpt-4, token: 4 | | text: Ciao mondo!, model: gpt-3.5-turbo, token: 4 | text: Ciao mondo!, model: gpt-3.5-turbo, token: 4 | | text: Ciao mondo!, model: text-davinci-003, token: 5 | text: Ciao mondo!, model: text-davinci-003, token: 5 | @@ -245,6 +268,7 @@ | text: Ciao mondo!, model: code-davinci-edit-001, token: 5 | text: Ciao mondo!, model: code-davinci-edit-001, token: 5 | | text: Ciao mondo!, model: text-embedding-ada-002, token: 4 | text: Ciao mondo!, model: text-embedding-ada-002, token: 4 | | text: Ciao mondo!, model: text-similarity-davinci-001, token: 5 | text: Ciao mondo!, model: text-similarity-davinci-001, token: 5 | +| text: Hej världen!, model: gpt-4o, token: 3 | text: Hej världen!, model: gpt-4o, token: 3 | | text: Hej världen!, model: gpt-4, token: 7 | text: Hej världen!, model: gpt-4, token: 7 | | text: Hej världen!, model: gpt-3.5-turbo, token: 7 | text: Hej världen!, model: gpt-3.5-turbo, token: 7 | | text: Hej världen!, model: text-davinci-003, token: 8 | text: Hej världen!, model: text-davinci-003, token: 8 | @@ -267,6 +291,7 @@ | text: Hej världen!, model: code-davinci-edit-001, token: 8 | text: Hej världen!, model: code-davinci-edit-001, token: 8 | | text: Hej världen!, model: text-embedding-ada-002, token: 7 | text: Hej världen!, model: text-embedding-ada-002, token: 7 | | text: Hej världen!, model: text-similarity-davinci-001, token: 8 | text: Hej världen!, model: text-similarity-davinci-001, token: 8 | +| text: Hallo wereld!, model: gpt-4o, token: 3 | text: Hallo wereld!, model: gpt-4o, token: 3 | | text: Hallo wereld!, model: gpt-4, token: 3 | text: Hallo wereld!, model: gpt-4, token: 3 | | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | | text: Hallo wereld!, model: text-davinci-003, token: 5 | text: Hallo wereld!, model: text-davinci-003, token: 5 | @@ -289,6 +314,7 @@ | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | +| text: Hallo verden!, model: gpt-4o, token: 3 | text: Hallo verden!, model: gpt-4o, token: 3 | | text: Hallo verden!, model: gpt-4, token: 4 | text: Hallo verden!, model: gpt-4, token: 4 | | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | | text: Hallo verden!, model: text-davinci-003, token: 5 | text: Hallo verden!, model: text-davinci-003, token: 5 | @@ -311,6 +337,7 @@ | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | text: Hallo verden!, model: code-davinci-edit-001, token: 5 | | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | text: Hallo verden!, model: text-embedding-ada-002, token: 4 | | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | text: Hallo verden!, model: text-similarity-davinci-001, token: 5 | +| text: Hallo wereld!, model: gpt-4o, token: 3 | text: Hallo wereld!, model: gpt-4o, token: 3 | | text: Hallo wereld!, model: gpt-4, token: 3 | text: Hallo wereld!, model: gpt-4, token: 3 | | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | text: Hallo wereld!, model: gpt-3.5-turbo, token: 3 | | text: Hallo wereld!, model: text-davinci-003, token: 5 | text: Hallo wereld!, model: text-davinci-003, token: 5 | @@ -333,6 +360,7 @@ | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | text: Hallo wereld!, model: code-davinci-edit-001, token: 5 | | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | text: Hallo wereld!, model: text-embedding-ada-002, token: 3 | | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | text: Hallo wereld!, model: text-similarity-davinci-001, token: 5 | +| text: Hallo verden!, model: gpt-4o, token: 3 | text: Hallo verden!, model: gpt-4o, token: 3 | | text: Hallo verden!, model: gpt-4, token: 4 | text: Hallo verden!, model: gpt-4, token: 4 | | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | text: Hallo verden!, model: gpt-3.5-turbo, token: 4 | | text: Hallo verden!, model: text-davinci-003, token: 5 | text: Hallo verden!, model: text-davinci-003, token: 5 | diff --git a/encoding.go b/encoding.go index b8106e3..601713b 100644 --- a/encoding.go +++ b/encoding.go @@ -2,6 +2,7 @@ package tiktoken import ( "errors" + "strings" "sync" ) @@ -12,6 +13,7 @@ const FIM_SUFFIX string = "<|fim_suffix|>" const ENDOFPROMPT string = "<|endofprompt|>" const ( + MODEL_O200K_BASE string = "o200k_base" MODEL_CL100K_BASE string = "cl100k_base" MODEL_P50K_BASE string = "p50k_base" MODEL_P50K_EDIT string = "p50k_edit" @@ -20,6 +22,7 @@ const ( var MODEL_TO_ENCODING = map[string]string{ // chat + "gpt-4o": MODEL_O200K_BASE, "gpt-4": MODEL_CL100K_BASE, "gpt-3.5-turbo": MODEL_CL100K_BASE, // text @@ -62,6 +65,7 @@ var MODEL_TO_ENCODING = map[string]string{ var MODEL_PREFIX_TO_ENCODING = map[string]string{ // chat + "gpt-4o-": MODEL_O200K_BASE, // e.g., gpt-4o-2024-05-13, etc. "gpt-4-": MODEL_CL100K_BASE, // e.g., gpt-4-0314, etc., plus gpt-4-32k "gpt-3.5-turbo-": MODEL_CL100K_BASE, // e.g, gpt-3.5-turbo-0301, -0401, etc. } @@ -98,6 +102,8 @@ func getEncoding(encodingName string) (*Encoding, error) { func initEncoding(encodingName string) (*Encoding, error) { switch encodingName { + case MODEL_O200K_BASE: + return o200k_base() case MODEL_CL100K_BASE: return cl100k_base() case MODEL_P50K_BASE: @@ -111,6 +117,32 @@ func initEncoding(encodingName string) (*Encoding, error) { } } +func o200k_base() (*Encoding, error) { + ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/o200k_base.tiktoken") + if err != nil { + return nil, err + } + special_tokens := map[string]int{ + ENDOFTEXT: 199999, + ENDOFPROMPT: 200018, + } + pats := []string{ + `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?`, + `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?`, + `\p{N}{1,3}`, + ` ?[^\s\p{L}\p{N}]+[\r\n/]*`, + `\s*[\r\n]+`, + `\s+(?!\S)`, + `\s+`, + } + return &Encoding{ + Name: MODEL_O200K_BASE, + PatStr: strings.Join(pats, "|"), + MergeableRanks: ranks, + SpecialTokens: special_tokens, + }, nil +} + func cl100k_base() (*Encoding, error) { ranks, err := bpeLoader.LoadTiktokenBpe("https://openaipublic.blob.core.windows.net/encodings/cl100k_base.tiktoken") if err != nil { diff --git a/test/benchmark.py b/test/benchmark.py index 8c648d1..cb9ba5b 100644 --- a/test/benchmark.py +++ b/test/benchmark.py @@ -1,5 +1,4 @@ import tiktoken as tk -import requests import time def benchmark_test(text_list,enc): @@ -8,16 +7,15 @@ def benchmark_test(text_list,enc): :return: None """ start = time.perf_counter_ns() - for index in range(100000): - text = text_list[index] + n = 100000 + for i in range(n): + text = text_list[i % len(text_list)] num_tokens = len(enc.encode(text)) end = time.perf_counter_ns() - print('benchmark test: {} ns/op'.format((end - start)/100000)) + print('benchmark test: {} ns/op'.format((end - start)/n)) if __name__ == '__main__': - r = requests.get('https://unicode.org/udhr/assemblies/full_all.txt') - text_list = r.text.splitlines() - cursor = 0 - enc=tk.get_encoding('cl100k_base') - benchmark_test(text_list,enc) - + with open('/tmp/udhr.txt','r') as f: + text_list = f.readlines() + enc=tk.get_encoding('o200k_base') + benchmark_test(text_list,enc) diff --git a/test/benchmark_test.go b/test/benchmark_test.go index de364d3..126956e 100644 --- a/test/benchmark_test.go +++ b/test/benchmark_test.go @@ -1,32 +1,24 @@ package main import ( - "io" "log" - "net/http" + "os" "strings" "testing" "github.com/pkoukk/tiktoken-go" ) -func BenchmarkEncodingInFullLanguage(b *testing.B) { - // Universal Declaration of Human Rights in all languages - url := "https://unicode.org/udhr/assemblies/full_all.txt" - response, err := http.Get(url) - if err != nil { - log.Fatal(err) - } - defer response.Body.Close() +// go test -benchmem -run=^$ -bench ^BenchmarkEncodingInFullLanguage$ -benchtime=100000x github.com/pkoukk/tiktoken-go/test - responseData, err := io.ReadAll(response.Body) +func BenchmarkEncodingInFullLanguage(b *testing.B) { + data, err := os.ReadFile("/tmp/udhr.txt") if err != nil { log.Fatal(err) } - responseString := string(responseData) - lines := strings.Split(responseString, "\n") - tkm, err := tiktoken.EncodingForModel("gpt-4") + lines := strings.Split(string(data), "\n") + tkm, err := tiktoken.EncodingForModel("gpt-4o") lineCount := len(lines) if err != nil { log.Fatal(err) diff --git a/test/get_udhr.py b/test/get_udhr.py new file mode 100644 index 0000000..badceef --- /dev/null +++ b/test/get_udhr.py @@ -0,0 +1,27 @@ +import os +import tarfile +import urllib.request + +url = "http://research.ics.aalto.fi/cog/data/udhr/udhr_txt_20100325.tar.gz" +file_name = "/tmp/udhr_txt_20100325.tar.gz" + +def download_file(url, file_name): + urllib.request.urlretrieve(url, file_name) + +def merge_files(source_dir, output_file): + with open(output_file, 'wb') as outfile: + for filename in os.listdir(source_dir): + if os.path.isfile(os.path.join(source_dir, filename)): + with open(os.path.join(source_dir, filename), 'rb') as infile: + outfile.write(infile.read()) + outfile.write(b'\n') + +def untar(dest, file_name): + with tarfile.open(file_name, "r:gz") as tar: + tar.extractall(path=dest) + +# download_file(url, file_name) + +untar('/tmp', file_name) + +merge_files('/tmp/udhr/txt', '/tmp/udhr.txt') diff --git a/test/test.txt b/test/test.txt index 7191e19..2a8a999 100644 --- a/test/test.txt +++ b/test/test.txt @@ -1,3 +1,3 @@ hallo world!,你好世界!,こんにちは世界!,안녕하세요 세계!,Привет мир!,¡Hola mundo!,Hallo Welt!,Bonjour le monde!,Ciao mondo!,Hej världen!,Hallo wereld!,Hallo verden!,Hallo wereld!,Hallo verden! -gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001 -cl100k_base,p50k_base,r50k_base \ No newline at end of file +gpt-4o,gpt-4,gpt-3.5-turbo,text-davinci-003,text-davinci-002,text-davinci-001,text-curie-001,text-babbage-001,text-ada-001,davinci,curie,babbage,ada,code-davinci-002,code-davinci-001,code-cushman-002,code-cushman-001,davinci-codex,cushman-codex,text-davinci-edit-001,code-davinci-edit-001,text-embedding-ada-002,text-similarity-davinci-001 +o200k_base,cl100k_base,p50k_base,r50k_base \ No newline at end of file diff --git a/test/token_num.go b/test/token_num.go index cbd12b0..0947fd4 100644 --- a/test/token_num.go +++ b/test/token_num.go @@ -74,7 +74,7 @@ func getTokenByEncoding(text string, encoding string) (num_tokens int) { func testTokenByModel(textList []string, modelList []string) { for i := 0; i < len(textList); i++ { for j := 0; j < len(modelList); j++ { - fmt.Printf("text: %s, model: %s, token: %d \n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j])) + fmt.Printf("text: %s, model: %s, token: %d\n", textList[i], modelList[j], getTokenByModel(textList[i], modelList[j])) } } } @@ -83,7 +83,7 @@ func testTokenByModel(textList []string, modelList []string) { func testTokenByEncoding(textList []string, encodingList []string) { for i := 0; i < len(textList); i++ { for j := 0; j < len(encodingList); j++ { - fmt.Printf("text: %s, encoding: %s, token: %d \n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j])) + fmt.Printf("text: %s, encoding: %s, token: %d\n", textList[i], encodingList[j], getTokenByEncoding(textList[i], encodingList[j])) } } } diff --git a/test/token_num.py b/test/token_num.py index f160c2d..8cedec8 100644 --- a/test/token_num.py +++ b/test/token_num.py @@ -64,7 +64,5 @@ def test_token_by_encoding(text_list, encoding_list): if __name__ == '__main__': text_list, model_list, encoding_list = read_data_from_file('test/test.txt') test_token_by_model(text_list, model_list) - print("=====================================") + print("=========================================") test_token_by_encoding(text_list, encoding_list) - - \ No newline at end of file From b660fb8b24585eb859a9bf13f1dd14c7b83de45b Mon Sep 17 00:00:00 2001 From: WqyJh <781345688@qq.com> Date: Wed, 15 May 2024 19:26:12 +0800 Subject: [PATCH 2/2] Update docs --- README.md | 1 + README_zh-hans.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/README.md b/README.md index 51130d7..ed36ac7 100644 --- a/README.md +++ b/README.md @@ -176,6 +176,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string # Available Encodings | Encoding name | OpenAI models | | ----------------------- | ---------------------------------------------------- | + | `o200k_base` | `gpt-4o` | | `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` | | `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` | | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` | diff --git a/README_zh-hans.md b/README_zh-hans.md index dd0ac82..628fdc0 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -169,6 +169,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string # available encodings | Encoding name | OpenAI models | | ----------------------- | ---------------------------------------------------- | + | `o200k_base` | `gpt-4o` | | `cl100k_base` | `gpt-4`, `gpt-3.5-turbo`, `text-embedding-ada-002` | | `p50k_base` | Codex models, `text-davinci-002`, `text-davinci-003` | | `r50k_base` (or `gpt2`) | GPT-3 models like `davinci` | @@ -177,6 +178,7 @@ func NumTokensFromMessages(messages []openai.ChatCompletionMessage, model string # available models | Model name | OpenAI models | | ---------------------------- | ------------- | +| gpt-4o-* | o200k_base | | gpt-4 | cl100k_base | | gpt-4-* | cl100k_base | | gpt-3.5-turbo | cl100k_base |