diff --git a/README.md b/README.md index 53c43c2..f6d727c 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,7 @@ Python 3 and C++ compiler required. The command will download the model and the | Llama 3 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_8b_instruct_q40` | | Llama 3.1 8B Instruct Q40 | Chat, API | 6.32 GB | `python launch.py llama3_1_8b_instruct_q40` | | Llama 3.1 405B Instruct Q40 | Chat, API | 238 GB | `python launch.py llama3_1_405b_instruct_q40` | +| Llama 3.2 1B Instruct Q40 | Chat, API | 1.7 GB | `python launch.py llama3_2_1b_instruct_q40` | ### 🛠️ Convert Model Manually diff --git a/launch.py b/launch.py index 364923f..469704f 100644 --- a/launch.py +++ b/launch.py @@ -37,6 +37,11 @@ def parts(length): 'https://huggingface.co/b4rtaz/Llama-3_1-405B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama_3_1.t?download=true', 'q40', 'q80', 'chat' ], + 'llama3_2_1b_instruct_q40': [ + ['https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_model_llama3.2-1b-instruct_q40.m?download=true'], + 'https://huggingface.co/b4rtaz/Llama-3_2-1B-Q40-Instruct-Distributed-Llama/resolve/main/dllama_tokenizer_llama3_2.t?download=true', + 'q40', 'q80', 'chat', '--max-seq-len 8192' + ], } def downloadFile(urls: str, path: str): @@ -107,6 +112,8 @@ def printUsage(): else: command = './dllama inference --steps 64 --prompt "Hello world"' command += f' --model {modelPath} --tokenizer {tokenizerPath} --buffer-float-type {model[3]} --nthreads 4' + if (len(model) > 5): + command += f' {model[5]}' print('To run Distributed Llama you need to execute:') print('--- copy start ---')