From 51f38c5cdac0139fc6d297f8e93752f7edc6d8ab Mon Sep 17 00:00:00 2001
From: Arnav Garg <106701836+arnavgarg1@users.noreply.github.com>
Date: Tue, 6 Feb 2024 14:57:29 -0500
Subject: [PATCH] Add example for base model dequantization/upscaling (#3924)

---
 .../llm_base_model_dequantization/README.md   | 74 +++++++++++++++++++
 .../phi_2_dequantization.py                   | 45 +++++++++++
 2 files changed, 119 insertions(+)
 create mode 100644 examples/llm_base_model_dequantization/README.md
 create mode 100644 examples/llm_base_model_dequantization/phi_2_dequantization.py

diff --git a/examples/llm_base_model_dequantization/README.md b/examples/llm_base_model_dequantization/README.md
new file mode 100644
index 00000000000..6e053361dee
--- /dev/null
+++ b/examples/llm_base_model_dequantization/README.md
@@ -0,0 +1,74 @@
+# Convert quantized base model to fp16
+
+Ludwig has utility functions to convert nf4 quantized bitsandbytes base models back to fp16
+for more efficient inference. This is desireable since inference with bitsandbytes is slow because
+every forward pass through the model requires dequantizing the model weights from nf4 to fp16 layer
+by layer and then quantizing it back to nf4 to keep memory usage constant.
+
+By dequantizing the base model in fp16 upfront, you can get the same effect of the quantized weights
+without sacrificing on inference performance.
+
+## Visual Illustration
+
+### Without dequantization upfront
+
+| **Request 1:**                             | **Request 2:**                             | **Request 3:**                             |
+| ------------------------------------------ | ------------------------------------------ | ------------------------------------------ |
+| - Quantized bitsandbytes model             | - Quantized bitsandbytes model             | - Quantized bitsandbytes model             |
+| - Dequantization of layer 1 (nf4 to fp16)  | - Dequantization of layer 1 (nf4 to fp16)  | - Dequantization of layer 1 (nf4 to fp16)  |
+| - Forward Pass (using dequantized weights) | - Forward Pass (using dequantized weights) | - Forward Pass (using dequantized weights) |
+| - Quantization of layer 1 (fp16 to nf4)    | - Quantization of layer 1 (fp16 to nf4)    | - Quantization of layer 1 (fp16 to nf4)    |
+| - Dequantization of layer 2 (nf4 to fp16)  | - Dequantization of layer 2 (nf4 to fp16)  | - Dequantization of layer 2 (nf4 to fp16)  |
+| - Forward Pass (using dequantized weights) | - Forward Pass (using dequantized weights) | - Forward Pass (using dequantized weights) |
+| - Quantization of layer 2 (fp16 to nf4)    | - Quantization of layer 2 (fp16 to nf4)    | - Quantization of layer 2 (fp16 to nf4)    |
+| - ...                                      | - ...                                      | - ...                                      |
+| - Final Output                             | - Final Output                             | - Final Output                             |
+
+### With dequantization upfront
+
+| **Request 1:**                   | **Request 2:**                   | **Request 3:**                   |
+| -------------------------------- | -------------------------------- | -------------------------------- |
+| - Dequantized base model in fp16 | - Dequantized base model in fp16 | - Dequantized base model in fp16 |
+| - Forward pass through layer 1   | - Forward pass through layer 1   | - Forward pass through layer 1   |
+| - Forward pass through layer 2   | - Forward pass through layer 2   | - Forward pass through layer 2   |
+| - ...                            | - ...                            | - ...                            |
+| - Final Output                   | - Final Output                   | - Final Output                   |
+
+## Running the example script
+
+The example `phi_2_dequantization.py` shows how you how you can quantize and then dequantized Phi-2. This process
+can be repeated for any other base model supported by Ludwig that is quantized using 4 bits nf4 bitsandbytes quantization. You will need a GPU to run the script successfully.
+
+Beneath the surface, this script:
+
+1. Loads the base model in 4 bit nf4 quantization
+1. Dequantizes the model layer by layer back into fp16 in-place.
+1. Write the new dequantized weights to disk at `save_path`
+1. Write the tokenizer to disk at `save_path`
+
+Make sure you update the paths at the top of the file for base model, save path, and huggingface repo ID!
+
+## Bonus
+
+If desired, you can also use Ludwig to push the new dequantized model weights straight to HuggingFace hub!
+
+```python
+from ludwig.utils.hf_utils import upload_folder_to_hfhub
+
+upload_folder_to_hfhub(repo_id=hfhub_repo_id, folder_path=save_path)
+```
+
+### Dequantized base models already on huggingface hub
+
+- [CodeLlama 7b Instruct](https://huggingface.co/arnavgrg/codallama-7b-instruct-nf4-fp16-upscaled)
+- [CodeLlama 13b Instruct](https://huggingface.co/arnavgrg/codellama-13b-instruct-nf4-fp16-upscaled)
+- [CodeLlama 70b Instruct](https://huggingface.co/arnavgrg/codellama-70b-instruct-nf4-fp16-upscaled)
+- [Llama 2 7b](https://huggingface.co/arnavgrg/llama-2-7b-nf4-fp16-upscaled)
+- [Llama 2 7b Chat](https://huggingface.co/arnavgrg/llama-2-7b-chat-nf4-fp16-upscaled)
+- [Llama 2 13b Chat](https://huggingface.co/arnavgrg/llama-2-13b-chat-nf4-fp16-upscaled)
+- [Llama 2 70b Chat](https://huggingface.co/arnavgrg/llama-2-70b-chat-nf4-fp16-upscaled)
+- [Mistral 7b](https://huggingface.co/arnavgrg/mistral-7b-nf4-fp16-upscaled)
+- [Mistral 7b Instruct](https://huggingface.co/arnavgrg/mistral-7b-instruct-nf4-fp16-upscaled)
+- [NousMistral Yarn 7b 128K](https://huggingface.co/arnavgrg/NousResearch-Yarn-Mistral-7b-128k-nf4-fp16-upscaled)
+- [Microsoft Phi-2](https://huggingface.co/arnavgrg/phi-2-nf4-fp16-upscaled)
+- [Zephyr 7b Beta](https://huggingface.co/arnavgrg/zephyr-7b-beta-nf4-fp16-upscaled)
diff --git a/examples/llm_base_model_dequantization/phi_2_dequantization.py b/examples/llm_base_model_dequantization/phi_2_dequantization.py
new file mode 100644
index 00000000000..7b818cf344b
--- /dev/null
+++ b/examples/llm_base_model_dequantization/phi_2_dequantization.py
@@ -0,0 +1,45 @@
+import logging
+
+import yaml
+from huggingface_hub import whoami
+
+from ludwig.api import LudwigModel
+from ludwig.utils.hf_utils import upload_folder_to_hfhub
+
+hf_username = whoami().get("name")
+base_model_name = "microsoft/phi-2"
+dequantized_path = "microsoft-phi-2-dequantized"
+save_path = "/home/ray/" + dequantized_path
+hfhub_repo_id = hf_username + dequantized_path
+
+
+config = yaml.safe_load(
+    f"""
+    model_type: llm
+    base_model: {base_model_name}
+
+    quantization:
+      bits: 4
+
+    input_features:
+      - name: instruction
+        type: text
+
+    output_features:
+      - name: output
+        type: text
+
+    trainer:
+        type: none
+
+    backend:
+      type: local
+  """
+)
+
+# Define Ludwig model object that drive model training
+model = LudwigModel(config=config, logging_level=logging.INFO)
+model.save_dequantized_base_model(save_path=save_path)
+
+# Optional: Upload to Huggingface Hub
+upload_folder_to_hfhub(repo_id=hfhub_repo_id, folder_path=save_path)