From 9b77d17ee5cf7c2129701f46150d1dfb1c52f4f0 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 08:30:00 +0000
Subject: [PATCH 01/15] update llama for multi devices

---
 projects/Llama/pipeline.py           | 59 +++++++++++++---------------
 projects/Llama/tokenizer.py          |  4 +-
 projects/Llama/utils/llama_loader.py |  4 ++
 3 files changed, 34 insertions(+), 33 deletions(-)

diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index bea4a2f56..99c5153e8 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import click
 from libai.inference.basic import BasePipeline
 from libai.utils import distributed as dist
 
@@ -30,11 +31,7 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
         if mode == "huggingface":
             from projects.Llama.utils.llama_loader import LlamaLoaderHuggerFace
 
-            model_loader = LlamaLoaderHuggerFace(
-                libai_cfg_model,
-                libai_cfg_model.cfg,
-                model_path,
-            )
+            model_loader = LlamaLoaderHuggerFace(libai_cfg_model, libai_cfg_model.cfg, model_path,)
             model = model_loader.load()
             model.eval()
             return model
@@ -42,11 +39,7 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
         elif mode == "libai":
             from projects.Llama.utils.llama_loader import LlamaLoaderLiBai
 
-            model_loader = LlamaLoaderLiBai(
-                libai_cfg_model,
-                libai_cfg_model.cfg,
-                model_path,
-            )
+            model_loader = LlamaLoaderLiBai(libai_cfg_model, libai_cfg_model.cfg, model_path,)
             model = model_loader.load()
             model.eval()
             return model
@@ -67,7 +60,7 @@ def _parse_parameters(self, **pipeline_parameters):
 
     def preprocess(self, inputs, **kwargs) -> dict:
         # tokenizer encoderW
-        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True)
+        inputs = self.tokenizer.tokenize(inputs, add_bos=True, padding=True, device=self.device)
         inputs = {
             "input_ids": inputs,
         }
@@ -87,31 +80,31 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
         return records
 
 
-if __name__ == "__main__":
-    # ----- load huggingface checkpoint -----
-    # pipeline = TextGenerationPipeline(
-    #     "projects/Llama/configs/llama_config.py",
-    #     data_parallel=1,
-    #     tensor_parallel=1,
-    #     pipeline_parallel=1,
-    #     pipeline_num_layers=32,
-    #     model_path="",
-    #     mode="huggingface",
-    # )
-
-    # output = pipeline(inputs=text)
-    # if dist.is_main_process():
-    #     print(output)
-
-    # ----- load libai checkpoint -----
+@click.command()
+@click.option(
+    "--config_file",
+    default="projects/Llama/configs/llama_config.py",
+    help="Path to the configuration file.",
+)
+@click.option("--model_path", default="", help="Path to the model checkpoint.")
+@click.option(
+    "--mode",
+    default="libai",
+    help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.",
+)
+@click.option(
+    "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
+)
+def main(config_file, model_path, mode, device):
     pipeline = TextGenerationPipeline(
-        "projects/Llama/configs/llama_config.py",
+        config_file,
         data_parallel=1,
         tensor_parallel=1,
         pipeline_parallel=1,
         pipeline_num_layers=32,
-        model_path="",
-        mode="libai",
+        model_path=model_path,
+        mode=mode,
+        device=device,
     )
 
     text = [
@@ -120,3 +113,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     output = pipeline(inputs=text)
     if dist.is_main_process():
         print(output)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/projects/Llama/tokenizer.py b/projects/Llama/tokenizer.py
index 56aca8336..1598a1dbe 100644
--- a/projects/Llama/tokenizer.py
+++ b/projects/Llama/tokenizer.py
@@ -75,9 +75,9 @@ def tokenize(
         if add_eos:
             tokens = [token + [self.eos_token_id] for token in tokens]
 
-        if device == "cuda":
+        if device:
             sbp = kwargs.get("sbp", dist.get_nd_sbp([flow.sbp.broadcast, flow.sbp.broadcast]))
-            placement = kwargs.get("placement", flow.placement("cuda", [0]))
+            placement = kwargs.get("placement", flow.placement(device, [0]))
             return_token_ids = flow.tensor(tokens, sbp=sbp, placement=placement, dtype=flow.long)
         else:
             return_token_ids = flow.tensor(tokens, dtype=flow.long)
diff --git a/projects/Llama/utils/llama_loader.py b/projects/Llama/utils/llama_loader.py
index 20b9ba258..c46cb480a 100644
--- a/projects/Llama/utils/llama_loader.py
+++ b/projects/Llama/utils/llama_loader.py
@@ -26,6 +26,8 @@ def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
 
         self.base_model_prefix_1 = "model"
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path
 
     def _convert_state_dict(self, flow_state_dict, cfg):
         """Convert state_dict's keys to match model.
@@ -104,3 +106,5 @@ class LlamaLoaderLiBai(ModelLoaderLiBai):
     def __init__(self, model, libai_cfg, pretrained_model_path, **kwargs):
         super().__init__(model, libai_cfg, pretrained_model_path, **kwargs)
         self.base_model_prefix_2 = "model"
+        if not pretrained_model_path:
+            self.pretrained_model_path = libai_cfg.pretrained_model_path

From 87b2c4182aa32650baa88adfcbd33c00cda62031 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 08:31:53 +0000
Subject: [PATCH 02/15] xpu and npu config files

---
 projects/Llama/configs/llama_config_npu.py | 64 ++++++++++++++++++++++
 projects/Llama/configs/llama_config_xpu.py | 64 ++++++++++++++++++++++
 2 files changed, 128 insertions(+)
 create mode 100644 projects/Llama/configs/llama_config_npu.py
 create mode 100644 projects/Llama/configs/llama_config_xpu.py

diff --git a/projects/Llama/configs/llama_config_npu.py b/projects/Llama/configs/llama_config_npu.py
new file mode 100644
index 000000000..1d959cc2e
--- /dev/null
+++ b/projects/Llama/configs/llama_config_npu.py
@@ -0,0 +1,64 @@
+from omegaconf import DictConfig, OmegaConf
+
+from libai.config import LazyCall
+from projects.Llama.llama import LlamaForCausalLM
+from projects.Llama.tokenizer import LlamaTokenizer
+from configs.common.train import train
+
+import oneflow_npu
+
+cfg = dict(
+    # Model
+    hidden_act="silu",
+    hidden_size=4096,
+    initializer_range=0.02,
+    intermediate_size=11008,
+    max_position_embeddings=2048,
+    num_attention_heads=32,
+    hidden_layers=32,
+    pretraining_tp=1,
+    rms_norm_eps=1e-05,
+    rope_scaling=None,
+    tie_word_embeddings=False,
+    vocab_size=32000,
+    use_scaled_init_for_output_weights=False,
+    scale_mask_softmax_fusion=False,
+    amp_enabled=True,
+    # Inference
+    is_encoder_decoder=False,
+    max_length=256,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.6,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    use_cache=True,
+    bos_token_id=1,
+    eos_token_id=2,
+    pad_token_id=0,
+    # train
+    #pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
+)
+
+cfg = DictConfig(cfg)
+
+model = LazyCall(LlamaForCausalLM)(cfg=cfg)
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(LlamaTokenizer)(
+    #pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
+)
diff --git a/projects/Llama/configs/llama_config_xpu.py b/projects/Llama/configs/llama_config_xpu.py
new file mode 100644
index 000000000..48f103293
--- /dev/null
+++ b/projects/Llama/configs/llama_config_xpu.py
@@ -0,0 +1,64 @@
+from omegaconf import DictConfig, OmegaConf
+
+from libai.config import LazyCall
+from projects.Llama.llama import LlamaForCausalLM
+from projects.Llama.tokenizer import LlamaTokenizer
+from configs.common.train import train
+
+import oneflow_xpu
+
+cfg = dict(
+    # Model
+    hidden_act="silu",
+    hidden_size=4096,
+    initializer_range=0.02,
+    intermediate_size=11008,
+    max_position_embeddings=2048,
+    num_attention_heads=32,
+    hidden_layers=32,
+    pretraining_tp=1,
+    rms_norm_eps=1e-05,
+    rope_scaling=None,
+    tie_word_embeddings=False,
+    vocab_size=32000,
+    use_scaled_init_for_output_weights=False,
+    scale_mask_softmax_fusion=False,
+    amp_enabled=True,
+    # Inference
+    is_encoder_decoder=False,
+    max_length=256,
+    min_length=0,
+    do_sample=False,
+    early_stopping=False,
+    num_beams=1,
+    num_beam_groups=1,
+    diversity_penalty=0.0,
+    temperature=0.9,
+    top_k=50,
+    top_p=0.6,
+    typical_p=1.0,
+    repetition_penalty=1.0,
+    length_penalty=1.0,
+    no_repeat_ngram_size=0,
+    encoder_no_repeat_ngram_size=0,
+    num_return_sequences=1,
+    chunk_size_feed_forward=0,
+    output_scores=False,
+    use_cache=True,
+    bos_token_id=1,
+    eos_token_id=2,
+    pad_token_id=0,
+    # train
+    #pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
+)
+
+cfg = DictConfig(cfg)
+
+model = LazyCall(LlamaForCausalLM)(cfg=cfg)
+tokenization = OmegaConf.create()
+tokenization.make_vocab_size_divisible_by = 1
+tokenization.tokenizer = LazyCall(LlamaTokenizer)(
+    #pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
+)

From f87b71331ed339f50277e9123583954433a54d17 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 08:35:22 +0000
Subject: [PATCH 03/15] update device for inference

---
 libai/inference/basic.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index 94d3f1781..53933e094 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -43,6 +43,7 @@ def __init__(
         pipeline_num_layers=None,
         model_path=None,
         mode="libai",
+        device="cuda",
         **kwargs,
     ):
         # init cfg
@@ -60,6 +61,7 @@ def __init__(
             pipeline_stage_id,
             pipeline_num_layers,
         )
+        self.device = device
         dist.setup_dist_util(self.cfg.train.dist)
         logger.info(self.cfg.train.dist)
 
@@ -104,10 +106,7 @@ def update_cfg(
             ), "cfg.train.dist.pipeline_num_layers must be set when run pipeline parallel"
 
     def load_pretrain_weight(
-        self,
-        libai_cfg_model,
-        model_path,
-        mode="libai",
+        self, libai_cfg_model, model_path, mode="libai",
     ):
         """load pretrained model.
 
@@ -167,7 +166,9 @@ def to_local(self, model_outputs_dict):
         for key, value in model_outputs_dict.items():
             if isinstance(value, flow.Tensor) and value.is_global:
                 model_outputs_dict[key] = dist.ttol(
-                    value, ranks=[0] if value.placement.ranks.ndim == 1 else [[0]]
+                    value,
+                    device=self.device,
+                    ranks=[0] if value.placement.ranks.ndim == 1 else [[0]],
                 )
         if flow.cuda.is_available():
             dist.synchronize()

From 9ba5a652a6983d7ec3cac81cb5a556c205d15a04 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 09:22:39 +0000
Subject: [PATCH 04/15] update

---
 libai/inference/basic.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index 53933e094..e923bda99 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -62,6 +62,8 @@ def __init__(
             pipeline_num_layers,
         )
         self.device = device
+        if device:
+            self.cfg.train.dist.device_type = device
         dist.setup_dist_util(self.cfg.train.dist)
         logger.info(self.cfg.train.dist)
 

From 336c48122b7f4e6953d4c63e345a692068345f03 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 10:07:42 +0000
Subject: [PATCH 05/15] update

---
 projects/Llama/configs/llama_config_npu.py | 4 ++--
 projects/Llama/pipeline.py                 | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/projects/Llama/configs/llama_config_npu.py b/projects/Llama/configs/llama_config_npu.py
index 1d959cc2e..5bdb059a9 100644
--- a/projects/Llama/configs/llama_config_npu.py
+++ b/projects/Llama/configs/llama_config_npu.py
@@ -50,7 +50,7 @@
     pad_token_id=0,
     # train
     #pretrained_model_path="meta-llama/Llama-2-7b-hf",
-    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
+    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf",
 )
 
 cfg = DictConfig(cfg)
@@ -60,5 +60,5 @@
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
     #pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
-    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
+    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf/tokenizer.model"
 )
diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index 99c5153e8..5c5acc357 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -96,6 +96,8 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
 )
 def main(config_file, model_path, mode, device):
+    if model_path:
+        print("Note: The '--model_path' option is for the model checkpoint only. Please configure 'tokenization.tokenizer.pretrained_model_path' directly in the config file.")
     pipeline = TextGenerationPipeline(
         config_file,
         data_parallel=1,

From 24e9c1af0a45caaa946e43aa383d8b3538435429 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 10:16:00 +0000
Subject: [PATCH 06/15] update README

---
 projects/Llama/{readme.md => README.md} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename projects/Llama/{readme.md => README.md} (100%)

diff --git a/projects/Llama/readme.md b/projects/Llama/README.md
similarity index 100%
rename from projects/Llama/readme.md
rename to projects/Llama/README.md

From 032664a65ac76b945e922f603695c9cf59ddb307 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Tue, 3 Sep 2024 10:25:34 +0000
Subject: [PATCH 07/15] update

---
 projects/Llama/README.md | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/projects/Llama/README.md b/projects/Llama/README.md
index 9adb3d925..8c6dd862d 100644
--- a/projects/Llama/README.md
+++ b/projects/Llama/README.md
@@ -44,4 +44,17 @@ python projects/Llama/utils/eval_adapter.py
 - Adjust the parameters in the `projects/Llama/pipeline.py`, and running:
 ```bash
 bash tools/infer.sh projects/Llama/pipeline.py 8
-```
\ No newline at end of file
+```
+
+## npu/xpu example
+
+- npu
+```bash
+python projects/Llama/pipeline.py --device=npu --mode=huggingface --config_file=projects/Llama/configs/llama_config_npu.py
+```
+
+- xpu
+```bash
+python projects/Llama/pipeline.py --device=xpu --mode=huggingface --config_file=projects/Llama/configs/llama_config_xpu.py
+```
+

From 6f921cb899f4638e767be5cf3450c28d33f5bfc6 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 06:59:02 +0000
Subject: [PATCH 08/15] format

---
 libai/inference/basic.py                   |  5 ++++-
 projects/Llama/configs/llama_config_npu.py |  4 ++--
 projects/Llama/configs/llama_config_xpu.py |  4 ++--
 projects/Llama/pipeline.py                 | 16 +++++++++++++---
 4 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index e923bda99..752f4578a 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -108,7 +108,10 @@ def update_cfg(
             ), "cfg.train.dist.pipeline_num_layers must be set when run pipeline parallel"
 
     def load_pretrain_weight(
-        self, libai_cfg_model, model_path, mode="libai",
+        self,
+        libai_cfg_model,
+        model_path,
+        mode="libai",
     ):
         """load pretrained model.
 
diff --git a/projects/Llama/configs/llama_config_npu.py b/projects/Llama/configs/llama_config_npu.py
index 5bdb059a9..e06ca1bee 100644
--- a/projects/Llama/configs/llama_config_npu.py
+++ b/projects/Llama/configs/llama_config_npu.py
@@ -49,7 +49,7 @@
     eos_token_id=2,
     pad_token_id=0,
     # train
-    #pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
     pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf",
 )
 
@@ -59,6 +59,6 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    #pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
     pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf/tokenizer.model"
 )
diff --git a/projects/Llama/configs/llama_config_xpu.py b/projects/Llama/configs/llama_config_xpu.py
index 48f103293..0f9fa66c2 100644
--- a/projects/Llama/configs/llama_config_xpu.py
+++ b/projects/Llama/configs/llama_config_xpu.py
@@ -49,7 +49,7 @@
     eos_token_id=2,
     pad_token_id=0,
     # train
-    #pretrained_model_path="meta-llama/Llama-2-7b-hf",
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
     pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
 )
 
@@ -59,6 +59,6 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    #pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
     pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
 )
diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index 5c5acc357..f0cd302de 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -31,7 +31,11 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
         if mode == "huggingface":
             from projects.Llama.utils.llama_loader import LlamaLoaderHuggerFace
 
-            model_loader = LlamaLoaderHuggerFace(libai_cfg_model, libai_cfg_model.cfg, model_path,)
+            model_loader = LlamaLoaderHuggerFace(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+            )
             model = model_loader.load()
             model.eval()
             return model
@@ -39,7 +43,11 @@ def load_pretrain_weight(self, libai_cfg_model, model_path, mode="huggingface"):
         elif mode == "libai":
             from projects.Llama.utils.llama_loader import LlamaLoaderLiBai
 
-            model_loader = LlamaLoaderLiBai(libai_cfg_model, libai_cfg_model.cfg, model_path,)
+            model_loader = LlamaLoaderLiBai(
+                libai_cfg_model,
+                libai_cfg_model.cfg,
+                model_path,
+            )
             model = model_loader.load()
             model.eval()
             return model
@@ -97,7 +105,9 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
 )
 def main(config_file, model_path, mode, device):
     if model_path:
-        print("Note: The '--model_path' option is for the model checkpoint only. Please configure 'tokenization.tokenizer.pretrained_model_path' directly in the config file.")
+        print(
+            "Note: The '--model_path' option is for the model checkpoint only. Please configure 'tokenization.tokenizer.pretrained_model_path' directly in the config file."
+        )
     pipeline = TextGenerationPipeline(
         config_file,
         data_parallel=1,

From a238a4b78063bee3e0e9929060f93113740d1243 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 07:07:40 +0000
Subject: [PATCH 09/15] format

---
 projects/Llama/pipeline.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index f0cd302de..2374b0ee4 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import click
+
 from libai.inference.basic import BasePipeline
 from libai.utils import distributed as dist
 

From d4bd6dbcfd7632ea4933ede833ab802d60f7961d Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 07:17:41 +0000
Subject: [PATCH 10/15] fix

---
 projects/Llama/pipeline.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index 2374b0ee4..3014f6d40 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -107,7 +107,9 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
 def main(config_file, model_path, mode, device):
     if model_path:
         print(
-            "Note: The '--model_path' option is for the model checkpoint only. Please configure 'tokenization.tokenizer.pretrained_model_path' directly in the config file."
+            "Note: The '--model_path' option is for the model checkpoint only. "
+            "Please configure 'tokenization.tokenizer.pretrained_model_path' "
+            "directly in the config file."
         )
     pipeline = TextGenerationPipeline(
         config_file,

From 593937f553e0181c6ac5c65d57450c116c2e6901 Mon Sep 17 00:00:00 2001
From: Qunhong Zeng <871206929@qq.com>
Date: Wed, 4 Sep 2024 16:41:18 +0800
Subject: [PATCH 11/15] feat: support third-party oneflow device extension
 (#549)

* feat: support third-party device oneflow extentions

also, refactor the build process of model and tokenizer using
pretrained_model_path cofnig

* refactor: remove unnecessary config and warnings

* docs: update readme for commands to run llama on npu and xpu
---
 libai/inference/basic.py                   | 20 ++++++-
 libai/utils/distributed.py                 | 16 ++++++
 projects/Llama/README.md                   |  4 +-
 projects/Llama/configs/llama_config.py     |  2 +-
 projects/Llama/configs/llama_config_npu.py | 64 ----------------------
 projects/Llama/configs/llama_config_xpu.py | 64 ----------------------
 projects/Llama/pipeline.py                 |  8 +--
 7 files changed, 38 insertions(+), 140 deletions(-)
 delete mode 100644 projects/Llama/configs/llama_config_npu.py
 delete mode 100644 projects/Llama/configs/llama_config_xpu.py

diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index 752f4578a..4de70d010 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -16,6 +16,7 @@
 import logging
 from abc import ABCMeta, abstractmethod
 from typing import Any, Dict
+from pathlib import Path
 
 import oneflow as flow
 
@@ -62,12 +63,20 @@ def __init__(
             pipeline_num_layers,
         )
         self.device = device
-        if device:
-            self.cfg.train.dist.device_type = device
+        self.cfg.train.dist.device_type = device
         dist.setup_dist_util(self.cfg.train.dist)
         logger.info(self.cfg.train.dist)
 
         # initial and load model
+        self.model_path = model_path
+        if self.model_path is not None:
+            # If a model_path is provided in BasePipeline,
+            # we use it with priority, overwrite the pretrained_model_path in config
+            self.cfg.model.cfg.pretrained_model_path = self.model_path
+        else:
+            # If the model_path in BasePipeline is None, then use the one from the config
+            assert "pretrained_model_path" in self.cfg.model.cfg
+            self.model_path = self.cfg.model.cfg.pretrained_model_path
 
         self.model = self.load_pretrain_weight(self.cfg.model, model_path, mode=mode)
         self.model._apply(dist.convert_to_distributed_default_setting)
@@ -138,6 +147,13 @@ def load_pretrain_weight(
     def build_tokenizer(self, cfg):
         tokenizer = None
         if try_get_key(cfg, "tokenization") is not None:
+            tokenizer_cfg = cfg.tokenization.tokenizer
+            if "pretrained_model_path" not in tokenizer_cfg:
+                # If "pretrained_model_path" does not exist in the tokenizer's config,
+                # set it to default as f"{model_path}/tokenizer.model"
+                tokenizer_cfg.pretrained_model_path = str(
+                    Path(self.model_path).joinpath("tokenizer.model")
+                )
             tokenizer = DefaultTrainer.build_tokenizer(cfg)
         return tokenizer
 
diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index f84313fd7..5fab501de 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -72,6 +72,22 @@ def _init_distributed_env(self, cfg):
 
         # Add set device type
         self._device_type = try_get_key(cfg, "device_type", default="cuda")
+        if self._device_type == "npu":
+            try:
+                import oneflow_npu
+            except ImportError:
+                raise ImportError(
+                    "The module 'oneflow_npu' is not installed. Please install it to use NPU devices."
+                )
+        elif self._device_type == "xpu":
+            try:
+                import oneflow_xpu
+            except ImportError:
+                raise ImportError(
+                    "The module 'oneflow_xpu' is not installed. Please install it to use XPU devices."
+                )
+        elif self._device_type not in ("cuda", "npu", "xpu", "cpu"):
+            raise NotImplementedError(f"Unsupported device {self._device_type}")
 
     def _init_parallel_size(self, cfg):
 
diff --git a/projects/Llama/README.md b/projects/Llama/README.md
index 8c6dd862d..f58e416c1 100644
--- a/projects/Llama/README.md
+++ b/projects/Llama/README.md
@@ -50,11 +50,11 @@ bash tools/infer.sh projects/Llama/pipeline.py 8
 
 - npu
 ```bash
-python projects/Llama/pipeline.py --device=npu --mode=huggingface --config_file=projects/Llama/configs/llama_config_npu.py
+python projects/Llama/pipeline.py --device=npu --mode=huggingface --model_path /your/model/path
 ```
 
 - xpu
 ```bash
-python projects/Llama/pipeline.py --device=xpu --mode=huggingface --config_file=projects/Llama/configs/llama_config_xpu.py
+python projects/Llama/pipeline.py --device=xpu --mode=huggingface --model_path /your/model/path
 ```
 
diff --git a/projects/Llama/configs/llama_config.py b/projects/Llama/configs/llama_config.py
index 01d208016..36f95d126 100644
--- a/projects/Llama/configs/llama_config.py
+++ b/projects/Llama/configs/llama_config.py
@@ -57,5 +57,5 @@
 tokenization = OmegaConf.create()
 tokenization.make_vocab_size_divisible_by = 1
 tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
+    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
 )
diff --git a/projects/Llama/configs/llama_config_npu.py b/projects/Llama/configs/llama_config_npu.py
deleted file mode 100644
index e06ca1bee..000000000
--- a/projects/Llama/configs/llama_config_npu.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from omegaconf import DictConfig, OmegaConf
-
-from libai.config import LazyCall
-from projects.Llama.llama import LlamaForCausalLM
-from projects.Llama.tokenizer import LlamaTokenizer
-from configs.common.train import train
-
-import oneflow_npu
-
-cfg = dict(
-    # Model
-    hidden_act="silu",
-    hidden_size=4096,
-    initializer_range=0.02,
-    intermediate_size=11008,
-    max_position_embeddings=2048,
-    num_attention_heads=32,
-    hidden_layers=32,
-    pretraining_tp=1,
-    rms_norm_eps=1e-05,
-    rope_scaling=None,
-    tie_word_embeddings=False,
-    vocab_size=32000,
-    use_scaled_init_for_output_weights=False,
-    scale_mask_softmax_fusion=False,
-    amp_enabled=True,
-    # Inference
-    is_encoder_decoder=False,
-    max_length=256,
-    min_length=0,
-    do_sample=False,
-    early_stopping=False,
-    num_beams=1,
-    num_beam_groups=1,
-    diversity_penalty=0.0,
-    temperature=0.9,
-    top_k=50,
-    top_p=0.6,
-    typical_p=1.0,
-    repetition_penalty=1.0,
-    length_penalty=1.0,
-    no_repeat_ngram_size=0,
-    encoder_no_repeat_ngram_size=0,
-    num_return_sequences=1,
-    chunk_size_feed_forward=0,
-    output_scores=False,
-    use_cache=True,
-    bos_token_id=1,
-    eos_token_id=2,
-    pad_token_id=0,
-    # train
-    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
-    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf",
-)
-
-cfg = DictConfig(cfg)
-
-model = LazyCall(LlamaForCausalLM)(cfg=cfg)
-tokenization = OmegaConf.create()
-tokenization.make_vocab_size_divisible_by = 1
-tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
-    pretrained_model_path="/data0/hf_models/Llama-2-7b-chat-hf/tokenizer.model"
-)
diff --git a/projects/Llama/configs/llama_config_xpu.py b/projects/Llama/configs/llama_config_xpu.py
deleted file mode 100644
index 0f9fa66c2..000000000
--- a/projects/Llama/configs/llama_config_xpu.py
+++ /dev/null
@@ -1,64 +0,0 @@
-from omegaconf import DictConfig, OmegaConf
-
-from libai.config import LazyCall
-from projects.Llama.llama import LlamaForCausalLM
-from projects.Llama.tokenizer import LlamaTokenizer
-from configs.common.train import train
-
-import oneflow_xpu
-
-cfg = dict(
-    # Model
-    hidden_act="silu",
-    hidden_size=4096,
-    initializer_range=0.02,
-    intermediate_size=11008,
-    max_position_embeddings=2048,
-    num_attention_heads=32,
-    hidden_layers=32,
-    pretraining_tp=1,
-    rms_norm_eps=1e-05,
-    rope_scaling=None,
-    tie_word_embeddings=False,
-    vocab_size=32000,
-    use_scaled_init_for_output_weights=False,
-    scale_mask_softmax_fusion=False,
-    amp_enabled=True,
-    # Inference
-    is_encoder_decoder=False,
-    max_length=256,
-    min_length=0,
-    do_sample=False,
-    early_stopping=False,
-    num_beams=1,
-    num_beam_groups=1,
-    diversity_penalty=0.0,
-    temperature=0.9,
-    top_k=50,
-    top_p=0.6,
-    typical_p=1.0,
-    repetition_penalty=1.0,
-    length_penalty=1.0,
-    no_repeat_ngram_size=0,
-    encoder_no_repeat_ngram_size=0,
-    num_return_sequences=1,
-    chunk_size_feed_forward=0,
-    output_scores=False,
-    use_cache=True,
-    bos_token_id=1,
-    eos_token_id=2,
-    pad_token_id=0,
-    # train
-    # pretrained_model_path="meta-llama/Llama-2-7b-hf",
-    pretrained_model_path="/root/models/Llama-2-7b-chat-hf",
-)
-
-cfg = DictConfig(cfg)
-
-model = LazyCall(LlamaForCausalLM)(cfg=cfg)
-tokenization = OmegaConf.create()
-tokenization.make_vocab_size_divisible_by = 1
-tokenization.tokenizer = LazyCall(LlamaTokenizer)(
-    # pretrained_model_path="meta-llama/Llama-2-7b-hf/tokenizer.model"
-    pretrained_model_path="/root/models/Llama-2-7b-chat-hf/tokenizer.model"
-)
diff --git a/projects/Llama/pipeline.py b/projects/Llama/pipeline.py
index 3014f6d40..4b65d2895 100644
--- a/projects/Llama/pipeline.py
+++ b/projects/Llama/pipeline.py
@@ -95,7 +95,7 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     default="projects/Llama/configs/llama_config.py",
     help="Path to the configuration file.",
 )
-@click.option("--model_path", default="", help="Path to the model checkpoint.")
+@click.option("--model_path", default=None, help="Path to the model checkpoint.")
 @click.option(
     "--mode",
     default="libai",
@@ -105,12 +105,6 @@ def postprocess(self, model_output_dict, **kwargs) -> dict:
     "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'."
 )
 def main(config_file, model_path, mode, device):
-    if model_path:
-        print(
-            "Note: The '--model_path' option is for the model checkpoint only. "
-            "Please configure 'tokenization.tokenizer.pretrained_model_path' "
-            "directly in the config file."
-        )
     pipeline = TextGenerationPipeline(
         config_file,
         data_parallel=1,

From a030a1b0cc8c12827edf458a090bc0c710960859 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 08:45:56 +0000
Subject: [PATCH 12/15] fix import order

---
 libai/inference/basic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libai/inference/basic.py b/libai/inference/basic.py
index 4de70d010..b869e56cc 100644
--- a/libai/inference/basic.py
+++ b/libai/inference/basic.py
@@ -15,8 +15,8 @@
 
 import logging
 from abc import ABCMeta, abstractmethod
-from typing import Any, Dict
 from pathlib import Path
+from typing import Any, Dict
 
 import oneflow as flow
 

From 942556f157360116797afc6b80998abf5f1e7fc8 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 08:51:51 +0000
Subject: [PATCH 13/15] update

---
 libai/utils/distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index 5fab501de..7fd9996d0 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -77,14 +77,14 @@ def _init_distributed_env(self, cfg):
                 import oneflow_npu
             except ImportError:
                 raise ImportError(
-                    "The module 'oneflow_npu' is not installed. Please install it to use NPU devices."
+                    "'oneflow_npu' is missing. Install it to use NPU devices."
                 )
         elif self._device_type == "xpu":
             try:
                 import oneflow_xpu
             except ImportError:
                 raise ImportError(
-                    "The module 'oneflow_xpu' is not installed. Please install it to use XPU devices."
+                    "'oneflow_xpu' is missing. Install it to use NPU devices."
                 )
         elif self._device_type not in ("cuda", "npu", "xpu", "cpu"):
             raise NotImplementedError(f"Unsupported device {self._device_type}")

From 8cfd032816a446b3b2283c43597df7d1417fbae5 Mon Sep 17 00:00:00 2001
From: ShawnXuan <xiexuanx2@gmail.com>
Date: Wed, 4 Sep 2024 08:54:37 +0000
Subject: [PATCH 14/15] update

---
 libai/utils/distributed.py | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index 7fd9996d0..8b22c8a4c 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -76,16 +76,12 @@ def _init_distributed_env(self, cfg):
             try:
                 import oneflow_npu
             except ImportError:
-                raise ImportError(
-                    "'oneflow_npu' is missing. Install it to use NPU devices."
-                )
+                raise ImportError("'oneflow_npu' is missing. Install it to use NPU devices.")
         elif self._device_type == "xpu":
             try:
                 import oneflow_xpu
             except ImportError:
-                raise ImportError(
-                    "'oneflow_xpu' is missing. Install it to use NPU devices."
-                )
+                raise ImportError("'oneflow_xpu' is missing. Install it to use NPU devices.")
         elif self._device_type not in ("cuda", "npu", "xpu", "cpu"):
             raise NotImplementedError(f"Unsupported device {self._device_type}")
 

From 9aa8b065e2000c15ce1e1919dda7cf0de1da4acc Mon Sep 17 00:00:00 2001
From: 0x404 <871206929@qq.com>
Date: Wed, 4 Sep 2024 09:20:11 +0000
Subject: [PATCH 15/15] fix: skip lint on oneflow third-party imports

---
 libai/utils/distributed.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libai/utils/distributed.py b/libai/utils/distributed.py
index 8b22c8a4c..f64479210 100644
--- a/libai/utils/distributed.py
+++ b/libai/utils/distributed.py
@@ -74,12 +74,12 @@ def _init_distributed_env(self, cfg):
         self._device_type = try_get_key(cfg, "device_type", default="cuda")
         if self._device_type == "npu":
             try:
-                import oneflow_npu
+                import oneflow_npu  # noqa: F401
             except ImportError:
                 raise ImportError("'oneflow_npu' is missing. Install it to use NPU devices.")
         elif self._device_type == "xpu":
             try:
-                import oneflow_xpu
+                import oneflow_xpu  # noqa: F401
             except ImportError:
                 raise ImportError("'oneflow_xpu' is missing. Install it to use NPU devices.")
         elif self._device_type not in ("cuda", "npu", "xpu", "cpu"):