From 24f1c2de0e1ec94c521571d4c9b684f5f1cc67dc Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 18 Sep 2024 05:56:44 +0000 Subject: [PATCH 01/16] update --- projects/Qwen/pipeline.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index f9628df4b..f192780a9 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -87,16 +87,31 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: return records -if __name__ == "__main__": - # ----- load huggingface checkpoint ----- +@click.command() +@click.option( + "--config_file", + default="projects/Qwen/config/qwen_config.py", + help="Path to the configuration file.", +) +@click.option("--model_path", default=None, help="Path to the model checkpoint.") +@click.option( + "--mode", + default="libai", + help="Mode for the dataloader pipeline, e.g., 'libai' or 'huggingface'.", +) +@click.option( + "--device", default="cuda", help="Device to run the model on, e.g., 'cuda', 'xpu', 'npu'." +) +def main(config_file, model_path, mode, device): pipeline = TextGenerationPipeline( - "projects/Qwen/config/qwen_config.py", + config_file, data_parallel=1, tensor_parallel=1, pipeline_parallel=1, pipeline_num_layers=32, - model_path="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B", - mode="huggingface", + model_path=model_path, + mode=mode, + device=device, ) text = ["给出3点关于保持身体健康的意见。"] @@ -104,3 +119,6 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: output = pipeline(inputs=text) if dist.is_main_process(): print(output) + +if __name__ == "__main__": + main() From 3703ff6ad408ea5c2b8640962623e611717d3bd4 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 18 Sep 2024 06:54:40 +0000 Subject: [PATCH 02/16] format --- libai/tokenizer/tokenization_base.py | 6 +++- projects/Qwen/config/qwen_config.py | 9 +++-- projects/Qwen/config/qwen_sft.py | 22 +++++------- projects/Qwen/pipeline.py | 26 +++++++++++++- projects/Qwen/tokenizer.py | 26 ++++++++------ projects/Qwen/utils/data_process.py | 51 +++++++++++----------------- projects/Qwen/utils/qwen2_loader.py | 2 +- 7 files changed, 78 insertions(+), 64 deletions(-) diff --git a/libai/tokenizer/tokenization_base.py b/libai/tokenizer/tokenization_base.py index e5e5f121d..18aaef8e6 100644 --- a/libai/tokenizer/tokenization_base.py +++ b/libai/tokenizer/tokenization_base.py @@ -827,7 +827,11 @@ def encode(self, text, return_tensors=None, is_global=False, device="cuda", **kw self.build_inputs_with_special_tokens(token_ids) for token_ids in token_ids_list ] token_ids_list = self.convert_to_tensors( - token_ids_list, return_tensors=return_tensors, is_global=is_global, **kwargs + token_ids_list, + return_tensors=return_tensors, + is_global=is_global, + device=device, + **kwargs, ) return token_ids_list elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): diff --git a/projects/Qwen/config/qwen_config.py b/projects/Qwen/config/qwen_config.py index 20381a5fd..740d0adec 100644 --- a/projects/Qwen/config/qwen_config.py +++ b/projects/Qwen/config/qwen_config.py @@ -1,10 +1,9 @@ from omegaconf import DictConfig, OmegaConf +from configs.common.train import train from libai.config import LazyCall from projects.Qwen.qwen2 import Qwen2ForCausalLM from projects.Qwen.tokenizer import Qwen2Tokenizer -from configs.common.train import train - cfg = dict( # Model @@ -49,7 +48,7 @@ eos_token_id=151645, pad_token_id=151643, # train - pretrained_model_path="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B", + pretrained_model_path="/root/models/Qwen1.5-7B-Chat", ) cfg = DictConfig(cfg) @@ -58,6 +57,6 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", - merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt", + # vocab_file="/root/models/Qwen1.5-7B/vocab.json", + # merges_file="/root/models/Qwen/Qwen1.5-7B/merges.txt", ) diff --git a/projects/Qwen/config/qwen_sft.py b/projects/Qwen/config/qwen_sft.py index 93ccf5ca9..028291065 100644 --- a/projects/Qwen/config/qwen_sft.py +++ b/projects/Qwen/config/qwen_sft.py @@ -1,20 +1,18 @@ import os + from omegaconf import OmegaConf +from configs.common.models.graph import graph +from configs.common.optim import optim +from configs.common.train import train from libai.config import LazyCall +from libai.data.build import build_nlp_test_loader, build_nlp_train_loader from libai.evaluation import PPLEvaluator from libai.scheduler import WarmupExponentialLR -from libai.data.build import build_nlp_test_loader, build_nlp_train_loader - -from configs.common.train import train -from configs.common.models.graph import graph -from configs.common.optim import optim - from projects.Qwen.config.qwen_config import cfg -from projects.Qwen.utils.qwen_dataset import QwenDataset -from projects.Qwen.tokenizer import Qwen2Tokenizer from projects.Qwen.qwen2 import Qwen2ForCausalLM - +from projects.Qwen.tokenizer import Qwen2Tokenizer +from projects.Qwen.utils.qwen_dataset import QwenDataset # Hyperparameters weight_decay = 0.1 @@ -46,11 +44,7 @@ # datasets dataloader = OmegaConf.create() dataloader.train = LazyCall(build_nlp_train_loader)( - dataset=[ - LazyCall(QwenDataset)( - path=dataset_path, tokenizer=tokenization.tokenizer - ) - ], + dataset=[LazyCall(QwenDataset)(path=dataset_path, tokenizer=tokenization.tokenizer)], ) train.update( diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index f192780a9..02e8ecc3d 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -13,6 +13,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +from pathlib import Path + +import click + +from libai.config import try_get_key +from libai.engine import DefaultTrainer from libai.inference.basic import BasePipeline from libai.utils import distributed as dist @@ -67,7 +73,9 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoderW - inputs = self.tokenizer.encode(inputs, return_tensors='of', is_global=True) + inputs = self.tokenizer.encode( + inputs, return_tensors="of", is_global=True, device=self.device + ) inputs = { "input_ids": inputs, } @@ -86,6 +94,21 @@ def postprocess(self, model_output_dict, **kwargs) -> dict: ] return records + def build_tokenizer(self, cfg): + tokenizer = None + if try_get_key(cfg, "tokenization") is not None: + tokenizer_cfg = cfg.tokenization.tokenizer + if "vocab_file" not in tokenizer_cfg: + # If "vocab_file" does not exist in the tokenizer's config, + # set it to default as f"{model_path}/vocab.json" + tokenizer_cfg.vocab_file = str(Path(self.model_path).joinpath("vocab.json")) + if "merges_file" not in tokenizer_cfg: + # If "merges_file" does not exist in the tokenizer's config, + # set it to default as f"{model_path}/merges.txt" + tokenizer_cfg.merges_file = str(Path(self.model_path).joinpath("merges.txt")) + tokenizer = DefaultTrainer.build_tokenizer(cfg) + return tokenizer + @click.command() @click.option( @@ -120,5 +143,6 @@ def main(config_file, model_path, mode, device): if dist.is_main_process(): print(output) + if __name__ == "__main__": main() diff --git a/projects/Qwen/tokenizer.py b/projects/Qwen/tokenizer.py index ba6e3eb05..acbb22f27 100644 --- a/projects/Qwen/tokenizer.py +++ b/projects/Qwen/tokenizer.py @@ -22,8 +22,8 @@ from io import open from typing import List, Optional -import regex as re import oneflow as flow +import regex as re import libai.utils.distributed as dist from libai.tokenizer.tokenization_base import PreTrainedTokenizer @@ -36,8 +36,12 @@ } PRETRAINED_VOCAB_FILES_MAP = { - "vocab_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json"}, - "merges_file": {"qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt"}, + "vocab_file": { + "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/vocab.json" + }, + "merges_file": { + "qwen/qwen-tokenizer": "https://huggingface.co/qwen/qwen-tokenizer/resolve/main/merges.txt" + }, } MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} @@ -48,14 +52,16 @@ @lru_cache() def bytes_to_unicode(): bs = ( - list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + list(range(ord("!"), ord("~") + 1)) + + list(range(ord("¡"), ord("¬") + 1)) + + list(range(ord("®"), ord("ÿ") + 1)) ) cs = bs[:] n = 0 - for b in range(2**8): + for b in range(2 ** 8): if b not in bs: bs.append(b) - cs.append(2**8 + n) + cs.append(2 ** 8 + n) n += 1 cs = [chr(n) for n in cs] return dict(zip(bs, cs)) @@ -113,11 +119,11 @@ def __init__( self.pat = re.compile(PRETOKENIZE_REGEX) super(Qwen2Tokenizer, self).__init__( - bos_token=bos_token, - eos_token=eos_token, - unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, pad_token=pad_token, - **kwargs + **kwargs, ) @property diff --git a/projects/Qwen/utils/data_process.py b/projects/Qwen/utils/data_process.py index 4c9d1946f..1f3755156 100644 --- a/projects/Qwen/utils/data_process.py +++ b/projects/Qwen/utils/data_process.py @@ -1,21 +1,23 @@ -import os import json -from tqdm import tqdm +import os import random import oneflow as flow - +from tqdm import tqdm IGNORE_TOKEN_ID = -100 data = { - 'id': 'i6IyJda_0', - 'conversations': [ - {'from': 'human', 'value': 'How to tell if a customer segment is well segmented? In 3 bullet points.'}, - {'from': 'gpt', 'value': '1. Homogeneity \n2. Distinctiveness \n3. Stability'}, - {'from': 'human', 'value': 'Thank you'}, - {'from': 'gpt', 'value': 'you are welcome'}, - ] + "id": "i6IyJda_0", + "conversations": [ + { + "from": "human", + "value": "How to tell if a customer segment is well segmented? In 3 bullet points.", + }, + {"from": "gpt", "value": "1. Homogeneity \n2. Distinctiveness \n3. Stability"}, + {"from": "human", "value": "Thank you"}, + {"from": "gpt", "value": "you are welcome"}, + ], } @@ -41,17 +43,9 @@ def qwen2_data_process( source = source[1:] input_id, target = [], [] - system = ( - [im_start] - + _system - + tokenizer(system_message).input_ids - + [im_end] - + nl_tokens - ) + system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens input_id += system - target += ( - [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens - ) + target += [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens assert len(input_id) == len(target) for j, sentence in enumerate(source): role = roles[sentence["from"]] @@ -65,10 +59,7 @@ def qwen2_data_process( input_id += _input_id if role == "<|im_start|>user": _target = ( - [im_start] - + [IGNORE_TOKEN_ID] * (len(_input_id) - 3) - + [im_end] - + nl_tokens + [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id) - 3) + [im_end] + nl_tokens ) elif role == "<|im_start|>assistant": _target = ( @@ -109,21 +100,17 @@ def preprocess(input_file, targe_file, shuffle=False, tokenizer=None): if __name__ == "__main__": - + from projects.mock_transformers.mock_tokenization import Qwen2Tokenizer input_file = "/data/home/xiezipeng/libai/projects/Qwen/subset.json" target_file = "/data/home/xiezipeng/libai/projects/Qwen" model_file = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B" - + tokenizer = Qwen2Tokenizer.from_pretrained(model_file) tokenizer.model_max_length = 2048 - preprocess( - input_file=input_file, - targe_file=target_file, - tokenizer=tokenizer - ) + preprocess(input_file=input_file, targe_file=target_file, tokenizer=tokenizer) # res = qwen2_data_process([data["conversations"]], tokenizer) # input_ids = res["input_ids"] @@ -136,6 +123,6 @@ def preprocess(input_file, targe_file, shuffle=False, tokenizer=None): # labels = labels[0] # labels[labels==IGNORE_TOKEN_ID] = 151643 - + # print("input text:\n",tokenizer.decode(input_ids[0].tolist())) # print("labels text: \n",tokenizer.decode(labels.tolist())) diff --git a/projects/Qwen/utils/qwen2_loader.py b/projects/Qwen/utils/qwen2_loader.py index 75cf970c9..8668bdca5 100644 --- a/projects/Qwen/utils/qwen2_loader.py +++ b/projects/Qwen/utils/qwen2_loader.py @@ -63,7 +63,7 @@ def _convert_state_dict(self, flow_state_dict, cfg): oneflow_state_dict.pop(query_w) oneflow_state_dict.pop(key_w) oneflow_state_dict.pop(value_w) - + query_b = old_key_qkv_b.format(layer_idx, "q_proj") key_b = old_key_qkv_b.format(layer_idx, "k_proj") value_b = old_key_qkv_b.format(layer_idx, "v_proj") From 476a6778c52e630cc5016c166b23312ba5aacc74 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 18 Sep 2024 06:59:45 +0000 Subject: [PATCH 03/16] readme --- projects/Qwen/README.md | 97 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 projects/Qwen/README.md diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md new file mode 100644 index 000000000..b84656930 --- /dev/null +++ b/projects/Qwen/README.md @@ -0,0 +1,97 @@ + +### 推理 + +- cuda TODO + +```bash +python projects/Qwen/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface +``` + +- npu TODO + +```bash +python projects/Qwen/pipeline.py --model_path=/data0/hf_models/Qwen1.5-7B-Chat --mode=huggingface --device=npu +``` + +- xpu + + +```bash +python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu +``` + +目前报错: +```bash +Traceback (most recent call last): + File "projects/Qwen/pipeline.py", line 144, in + + File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1126, in __call__ + return self.main(*args, **kwargs) + File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1051, in main + rv = self.invoke(ctx) + File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1393, in invoke + return ctx.invoke(self.callback, **ctx.params) + File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 752, in invoke + return __callback(*args, **kwargs) + File "projects/Qwen/pipeline.py", line 139, in main + text = ["给出3点关于保持身体健康的意见。"] + File "/workspace/git-repos/libai/libai/inference/basic.py", line 180, in __call__ + outputs_dict = self.postprocess(model_outputs_dict, **postprocess_params) + File "projects/Qwen/pipeline.py", line 88, in postprocess + def postprocess(self, model_output_dict, **kwargs) -> dict: + File "projects/Qwen/pipeline.py", line 89, in + return_ids = model_output_dict["return_ids"] + File "/workspace/git-repos/libai/projects/Qwen/tokenizer.py", line 202, in decode + token_ids, + File "/workspace/git-repos/libai/libai/tokenizer/tokenization_base.py", line 930, in decode + sub_texts.append(token) + File "/workspace/git-repos/libai/projects/Qwen/tokenizer.py", line 190, in convert_tokens_to_string + def _convert_id_to_token(self, index): +TypeError: sequence item 85: expected str instance, NoneType found +``` + +### 训练 TODO + +- data preparation + +```bash +python projects/Aquila/utils/data_prepare.py +``` + +- cuda 通过 + +```bash +export NUM_GPUS=4 +python3 -m oneflow.distributed.launch \ + --nproc_per_node ${NUM_GPUS} \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 127.0.0.1 \ + --master_port 12345 \ + tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \ + graph.enabled=True \ + train.input_placement_device="cuda" \ + train.dist.device_type="cuda" \ + train.dist.pipeline_parallel_size=${NUM_GPUS} +``` + +- xpu iter 0 通过, 然后就OOM + +```bash +export NUM_GPUS=1 +python3 -m oneflow.distributed.launch \ + --nproc_per_node ${NUM_GPUS} \ + --nnodes 1 \ + --node_rank 0 \ + --master_addr 127.0.0.1 \ + --master_port 12345 \ + tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \ + graph.enabled=True \ + train.input_placement_device="xpu" \ + train.dist.device_type="xpu" \ + train.dist.pipeline_parallel_size=${NUM_GPUS} +``` + +- npu 没有测,应该不行 + + From b68f0053136a85b2d124da57ed34a5090d4d81db Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Wed, 18 Sep 2024 07:15:54 +0000 Subject: [PATCH 04/16] update README --- projects/Qwen/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md index b84656930..70e8842a9 100644 --- a/projects/Qwen/README.md +++ b/projects/Qwen/README.md @@ -4,13 +4,13 @@ - cuda TODO ```bash -python projects/Qwen/pipeline.py --model_path=/root/models/Aquila-7B --mode=huggingface +python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface ``` - npu TODO ```bash -python projects/Qwen/pipeline.py --model_path=/data0/hf_models/Qwen1.5-7B-Chat --mode=huggingface --device=npu +python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B-Chat --mode=huggingface --device=npu ``` - xpu From 3a5978e252087d74e62350eb3b18dfbd8a1374ad Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 02:24:53 +0000 Subject: [PATCH 05/16] fix --- projects/Qwen/pipeline.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index 02e8ecc3d..48ba68127 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -73,9 +73,9 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoderW - inputs = self.tokenizer.encode( - inputs, return_tensors="of", is_global=True, device=self.device - ) + import oneflow as flow + inputs = flow.tensor(self.tokenizer.encode(inputs, add_bos=True, padding=True)) + inputs = { "input_ids": inputs, } @@ -83,7 +83,8 @@ def preprocess(self, inputs, **kwargs) -> dict: return inputs def forward(self, inputs, **kwargs) -> dict: - outputs = self.model.generate(inputs["input_ids"], max_length=100, **kwargs) + inputs = dist.convert_to_distributed_default_setting(inputs["input_ids"]) + outputs = self.model.generate(inputs, max_length=50, **kwargs) return {"return_ids": outputs} def postprocess(self, model_output_dict, **kwargs) -> dict: From 45531e84f8bd0f04cad134ff194b1f76b98801f9 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 02:45:20 +0000 Subject: [PATCH 06/16] update readme --- projects/Qwen/README.md | 33 +-------------------------------- 1 file changed, 1 insertion(+), 32 deletions(-) diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md index 70e8842a9..2e07fd851 100644 --- a/projects/Qwen/README.md +++ b/projects/Qwen/README.md @@ -7,7 +7,7 @@ python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface ``` -- npu TODO +- npu ```bash python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B-Chat --mode=huggingface --device=npu @@ -15,41 +15,10 @@ python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B- - xpu - ```bash python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu ``` -目前报错: -```bash -Traceback (most recent call last): - File "projects/Qwen/pipeline.py", line 144, in - - File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1126, in __call__ - return self.main(*args, **kwargs) - File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1051, in main - rv = self.invoke(ctx) - File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 1393, in invoke - return ctx.invoke(self.callback, **ctx.params) - File "/root/miniconda/envs/python38_torch201_cuda/lib/python3.8/site-packages/click/core.py", line 752, in invoke - return __callback(*args, **kwargs) - File "projects/Qwen/pipeline.py", line 139, in main - text = ["给出3点关于保持身体健康的意见。"] - File "/workspace/git-repos/libai/libai/inference/basic.py", line 180, in __call__ - outputs_dict = self.postprocess(model_outputs_dict, **postprocess_params) - File "projects/Qwen/pipeline.py", line 88, in postprocess - def postprocess(self, model_output_dict, **kwargs) -> dict: - File "projects/Qwen/pipeline.py", line 89, in - return_ids = model_output_dict["return_ids"] - File "/workspace/git-repos/libai/projects/Qwen/tokenizer.py", line 202, in decode - token_ids, - File "/workspace/git-repos/libai/libai/tokenizer/tokenization_base.py", line 930, in decode - sub_texts.append(token) - File "/workspace/git-repos/libai/projects/Qwen/tokenizer.py", line 190, in convert_tokens_to_string - def _convert_id_to_token(self, index): -TypeError: sequence item 85: expected str instance, NoneType found -``` - ### 训练 TODO - data preparation From ad9f49dac6e63361672437b06fe27540c8f02a89 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 03:29:37 +0000 Subject: [PATCH 07/16] config -> configs --- projects/Qwen/configs/qwen_config.py | 62 ++++++++++++++++++ projects/Qwen/configs/qwen_sft.py | 97 ++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 projects/Qwen/configs/qwen_config.py create mode 100644 projects/Qwen/configs/qwen_sft.py diff --git a/projects/Qwen/configs/qwen_config.py b/projects/Qwen/configs/qwen_config.py new file mode 100644 index 000000000..740d0adec --- /dev/null +++ b/projects/Qwen/configs/qwen_config.py @@ -0,0 +1,62 @@ +from omegaconf import DictConfig, OmegaConf + +from configs.common.train import train +from libai.config import LazyCall +from projects.Qwen.qwen2 import Qwen2ForCausalLM +from projects.Qwen.tokenizer import Qwen2Tokenizer + +cfg = dict( + # Model + vocab_size=151936, + hidden_size=4096, + intermediate_size=22016, + hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-06, + rope_theta=10000.0, + attention_dropout=0.0, + tie_word_embeddings=False, + use_scaled_init_for_output_weights=False, + scale_mask_softmax_fusion=False, + amp_enabled=True, + # Inference + is_encoder_decoder=False, + max_length=256, + min_length=0, + do_sample=False, + early_stopping=False, + num_beams=1, + num_beam_groups=1, + diversity_penalty=0.0, + temperature=0.7, + top_k=20, + top_p=0.8, + typical_p=1.0, + repetition_penalty=1.05, + length_penalty=1.0, + no_repeat_ngram_size=0, + encoder_no_repeat_ngram_size=0, + num_return_sequences=1, + chunk_size_feed_forward=0, + output_scores=False, + use_cache=True, + bos_token_id=151643, + eos_token_id=151645, + pad_token_id=151643, + # train + pretrained_model_path="/root/models/Qwen1.5-7B-Chat", +) + +cfg = DictConfig(cfg) + +model = LazyCall(Qwen2ForCausalLM)(cfg=cfg) +tokenization = OmegaConf.create() +tokenization.make_vocab_size_divisible_by = 1 +tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( + # vocab_file="/root/models/Qwen1.5-7B/vocab.json", + # merges_file="/root/models/Qwen/Qwen1.5-7B/merges.txt", +) diff --git a/projects/Qwen/configs/qwen_sft.py b/projects/Qwen/configs/qwen_sft.py new file mode 100644 index 000000000..80cdd36be --- /dev/null +++ b/projects/Qwen/configs/qwen_sft.py @@ -0,0 +1,97 @@ +import os + +from omegaconf import OmegaConf + +from configs.common.models.graph import graph +from configs.common.optim import optim +from configs.common.train import train +from libai.config import LazyCall +from libai.data.build import build_nlp_test_loader, build_nlp_train_loader +from libai.evaluation import PPLEvaluator +from libai.scheduler import WarmupExponentialLR +from projects.Qwen.configs.qwen_config import cfg +from projects.Qwen.qwen2 import Qwen2ForCausalLM +from projects.Qwen.tokenizer import Qwen2Tokenizer +from projects.Qwen.qwen_dataset import QwenDataset + +# Hyperparameters +weight_decay = 0.1 +learning_rate = 5e-5 +dataset_path = "./alpaca_data" +pretrained_model_path = "/root/models/Qwen1.5-7B-Chat" + +# graph & optim +graph["enabled"] = False +optim.update( + dict( + lr=learning_rate, + weight_decay=weight_decay, + ) +) + +# tokenize +tokenization = OmegaConf.create() +tokenization.make_vocab_size_divisible_by = 1 +tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( + vocab_file=pretrained_model_path+"/vocab.json", + merges_file=pretrained_model_path+"/merges.txt", +) + + +# model +cfg.pretrained_model_path = pretrained_model_path +model = LazyCall(Qwen2ForCausalLM)(cfg=cfg) + +# datasets +dataloader = OmegaConf.create() +dataloader.train = LazyCall(build_nlp_train_loader)( + dataset=[ + LazyCall(QwenDataset)( + path=os.path.join(dataset_path, "train"), tokenizer=tokenization.tokenizer + ) + ], +) +dataloader.test = [ + LazyCall(build_nlp_test_loader)( + dataset=LazyCall(QwenDataset)( + path=os.path.join(dataset_path, "test"), tokenizer=tokenization.tokenizer + ), + ), +] + +train.update( + dict( + output_dir="./sft_result", + train_micro_batch_size=1, + test_micro_batch_size=1, + train_epoch=3, + train_iter=1, + log_period=10, + warmup_ratio=1 / 3, + num_accumulation_steps=8, + rdma_enabled=False, + amp=dict(enabled=True), + activation_checkpoint=dict(enabled=True), + checkpointer=dict( + period=5000, + max_to_keep=20, + ), + dist=dict( + data_parallel_size=1, + tensor_parallel_size=1, + pipeline_parallel_size=8, + pipeline_num_layers=cfg.hidden_layers, + ), + evaluation=dict( + enabled=False, + evaluator=LazyCall(PPLEvaluator)(), + eval_period=1000, + eval_iter=1e5, + ), + scheduler=LazyCall(WarmupExponentialLR)( + warmup_factor=0.0, + gamma=1.0, + warmup_method="linear", + ), + ) +) From 078a1db770c255e7ed99c77b2c879f50c29493ce Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 03:30:53 +0000 Subject: [PATCH 08/16] update sft --- projects/Qwen/README.md | 14 +-- projects/Qwen/config/qwen_config.py | 62 ---------- projects/Qwen/config/qwen_sft.py | 85 -------------- projects/Qwen/pipeline.py | 2 +- projects/Qwen/qwen_dataset.py | 19 +++ projects/Qwen/utils/prepare_alpaca.py | 163 ++++++++++++++++++++++++++ 6 files changed, 190 insertions(+), 155 deletions(-) delete mode 100644 projects/Qwen/config/qwen_config.py delete mode 100644 projects/Qwen/config/qwen_sft.py create mode 100644 projects/Qwen/qwen_dataset.py create mode 100644 projects/Qwen/utils/prepare_alpaca.py diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md index 2e07fd851..aaad3ed31 100644 --- a/projects/Qwen/README.md +++ b/projects/Qwen/README.md @@ -19,15 +19,15 @@ python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B- python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu ``` -### 训练 TODO +### 训练 - data preparation ```bash -python projects/Aquila/utils/data_prepare.py +python projects/Qwen/utils/data_prepare.py ``` -- cuda 通过 +- cuda TODO ```bash export NUM_GPUS=4 @@ -37,14 +37,14 @@ python3 -m oneflow.distributed.launch \ --node_rank 0 \ --master_addr 127.0.0.1 \ --master_port 12345 \ - tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \ + tools/train_net.py --config-file=projects/Qwen/configs/qwen_sft.py \ graph.enabled=True \ train.input_placement_device="cuda" \ train.dist.device_type="cuda" \ train.dist.pipeline_parallel_size=${NUM_GPUS} ``` -- xpu iter 0 通过, 然后就OOM +- xpu TODO ```bash export NUM_GPUS=1 @@ -54,8 +54,8 @@ python3 -m oneflow.distributed.launch \ --node_rank 0 \ --master_addr 127.0.0.1 \ --master_port 12345 \ - tools/train_net.py --config-file=projects/Aquila/configs/aquila_sft.py \ - graph.enabled=True \ + tools/train_net.py --config-file=projects/Qwen/configs/qwen_sft.py \ + graph.enabled=False \ train.input_placement_device="xpu" \ train.dist.device_type="xpu" \ train.dist.pipeline_parallel_size=${NUM_GPUS} diff --git a/projects/Qwen/config/qwen_config.py b/projects/Qwen/config/qwen_config.py deleted file mode 100644 index 740d0adec..000000000 --- a/projects/Qwen/config/qwen_config.py +++ /dev/null @@ -1,62 +0,0 @@ -from omegaconf import DictConfig, OmegaConf - -from configs.common.train import train -from libai.config import LazyCall -from projects.Qwen.qwen2 import Qwen2ForCausalLM -from projects.Qwen.tokenizer import Qwen2Tokenizer - -cfg = dict( - # Model - vocab_size=151936, - hidden_size=4096, - intermediate_size=22016, - hidden_layers=32, - num_attention_heads=32, - num_key_value_heads=32, - hidden_act="silu", - max_position_embeddings=32768, - initializer_range=0.02, - rms_norm_eps=1e-06, - rope_theta=10000.0, - attention_dropout=0.0, - tie_word_embeddings=False, - use_scaled_init_for_output_weights=False, - scale_mask_softmax_fusion=False, - amp_enabled=True, - # Inference - is_encoder_decoder=False, - max_length=256, - min_length=0, - do_sample=False, - early_stopping=False, - num_beams=1, - num_beam_groups=1, - diversity_penalty=0.0, - temperature=0.7, - top_k=20, - top_p=0.8, - typical_p=1.0, - repetition_penalty=1.05, - length_penalty=1.0, - no_repeat_ngram_size=0, - encoder_no_repeat_ngram_size=0, - num_return_sequences=1, - chunk_size_feed_forward=0, - output_scores=False, - use_cache=True, - bos_token_id=151643, - eos_token_id=151645, - pad_token_id=151643, - # train - pretrained_model_path="/root/models/Qwen1.5-7B-Chat", -) - -cfg = DictConfig(cfg) - -model = LazyCall(Qwen2ForCausalLM)(cfg=cfg) -tokenization = OmegaConf.create() -tokenization.make_vocab_size_divisible_by = 1 -tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - # vocab_file="/root/models/Qwen1.5-7B/vocab.json", - # merges_file="/root/models/Qwen/Qwen1.5-7B/merges.txt", -) diff --git a/projects/Qwen/config/qwen_sft.py b/projects/Qwen/config/qwen_sft.py deleted file mode 100644 index 028291065..000000000 --- a/projects/Qwen/config/qwen_sft.py +++ /dev/null @@ -1,85 +0,0 @@ -import os - -from omegaconf import OmegaConf - -from configs.common.models.graph import graph -from configs.common.optim import optim -from configs.common.train import train -from libai.config import LazyCall -from libai.data.build import build_nlp_test_loader, build_nlp_train_loader -from libai.evaluation import PPLEvaluator -from libai.scheduler import WarmupExponentialLR -from projects.Qwen.config.qwen_config import cfg -from projects.Qwen.qwen2 import Qwen2ForCausalLM -from projects.Qwen.tokenizer import Qwen2Tokenizer -from projects.Qwen.utils.qwen_dataset import QwenDataset - -# Hyperparameters -weight_decay = 0.1 -learning_rate = 5e-5 -dataset_path = "/data/home/xiezipeng/libai/projects/Qwen/train_set" -pretrained_model_path = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B" - -# graph & optim -graph["enabled"] = False -optim.update( - dict( - lr=learning_rate, - weight_decay=weight_decay, - ) -) - -# tokenize -tokenization = OmegaConf.create() -tokenization.make_vocab_size_divisible_by = 1 -tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", - merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt", -) - - -# model -model = LazyCall(Qwen2ForCausalLM)(cfg=cfg) - -# datasets -dataloader = OmegaConf.create() -dataloader.train = LazyCall(build_nlp_train_loader)( - dataset=[LazyCall(QwenDataset)(path=dataset_path, tokenizer=tokenization.tokenizer)], -) - -train.update( - dict( - output_dir="./sft_result", - train_micro_batch_size=1, - test_micro_batch_size=1, - train_epoch=3, - train_iter=1, - log_period=10, - warmup_ratio=1 / 3, - num_accumulation_steps=8, - rdma_enabled=False, - amp=dict(enabled=True), - activation_checkpoint=dict(enabled=True), - checkpointer=dict( - period=5000, - max_to_keep=20, - ), - dist=dict( - data_parallel_size=1, - tensor_parallel_size=1, - pipeline_parallel_size=8, - pipeline_num_layers=cfg.hidden_layers, - ), - evaluation=dict( - enabled=False, - evaluator=LazyCall(PPLEvaluator)(), - eval_period=1000, - eval_iter=1e5, - ), - scheduler=LazyCall(WarmupExponentialLR)( - warmup_factor=0.0, - gamma=1.0, - warmup_method="linear", - ), - ) -) diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index 48ba68127..413577384 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -114,7 +114,7 @@ def build_tokenizer(self, cfg): @click.command() @click.option( "--config_file", - default="projects/Qwen/config/qwen_config.py", + default="projects/Qwen/configs/qwen_config.py", help="Path to the configuration file.", ) @click.option("--model_path", default=None, help="Path to the model checkpoint.") diff --git a/projects/Qwen/qwen_dataset.py b/projects/Qwen/qwen_dataset.py new file mode 100644 index 000000000..c7c412a01 --- /dev/null +++ b/projects/Qwen/qwen_dataset.py @@ -0,0 +1,19 @@ +import oneflow as flow +from oneflow.utils.data import Dataset + +from libai.data.structures import DistTensorData, Instance + + +class QwenDataset(Dataset): + def __init__(self, path, tokenizer): + self.data = flow.load(path) + self.tokenizer = tokenizer + + def __len__(self): + return len(self.data) + + def __getitem__(self, index): + return Instance( + input_ids=DistTensorData(self.data[index]["input_ids"]), + labels=DistTensorData(self.data[index]["labels"]), + ) diff --git a/projects/Qwen/utils/prepare_alpaca.py b/projects/Qwen/utils/prepare_alpaca.py new file mode 100644 index 000000000..b02f9f472 --- /dev/null +++ b/projects/Qwen/utils/prepare_alpaca.py @@ -0,0 +1,163 @@ +"""Implementation derived from https://github.com/tloen/alpaca-lora""" +import copy +import json +import math +import os +from pathlib import Path +from typing import Optional + +import oneflow as flow +import requests +from oneflow.utils.data import random_split +from tqdm import tqdm + +from libai.config import instantiate +from libai.utils.logger import setup_logger +from projects.Qwen.configs.qwen_config import tokenization + +logger = setup_logger() + + +def prepare( + destination_path: Path = Path("./data/libai_xpu_alpaca"), + checkpoint_dir: Path = Path("/root/models/Qwen1.5-7B-Chat"), + test_split_fraction: float = 0.03865, # to get exactly 2000 test samples, + seed: int = 42, + mask_inputs: bool = False, # as in alpaca-lora + data_file_name: str = "alpaca_data_cleaned_archive.json", + data_file_url: str = "https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_cleaned_archive.json", # noqa + ignore_index: int = -100, + max_seq_length: Optional[int] = 512, +) -> None: + """Prepare the Alpaca dataset for instruction tuning. + The output is a training and test dataset saved as `train.pt` and `test.pt`, + which stores the preprocessed and tokenized prompts and labels. + """ + if max_seq_length is None: + with open(os.path.join(checkpoint_dir, "config.json"), "r", encoding="utf-8") as file: + config = json.load(file) + max_seq_length = config["max_position_embeddings"] + + destination_path.mkdir(parents=True, exist_ok=True) + data_file_path = destination_path / data_file_name + logger.info("Loading data file...") + download_if_missing(data_file_path, data_file_url) + with open(data_file_path, "r", encoding="utf-8") as file: + data = json.load(file) + + logger.info("Loading tokenizer...") + tokenizer = instantiate(tokenization.tokenizer) + + # Partition the dataset into train and test + num_of_test_samples = math.floor(test_split_fraction * len(data)) + num_of_train_samples = len(data) - num_of_test_samples + train_set, test_set = random_split( + data, + [num_of_train_samples, num_of_test_samples], + generator=flow.Generator().manual_seed(seed), + ) + train_set, test_set = list(train_set), list(test_set) + + logger.info(f"train has {len(train_set):,} samples") + logger.info(f"test has {len(test_set):,} samples") + + logger.info("Processing train split ...") + train_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(train_set) + ] + flow.save(train_set, destination_path / "train") + + logger.info("Processing test split ...") + test_set = [ + prepare_sample( + example=sample, + tokenizer=tokenizer, + max_length=max_seq_length, + ) + for sample in tqdm(test_set) + ] + flow.save(test_set, destination_path / "test") + + max_length = max([i["input_ids"].shape[0] for i in train_set]) + logger.info("Max length of training dataset: {}".format(max_length)) + + +def download_if_missing(file_path: Path, file_url: str) -> None: + """Downloads the raw json data file and saves it in the given destination.""" + if file_path.exists() and file_path.stat().st_size > 0: + return + with open(file_path, "w", encoding="utf-8") as f: + f.write(requests.get(file_url).text) + + +def prepare_sample(example: dict, tokenizer, max_length: int) -> dict: + """Processes a single sample. + Each sample in the dataset consists of: + - instruction: A string describing the task + - input: A string holding a special input value for the instruction. + This only applies to some samples, and in others this is empty. + - output: The response string + This function processes this data to produce a prompt text and a label for + supervised training. The prompt text is formed as a single message including both + the instruction and the input. The label/target is the same message but with the + response attached. + Finally, both the prompt and the label get tokenized. If desired, all tokens + in the label that correspond to the original input prompt get masked out (default). + """ + full_prompt = generate_prompt(example) + full_prompt_and_response = full_prompt + example["output"] + + prompt = tokenizer.encode(full_prompt, device="cpu") + prompt = flow.tensor(prompt, dtype=flow.int, device="cpu") + example = tokenizer.encode( + full_prompt_and_response, device="cpu" + ) + example = flow.tensor(example, dtype=flow.int, device="cpu") + + padding = max_length - example.shape[0] + if padding > 0: + example = flow.cat((example, flow.zeros(padding, dtype=flow.long) - 1)) + elif padding < 0: + example = example[:max_length] + labels = copy.deepcopy(example) + labels[: len(prompt)] = -1 + example_mask = example.ge(0) + label_mask = labels.ge(0) + example[~example_mask] = 0 + labels[~label_mask] = -1 + example = example[:-1] + labels = labels[1:] + example_mask = flow.where( + example_mask, flow.tensor(0, dtype=flow.float), flow.tensor(-float("inf")) + ) + example_mask = example_mask[:-1] + return { + "input_ids": example, + "labels": labels, + } + + +def generate_prompt(example: dict) -> str: + """Generates a standardized message to prompt the model with an instruction, optional input and a + 'response' field.""" + + if example["input"]: + return ( + "Below is an instruction that describes a task, paired with an input that provides further context. " # noqa + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:" # noqa + ) + return ( + "Below is an instruction that describes a task. " + "Write a response that appropriately completes the request.\n\n" + f"### Instruction:\n{example['instruction']}\n\n### Response:" + ) + + +if __name__ == "__main__": + prepare() From 195cdb91d5769eea659c19eba59c5d22199cb7ea Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 03:41:21 +0000 Subject: [PATCH 09/16] update --- projects/Qwen/configs/qwen_sft.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/Qwen/configs/qwen_sft.py b/projects/Qwen/configs/qwen_sft.py index 80cdd36be..95e60d818 100644 --- a/projects/Qwen/configs/qwen_sft.py +++ b/projects/Qwen/configs/qwen_sft.py @@ -64,11 +64,11 @@ output_dir="./sft_result", train_micro_batch_size=1, test_micro_batch_size=1, - train_epoch=3, + train_epoch=1, train_iter=1, - log_period=10, + log_period=1, warmup_ratio=1 / 3, - num_accumulation_steps=8, + num_accumulation_steps=1, rdma_enabled=False, amp=dict(enabled=True), activation_checkpoint=dict(enabled=True), From 240c01bf0dad7001d76d7f2d9ec47857c321719d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 03:45:08 +0000 Subject: [PATCH 10/16] update --- projects/Qwen/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md index aaad3ed31..ab57712f9 100644 --- a/projects/Qwen/README.md +++ b/projects/Qwen/README.md @@ -44,7 +44,7 @@ python3 -m oneflow.distributed.launch \ train.dist.pipeline_parallel_size=${NUM_GPUS} ``` -- xpu TODO +- xpu OOM ```bash export NUM_GPUS=1 From 1362ca1120de5e3ee32c03e6a31e175e561ac93d Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 19:15:12 +0800 Subject: [PATCH 11/16] update --- projects/Qwen/README.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/projects/Qwen/README.md b/projects/Qwen/README.md index ab57712f9..65333c443 100644 --- a/projects/Qwen/README.md +++ b/projects/Qwen/README.md @@ -1,19 +1,19 @@ ### 推理 -- cuda TODO +- cuda PASS ```bash python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface ``` -- npu +- npu PASS ```bash python projects/Qwen/pipeline.py --model_path=/data0/hf_models/qwen2/Qwen1.5-7B-Chat --mode=huggingface --device=npu ``` -- xpu +- xpu PASS ```bash python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mode=huggingface --device=xpu @@ -27,10 +27,10 @@ python projects/Qwen/pipeline.py --model_path=/root/models/Qwen1.5-7B-Chat --mod python projects/Qwen/utils/data_prepare.py ``` -- cuda TODO +- cuda PASS ```bash -export NUM_GPUS=4 +export NUM_GPUS=8 python3 -m oneflow.distributed.launch \ --nproc_per_node ${NUM_GPUS} \ --nnodes 1 \ @@ -43,6 +43,7 @@ python3 -m oneflow.distributed.launch \ train.dist.device_type="cuda" \ train.dist.pipeline_parallel_size=${NUM_GPUS} ``` +A100-PCIE-40GB x 4 OOM - xpu OOM From acb337a33acc81ddd34b197cf40ce1f968840eb3 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Thu, 19 Sep 2024 11:19:27 +0000 Subject: [PATCH 12/16] black format --- projects/Qwen/configs/qwen_sft.py | 4 ++-- projects/Qwen/pipeline.py | 1 + projects/Qwen/utils/prepare_alpaca.py | 6 ++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/projects/Qwen/configs/qwen_sft.py b/projects/Qwen/configs/qwen_sft.py index 95e60d818..05f506d97 100644 --- a/projects/Qwen/configs/qwen_sft.py +++ b/projects/Qwen/configs/qwen_sft.py @@ -33,8 +33,8 @@ tokenization = OmegaConf.create() tokenization.make_vocab_size_divisible_by = 1 tokenization.tokenizer = LazyCall(Qwen2Tokenizer)( - vocab_file=pretrained_model_path+"/vocab.json", - merges_file=pretrained_model_path+"/merges.txt", + vocab_file=pretrained_model_path + "/vocab.json", + merges_file=pretrained_model_path + "/merges.txt", ) diff --git a/projects/Qwen/pipeline.py b/projects/Qwen/pipeline.py index 413577384..75c702a95 100644 --- a/projects/Qwen/pipeline.py +++ b/projects/Qwen/pipeline.py @@ -74,6 +74,7 @@ def _parse_parameters(self, **pipeline_parameters): def preprocess(self, inputs, **kwargs) -> dict: # tokenizer encoderW import oneflow as flow + inputs = flow.tensor(self.tokenizer.encode(inputs, add_bos=True, padding=True)) inputs = { diff --git a/projects/Qwen/utils/prepare_alpaca.py b/projects/Qwen/utils/prepare_alpaca.py index b02f9f472..ce422c96b 100644 --- a/projects/Qwen/utils/prepare_alpaca.py +++ b/projects/Qwen/utils/prepare_alpaca.py @@ -111,12 +111,10 @@ def prepare_sample(example: dict, tokenizer, max_length: int) -> dict: """ full_prompt = generate_prompt(example) full_prompt_and_response = full_prompt + example["output"] - + prompt = tokenizer.encode(full_prompt, device="cpu") prompt = flow.tensor(prompt, dtype=flow.int, device="cpu") - example = tokenizer.encode( - full_prompt_and_response, device="cpu" - ) + example = tokenizer.encode(full_prompt_and_response, device="cpu") example = flow.tensor(example, dtype=flow.int, device="cpu") padding = max_length - example.shape[0] From 400dcab68073d921a2d3abbc40956bd185634413 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 20 Sep 2024 01:08:51 +0000 Subject: [PATCH 13/16] format and isort --- libai/models/utils/model_loader/base_loader.py | 1 - projects/mock_transformers/mock_tokenization.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/libai/models/utils/model_loader/base_loader.py b/libai/models/utils/model_loader/base_loader.py index 5222e9f06..e12294cd3 100644 --- a/libai/models/utils/model_loader/base_loader.py +++ b/libai/models/utils/model_loader/base_loader.py @@ -22,7 +22,6 @@ import oneflow as flow from safetensors import safe_open from termcolor import colored -from safetensors import safe_open import libai.utils.distributed as dist from libai.config import LazyCall diff --git a/projects/mock_transformers/mock_tokenization.py b/projects/mock_transformers/mock_tokenization.py index 22f42e693..b28cebeb9 100644 --- a/projects/mock_transformers/mock_tokenization.py +++ b/projects/mock_transformers/mock_tokenization.py @@ -19,7 +19,6 @@ import oneflow.mock_torch as mock from libai.utils import distributed as dist -import oneflow.mock_torch as mock with mock.enable(lazy=True): from transformers import ( # noqa @@ -33,7 +32,6 @@ from transformers.utils import generic # noqa from transformers.utils.generic import TensorType # noqa - # ---------------- mock TensorType ------------------ class TensorType(ExplicitEnum): # noqa PYTORCH = "pt" @@ -145,5 +143,4 @@ def flow_convert_to_tensors(self, tensor_type=None, prepend_batch_axis=False): self[k] = v.to_global(sbp=sbp, placement=dist.get_layer_placement(0)) return self - BatchEncoding.convert_to_tensors = flow_convert_to_tensors # noqa From 88fcc1052ee6b93c0158f379982ce5e2a656022e Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 20 Sep 2024 01:11:59 +0000 Subject: [PATCH 14/16] fix imports --- projects/Qwen/tokenizer.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/projects/Qwen/tokenizer.py b/projects/Qwen/tokenizer.py index acbb22f27..2fd570a6d 100644 --- a/projects/Qwen/tokenizer.py +++ b/projects/Qwen/tokenizer.py @@ -20,12 +20,10 @@ import unicodedata from functools import lru_cache from io import open -from typing import List, Optional +from typing import Optional -import oneflow as flow import regex as re -import libai.utils.distributed as dist from libai.tokenizer.tokenization_base import PreTrainedTokenizer logger = logging.getLogger(__name__) From f568e2f8387fc31ccecafcf25b890dd738002b61 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 20 Sep 2024 01:21:58 +0000 Subject: [PATCH 15/16] rm usless files --- projects/Qwen/test.py | 37 -------- projects/Qwen/utils/data_process.py | 128 ---------------------------- 2 files changed, 165 deletions(-) delete mode 100644 projects/Qwen/test.py delete mode 100644 projects/Qwen/utils/data_process.py diff --git a/projects/Qwen/test.py b/projects/Qwen/test.py deleted file mode 100644 index 8fb4e574a..000000000 --- a/projects/Qwen/test.py +++ /dev/null @@ -1,37 +0,0 @@ -# from transformers import Qwen2Tokenizer as T2 -# from projects.Qwen.tokenizer import Qwen2Tokenizer as T1 - - -# tokenizer1 = T1( -# vocab_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/vocab.json", -# merges_file="/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B/merges.txt" -# ) -# tokenizer2 = T2.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") - -# text = [ -# "清晨的阳光洒落在树叶上,露珠在叶片上闪烁着晶莹的光泽。微风拂过,树枝微微摇曳,像是在向大自然问好。泥土的芳香弥漫在空气中,一只小鸟欢快地啾啾鸣叫,这是一个美好的新的一天。", -# "书本总是向我们敞开怀抱,蕴藏着无穷无尽的智慧和知识。当我打开一本书时,仿佛走进了一个全新的世界。字里行间娓娓道来着作者的心血和思想,让我如痴如醉地沉浸其中,收获了许多启迪和感悟。", -# "夜幕低垂,城市璀璨的灯火像是一颗颗明亮的星星。街道上来来往往的行人、川流不息的车辆,构成了一幅生动活泼的都市夜景。霓虹灯的光影闪烁,将这座城市渲染得更加缤纷多彩。", -# "The morning dew glistened on the blades of grass, each droplet reflecting the warm rays of the rising sun. A gentle breeze carried the sweet scent of flowers, and birds serenaded the new day with their cheerful melodies. It was a picture-perfect start to what promised to be a beautiful day.", -# "As I turned the pages of the worn leather-bound book, I found myself transported to distant lands and bygone eras. The author's words painted vivid scenes that danced across my mind's eye, inviting me to explore the depths of human experience and emotion. Reading has always been an escape, a journey without ever leaving my chair.", -# ] - -# for i in text: -# print(i) -# res1 = tokenizer1.encode(text) -# # res2 = tokenizer2.tokenize(i) -# print(res1) -# # assert res1 == res2 - -from transformers import AutoModelForCausalLM, AutoTokenizer - -# model = AutoModelForCausalLM.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") -t = AutoTokenizer.from_pretrained("/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B") -print(t.encode("<|endoftext|>")) -print(t.pad_token_id) - -# text = "给出3点关于保持身体健康的意见。" -# input_ids = t.encode(text, return_tensors='pt') -# res = model.generate(input_ids, max_new_tokens=30) -# res = t.decode(res[0]) -# print(res) diff --git a/projects/Qwen/utils/data_process.py b/projects/Qwen/utils/data_process.py deleted file mode 100644 index 1f3755156..000000000 --- a/projects/Qwen/utils/data_process.py +++ /dev/null @@ -1,128 +0,0 @@ -import json -import os -import random - -import oneflow as flow -from tqdm import tqdm - -IGNORE_TOKEN_ID = -100 - -data = { - "id": "i6IyJda_0", - "conversations": [ - { - "from": "human", - "value": "How to tell if a customer segment is well segmented? In 3 bullet points.", - }, - {"from": "gpt", "value": "1. Homogeneity \n2. Distinctiveness \n3. Stability"}, - {"from": "human", "value": "Thank you"}, - {"from": "gpt", "value": "you are welcome"}, - ], -} - - -def qwen2_data_process( - sources, - tokenizer, - system_message: str = "You are a helpful assistant.", -): - max_len = tokenizer.model_max_length - roles = {"user": "<|im_start|>user", "assistant": "<|im_start|>assistant"} - - im_start = tokenizer.encode("<|im_start|>")[0] - im_end = tokenizer.encode("<|im_end|>")[0] - nl_tokens = tokenizer("\n").input_ids - _system = tokenizer("system").input_ids + nl_tokens - _user = tokenizer("user").input_ids + nl_tokens - _assistant = tokenizer("assistant").input_ids + nl_tokens - - # Apply prompt templates - input_ids, targets = [], [] - for i, source in enumerate(sources): - if roles[source[0]["from"]] != roles["user"]: - source = source[1:] - - input_id, target = [], [] - system = [im_start] + _system + tokenizer(system_message).input_ids + [im_end] + nl_tokens - input_id += system - target += [im_start] + [IGNORE_TOKEN_ID] * (len(system) - 3) + [im_end] + nl_tokens - assert len(input_id) == len(target) - for j, sentence in enumerate(source): - role = roles[sentence["from"]] - _input_id = ( - tokenizer(role).input_ids - + nl_tokens - + tokenizer(sentence["value"]).input_ids - + [im_end] - + nl_tokens - ) - input_id += _input_id - if role == "<|im_start|>user": - _target = ( - [im_start] + [IGNORE_TOKEN_ID] * (len(_input_id) - 3) + [im_end] + nl_tokens - ) - elif role == "<|im_start|>assistant": - _target = ( - [im_start] - + [IGNORE_TOKEN_ID] * (len(tokenizer(role).input_ids) - 1) - + _input_id[len(tokenizer(role).input_ids) : -2] - + [im_end] - + nl_tokens - ) - else: - raise NotImplementedError - target += _target - assert len(input_id) == len(target) - input_id += [tokenizer.pad_token_id] * (max_len - len(input_id)) - target += [IGNORE_TOKEN_ID] * (max_len - len(target)) - input_ids.append(input_id[:max_len]) - targets.append(target[:max_len]) - input_ids = flow.tensor(input_ids, dtype=flow.int, device="cpu") - targets = flow.tensor(targets, dtype=flow.long, device="cpu") - attention_mask = input_ids.ne(tokenizer.pad_token_id) - attention_mask = flow.where(attention_mask, flow.tensor(0.0), flow.tensor(-float("Inf"))) - - return dict( - input_ids=input_ids[0], - labels=targets[0], - attention_mask=attention_mask[0], - ) - - -def preprocess(input_file, targe_file, shuffle=False, tokenizer=None): - file = open(input_file, "r") - data = json.load(file) - if shuffle: - random.shuffle(data) - train_set = [qwen2_data_process([sample["conversations"]], tokenizer) for sample in tqdm(data)] - flow.save(train_set, os.path.join(targe_file, "train_set")) - print("training dataset saved in {}\n".format(os.path.join(targe_file, "train_set"))) - - -if __name__ == "__main__": - - from projects.mock_transformers.mock_tokenization import Qwen2Tokenizer - - input_file = "/data/home/xiezipeng/libai/projects/Qwen/subset.json" - target_file = "/data/home/xiezipeng/libai/projects/Qwen" - model_file = "/data/home/xiezipeng/hf_models/Qwen/Qwen1.5-7B" - - tokenizer = Qwen2Tokenizer.from_pretrained(model_file) - tokenizer.model_max_length = 2048 - - preprocess(input_file=input_file, targe_file=target_file, tokenizer=tokenizer) - - # res = qwen2_data_process([data["conversations"]], tokenizer) - # input_ids = res["input_ids"] - # labels = res["labels"] - # attention_mask = res["attention_mask"] - - # print(input_ids[0]) - # print(labels) - # print(attention_mask) - - # labels = labels[0] - # labels[labels==IGNORE_TOKEN_ID] = 151643 - - # print("input text:\n",tokenizer.decode(input_ids[0].tolist())) - # print("labels text: \n",tokenizer.decode(labels.tolist())) From 974a3707e47aebb31a2847bcb6817169324f8ef4 Mon Sep 17 00:00:00 2001 From: ShawnXuan Date: Fri, 20 Sep 2024 01:24:51 +0000 Subject: [PATCH 16/16] format --- projects/Qwen/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/Qwen/tokenizer.py b/projects/Qwen/tokenizer.py index 2fd570a6d..cc00fa800 100644 --- a/projects/Qwen/tokenizer.py +++ b/projects/Qwen/tokenizer.py @@ -44,7 +44,7 @@ MAX_MODEL_INPUT_SIZES = {"qwen/qwen-tokenizer": 32768} -PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" +PRETOKENIZE_REGEX = r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""" # noqa: E501 @lru_cache()