diff --git a/dbgpt_hub/configs/train_args.py b/dbgpt_hub/configs/train_args.py index 1004cd7..f79b048 100644 --- a/dbgpt_hub/configs/train_args.py +++ b/dbgpt_hub/configs/train_args.py @@ -1,14 +1,16 @@ from dataclasses import dataclass, field from typing import Optional from transformers import TrainingArguments +from transformers import Seq2SeqTrainingArguments @dataclass -class TrainingArguments(transformers.Seq2SeqTrainingArguments): +class TrainingArguments(Seq2SeqTrainingArguments): cache_dir: Optional[str] = field(default=None) train_on_source: Optional[bool] = field( default=False, - metadata={"help": "Whether to train on the input in addition to the target text."} + metadata={ + "help": "Whether to train on the input in addition to the target text."} ) full_finetune: bool = field( default=False, @@ -43,17 +45,32 @@ class TrainingArguments(transformers.Seq2SeqTrainingArguments): 'Maximum sequence length. Sequences will be right padded (and possibly truncated).' }, ) - output_dir: str = field(default='./adapter', metadata={"help": 'The output dir for logs and checkpoints'}) - per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'}) - gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'}) - max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'}) - weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed - learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'}) - remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'}) - lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'}) - warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'}) - logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'}) - group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'}) - save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'}) - save_steps: int = field(default=250, metadata={"help": 'How often to save a model'}) - save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'}) + output_dir: str = field( + default='./adapter', metadata={"help": 'The output dir for logs and checkpoints'}) + per_device_train_batch_size: int = field(default=1, metadata={ + "help": 'The training batch size per GPU. Increase for better speed.'}) + gradient_accumulation_steps: int = field(default=16, metadata={ + "help": 'How many gradients to accumulate before to perform an optimizer step'}) + max_steps: int = field(default=10000, metadata={ + "help": 'How many optimizer update steps to take'}) + # use lora dropout instead for regularization if needed + weight_decay: float = field(default=0.0, metadata={ + "help": 'The L2 weight decay rate of AdamW'}) + learning_rate: float = field(default=0.0002, metadata={ + "help": 'The learnign rate'}) + remove_unused_columns: bool = field(default=False, metadata={ + "help": 'Removed unused columns. Needed to make this codebase work.'}) + lr_scheduler_type: str = field(default='constant', metadata={ + "help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'}) + warmup_ratio: float = field(default=0.03, metadata={ + "help": 'Fraction of steps to do a warmup for'}) + logging_steps: int = field(default=10, metadata={ + "help": 'The frequency of update steps after which to log the loss'}) + group_by_length: bool = field(default=True, metadata={ + "help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'}) + save_strategy: str = field(default='steps', metadata={ + "help": 'When to save checkpoints'}) + save_steps: int = field(default=250, metadata={ + "help": 'How often to save a model'}) + save_total_limit: int = field(default=40, metadata={ + "help": 'How many checkpoints to save before the oldest is overwritten'}) diff --git a/predict_lora.py b/predict_lora.py index bb6b0ab..83e0a6a 100644 --- a/predict_lora.py +++ b/predict_lora.py @@ -5,25 +5,40 @@ import transformers from transformers import AutoTokenizer from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer) -from dbgpt_hub.configs import (GenerationArguments,ModelInferenceArguments) +from dbgpt_hub.configs import (GenerationArguments, ModelInferenceArguments) from datasets import load_dataset, Dataset from dbgpt_hub.utils.model_utils import get_logits_processor from dbgpt_hub.utils.model_utils import smart_tokenizer_and_embedding_resize +from peft import PeftModel +import argparse + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base_model_name_or_path", type=str, + default="./model") + parser.add_argument("--peft_ckpt_path", type=str, + default="Your lora ckpt path") + parser.add_argument("--input_data_json", type=str, default="dev_sql.json") + parser.add_argument("--output_name", type=str, + default="./data/out_pred/pre_lora_8_lr_2e4_drop1e1.sql") + return parser.parse_args() + + +local_parser = get_args() +# print(f"loca {local_parser.base_model_name_or_path}") + SQL_PROMPT_DICT = { "prompt_input": ( - "I want you to act as a SQL terminal in front of an example database, \ - you need only to return the sql command to me.Below is an instruction that describes a task, \ - Write a response that appropriately completes the request. \ - The instruction is {instruction}, So please tell me {input}, ###Response:" + "I want you to act as a SQL terminal in front of an example database, you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. The instruction is {instruction}, So please tell me {input} Response:" ), "prompt_no_input": ( - "I want you to act as a SQL terminal in front of an example database, \ - you need only to return the sql command to me.Below is an instruction that describes a task, \ - Write a response that appropriately completes the request. \ - The instruction is {instruction}, ###Response:" + "I want you to act as a SQL terminal in front of an example database, you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. The instruction is {instruction}, Response:" ), } + + def extract_sql_dataset(example): if example.get("input", "") != "": prompt_format = SQL_PROMPT_DICT["prompt_input"] @@ -33,21 +48,40 @@ def extract_sql_dataset(example): def predict(): - # parameters + # parameters parser = transformers.HfArgumentParser( (ModelInferenceArguments, GenerationArguments)) model_server_args, generation_args = parser.parse_args_into_dataclasses() device = 'cuda' if torch.cuda.is_available() else 'cpu' - model = AutoModelForCausalLM.from_pretrained( - model_server_args.model_name_or_path, + print(f"Loading base model: {model_server_args.model_name_or_path}") + + base_model = AutoModelForCausalLM.from_pretrained( + local_parser.base_model_name_or_path, trust_remote_code=True, low_cpu_mem_usage=True, torch_dtype=torch.float16, - device_map={"":0}) + device_map={"": 0}) + + print(f"Loading PEFT LoRA: {local_parser.peft_ckpt_path}") + model = PeftModel.from_pretrained(base_model, local_parser.peft_ckpt_path) + + # args = get_args() + + # print(f"Loading base model: {args.base_model_name_or_path}") + # base_model = AutoModelForCausalLM.from_pretrained( + # args.base_model_name_or_path, + # return_dict=True, + # torch_dtype=torch.float16, + # trust_remote_code=True + # ) + + # print(f"Loading PEFT: {args.peft_model_path}") + # model = PeftModel.from_pretrained(base_model, checkpoint_dir) + # model.to(args.device) tokenizer = AutoTokenizer.from_pretrained( - model_server_args.model_name_or_path, + local_parser.base_model_name_or_path, trust_remote_code=True, use_fast=False, ) @@ -64,27 +98,26 @@ def predict(): # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token. print('Adding special tokens.') tokenizer.add_special_tokens({ - "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), - "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), - "unk_token": tokenizer.convert_ids_to_tokens( - model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id - ), + "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), + "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), + "unk_token": tokenizer.convert_ids_to_tokens( + model.config.pad_token_id if model.config.pad_token_id != - + 1 else tokenizer.pad_token_id + ), }) model.config.use_cache = False # model.to(device) - - # Load dataset. - dataset = load_dataset("json", data_files="dev_sql.json") + dataset = load_dataset("json", data_files=local_parser.input_data_json) dataset = dataset.map(extract_sql_dataset, remove_columns=['instruction']) - # dataset_labels = dataset["train"]["output"] + # dataset_labels = dataset["train"]["output"] dataset = dataset["train"]["input"] result = [] predict_batchsize = 1 idx = 0 - nums_examples =len(dataset) + nums_examples = len(dataset) while idx < nums_examples: if idx + predict_batchsize < nums_examples: inputs = dataset[idx: idx+predict_batchsize] @@ -92,52 +125,54 @@ def predict(): else: inputs = dataset[idx: nums_examples] idx = nums_examples - encoded_inputs = tokenizer.batch_encode_plus(inputs, - return_tensors="pt", - padding=True, truncation=True, + encoded_inputs = tokenizer.batch_encode_plus(inputs, + return_tensors="pt", + padding=True, truncation=True, max_length=512 - ) - encoded_inputs = {name: tensor.to(device) for name, tensor in encoded_inputs.items()} + ) + encoded_inputs = {name: tensor.to(device) + for name, tensor in encoded_inputs.items()} outputs = model.generate( - **encoded_inputs, - **generation_args.to_dict(), - logits_processor=get_logits_processor() - ) - # ## support different type LLM + **encoded_inputs, + **generation_args.to_dict(), + logits_processor=get_logits_processor() + ) + # ## support different type LLM # if re.search(r'(?i)falcon', model_path): # generate_kwargs = { - # "input_ids": encoded_inputs["input_ids"], + # "input_ids": encoded_inputs["input_ids"], # "attention_mask": encoded_inputs["attention_mask"] # } # outputs = model.generate(**generate_kwargs, max_length=512) # elif re.search(r'(?i)llama', model_path): # outputs = model.generate( - # **encoded_inputs, - # max_new_tokens=512, - # generation_config = training_args.generation_config, + # **encoded_inputs, + # max_new_tokens=512, + # generation_config = training_args.generation_config, # logits_processor=get_logits_processor() # ) # else: # print("right now,not support well") - ## support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict; - for output in outputs: + # support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict; + for output in outputs: prediction = tokenizer.decode(output, skip_special_tokens=True) response = re.split(r"Response:\s*", prediction)[-1] result.append(response) print(response) print(idx) - ## origin only predict format + # origin only predict format # for output in outputs: # prediction = tokenizer.decode(output, skip_special_tokens=True) # response = re.split(r"Response:\s*", prediction)[-1] # result.append(response.replace("\n", "")) return result + if __name__ == "__main__": result = predict() - with open('dev_pred.sql', 'w') as f: + with open(local_parser.output_name, 'w') as f: for p in result: f.write(p + "\n") diff --git a/predict_qlora.py b/predict_qlora.py index 94a5915..37db6f2 100644 --- a/predict_qlora.py +++ b/predict_qlora.py @@ -10,12 +10,30 @@ ) from dbgpt_hub.configs import (DataArguments, GenerationArguments, - LoraArguments, ModelArguments, QuantArguments, - TrainingArguments) -from dbgpt_hub.model import get_accelerate_model + LoraArguments, ModelArguments, QuantArguments, + TrainingArguments) +from dbgpt_hub.model import get_accelerate_model from datasets import load_dataset, Dataset import pandas as pd +from peft import PeftModel +import argparse + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--base_model_name_or_path", type=str, + default="/home/model_files/Llama-2-13b-chat-hf") + parser.add_argument("--peft_ckpt_path", type=str, + default="Your peft qlora ckpt path") + parser.add_argument("--input_data_json", type=str, default="dev_sql.json") + parser.add_argument("--output_name", type=str, + default="./data/out_pred/qlora_8_lr_2e4_drop1e1.sql") + + return parser.parse_args() + + +local_parser = get_args() ALPACA_PROMPT_DICT = { @@ -52,6 +70,7 @@ def extract_alpaca_dataset(example): prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"] return {'input': prompt_format.format(**example)} + def extract_sql_dataset(example): if example.get("input", "") != "": prompt_format = SQL_PROMPT_DICT["prompt_input"] @@ -59,27 +78,9 @@ def extract_sql_dataset(example): prompt_format = SQL_PROMPT_DICT["prompt_no_input"] return {'input': prompt_format.format(**example)} -def local_dataset(dataset_name): - if dataset_name.endswith('.json'): - full_dataset = Dataset.from_json(path_or_paths=dataset_name) - elif dataset_name.endswith('.jsonl'): - full_dataset = Dataset.from_json(filename=dataset_name, format='jsonlines') - elif dataset_name.endswith('.csv'): - full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name)) - elif dataset_name.endswith('.tsv'): - full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t')) - else: - raise ValueError(f"Unsupported dataset format: {dataset_name}") - - split_dataset = full_dataset.train_test_split(test_size=0.1) - return split_dataset - - - - def predict(): - # parameters + # parameters parser = transformers.HfArgumentParser( (ModelArguments, DataArguments, TrainingArguments, LoraArguments, QuantArguments, GenerationArguments)) @@ -93,51 +94,21 @@ def predict(): **vars(training_args), **vars(lora_args), **vars(quant_args)) - - # device = torch.device("cuda:0") - model,tokenizer = get_accelerate_model(args) + device = 'cuda' if torch.cuda.is_available() else 'cpu' + model, tokenizer = get_accelerate_model(args, local_parser.peft_ckpt_path) model.config.use_cache = False - # model.to(device) - - # Load dataset. - def load_data(dataset_name): - if dataset_name == 'alpaca': - return load_dataset("tatsu-lab/alpaca") - elif dataset_name == 'alpaca-clean': - return load_dataset("yahma/alpaca-cleaned") - elif dataset_name == 'chip2': - return load_dataset("laion/OIG", data_files='unified_chip2.jsonl') - elif dataset_name == 'self-instruct': - return load_dataset("yizhongw/self_instruct", name='self_instruct') - elif dataset_name == 'hh-rlhf': - return load_dataset("Anthropic/hh-rlhf") - elif dataset_name == 'longform': - return load_dataset("akoksal/LongForm") - elif dataset_name == 'oasst1': - return load_dataset("timdettmers/openassistant-guanaco") - elif dataset_name == 'vicuna': - raise NotImplementedError("Vicuna data was not released.") - elif dataset_name == 'spider': - return load_dataset("json", data_files="sql_finetune_data.json") - else: - if os.path.exists(dataset_name): - try: - args.dataset_format = args.dataset_format if args.dataset_format else "input-output" - full_dataset = local_dataset(dataset_name) - return full_dataset - except: - raise ValueError(f"Error loading dataset from {dataset_name}") - else: - raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.") def format_dataset(dataset, dataset_format): if ( - dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or - (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean']) + dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or + (dataset_format is None and args.dataset in [ + 'alpaca', 'alpaca-clean']) ): - dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction']) + dataset = dataset.map(extract_alpaca_dataset, + remove_columns=['instruction']) elif dataset_format == 'spider': - dataset = dataset.map(extract_sql_dataset, remove_columns=['instruction']) + dataset = dataset.map(extract_sql_dataset, + remove_columns=['instruction']) elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'): dataset = dataset.map(lambda x: { 'input': x['text'].split('\n: ')[0].replace(': ', ''), @@ -159,21 +130,25 @@ def format_dataset(dataset, dataset_format): elif dataset_format == 'input-output': pass dataset = dataset.remove_columns( - [col for col in dataset.column_names['train'] if col not in ['input', 'output']] + [col for col in dataset.column_names['train'] + if col not in ['input', 'output']] ) return dataset # Load dataset. - dataset = load_data(args.dataset) + dataset = load_dataset("json", data_files=local_parser.input_data_json) dataset = format_dataset(dataset, args.dataset_format) - dataset_labels = dataset["train"]["output"] + dataset_labels = dataset["train"]["output"] dataset = dataset["train"]["input"] result = [] - predict_batchsize = 24 idx = 0 - nums_examples =len(dataset) + predict_batchsize = 2 + nums_examples = len(dataset) + # if nums_examples > 6: + # nums_examples = 6 + print(f"just test {nums_examples} examples\n") while idx < nums_examples: if idx + predict_batchsize < nums_examples: inputs = dataset[idx: idx+predict_batchsize] @@ -181,50 +156,46 @@ def format_dataset(dataset, dataset_format): else: inputs = dataset[idx: nums_examples] idx = nums_examples - encoded_inputs = tokenizer.batch_encode_plus(inputs, - return_tensors="pt", - padding=True, truncation=True, + encoded_inputs = tokenizer.batch_encode_plus(inputs, + return_tensors="pt", + padding=True, truncation=True, max_length=512 - ) - # encoded_inputs = {name: tensor.to(device) for name, tensor in encoded_inputs.items()} - encoded_inputs = {name: tensor for name, tensor in encoded_inputs.items()} - - ## support different type LLM - if re.search(r'(?i)falcon', model_path): + ) + encoded_inputs = {name: tensor.to(device) + for name, tensor in encoded_inputs.items()} + + # support different type LLM + if re.search(r'(?i)falcon', local_parser.base_model_name_or_path): generate_kwargs = { - "input_ids": encoded_inputs["input_ids"], - "attention_mask": encoded_inputs["attention_mask"] + "input_ids": encoded_inputs["input_ids"], + "attention_mask": encoded_inputs["attention_mask"] } outputs = model.generate(**generate_kwargs, max_length=512) - elif re.search(r'(?i)llama', model_path): + elif re.search(r'(?i)llama', local_parser.base_model_name_or_path): outputs = model.generate(**encoded_inputs, max_length=512) else: print("right now,not support well") - ## support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict; - for i,output in enumerate(outputs): + # support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict; + for i, output in enumerate(outputs): input_idx = idx-predict_batchsize+i prediction = tokenizer.decode(output, skip_special_tokens=True) response = re.split(r"Response:\s*", prediction)[-1] - compose_i = "origin inputs:\t"+ dataset[input_idx].replace("\n", "") + "\n"+"orgin outputs labels:\t" + dataset_labels[input_idx].replace("\n", "") + "\n"+"predict outputs labels:\t"+ response.replace("\n", "") + # compose_i = "origin inputs:\t" + dataset[input_idx].replace("\n", "") + "\n"+"orgin outputs labels:\t" + dataset_labels[input_idx].replace( + # "\n", "") + "\n"+"predict outputs labels:\t" + response.replace("\n", "") + # test + compose_i = response.replace("\n", "") + print(f'compos_i \t {compose_i}') result.append(compose_i) - ## origin only predict format - # for output in outputs: - # prediction = tokenizer.decode(output, skip_special_tokens=True) - # response = re.split(r"Response:\s*", prediction)[-1] - # result.append(response.replace("\n", "")) print(result) print(idx) return args.dataset, result - - - if __name__ == "__main__": dataset_name, result = predict() - with open('data/'+ dataset_name +'/dev_pred.sql', 'w') as f: + with open(local_parser.output_name, 'w') as f: for p in result: f.write(p + "\n") diff --git a/scripts/get_predict.sh b/scripts/get_predict_lora.sh similarity index 59% rename from scripts/get_predict.sh rename to scripts/get_predict_lora.sh index ba49f0a..fc06fb4 100644 --- a/scripts/get_predict.sh +++ b/scripts/get_predict_lora.sh @@ -1,9 +1,11 @@ #!/bin/bash +# prepare dev data python dbgpt_hub/utils/sql_data_process.py \ --data_filepaths data/spider/dev.json \ --output_file dev_sql.json \ - -python predict.py \ - --model_name_or_path merged_model \ + +# get lora predict +CUDA_VISIBLE_DEVICES=3 python ./predict_lora.py \ + diff --git a/scripts/get_predict_qlora.sh b/scripts/get_predict_qlora.sh index bb70369..2526659 100644 --- a/scripts/get_predict_qlora.sh +++ b/scripts/get_predict_qlora.sh @@ -5,5 +5,4 @@ python dbgpt_hub/utils/sql_data_process.py \ --output_file dev_sql.json \ CUDA_VISIBLE_DEVICES=2,3 python ./../predict_qlora.py \ - --model_name_or_path merged_model \