diff --git a/dbgpt_hub/configs/__init__.py b/dbgpt_hub/configs/__init__.py index 08898db..94add8b 100644 --- a/dbgpt_hub/configs/__init__.py +++ b/dbgpt_hub/configs/__init__.py @@ -1,5 +1,6 @@ from .data_args import DataArguments from .gen_args import GenerationArguments +from .infer_args import ModelInferenceArguments from .lora_args import LoraArguments from .model_args import ModelArguments from .quant_args import QuantArguments @@ -7,5 +8,6 @@ __all__ = [ 'DataArguments', 'GenerationArguments', 'ModelArguments', - 'TrainingArguments', 'LoraArguments','QuantArguments' - ] \ No newline at end of file + 'TrainingArguments', 'ModelInferenceArguments', 'LoraArguments', + 'QuantArguments' +] \ No newline at end of file diff --git a/dbgpt_hub/configs/gen_args.py b/dbgpt_hub/configs/gen_args.py index 7fe09d0..1d0f012 100644 --- a/dbgpt_hub/configs/gen_args.py +++ b/dbgpt_hub/configs/gen_args.py @@ -16,7 +16,7 @@ class GenerationArguments: default=None, metadata={"help": "Minimum number of new tokens to generate."} ) - + # Generation strategy do_sample: Optional[bool] = field(default=False) num_beams: Optional[int] = field(default=1) diff --git a/dbgpt_hub/configs/infer_args.py b/dbgpt_hub/configs/infer_args.py new file mode 100644 index 0000000..9f169ad --- /dev/null +++ b/dbgpt_hub/configs/infer_args.py @@ -0,0 +1,28 @@ +from dataclasses import dataclass, field +from typing import Optional +import os + +model_path = os.path.join("./model", os.listdir("model")[1]) + +@dataclass +class ModelInferenceArguments: + cache_dir: Optional[str] = field(default=None) + model_name_or_path: Optional[str] = field( + default=model_path, + metadata={'help': 'Path to pre-trained model'}) + model_max_length: int = field( + default=1024, + metadata={ + 'help': + 'Maximum sequence length. Sequences will be right padded (and possibly truncated).' + }, + ) + prompt_template: str = field( + default='default', + metadata={ + 'help': + 'Prompt template name. Such as vanilla, alpaca, llama2, vicuna..., etc.' + }) + source_prefix: Optional[str] = field( + default=None, + metadata={'help': 'Prefix to prepend to every source text.'}) diff --git a/dbgpt_hub/configs/train_args.py b/dbgpt_hub/configs/train_args.py index d8fe0e0..aa517fc 100644 --- a/dbgpt_hub/configs/train_args.py +++ b/dbgpt_hub/configs/train_args.py @@ -4,7 +4,7 @@ @dataclass -class TrainingArguments(TrainingArguments): +class TrainingArguments(Seq2SeqTrainingArguments): cache_dir: Optional[str] = field(default=None) train_on_source: Optional[bool] = field( default=False, diff --git a/dbgpt_hub/data/data_module.py b/dbgpt_hub/data/data_module.py index a29dc8e..39e71a7 100644 --- a/dbgpt_hub/data/data_module.py +++ b/dbgpt_hub/data/data_module.py @@ -80,14 +80,14 @@ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: SQL_PROMPT_DICT = { "prompt_input": ( - "I want you to act as a SQL terminal in front of an example database. " - "Below is an instruction that describes a task, Write a response that appropriately completes the request.\n\n" - "###Instruction:\n{instruction}\n\n###Input:\n{input}\n\n###Response: " + "I want you to act as a SQL terminal in front of an example database, + you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. + The instruction is {instruction}, So please tell me {input} Response:" ), "prompt_no_input": ( - "I want you to act as a SQL terminal in front of an example database. " - "Below is an instruction that describes a task, Write a response that appropriately completes the request.\n\n" - "###Instruction:\n{instruction}\n\n### Response: " + "I want you to act as a SQL terminal in front of an example database, + you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. + The instruction is {instruction}, Response:" ), } @@ -226,4 +226,4 @@ def format_dataset(dataset, dataset_format): eval_dataset=eval_dataset if args.do_eval else None, predict_dataset=eval_dataset if args.do_predict else None, data_collator=data_collator - ) \ No newline at end of file + ) diff --git a/predict_lora.py b/predict_lora.py index 589cfc3..a3cb482 100644 --- a/predict_lora.py +++ b/predict_lora.py @@ -1,64 +1,27 @@ import re import os import torch +import pandas as pd import transformers from transformers import AutoTokenizer -import logging -from typing import Dict - -# LoraArguments need to import from train_lora,otherwise report bug -from train_qlora import \ -( - DataArguments, ModelArguments,TrainingArguments, GenerationArguments, get_accelerate_model -) -from train_lora import load_model_tokenizer,LoraArguments -## todo merge different LoraArguments definition, and adapte default value for two qlora and lora case - - +from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer) +from dbgpt_hub.configs import (GenerationArguments,ModelInferenceArguments) from datasets import load_dataset, Dataset - -# LLM path -model_path = os.path.join("./model", os.listdir("model")[1]) -# qlora adapter outputer path -checkpoint_dir = [os.path.join("./adapter", folder) for folder in os.listdir("./adapter") if "checkpoint" in folder][0] -dataset_format = "spider" -dataset = "spider" - - -ALPACA_PROMPT_DICT = { - "prompt_input": ( - "Below is an instruction that describes a task, paired with an input that provides further context. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response: " - ), - "prompt_no_input": ( - "Below is an instruction that describes a task. " - "Write a response that appropriately completes the request.\n\n" - "### Instruction:\n{instruction}\n\n### Response: " - ), -} +from dbgpt_hub.utils.model_utils import get_logits_processor +from dbgpt_hub.utils.model_utils import smart_tokenizer_and_embedding_resize SQL_PROMPT_DICT = { "prompt_input": ( - "I want you to act as a SQL terminal in front of an example database. " - "Below is an instruction that describes a task, Write a response that appropriately completes the request.\n\n" - "###Instruction:\n{instruction}\n\n###Input:\n{input}\n\n###Response: " + "I want you to act as a SQL terminal in front of an example database, + you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. + The instruction is {instruction}, So please tell me {input} Response:" ), "prompt_no_input": ( - "I want you to act as a SQL terminal in front of an example database. " - "Below is an instruction that describes a task, Write a response that appropriately completes the request.\n\n" - "###Instruction:\n{instruction}\n\n### Response: " + "I want you to act as a SQL terminal in front of an example database, + you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. + The instruction is {instruction}, Response:" ), } - - -def extract_alpaca_dataset(example): - if example.get("input", "") != "": - prompt_format = ALPACA_PROMPT_DICT["prompt_input"] - else: - prompt_format = ALPACA_PROMPT_DICT["prompt_no_input"] - return {'input': prompt_format.format(**example)} - def extract_sql_dataset(example): if example.get("input", "") != "": prompt_format = SQL_PROMPT_DICT["prompt_input"] @@ -66,173 +29,60 @@ def extract_sql_dataset(example): prompt_format = SQL_PROMPT_DICT["prompt_no_input"] return {'input': prompt_format.format(**example)} -def local_dataset(dataset_name): - if dataset_name.endswith('.json'): - full_dataset = Dataset.from_json(path_or_paths=dataset_name) - elif dataset_name.endswith('.jsonl'): - full_dataset = Dataset.from_json(filename=dataset_name, format='jsonlines') - elif dataset_name.endswith('.csv'): - full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name)) - elif dataset_name.endswith('.tsv'): - full_dataset = Dataset.from_pandas(pd.read_csv(dataset_name, delimiter='\t')) - else: - raise ValueError(f"Unsupported dataset format: {dataset_name}") - - split_dataset = full_dataset.train_test_split(test_size=0.1) - return split_dataset - - - - - - - -def smart_tokenizer_and_embedding_resize( - special_tokens_dict: Dict, - tokenizer: transformers.PreTrainedTokenizer, - model: transformers.PreTrainedModel, -): - """Resize tokenizer and embedding. - - Note: This is the unoptimized version that may make your embedding size not be divisible by 64. - """ - num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict) - model.resize_token_embeddings(len(tokenizer)) - - if num_new_tokens > 0: - input_embeddings_data = model.get_input_embeddings().weight.data - output_embeddings_data = model.get_output_embeddings().weight.data - - input_embeddings_avg = input_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) - output_embeddings_avg = output_embeddings_data[:-num_new_tokens].mean(dim=0, keepdim=True) - - input_embeddings_data[-num_new_tokens:] = input_embeddings_avg - output_embeddings_data[-num_new_tokens:] = output_embeddings_avg - def predict(): - # Parse arguments - hfparser = transformers.HfArgumentParser(( - ModelArguments, DataArguments, TrainingArguments, GenerationArguments,LoraArguments - )) - model_args, data_args, training_args, generation_args, lora_args= \ - hfparser.parse_args_into_dataclasses() - training_args.generation_config = transformers.GenerationConfig(**vars(generation_args)) - import argparse - args = argparse.Namespace( - **vars(model_args), **vars(data_args), **vars(training_args), **vars(generation_args),**vars(lora_args) + # parameters + parser = transformers.HfArgumentParser( + (ModelInferenceArguments, GenerationArguments)) + model_server_args, generation_args = parser.parse_args_into_dataclasses() + + device = 'cuda' if torch.cuda.is_available() else 'cpu' + model = AutoModelForCausalLM.from_pretrained( + model_server_args.model_name_or_path, + trust_remote_code=True, + low_cpu_mem_usage=True, + torch_dtype=torch.float16, + device_map={"":0}) + + tokenizer = AutoTokenizer.from_pretrained( + model_server_args.model_name_or_path, + trust_remote_code=True, + use_fast=False, ) - # device = torch.device("cuda:0") - # model,tokenizer = get_accelerate_model(args, checkpoint_dir) # qlora的 - # model.config.use_cache = False + if tokenizer._pad_token is None: + smart_tokenizer_and_embedding_resize( + special_tokens_dict=dict(pad_token='[PAD]'), + tokenizer=tokenizer, + model=model, + ) + if 'llama' in model_server_args.model_name_or_path or isinstance(tokenizer, LlamaTokenizer): + # LLaMA tokenizer may not have correct special tokens set. + # Check and add them if missing to prevent them from being parsed into different tokens. + # Note that these are present in the vocabulary. + # Note also that `model.config.pad_token_id` is 0 which corresponds to `` token. + print('Adding special tokens.') + tokenizer.add_special_tokens({ + "eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id), + "bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id), + "unk_token": tokenizer.convert_ids_to_tokens( + model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id + ), + }) + model.config.use_cache = False # model.to(device) - # load model and tokenizer - model, tokenizer = load_model_tokenizer(args=args) - # tokenizer.add_special_tokens({'pad_token': '[PAD]'}) - # tokenizer.pad_token = tokenizer.eos_token - - if 'llama' in args.model_name_or_path and tokenizer.pad_token is None: - tokenizer.add_special_tokens({"pad_token": "[PAD]"}) - # # ## todo带测试 - # DEFAULT_PAD_TOKEN = "[PAD]" - # if tokenizer._pad_token is None: - # smart_tokenizer_and_embedding_resize( - # special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN), - # tokenizer=tokenizer, - # model=model, - # ) - - logging.warning('Successfully loaded model and tokenizer.') - model.config.use_cache = False - # Load dataset. - def load_data(dataset_name): - if dataset_name == 'alpaca': - return load_dataset("tatsu-lab/alpaca") - elif dataset_name == 'alpaca-clean': - return load_dataset("yahma/alpaca-cleaned") - elif dataset_name == 'chip2': - return load_dataset("laion/OIG", data_files='unified_chip2.jsonl') - elif dataset_name == 'self-instruct': - return load_dataset("yizhongw/self_instruct", name='self_instruct') - elif dataset_name == 'hh-rlhf': - return load_dataset("Anthropic/hh-rlhf") - elif dataset_name == 'longform': - return load_dataset("akoksal/LongForm") - elif dataset_name == 'oasst1': - return load_dataset("timdettmers/openassistant-guanaco") - elif dataset_name == 'vicuna': - raise NotImplementedError("Vicuna data was not released.") - elif dataset_name == 'spider': - # return load_dataset("json", data_files="sql_fintune_data.json") - return load_dataset("json", data_files="sql_finetune_data.json") - else: - if os.path.exists(dataset_name): - try: - args.dataset_format = args.dataset_format if args.dataset_format else "input-output" - full_dataset = local_dataset(dataset_name) - return full_dataset - except: - raise ValueError(f"Error loading dataset from {dataset_name}") - else: - raise NotImplementedError(f"Dataset {dataset_name} not implemented yet.") - - - def format_dataset(dataset, dataset_format): - if ( - dataset_format == 'alpaca' or dataset_format == 'alpaca-clean' or - (dataset_format is None and args.dataset in ['alpaca', 'alpaca-clean']) - ): - dataset = dataset.map(extract_alpaca_dataset, remove_columns=['instruction']) - elif dataset_format == 'spider': - dataset = dataset.map(extract_sql_dataset, remove_columns=['instruction']) - elif dataset_format == 'chip2' or (dataset_format is None and args.dataset == 'chip2'): - dataset = dataset.map(lambda x: { - 'input': x['text'].split('\n: ')[0].replace(': ', ''), - 'output': x['text'].split('\n: ')[1], - }) - elif dataset_format == 'self-instruct' or (dataset_format is None and args.dataset == 'self-instruct'): - for old, new in [["prompt", "input"], ["completion", "output"]]: - dataset = dataset.rename_column(old, new) - elif dataset_format == 'hh-rlhf' or (dataset_format is None and args.dataset == 'hh-rlhf'): - dataset = dataset.map(lambda x: { - 'input': '', - 'output': x['chosen'] - }) - elif dataset_format == 'oasst1' or (dataset_format is None and args.dataset == 'oasst1'): - dataset = dataset.map(lambda x: { - 'input': '', - 'output': x['text'], - }) - elif dataset_format == 'input-output': - pass - dataset = dataset.remove_columns( - [col for col in dataset.column_names['train'] if col not in ['input', 'output']] - ) - return dataset - - - # Load dataset. - dataset = load_data(args.dataset) - dataset = format_dataset(dataset, args.dataset_format) - dataset_labels = dataset["train"]["output"] #self - + dataset = load_dataset("json", data_files="dev_sql.json") + dataset = dataset.map(extract_sql_dataset, remove_columns=['instruction']) + # dataset_labels = dataset["train"]["output"] dataset = dataset["train"]["input"] - ## test + result = [] - predict_batchsize = 2 + predict_batchsize = 1 idx = 0 - print("just test show ,limit 100 examples\n") nums_examples =len(dataset) - if nums_examples >10: - nums_examples =10 - # print("an example as follows ") - - dataset_labels = dataset_labels[:nums_examples] #self zw - while idx < nums_examples: if idx + predict_batchsize < nums_examples: inputs = dataset[idx: idx+predict_batchsize] @@ -246,42 +96,50 @@ def format_dataset(dataset, dataset_format): max_length=512 ) encoded_inputs = {name: tensor.to(device) for name, tensor in encoded_inputs.items()} - - ## add branch for different type model , - if re.search(r'(?i)falcon', model_path) in model_path: - generate_kwargs = { - "input_ids": encoded_inputs["input_ids"], - "attention_mask": encoded_inputs["attention_mask"] - } - outputs = model.generate(**generate_kwargs, max_length=512) - elif re.search(r'(?i)llama', model_path): - outputs = model.generate(**encoded_inputs, max_length=512) - else: - print("right now,not support well") + outputs = model.generate( + **encoded_inputs, + **generation_args.to_dict(), + logits_processor=get_logits_processor() + ) + # ## support different type LLM + # if re.search(r'(?i)falcon', model_path): + # generate_kwargs = { + # "input_ids": encoded_inputs["input_ids"], + # "attention_mask": encoded_inputs["attention_mask"] + # } + # outputs = model.generate(**generate_kwargs, max_length=512) + # elif re.search(r'(?i)llama', model_path): + # outputs = model.generate( + # **encoded_inputs, + # max_new_tokens=512, + # generation_config = training_args.generation_config, + # logits_processor=get_logits_processor() + # ) + # else: + # print("right now,not support well") ## support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict; - for i,output in enumerate(outputs): - input_idx = idx-predict_batchsize+i + for output in outputs: prediction = tokenizer.decode(output, skip_special_tokens=True) response = re.split(r"Response:\s*", prediction)[-1] - compose_i = "origin inputs:\t"+ dataset[input_idx].replace("\n", "") + "\n"+"orgin outputs labels:\t" + dataset_labels[input_idx].replace("\n", "") + "\n"+"predict outputs labels:\t"+ response.replace("\n", "") - result.append(compose_i) + result.append(response) + print(response) + print(idx) ## origin only predict format # for output in outputs: # prediction = tokenizer.decode(output, skip_special_tokens=True) # response = re.split(r"Response:\s*", prediction)[-1] # result.append(response.replace("\n", "")) - print(result) - print(idx) - return args.dataset, result + return result + if __name__ == "__main__": - dataset_name, result = predict() + result = predict() - with open('data/'+ dataset_name +'/Llama2_dev_lora_pred_test6.sql', 'w') as f: + with open('dev_pred.sql', 'w') as f: for p in result: f.write(p + "\n") diff --git a/predict_qlora.py b/predict_qlora.py index 9e3daff..94a5915 100644 --- a/predict_qlora.py +++ b/predict_qlora.py @@ -18,14 +18,6 @@ import pandas as pd -# LLM path -model_path = os.path.join("./model", os.listdir("model")[1]) -# qlora adapter outputer path -checkpoint_dir = [os.path.join("./adapter", folder) for folder in os.listdir("./adapter") if "checkpoint" in folder][0] -dataset_format = "spider" -dataset = "spider" - - ALPACA_PROMPT_DICT = { "prompt_input": ( "Below is an instruction that describes a task, paired with an input that provides further context. " @@ -103,7 +95,7 @@ def predict(): # device = torch.device("cuda:0") - model,tokenizer = get_accelerate_model(args, checkpoint_dir) + model,tokenizer = get_accelerate_model(args) model.config.use_cache = False # model.to(device) diff --git a/scripts/get_predict.sh b/scripts/get_predict.sh index 067a21d..ba49f0a 100644 --- a/scripts/get_predict.sh +++ b/scripts/get_predict.sh @@ -1,8 +1,9 @@ #!/bin/bash +python dbgpt_hub/utils/sql_data_process.py \ + --data_filepaths data/spider/dev.json \ + --output_file dev_sql.json \ + +python predict.py \ + --model_name_or_path merged_model \ -python src/predict.py \ - --model_name_or_path model/vicuna-7b-delta-v1.3 \ - --checkpoint_dir adapter/checkpoint-7000 \ - --dataset_format spider \ - --dataset spider \ diff --git a/scripts/get_predict_qlora.sh b/scripts/get_predict_qlora.sh index 84fcc81..bb70369 100644 --- a/scripts/get_predict_qlora.sh +++ b/scripts/get_predict_qlora.sh @@ -1,8 +1,9 @@ #!/bin/bash +python dbgpt_hub/utils/sql_data_process.py \ + --data_filepaths data/spider/dev.json \ + --output_file dev_sql.json \ CUDA_VISIBLE_DEVICES=2,3 python ./../predict_qlora.py \ - --model_name_or_path model/vicuna-7b-delta-v1.3 \ - --checkpoint_dir adapter/checkpoint-7000 \ - --dataset_format spider \ - --dataset spider \ + --model_name_or_path merged_model \ + diff --git a/scripts/lora/lora.sh b/scripts/lora/lora.sh index 8fd6ac9..5606439 100644 --- a/scripts/lora/lora.sh +++ b/scripts/lora/lora.sh @@ -1,9 +1,5 @@ python dbgpt_hub/utils/sql_data_process.py -python dbgpt_hub/utils/sql_data_process.py \ - --data_filepaths data/spider/dev.json \ - --output_file dev_sql.json \ - python train_lora.py \ --dataset_name spider \ --output_dir adapter \ diff --git a/scripts/qlora/qlora.sh b/scripts/qlora/qlora.sh index 33a7c02..dac987b 100644 --- a/scripts/qlora/qlora.sh +++ b/scripts/qlora/qlora.sh @@ -2,10 +2,6 @@ python dbgpt_hub/utils/sql_data_process.py -python dbgpt_hub/utils/sql_data_process.py \ - --data_filepaths data/spider/dev.json \ - --output_file dev_sql.json \ - python train_qlora.py \ --model_name_or_path meta-llama/Llama-2-7b-hf \ --output_dir adapter \ diff --git a/train_lora.py b/train_lora.py index 5c2ec99..13043a8 100644 --- a/train_lora.py +++ b/train_lora.py @@ -11,7 +11,7 @@ from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, - PreTrainedModel, PreTrainedTokenizer, Trainer, + PreTrainedModel, PreTrainedTokenizer, Seq2SeqTrainer, deepspeed) from dbgpt_hub.configs import DataArguments, ModelArguments, TrainingArguments @@ -239,10 +239,10 @@ def train() -> None: # Create a Trainer object and start training logging.warning('Creating a Trainer...') - trainer = Trainer(model=model, - tokenizer=tokenizer, - args=training_args, - **data_module) + trainer = Seq2SeqTrainer(model=model, + tokenizer=tokenizer, + args=training_args, + **data_module) logging.warning('Starting training...') if training_args.resume_from_checkpoint and list(