Skip to content

Commit

Permalink
Merge pull request #35 from wangzaistone/main
Browse files Browse the repository at this point in the history
Solve some bugs, decouple ckpt and lora weights
  • Loading branch information
wangzaistone authored Aug 4, 2023
2 parents 1840e79 + 1e0eb07 commit e6ecf5b
Show file tree
Hide file tree
Showing 5 changed files with 176 additions and 152 deletions.
49 changes: 33 additions & 16 deletions dbgpt_hub/configs/train_args.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
from dataclasses import dataclass, field
from typing import Optional
from transformers import TrainingArguments
from transformers import Seq2SeqTrainingArguments


@dataclass
class TrainingArguments(transformers.Seq2SeqTrainingArguments):
class TrainingArguments(Seq2SeqTrainingArguments):
cache_dir: Optional[str] = field(default=None)
train_on_source: Optional[bool] = field(
default=False,
metadata={"help": "Whether to train on the input in addition to the target text."}
metadata={
"help": "Whether to train on the input in addition to the target text."}
)
full_finetune: bool = field(
default=False,
Expand Down Expand Up @@ -43,17 +45,32 @@ class TrainingArguments(transformers.Seq2SeqTrainingArguments):
'Maximum sequence length. Sequences will be right padded (and possibly truncated).'
},
)
output_dir: str = field(default='./adapter', metadata={"help": 'The output dir for logs and checkpoints'})
per_device_train_batch_size: int = field(default=1, metadata={"help": 'The training batch size per GPU. Increase for better speed.'})
gradient_accumulation_steps: int = field(default=16, metadata={"help": 'How many gradients to accumulate before to perform an optimizer step'})
max_steps: int = field(default=10000, metadata={"help": 'How many optimizer update steps to take'})
weight_decay: float = field(default=0.0, metadata={"help": 'The L2 weight decay rate of AdamW'}) # use lora dropout instead for regularization if needed
learning_rate: float = field(default=0.0002, metadata={"help": 'The learnign rate'})
remove_unused_columns: bool = field(default=False, metadata={"help": 'Removed unused columns. Needed to make this codebase work.'})
lr_scheduler_type: str = field(default='constant', metadata={"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
warmup_ratio: float = field(default=0.03, metadata={"help": 'Fraction of steps to do a warmup for'})
logging_steps: int = field(default=10, metadata={"help": 'The frequency of update steps after which to log the loss'})
group_by_length: bool = field(default=True, metadata={"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
save_strategy: str = field(default='steps', metadata={"help": 'When to save checkpoints'})
save_steps: int = field(default=250, metadata={"help": 'How often to save a model'})
save_total_limit: int = field(default=40, metadata={"help": 'How many checkpoints to save before the oldest is overwritten'})
output_dir: str = field(
default='./adapter', metadata={"help": 'The output dir for logs and checkpoints'})
per_device_train_batch_size: int = field(default=1, metadata={
"help": 'The training batch size per GPU. Increase for better speed.'})
gradient_accumulation_steps: int = field(default=16, metadata={
"help": 'How many gradients to accumulate before to perform an optimizer step'})
max_steps: int = field(default=10000, metadata={
"help": 'How many optimizer update steps to take'})
# use lora dropout instead for regularization if needed
weight_decay: float = field(default=0.0, metadata={
"help": 'The L2 weight decay rate of AdamW'})
learning_rate: float = field(default=0.0002, metadata={
"help": 'The learnign rate'})
remove_unused_columns: bool = field(default=False, metadata={
"help": 'Removed unused columns. Needed to make this codebase work.'})
lr_scheduler_type: str = field(default='constant', metadata={
"help": 'Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis'})
warmup_ratio: float = field(default=0.03, metadata={
"help": 'Fraction of steps to do a warmup for'})
logging_steps: int = field(default=10, metadata={
"help": 'The frequency of update steps after which to log the loss'})
group_by_length: bool = field(default=True, metadata={
"help": 'Group sequences into batches with same length. Saves memory and speeds up training considerably.'})
save_strategy: str = field(default='steps', metadata={
"help": 'When to save checkpoints'})
save_steps: int = field(default=250, metadata={
"help": 'How often to save a model'})
save_total_limit: int = field(default=40, metadata={
"help": 'How many checkpoints to save before the oldest is overwritten'})
119 changes: 77 additions & 42 deletions predict_lora.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,25 +5,40 @@
import transformers
from transformers import AutoTokenizer
from transformers import (AutoModelForCausalLM, AutoTokenizer, LlamaTokenizer)
from dbgpt_hub.configs import (GenerationArguments,ModelInferenceArguments)
from dbgpt_hub.configs import (GenerationArguments, ModelInferenceArguments)
from datasets import load_dataset, Dataset
from dbgpt_hub.utils.model_utils import get_logits_processor
from dbgpt_hub.utils.model_utils import smart_tokenizer_and_embedding_resize
from peft import PeftModel
import argparse


def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--base_model_name_or_path", type=str,
default="./model")
parser.add_argument("--peft_ckpt_path", type=str,
default="Your lora ckpt path")
parser.add_argument("--input_data_json", type=str, default="dev_sql.json")
parser.add_argument("--output_name", type=str,
default="./data/out_pred/pre_lora_8_lr_2e4_drop1e1.sql")
return parser.parse_args()


local_parser = get_args()
# print(f"loca {local_parser.base_model_name_or_path}")


SQL_PROMPT_DICT = {
"prompt_input": (
"I want you to act as a SQL terminal in front of an example database, \
you need only to return the sql command to me.Below is an instruction that describes a task, \
Write a response that appropriately completes the request. \
The instruction is {instruction}, So please tell me {input}, ###Response:"
"I want you to act as a SQL terminal in front of an example database, you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. The instruction is {instruction}, So please tell me {input} Response:"
),
"prompt_no_input": (
"I want you to act as a SQL terminal in front of an example database, \
you need only to return the sql command to me.Below is an instruction that describes a task, \
Write a response that appropriately completes the request. \
The instruction is {instruction}, ###Response:"
"I want you to act as a SQL terminal in front of an example database, you need to return the sql command to me.Below is an instruction that describes a task, Write a response that appropriately completes the request. The instruction is {instruction}, Response:"
),
}


def extract_sql_dataset(example):
if example.get("input", "") != "":
prompt_format = SQL_PROMPT_DICT["prompt_input"]
Expand All @@ -33,21 +48,40 @@ def extract_sql_dataset(example):


def predict():
# parameters
# parameters
parser = transformers.HfArgumentParser(
(ModelInferenceArguments, GenerationArguments))
model_server_args, generation_args = parser.parse_args_into_dataclasses()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = AutoModelForCausalLM.from_pretrained(
model_server_args.model_name_or_path,
print(f"Loading base model: {model_server_args.model_name_or_path}")

base_model = AutoModelForCausalLM.from_pretrained(
local_parser.base_model_name_or_path,
trust_remote_code=True,
low_cpu_mem_usage=True,
torch_dtype=torch.float16,
device_map={"":0})
device_map={"": 0})

print(f"Loading PEFT LoRA: {local_parser.peft_ckpt_path}")
model = PeftModel.from_pretrained(base_model, local_parser.peft_ckpt_path)

# args = get_args()

# print(f"Loading base model: {args.base_model_name_or_path}")
# base_model = AutoModelForCausalLM.from_pretrained(
# args.base_model_name_or_path,
# return_dict=True,
# torch_dtype=torch.float16,
# trust_remote_code=True
# )

# print(f"Loading PEFT: {args.peft_model_path}")
# model = PeftModel.from_pretrained(base_model, checkpoint_dir)
# model.to(args.device)

tokenizer = AutoTokenizer.from_pretrained(
model_server_args.model_name_or_path,
local_parser.base_model_name_or_path,
trust_remote_code=True,
use_fast=False,
)
Expand All @@ -64,80 +98,81 @@ def predict():
# Note also that `model.config.pad_token_id` is 0 which corresponds to `<unk>` token.
print('Adding special tokens.')
tokenizer.add_special_tokens({
"eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
"bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
"unk_token": tokenizer.convert_ids_to_tokens(
model.config.pad_token_id if model.config.pad_token_id != -1 else tokenizer.pad_token_id
),
"eos_token": tokenizer.convert_ids_to_tokens(model.config.eos_token_id),
"bos_token": tokenizer.convert_ids_to_tokens(model.config.bos_token_id),
"unk_token": tokenizer.convert_ids_to_tokens(
model.config.pad_token_id if model.config.pad_token_id != -
1 else tokenizer.pad_token_id
),
})
model.config.use_cache = False
# model.to(device)



# Load dataset.
dataset = load_dataset("json", data_files="dev_sql.json")
dataset = load_dataset("json", data_files=local_parser.input_data_json)
dataset = dataset.map(extract_sql_dataset, remove_columns=['instruction'])
# dataset_labels = dataset["train"]["output"]
# dataset_labels = dataset["train"]["output"]
dataset = dataset["train"]["input"]

result = []
predict_batchsize = 1
idx = 0
nums_examples =len(dataset)
nums_examples = len(dataset)
while idx < nums_examples:
if idx + predict_batchsize < nums_examples:
inputs = dataset[idx: idx+predict_batchsize]
idx += predict_batchsize
else:
inputs = dataset[idx: nums_examples]
idx = nums_examples
encoded_inputs = tokenizer.batch_encode_plus(inputs,
return_tensors="pt",
padding=True, truncation=True,
encoded_inputs = tokenizer.batch_encode_plus(inputs,
return_tensors="pt",
padding=True, truncation=True,
max_length=512
)
encoded_inputs = {name: tensor.to(device) for name, tensor in encoded_inputs.items()}
)
encoded_inputs = {name: tensor.to(device)
for name, tensor in encoded_inputs.items()}
outputs = model.generate(
**encoded_inputs,
**generation_args.to_dict(),
logits_processor=get_logits_processor()
)
# ## support different type LLM
**encoded_inputs,
**generation_args.to_dict(),
logits_processor=get_logits_processor()
)
# ## support different type LLM
# if re.search(r'(?i)falcon', model_path):
# generate_kwargs = {
# "input_ids": encoded_inputs["input_ids"],
# "input_ids": encoded_inputs["input_ids"],
# "attention_mask": encoded_inputs["attention_mask"]
# }
# outputs = model.generate(**generate_kwargs, max_length=512)
# elif re.search(r'(?i)llama', model_path):
# outputs = model.generate(
# **encoded_inputs,
# max_new_tokens=512,
# generation_config = training_args.generation_config,
# **encoded_inputs,
# max_new_tokens=512,
# generation_config = training_args.generation_config,
# logits_processor=get_logits_processor()
# )
# else:
# print("right now,not support well")

## support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict;
for output in outputs:
# support the compared format directly ,like origin inputs: \n orgin outputs labels \n predict;
for output in outputs:
prediction = tokenizer.decode(output, skip_special_tokens=True)
response = re.split(r"Response:\s*", prediction)[-1]
result.append(response)
print(response)
print(idx)
## origin only predict format
# origin only predict format
# for output in outputs:
# prediction = tokenizer.decode(output, skip_special_tokens=True)
# response = re.split(r"Response:\s*", prediction)[-1]
# result.append(response.replace("\n", ""))
return result


if __name__ == "__main__":

result = predict()

with open('dev_pred.sql', 'w') as f:
with open(local_parser.output_name, 'w') as f:
for p in result:
f.write(p + "\n")
Loading

0 comments on commit e6ecf5b

Please sign in to comment.