Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
andreea-popescu-reef committed Sep 6, 2024
1 parent acd0157 commit 05865a6
Show file tree
Hide file tree
Showing 8 changed files with 152 additions and 80 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build_push_image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ jobs:
IMAGE_NAME="${DOCKER_REPO_NAME}:${TAG_VERSION}"
cd src/compute_horde_prompt_gen
python download_model.py --huggingface_token "${{ secrets.HUGGINGFACE_API_KEY }}"
python download_model.py --model_name phi3 --huggingface_token "${{ secrets.HUGGINGFACE_API_KEY }}"
ls
docker build -t $IMAGE_NAME .
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/smoke_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ jobs:
run: |
cd src/compute_horde_prompt_gen
python3 run.py --mock_model --number_of_batches 5 --number_of_prompts_per_batch 20 --uuids uuid1,uuid2,uuid3,uuid4,uuid5
python3 run.py --model_name mock --number_of_batches 5 --number_of_prompts_per_batch 20 --uuids uuid1,uuid2,uuid3,uuid4,uuid5
# mkdir saved_models/
# mkdir output/
Expand Down
7 changes: 5 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@ The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`


```bash
# download the model data from huggingface
cd src/compute_horde_prompt_gen

# download model data
python3 download_model.py --huggingface_token <API_KEY>

cd src/compute_horde_prompt_gen
# build the image
docker build -t compute-horde-prompt-gen .
```

Expand All @@ -27,6 +29,7 @@ docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batche

### testint
```bash
cd src/compute_horde_prompt_gen
python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
```

Expand Down
29 changes: 21 additions & 8 deletions download_model.py → ...ompute_horde_prompt_gen/download_model.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,15 @@
import os
import argparse
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
)

MODEL_PATHS = {
"llama3": "meta-llama/Meta-Llama-3.1-8B-Instruct",
"phi3": "microsoft/Phi-3.5-mini-instruct",
}

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Save huggingface model")
parser.add_argument(
Expand All @@ -15,24 +21,31 @@
parser.add_argument(
"--model_name",
type=str,
default="meta-llama/Meta-Llama-3.1-8B-Instruct",
help="Model name to use",
choices=["llama3", "phi3"],
required=True,
help="Model to use - options are llama3 or phi3",
)
parser.add_argument(
"--model_path",
"--save_path",
type=str,
default="./src/compute_horde_prompt_gen/saved_models/",
default="./saved_models/",
help="Path to save the model and tokenizer to",
)

args = parser.parse_args()
save_path = os.path.join(args.save_path, args.model_name)
model_name = MODEL_PATHS[args.model_name]
print(f"Saving {model_name} model to {save_path}")

model = AutoModelForCausalLM.from_pretrained(
args.model_name,
model_name,
# either give token directly or assume logged in with huggingface-cli
token=args.huggingface_token or True,
)
model.save_pretrained(args.model_path)
model.save_pretrained(save_path)

tokenizer = AutoTokenizer.from_pretrained(args.model_name)
tokenizer.save_pretrained(args.model_path)
tokenizer = AutoTokenizer.from_pretrained(
model_name,
token=args.huggingface_token or True,
)
tokenizer.save_pretrained(save_path)
91 changes: 78 additions & 13 deletions src/compute_horde_prompt_gen/model.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,22 @@
import logging

from prompt import PROMPT_ENDING
import io

log = logging.getLogger(__name__)


def strip_input(output: str, ending: str) -> str:
# input prompt is repeated in the output, so we need to remove it
idx = output.find(ending) + len(ending)
return output[idx:].strip()


class MockModel:
def __init__(self):
pass

def generate(self, prompts: list[str], num_return_sequences: int, **_kwargs):
return [1 for _ in range(len(prompts) * num_return_sequences)]

def decode(self, _output):
return f"COPY PASTE INPUT PROMPT {PROMPT_ENDING} Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
content = f"Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
return [content for _ in range(len(prompts) * num_return_sequences)]


class GenerativeModel:
Expand Down Expand Up @@ -45,26 +48,88 @@ def __init__(self, model_path: str, quantize: bool = False):
model_path,
local_files_only=True,
)
# set default padding token
self.tokenizer.pad_token = self.tokenizer.eos_token

# def tokenize(self, prompts: list[str], role: str) -> str:
# pass

def decode(self, output) -> list[str]:
pass

def generate(
self,
prompts: list[str],
role: str,
num_return_sequences: int,
max_new_tokens: int,
temperature: float,
):
# encode the prompts
inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
inputs = self.tokenize(prompts, role)

return self.model.generate(
**inputs,
output = self.model.generate(
inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
num_return_sequences=num_return_sequences,
do_sample=True, # use sampling-based decoding
)

def decode(self, output):
return self.tokenizer.decode(output, skip_special_tokens=True)
return self.decode(output)

def tokenize(self, prompts: list[str], role: str) -> str:
# set default padding token
self.tokenizer.pad_token = self.tokenizer.eos_token

role_templates = {
"system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"end": "<|start_header_id|>assistant<|end_header_id|>",
}

def tokenize(prompt: str) -> str:
msgs = [
{"role": "system", "content": role},
{"role": "user", "content": prompt},
]
full_prompt = io.StringIO()
for msg in msgs:
full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
full_prompt.write(role_templates["end"])
return full_prompt.getvalue()

inputs = [tokenize(prompt) for prompt in prompts]
inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to("cuda")
return inputs


class Phi3(GenerativeModel):
def decode(self, output) -> list[str]:
print(f"\nraw_output: {output}\n")
# return [
# strip_input(x, "<|assistant|>") for x in self.tokenizer.batch_decode(output)
# ]
return [
strip_input(
self.tokenizer.decode(x, skip_special_tokens=True), " }}assistant"
)
for x in output
]

# def tokenize_phi3(self, prompts: list[str], role: str) -> str:
# inputs = [{"role": "user", "content": prompt} for prompt in prompts]
# print(f"\ninputs: {inputs}\n")
# inputs = self.tokenizer.apply_chat_template(
# inputs, add_generation_prompt=True, return_tensors="pt"
# ).to("cuda")
# return inputs


class Llama3(GenerativeModel):
def decode(self, output) -> list[str]:
return [
strip_input(
self.tokenizer.decode(x, skip_special_tokens=True), " }}assistant"
)
for x in output
]
43 changes: 11 additions & 32 deletions src/compute_horde_prompt_gen/prompt.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,6 @@
import io
import random
from seeds import THEMES, ABILITIES, FORMATS

PROMPT_ENDING = " }}assistant"


class PromptGeneratingPrompt:
def random_select(self, arr: list[str], num: int = 5) -> str:
Expand All @@ -18,39 +15,21 @@ def generate_prompt(self) -> str:
abilities = self.random_select(ABILITIES, num=4)
formats = self.random_select(FORMATS, num=5)

# prompt = (
# f"Generate a list of 5 concise prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
# f"Each of these prompts should: "
# f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
# f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
# f"\n- challenge the model's ability to understand and respond appropriately"
# f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
# f"\nOutput each prompt on a new line without any extra commentary or special characters."
# )
prompt = (
f"Generate a list of 5 complex prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
f"Each of these prompts should: "
f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
f"\n- challenge the model's ability to understand and respond appropriately"
f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
f"\nOutput each prompt on a new line without any extra commentary or special characters."
f"Generate a list of 5 questions or instruct tasks related to the themes of {themes}. "
f"Output each prompt on a new line without any extra commentary or special characters."
)
return prompt

def generate_role(self) -> str:
role = "You are a prompt engineer tasked with prompts of varying complexity to test the capabilities of a new language model. For each prompt, consider what aspect of the language model's capabilities it is designed to test and ensure that the set of prompts covers a broad spectrum of potential use cases for the language model. Only output the prompts, one per line without any extra commentary. Do not use any special characters or formatting, numbering or styling in the output."
return role

def tokenize(self, prompt: str, role: str) -> str:
role_templates = {
"system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
"end": "<|start_header_id|>assistant<|end_header_id|>",
}
msgs = [
{"role": "system", "content": role},
{"role": "user", "content": prompt},
]
full_prompt = io.StringIO()
for msg in msgs:
full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
full_prompt.write(role_templates["end"])
return full_prompt.getvalue()

def generate(self):
prompt = self.generate_prompt()
role = self.generate_role()
return self.tokenize(prompt, role)
52 changes: 35 additions & 17 deletions src/compute_horde_prompt_gen/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@
import argparse

from prompt import PromptGeneratingPrompt
from model import MockModel, GenerativeModel
from model import MockModel, Llama3, Phi3
from utils import parse_output, append_to_file

logging.basicConfig(level=logging.INFO)
log = logging.getLogger(__name__)


Expand All @@ -24,25 +25,31 @@ def generate_prompts(
i = -1
while total_prompts > 0:
i += 1
prompts = [prompt_generator.generate() for _ in range(batch_size)]
prompts = [prompt_generator.generate_prompt() for _ in range(batch_size)]
role = prompt_generator.generate_role()

start_ts = datetime.datetime.now()
sequences = model.generate(
num_return_sequences=num_return_sequences,
prompts=prompts,
role=role,
max_new_tokens=max_new_tokens,
temperature=temperature,
)
print(f"\nsequences: {sequences}\n")

seconds_taken = (datetime.datetime.now() - start_ts).total_seconds()
log.info(f"{i=} generation took {seconds_taken:.2f}s")
return 0

new_prompts = []
for j, sequence in enumerate(sequences):
output = model.decode(sequence)
generated_prompts = parse_output(output)
log.debug(f"{i=} sequence={j} {generated_prompts=} from {output=}")
# output = model.decode(sequence)
# log.info(f"\n\n{i=} output={output}\n\n")
generated_prompts = parse_output(sequence)
log.debug(f"{i=} sequence={j} {generated_prompts=} from {sequence=}")

log.info(f"{i=} {sequence=} generated {len(generated_prompts)} prompts")
log.info(f"{i=} sequence={j} generated {len(generated_prompts)} prompts")
new_prompts.extend(generated_prompts)

# check_prompts_quality(new_prompts)
Expand Down Expand Up @@ -92,6 +99,13 @@ def generate_prompts(
default=1.0,
help="Temperature",
)
parser.add_argument(
"--model_name",
type=str,
choices=["llama3", "phi3", "mock"],
required=True,
help="Model to use - options are llama3 or phi3",
)
parser.add_argument(
"--model_path",
type=str,
Expand All @@ -116,12 +130,6 @@ def generate_prompts(
required=True,
help="Comma separated list of uuids, used as file names of output batches, i.e. `output/prompts_{uuid}.txt`",
)
parser.add_argument(
"--mock_model",
action="store_true",
default=False,
help="Mock llama3 model for testing purposes only",
)
parser.add_argument(
"--output_folder_path",
type=str,
Expand All @@ -138,11 +146,21 @@ def generate_prompts(
len(uuids) == args.number_of_batches
), "Number of uuids should be equal to number of batches requested"

model = (
GenerativeModel(model_path=args.model_path, quantize=args.quantize)
if not args.mock_model
else MockModel()
)
model_path = os.path.join(args.model_path, args.model_name)
if args.model_name == "mock":
model = MockModel()
elif args.model_name == "llama3":
model = Llama3(
model_path=model_path,
quantize=args.quantize,
)
elif args.model_name == "phi3":
model = Phi3(
model_path=model_path,
quantize=args.quantize,
)
else:
raise ValueError(f"Invalid model name: {args.model_name}")

for uuid in uuids:
start_ts = datetime.datetime.now()
Expand Down
Loading

0 comments on commit 05865a6

Please sign in to comment.