diff --git a/.github/workflows/build_push_image.yml b/.github/workflows/build_push_image.yml
index 8fe3a8c..74be7cc 100644
--- a/.github/workflows/build_push_image.yml
+++ b/.github/workflows/build_push_image.yml
@@ -44,7 +44,7 @@ jobs:
           IMAGE_NAME="${DOCKER_REPO_NAME}:${TAG_VERSION}"
 
           cd src/compute_horde_prompt_gen
-          python download_model.py --huggingface_token "${{ secrets.HUGGINGFACE_API_KEY }}"
+          python download_model.py --model_name phi3 --huggingface_token "${{ secrets.HUGGINGFACE_API_KEY }}"
 
           ls
           docker build -t $IMAGE_NAME .
diff --git a/.github/workflows/smoke_test.yml b/.github/workflows/smoke_test.yml
index da9979a..fa1062a 100644
--- a/.github/workflows/smoke_test.yml
+++ b/.github/workflows/smoke_test.yml
@@ -27,7 +27,7 @@ jobs:
         run: | 
           cd src/compute_horde_prompt_gen
 
-          python3 run.py --mock_model --number_of_batches 5 --number_of_prompts_per_batch 20 --uuids uuid1,uuid2,uuid3,uuid4,uuid5
+          python3 run.py --model_name mock --number_of_batches 5 --number_of_prompts_per_batch 20 --uuids uuid1,uuid2,uuid3,uuid4,uuid5
 
           # mkdir saved_models/
           # mkdir output/
diff --git a/README.md b/README.md
index 87612b8..4b95e41 100644
--- a/README.md
+++ b/README.md
@@ -12,10 +12,12 @@ The generated prompts will be saved in `<output_folder_path>/prompts_<uuid>.txt`
 
 
 ```bash
-# download the model data from huggingface
+cd src/compute_horde_prompt_gen
+
+# download model data
 python3 download_model.py --huggingface_token <API_KEY>
 
-cd src/compute_horde_prompt_gen
+# build the image
 docker build -t compute-horde-prompt-gen .
 ```
 
@@ -27,6 +29,7 @@ docker run -v ./output/:/app/output/ compute-horde-prompt-gen --number_of_batche
 
 ### testint
 ```bash
+cd src/compute_horde_prompt_gen
 python3 run.py --mock_model --number_of_batches 3 --number_of_prompts_per_batch 4 --uuids uuid1,uuid2,uuid3
 ```
 
diff --git a/download_model.py b/src/compute_horde_prompt_gen/download_model.py
similarity index 51%
rename from download_model.py
rename to src/compute_horde_prompt_gen/download_model.py
index 0336dd7..7516d76 100644
--- a/download_model.py
+++ b/src/compute_horde_prompt_gen/download_model.py
@@ -1,9 +1,15 @@
+import os
 import argparse
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
 )
 
+MODEL_PATHS = {
+    "llama3": "meta-llama/Meta-Llama-3.1-8B-Instruct",
+    "phi3": "microsoft/Phi-3.5-mini-instruct",
+}
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Save huggingface model")
     parser.add_argument(
@@ -15,24 +21,31 @@
     parser.add_argument(
         "--model_name",
         type=str,
-        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
-        help="Model name to use",
+        choices=["llama3", "phi3"],
+        required=True,
+        help="Model to use - options are llama3 or phi3",
     )
     parser.add_argument(
-        "--model_path",
+        "--save_path",
         type=str,
-        default="./src/compute_horde_prompt_gen/saved_models/",
+        default="./saved_models/",
         help="Path to save the model and tokenizer to",
     )
 
     args = parser.parse_args()
+    save_path = os.path.join(args.save_path, args.model_name)
+    model_name = MODEL_PATHS[args.model_name]
+    print(f"Saving {model_name} model to {save_path}")
 
     model = AutoModelForCausalLM.from_pretrained(
-        args.model_name,
+        model_name,
         # either give token directly or assume logged in with huggingface-cli
         token=args.huggingface_token or True,
     )
-    model.save_pretrained(args.model_path)
+    model.save_pretrained(save_path)
 
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
-    tokenizer.save_pretrained(args.model_path)
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_name,
+        token=args.huggingface_token or True,
+    )
+    tokenizer.save_pretrained(save_path)
diff --git a/src/compute_horde_prompt_gen/model.py b/src/compute_horde_prompt_gen/model.py
index 65f1c4f..9cb1c30 100644
--- a/src/compute_horde_prompt_gen/model.py
+++ b/src/compute_horde_prompt_gen/model.py
@@ -1,19 +1,22 @@
 import logging
-
-from prompt import PROMPT_ENDING
+import io
 
 log = logging.getLogger(__name__)
 
 
+def strip_input(output: str, ending: str) -> str:
+    # input prompt is repeated in the output, so we need to remove it
+    idx = output.find(ending) + len(ending)
+    return output[idx:].strip()
+
+
 class MockModel:
     def __init__(self):
         pass
 
     def generate(self, prompts: list[str], num_return_sequences: int, **_kwargs):
-        return [1 for _ in range(len(prompts) * num_return_sequences)]
-
-    def decode(self, _output):
-        return f"COPY PASTE INPUT PROMPT {PROMPT_ENDING} Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
+        content = f"Here is the list of prompts:\nHow are you?\nDescribe something\nCount to ten\n"
+        return [content for _ in range(len(prompts) * num_return_sequences)]
 
 
 class GenerativeModel:
@@ -45,26 +48,88 @@ def __init__(self, model_path: str, quantize: bool = False):
             model_path,
             local_files_only=True,
         )
-        # set default padding token
-        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+    # def tokenize(self, prompts: list[str], role: str) -> str:
+    #     pass
+
+    def decode(self, output) -> list[str]:
+        pass
 
     def generate(
         self,
         prompts: list[str],
+        role: str,
         num_return_sequences: int,
         max_new_tokens: int,
         temperature: float,
     ):
         # encode the prompts
-        inputs = self.tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
+        inputs = self.tokenize(prompts, role)
 
-        return self.model.generate(
-            **inputs,
+        output = self.model.generate(
+            inputs,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
             num_return_sequences=num_return_sequences,
             do_sample=True,  # use sampling-based decoding
         )
 
-    def decode(self, output):
-        return self.tokenizer.decode(output, skip_special_tokens=True)
+        return self.decode(output)
+
+    def tokenize(self, prompts: list[str], role: str) -> str:
+        # set default padding token
+        self.tokenizer.pad_token = self.tokenizer.eos_token
+
+        role_templates = {
+            "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
+            "end": "<|start_header_id|>assistant<|end_header_id|>",
+        }
+
+        def tokenize(prompt: str) -> str:
+            msgs = [
+                {"role": "system", "content": role},
+                {"role": "user", "content": prompt},
+            ]
+            full_prompt = io.StringIO()
+            for msg in msgs:
+                full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
+            full_prompt.write(role_templates["end"])
+            return full_prompt.getvalue()
+
+        inputs = [tokenize(prompt) for prompt in prompts]
+        inputs = self.tokenizer(inputs, return_tensors="pt", padding=True).to("cuda")
+        return inputs
+
+
+class Phi3(GenerativeModel):
+    def decode(self, output) -> list[str]:
+        print(f"\nraw_output: {output}\n")
+        # return [
+        #     strip_input(x, "<|assistant|>") for x in self.tokenizer.batch_decode(output)
+        # ]
+        return [
+            strip_input(
+                self.tokenizer.decode(x, skip_special_tokens=True), " }}assistant"
+            )
+            for x in output
+        ]
+
+    # def tokenize_phi3(self, prompts: list[str], role: str) -> str:
+    #     inputs = [{"role": "user", "content": prompt} for prompt in prompts]
+    #     print(f"\ninputs: {inputs}\n")
+    #     inputs = self.tokenizer.apply_chat_template(
+    #         inputs, add_generation_prompt=True, return_tensors="pt"
+    #     ).to("cuda")
+    #     return inputs
+
+
+class Llama3(GenerativeModel):
+    def decode(self, output) -> list[str]:
+        return [
+            strip_input(
+                self.tokenizer.decode(x, skip_special_tokens=True), " }}assistant"
+            )
+            for x in output
+        ]
diff --git a/src/compute_horde_prompt_gen/prompt.py b/src/compute_horde_prompt_gen/prompt.py
index d97f049..075b9f3 100644
--- a/src/compute_horde_prompt_gen/prompt.py
+++ b/src/compute_horde_prompt_gen/prompt.py
@@ -1,9 +1,6 @@
-import io
 import random
 from seeds import THEMES, ABILITIES, FORMATS
 
-PROMPT_ENDING = " }}assistant"
-
 
 class PromptGeneratingPrompt:
     def random_select(self, arr: list[str], num: int = 5) -> str:
@@ -18,39 +15,21 @@ def generate_prompt(self) -> str:
         abilities = self.random_select(ABILITIES, num=4)
         formats = self.random_select(FORMATS, num=5)
 
+        # prompt = (
+        #     f"Generate a list of 5 concise prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
+        #     f"Each of these prompts should: "
+        #     f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
+        #     f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
+        #     f"\n- challenge the model's ability to understand and respond appropriately"
+        #     f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
+        #     f"\nOutput each prompt on a new line without any extra commentary or special characters."
+        # )
         prompt = (
-            f"Generate a list of 5 complex prompts (questions or instruct tasks) that cover a wide range of skills and knowledge areas related to the themes of {themes}. "
-            f"Each of these prompts should: "
-            f"\n- have a complexity level of {complexity_level} out of 20 and a relevance level to the theme of {relevance_level} out of 20"
-            f"\n- test various cognitive abilities ({abilities}) and require different types of writting formats ({formats})"
-            f"\n- challenge the model's ability to understand and respond appropriately"
-            f"\n- varyingly explore the {themes} in a manner that is consistent with their assigned complexity and relevance levels to the theme"
-            f"\nOutput each prompt on a new line without any extra commentary or special characters."
+            f"Generate a list of 5 questions or instruct tasks related to the themes of {themes}. "
+            f"Output each prompt on a new line without any extra commentary or special characters."
         )
         return prompt
 
     def generate_role(self) -> str:
         role = "You are a prompt engineer tasked with prompts of varying complexity to test the capabilities of a new language model. For each prompt, consider what aspect of the language model's capabilities it is designed to test and ensure that the set of prompts covers a broad spectrum of potential use cases for the language model. Only output the prompts, one per line without any extra commentary. Do not use any special characters or formatting, numbering or styling in the output."
         return role
-
-    def tokenize(self, prompt: str, role: str) -> str:
-        role_templates = {
-            "system": "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
-            "user": "<|start_header_id|>user<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
-            "assistant": "<|start_header_id|>assistant<|end_header_id|>\n{{{{ {} }}}}<|eot_id|>",
-            "end": "<|start_header_id|>assistant<|end_header_id|>",
-        }
-        msgs = [
-            {"role": "system", "content": role},
-            {"role": "user", "content": prompt},
-        ]
-        full_prompt = io.StringIO()
-        for msg in msgs:
-            full_prompt.write(role_templates[msg["role"]].format(msg["content"]))
-        full_prompt.write(role_templates["end"])
-        return full_prompt.getvalue()
-
-    def generate(self):
-        prompt = self.generate_prompt()
-        role = self.generate_role()
-        return self.tokenize(prompt, role)
diff --git a/src/compute_horde_prompt_gen/run.py b/src/compute_horde_prompt_gen/run.py
index 4b12eaf..baed85a 100644
--- a/src/compute_horde_prompt_gen/run.py
+++ b/src/compute_horde_prompt_gen/run.py
@@ -4,9 +4,10 @@
 import argparse
 
 from prompt import PromptGeneratingPrompt
-from model import MockModel, GenerativeModel
+from model import MockModel, Llama3, Phi3
 from utils import parse_output, append_to_file
 
+logging.basicConfig(level=logging.INFO)
 log = logging.getLogger(__name__)
 
 
@@ -24,25 +25,31 @@ def generate_prompts(
     i = -1
     while total_prompts > 0:
         i += 1
-        prompts = [prompt_generator.generate() for _ in range(batch_size)]
+        prompts = [prompt_generator.generate_prompt() for _ in range(batch_size)]
+        role = prompt_generator.generate_role()
 
         start_ts = datetime.datetime.now()
         sequences = model.generate(
             num_return_sequences=num_return_sequences,
             prompts=prompts,
+            role=role,
             max_new_tokens=max_new_tokens,
             temperature=temperature,
         )
+        print(f"\nsequences: {sequences}\n")
+
         seconds_taken = (datetime.datetime.now() - start_ts).total_seconds()
         log.info(f"{i=} generation took {seconds_taken:.2f}s")
+        return 0
 
         new_prompts = []
         for j, sequence in enumerate(sequences):
-            output = model.decode(sequence)
-            generated_prompts = parse_output(output)
-            log.debug(f"{i=} sequence={j} {generated_prompts=} from {output=}")
+            # output = model.decode(sequence)
+            # log.info(f"\n\n{i=} output={output}\n\n")
+            generated_prompts = parse_output(sequence)
+            log.debug(f"{i=} sequence={j} {generated_prompts=} from {sequence=}")
 
-            log.info(f"{i=} {sequence=} generated {len(generated_prompts)} prompts")
+            log.info(f"{i=} sequence={j} generated {len(generated_prompts)} prompts")
             new_prompts.extend(generated_prompts)
 
         # check_prompts_quality(new_prompts)
@@ -92,6 +99,13 @@ def generate_prompts(
         default=1.0,
         help="Temperature",
     )
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        choices=["llama3", "phi3", "mock"],
+        required=True,
+        help="Model to use - options are llama3 or phi3",
+    )
     parser.add_argument(
         "--model_path",
         type=str,
@@ -116,12 +130,6 @@ def generate_prompts(
         required=True,
         help="Comma separated list of uuids, used as file names of output batches, i.e. `output/prompts_{uuid}.txt`",
     )
-    parser.add_argument(
-        "--mock_model",
-        action="store_true",
-        default=False,
-        help="Mock llama3 model for testing purposes only",
-    )
     parser.add_argument(
         "--output_folder_path",
         type=str,
@@ -138,11 +146,21 @@ def generate_prompts(
             len(uuids) == args.number_of_batches
         ), "Number of uuids should be equal to number of batches requested"
 
-    model = (
-        GenerativeModel(model_path=args.model_path, quantize=args.quantize)
-        if not args.mock_model
-        else MockModel()
-    )
+    model_path = os.path.join(args.model_path, args.model_name)
+    if args.model_name == "mock":
+        model = MockModel()
+    elif args.model_name == "llama3":
+        model = Llama3(
+            model_path=model_path,
+            quantize=args.quantize,
+        )
+    elif args.model_name == "phi3":
+        model = Phi3(
+            model_path=model_path,
+            quantize=args.quantize,
+        )
+    else:
+        raise ValueError(f"Invalid model name: {args.model_name}")
 
     for uuid in uuids:
         start_ts = datetime.datetime.now()
diff --git a/src/compute_horde_prompt_gen/utils.py b/src/compute_horde_prompt_gen/utils.py
index 64ca608..5825cab 100644
--- a/src/compute_horde_prompt_gen/utils.py
+++ b/src/compute_horde_prompt_gen/utils.py
@@ -3,8 +3,6 @@
 import logging
 import collections
 
-from prompt import PROMPT_ENDING
-
 log = logging.getLogger(__name__)
 
 
@@ -16,10 +14,6 @@ def clean_line(line: str) -> str:
 
 
 def parse_output(output: str) -> list[str]:
-    # input prompt is repeated in the output, so we need to remove it
-    idx = output.find(PROMPT_ENDING) + len(PROMPT_ENDING)
-    output = output[idx:].strip()
-
     # split into lines and clean them
     lines = output.split("\n")
     lines = [clean_line(line) for line in lines]