From e415cd303e2b0b83a2780885714c40b0aa34cfb4 Mon Sep 17 00:00:00 2001
From: Matthew Kotila <matthew.r.kotila@gmail.com>
Date: Thu, 14 Dec 2023 18:15:43 -0800
Subject: [PATCH] Add profile subcommand to run perf analyzer

---
 src/triton_cli/parser.py   |  16 +-
 src/triton_cli/profiler.py | 612 +++++++++++++++++++++++++++++++++++++
 2 files changed, 627 insertions(+), 1 deletion(-)
 create mode 100644 src/triton_cli/profiler.py

diff --git a/src/triton_cli/parser.py b/src/triton_cli/parser.py
index d06d5a6..2180c73 100644
--- a/src/triton_cli/parser.py
+++ b/src/triton_cli/parser.py
@@ -9,6 +9,7 @@
 from triton_cli.metrics import MetricsClient
 from triton_cli.repository import ModelRepository
 from triton_cli.server.server_factory import TritonServerFactory
+from triton_cli.profiler import Profiler
 
 logger = logging.getLogger("triton")
 
@@ -84,6 +85,17 @@ def handle_model(args: argparse.Namespace):
 
     if args.subcommand == "infer":
         client.infer(args.model, args.data, args.prompt)
+    elif args.subcommand == "profile":
+        # TODO: run PA LLM benchmark script
+        print("pull engine()")
+        print("run_server()")
+        print("profile()")
+        Profiler.profile(
+            model=args.model,
+            batch_size=64,
+            url="localhost:8001",
+            input_length=2048,
+        )
     elif args.subcommand == "config":
         config = client.get_model_config(args.model)
         if config:
@@ -160,11 +172,13 @@ def parse_args_model(subcommands):
         default=None,
         help="Text input to LLM-like models. Required for inference on LLMs, optional otherwise.",
     )
+    profile = model_commands.add_parser("profile", help="Run Perf Analyzer")
+    profile.add_argument("-m", "--model", type=str, required=True, help="Model name")
     config = model_commands.add_parser("config", help="Get config for model")
     config.add_argument("-m", "--model", type=str, required=True, help="Model name")
     metrics = model_commands.add_parser("metrics", help="Get metrics for model")
     metrics.add_argument("-m", "--model", type=str, required=True, help="Model name")
-    add_client_args([infer, config, metrics])
+    add_client_args([infer, profile, config, metrics])
     return model
 
 
diff --git a/src/triton_cli/profiler.py b/src/triton_cli/profiler.py
new file mode 100644
index 0000000..9da7fd2
--- /dev/null
+++ b/src/triton_cli/profiler.py
@@ -0,0 +1,612 @@
+# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#  * Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+#  * Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+#  * Neither the name of NVIDIA CORPORATION nor the names of its
+#    contributors may be used to endorse or promote products derived
+#    from this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import json
+import subprocess
+from dataclasses import dataclass
+from itertools import pairwise
+from pathlib import Path
+from typing import Optional
+
+import numpy as np
+
+INPUT_FILENAME = "generated_input_data.json"
+METRIC_FIELDS = {
+    "max_first_token_latency": ("Max first token latency", "ms"),
+    "min_first_token_latency": ("Min first token latency", "ms"),
+    "avg_first_token_latency": ("Avg first token latency", "ms"),
+    "p50_first_token_latency": ("p50 first token latency", "ms"),
+    "p90_first_token_latency": ("p90 first token latency", "ms"),
+    "p95_first_token_latency": ("p95 first token latency", "ms"),
+    "p99_first_token_latency": ("p99 first token latency", "ms"),
+    "max_gen_latency": ("Max generation latency", "ms"),
+    "min_gen_latency": ("Min generation latency", "ms"),
+    "avg_gen_latency": ("Avg generation latency", "ms"),
+    "p50_gen_latency": ("p50 generation latency", "ms"),
+    "p90_gen_latency": ("p90 generation latency", "ms"),
+    "p95_gen_latency": ("p95 generation latency", "ms"),
+    "p99_gen_latency": ("p99 generation latency", "ms"),
+    "avg_output_token_latency": ("Avg output token latency", "ms/output token"),
+    "avg_total_t2t_latency": ("Avg total token-to-token latency", "ms"),
+    "max_e2e_latency": ("Max end-to-end latency", "ms"),
+    "min_e2e_latency": ("Min end-to-end latency", "ms"),
+    "avg_e2e_latency": ("Avg end-to-end latency", "ms"),
+    "p50_e2e_latency": ("p50 end-to-end latency", "ms"),
+    "p90_e2e_latency": ("p90 end-to-end latency", "ms"),
+    "p95_e2e_latency": ("p95 end-to-end latency", "ms"),
+    "p99_e2e_latency": ("p99 end-to-end latency", "ms"),
+    "max_e2e_throughput": ("Max end-to-end throughput", "tokens/s"),
+    "min_e2e_throughput": ("Min end-to-end throughput", "tokens/s"),
+    "avg_e2e_throughput": ("Avg end-to-end throughput", "tokens/s"),
+    "p50_e2e_throughput": ("p50 end-to-end throughput", "tokens/s"),
+    "p90_e2e_throughput": ("p90 end-to-end throughput", "tokens/s"),
+    "p95_e2e_throughput": ("p95 end-to-end throughput", "tokens/s"),
+    "p99_e2e_throughput": ("p99 end-to-end throughput", "tokens/s"),
+    "max_gen_throughput": ("Max generation throughput", "output tokens/s"),
+    "min_gen_throughput": ("Min generation throughput", "output tokens/s"),
+    "avg_gen_throughput": ("Avg generation throughput", "output tokens/s"),
+    "p50_gen_throughput": ("p50 generation throughput", "output tokens/s"),
+    "p90_gen_throughput": ("p90 generation throughput", "output tokens/s"),
+    "p95_gen_throughput": ("p95 generation throughput", "output tokens/s"),
+    "p99_gen_throughput": ("p99 generation throughput", "output tokens/s"),
+}
+
+
+@dataclass
+class ProfileResults:
+    prompt_size: int
+    max_first_token_latency: Optional[float] = None
+    min_first_token_latency: Optional[float] = None
+    avg_first_token_latency: Optional[float] = None
+    p50_first_token_latency: Optional[float] = None
+    p90_first_token_latency: Optional[float] = None
+    p95_first_token_latency: Optional[float] = None
+    p99_first_token_latency: Optional[float] = None
+    max_gen_latency: Optional[float] = None
+    min_gen_latency: Optional[float] = None
+    avg_gen_latency: Optional[float] = None
+    p50_gen_latency: Optional[float] = None
+    p90_gen_latency: Optional[float] = None
+    p95_gen_latency: Optional[float] = None
+    p99_gen_latency: Optional[float] = None
+    avg_output_token_latency: Optional[float] = None
+    avg_total_t2t_latency: Optional[float] = None
+    avg_periodic_t2t_latencies: Optional[list[float]] = None
+    max_e2e_latency: Optional[float] = None
+    min_e2e_latency: Optional[float] = None
+    avg_e2e_latency: Optional[float] = None
+    p50_e2e_latency: Optional[float] = None
+    p90_e2e_latency: Optional[float] = None
+    p95_e2e_latency: Optional[float] = None
+    p99_e2e_latency: Optional[float] = None
+    max_e2e_throughput: Optional[float] = None
+    min_e2e_throughput: Optional[float] = None
+    avg_e2e_throughput: Optional[float] = None
+    p50_e2e_throughput: Optional[float] = None
+    p90_e2e_throughput: Optional[float] = None
+    p95_e2e_throughput: Optional[float] = None
+    p99_e2e_throughput: Optional[float] = None
+    max_gen_throughput: Optional[float] = None
+    min_gen_throughput: Optional[float] = None
+    avg_gen_throughput: Optional[float] = None
+    p50_gen_throughput: Optional[float] = None
+    p90_gen_throughput: Optional[float] = None
+    p95_gen_throughput: Optional[float] = None
+    p99_gen_throughput: Optional[float] = None
+
+
+def load_json_data(filename):
+    with open(filename) as f:
+        return json.load(f)
+
+
+def save_json_data(data, filename):
+    with open(filename, "w") as f:
+        json.dump(data, f)
+
+
+def get_postfix(args, prompt_size):
+    """Generate postfix for profile export filename and plot.
+
+    e.g.
+      - trtllm-ensemble-prompt100-maxtokens256
+      - trtllm-ensemble-prompt100-periodic1_100_1-period32-maxtokens1024
+    """
+    stream_type = "offline" if args.offline else "online"
+    postfix = f"{args.backend}-{args.model}-{stream_type}-prompt{prompt_size}-"
+    if args.periodic_concurrency_range:
+        start, end, step = args.periodic_concurrency_range
+        postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-"
+    postfix += f"maxtokens{args.max_tokens}"
+    return postfix
+
+
+def get_export_filename(args, prompt_size):
+    postfix = get_postfix(args, prompt_size)
+    filename = f"profile_export-{postfix}.json"
+    return filename
+
+
+def get_plot_filename(args, prompt_size):
+    postfix = get_postfix(args, prompt_size)
+    filename = f"inflight_batching_benchmark-{postfix}.png"
+    return filename
+
+
+def print_benchmark_summary(profile_results):
+    print("[ BENCHMARK SUMMARY ]")
+    for pr in profile_results:
+        print(f"Prompt size: {pr.prompt_size}")
+        for metric, (name, unit) in METRIC_FIELDS.items():
+            if getattr(pr, metric):
+                print(f"  * {name}: {getattr(pr, metric):.4f} {unit}")
+        print("")
+
+
+def plot_results(latencies, filename="inflight_batching_benchmark.png"):
+    """Plot in-flight batching LLM bencharmark results."""
+    import matplotlib.pyplot as plt  # Lazy import
+
+    periods = np.arange(1, len(latencies) + 1)
+    fig, ax = plt.subplots()
+    ax.plot(periods, latencies)
+
+    # Set pyplot parameters
+    ax.grid(linestyle="--")
+    ax.set_xlabel("i-th Request Period", fontsize=12)
+    ax.set_ylabel("Avg Token-to-Token Latency (ms)", fontsize=12)
+    ax.set_title("In-Flight Batching Benchmark Summary", fontsize=14)
+    ax.set_ylim(bottom=0.0)
+
+    fig.savefig(filename, dpi=300)
+
+
+def add_latencies_to_bins(bins, pos, responses, request_period):
+    """Add token-to-token latencies into the corresponding bin.
+
+    Given the responses of a single request, calculate token-to-token
+    latency and add it into bin. Update the bin position to the next
+    for every request period.
+    """
+    for response_id, (prev_res, res) in enumerate(pairwise(responses)):
+        bins[pos].append(res - prev_res)
+        if (response_id + 1) % request_period == 0:
+            pos += 1
+
+
+def update_start_position(request_id, start_pos, initial_requests, step):
+    """Shift the start position of the bin.
+
+    Once we iterate through the entire <start> requests, we shift
+    the start position. Then, we shift the start position for every
+    <step> requests.
+    """
+    if (request_id + 1) >= initial_requests:
+        num_requests_after_start = request_id + 1 - initial_requests
+        if num_requests_after_start % step == 0:
+            start_pos += 1
+    return start_pos
+
+
+def collect_periodic_latencies(args, export_data):
+    """Split the entire benchmark results into segments with size
+    of request period and collect latencies for each segment.
+    """
+    start, end, step = args.periodic_concurrency_range
+
+    num_bins = args.max_tokens // args.request_period + (end - start) // step
+    if args.max_tokens % args.request_period != 0:
+        num_bins += 1  # extra bin
+
+    bins = [[] for _ in range(num_bins)]
+    bin_start_position = 0
+    requests = export_data["experiments"][0]["requests"]
+
+    for i, r in enumerate(requests):
+        add_latencies_to_bins(
+            bins=bins,
+            pos=bin_start_position,
+            responses=r["response_timestamps"],
+            request_period=args.request_period,
+        )
+        bin_start_position = update_start_position(
+            request_id=i,
+            start_pos=bin_start_position,
+            initial_requests=start,
+            step=step,
+        )
+    return bins
+
+
+def calculate_avg_periodic_latencies(args, profile_result, export_data):
+    """Calculate average token-to-token latency for each request period."""
+    bins = collect_periodic_latencies(args, export_data)
+
+    latencies = []
+    for bin in bins:
+        latencies.append(np.mean(bin) / 1_000_000)
+
+    profile_result.avg_periodic_t2t_latencies = latencies
+
+
+def collect_online_metrics(export_data, output_tokens):
+    # Example json demonstrating format:
+    #   see client/src/c++/perf_analyzer/docs/examples/decoupled_output_file.json
+    first_token_latencies = []
+    generation_latencies = []
+    token_to_token_latencies = []
+    generation_throughputs = []
+    requests = export_data["experiments"][0]["requests"]
+
+    for r in requests:
+        init_request, responses = r["timestamp"], r["response_timestamps"]
+        first_token_latency = (responses[0] - init_request) / 1_000_000
+        first_token_latencies.append(first_token_latency)
+        if output_tokens > 1:
+            generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000  # msec
+            generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000  # sec
+            generation_latencies.append(generation_latency_ms)
+            generation_throughputs.append(output_tokens / generation_latency_s)
+        for prev_res, res in pairwise(responses):
+            token_to_token_latencies.append((res - prev_res) / 1_000_000)
+    return (
+        first_token_latencies,
+        generation_latencies,
+        token_to_token_latencies,
+        generation_throughputs,
+    )
+
+
+def calculate_online_metrics(args, profile_result, export_data):
+    """Calculate online metrics for more fine-grained performance information."""
+    latencies = collect_online_metrics(export_data, args.max_tokens)
+    (
+        first_token_latencies,
+        generation_latencies,
+        token_to_token_latencies,
+        generation_throughputs,
+    ) = latencies
+
+    profile_result.max_first_token_latency = max(first_token_latencies)
+    profile_result.min_first_token_latency = min(first_token_latencies)
+    profile_result.avg_first_token_latency = np.mean(first_token_latencies)
+    profile_result.p50_first_token_latency = np.percentile(
+        first_token_latencies, 50, method="lower"
+    )
+    profile_result.p90_first_token_latency = np.percentile(
+        first_token_latencies, 90, method="lower"
+    )
+    profile_result.p95_first_token_latency = np.percentile(
+        first_token_latencies, 95, method="lower"
+    )
+    profile_result.p99_first_token_latency = np.percentile(
+        first_token_latencies, 99, method="lower"
+    )
+
+    if args.max_tokens > 1:
+        profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)
+
+        profile_result.max_gen_latency = max(generation_latencies)
+        profile_result.min_gen_latency = min(generation_latencies)
+        profile_result.avg_gen_latency = np.mean(generation_latencies)
+        profile_result.p50_gen_latency = np.percentile(
+            generation_latencies, 50, method="lower"
+        )
+        profile_result.p90_gen_latency = np.percentile(
+            generation_latencies, 90, method="lower"
+        )
+        profile_result.p95_gen_latency = np.percentile(
+            generation_latencies, 95, method="lower"
+        )
+        profile_result.p99_gen_latency = np.percentile(
+            generation_latencies, 99, method="lower"
+        )
+
+        token_latencies = [t / args.max_tokens for t in generation_latencies]
+        profile_result.avg_output_token_latency = np.mean(token_latencies)
+
+        profile_result.max_gen_throughput = max(generation_throughputs)
+        profile_result.min_gen_throughput = min(generation_throughputs)
+        profile_result.avg_gen_throughput = np.mean(generation_throughputs)
+        profile_result.p50_gen_throughput = np.percentile(
+            generation_throughputs, 50, method="lower"
+        )
+        profile_result.p90_gen_throughput = np.percentile(
+            generation_throughputs, 90, method="lower"
+        )
+        profile_result.p95_gen_throughput = np.percentile(
+            generation_throughputs, 95, method="lower"
+        )
+        profile_result.p99_gen_throughput = np.percentile(
+            generation_throughputs, 99, method="lower"
+        )
+
+
+def collect_offline_metrics(export_data, sequence_len):
+    latencies = []
+    throughputs = []
+    requests = export_data["experiments"][0]["requests"]
+
+    for request in requests:
+        total_time = request["response_timestamps"][-1] - request["timestamp"]
+        time_s = total_time / 1_000_000_000  # sec
+        time_ms = total_time / 1_000_000  # msec
+        latencies.append(time_ms)
+        throughputs.append(sequence_len / time_s)
+    return throughputs, latencies
+
+
+def calculate_offline_metrics(args, profile_result, export_data):
+    """Calculate offline metrics that show end-to-end performance."""
+    throughputs, latencies = collect_offline_metrics(
+        export_data, sequence_len=profile_result.prompt_size + args.max_tokens
+    )
+
+    profile_result.max_e2e_latency = max(latencies)
+    profile_result.min_e2e_latency = min(latencies)
+    profile_result.avg_e2e_latency = np.mean(latencies)
+    profile_result.p50_e2e_latency = np.percentile(latencies, 50, method="lower")
+    profile_result.p90_e2e_latency = np.percentile(latencies, 90, method="lower")
+    profile_result.p95_e2e_latency = np.percentile(latencies, 95, method="lower")
+    profile_result.p99_e2e_latency = np.percentile(latencies, 99, method="lower")
+
+    profile_result.max_e2e_throughput = max(throughputs)
+    profile_result.min_e2e_throughput = min(throughputs)
+    profile_result.avg_e2e_throughput = np.mean(throughputs)
+    profile_result.p50_e2e_throughput = np.percentile(throughputs, 50, method="lower")
+    profile_result.p90_e2e_throughput = np.percentile(throughputs, 90, method="lower")
+    profile_result.p95_e2e_throughput = np.percentile(throughputs, 95, method="lower")
+    profile_result.p99_e2e_throughput = np.percentile(throughputs, 99, method="lower")
+
+
+def calculate_metrics(args, profile_result, export_data):
+    calculate_offline_metrics(args, profile_result, export_data)
+    if not args.offline:
+        calculate_online_metrics(args, profile_result, export_data)
+
+    if args.periodic_concurrency_range:
+        calculate_avg_periodic_latencies(args, profile_result, export_data)
+        plot_results(
+            latencies=profile_result.avg_periodic_t2t_latencies,
+            filename=get_plot_filename(args, profile_result.prompt_size),
+        )
+
+
+def summarize_profile_results(args, prompts):
+    results = []
+    for prompt in prompts:
+        prompt_size = len(prompt.split())
+        export_file = get_export_filename(args, prompt_size)
+        export_data = load_json_data(export_file)
+
+        profile_result = ProfileResults(prompt_size=prompt_size)
+        calculate_metrics(args, profile_result, export_data)
+        results.append(profile_result)
+
+    print_benchmark_summary(results)
+    if args.periodic_concurrency_range:
+        print(
+            "Saved in-flight batching benchmark plots "
+            "@ 'inflight_batching_benchmark-*.png'."
+        )
+
+
+def profile(args, export_file):
+    command = (
+        f"perf_analyzer -m {args.model} -i grpc --async --streaming "
+        f"-u {args.url} "
+        f"--input-data={INPUT_FILENAME} "
+        f"--profile-export-file={export_file} "
+    )
+    if args.periodic_concurrency_range:
+        start, end, step = args.periodic_concurrency_range
+        command += (
+            f"--periodic-concurrency-range={start}:{end}:{step} "
+            f"--request-period={args.request_period}"
+        )
+    else:
+        command += (
+            "--measurement-mode=count_windows "
+            "--measurement-request-count=10 "
+            "--stability-percentage=999 "
+            f"--concurrency-range={args.concurrency}"
+        )
+
+    subprocess.run(args=[command], shell=True, stdout=subprocess.DEVNULL)
+
+
+def prepare_export_file(args, prompt):
+    prompt_size = len(prompt.split())
+    filename = get_export_filename(args, prompt_size)
+
+    # If exists, clean up
+    export_file = Path(filename)
+    export_file.unlink(missing_ok=True)
+    return export_file
+
+
+def prepare_input_data(input_data, prompt):
+    """Insert the prompt to send into input JSON data."""
+    input_data["data"][0]["prompt"] = [prompt]
+    save_json_data(input_data, INPUT_FILENAME)
+
+
+def generate_prompts(args, input_data):
+    """Generate dummy prompts if not specified by input JSON file."""
+    prompt = input_data["data"][0]["prompt"][0]
+
+    if not prompt:  # Generate dummy prompt
+        assert args.prompt_size_range, "Must specify --prompt-size-range."
+        start, end, step = args.prompt_size_range
+        return [" ".join(["hi"] * size) for size in range(start, end + 1, step)]
+    return [prompt]
+
+
+def construct_vllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for vLLM.
+
+    Parse the input JSON file (if exists) to construct the input data.
+    When user sets parameters through command line, overwrite the
+    parameters set by input JSON file.
+    """
+    # Default sampling parameters
+    sampling_params = {
+        "max_tokens": 256,
+        "ignore_eos": False,
+    }
+
+    if args.input_data:
+        input_data = load_json_data(filename=args.input_data)
+        if "sampling_parameters" in input_data["data"][0]:
+            loaded_params = input_data["data"][0]["sampling_parameters"][0]
+            loaded_params = json.loads(loaded_params or "null")
+            sampling_params = loaded_params if loaded_params else sampling_params
+    else:
+        # Default input JSON
+        input_data = {
+            "data": [
+                {
+                    "text_input": [""],
+                    "stream": [True],
+                    "sampling_parameters": [""],
+                }
+            ]
+        }
+
+    # If command line option is specified, overwrite
+    if args.offline:
+        input_data["data"][0]["stream"] = [False]
+    elif not input_data["data"][0]["stream"]:
+        args.offline = True
+
+    if args.max_tokens:
+        sampling_params["max_tokens"] = args.max_tokens
+    elif "max_tokens" in sampling_params:
+        args.max_tokens = sampling_params["max_tokens"]
+    else:
+        args.max_tokens = 256  # default
+        sampling_params["max_tokens"] = args.max_tokens
+
+    if args.ignore_eos:
+        sampling_params["ignore_eos"] = args.ignore_eos
+    elif "ignore_eos" in sampling_params:
+        args.ignore_eos = sampling_params["ignore_eos"]
+    else:
+        args.ignore_eos = False  # default
+        sampling_params["ignore_eos"] = args.ignore_eos
+
+    input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)]
+    return input_data
+
+
+def construct_trtllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for TRT-LLM.
+
+    Parse the input JSON file (if exists) to construct the input data.
+    When user sets parameters through command line, overwrite the
+    parameters set by input JSON file.
+    """
+    if args.input_data:
+        input_data = load_json_data(filename=args.input_data)
+    else:
+        # Default input JSON
+        input_data = {
+            "data": [
+                {
+                    "prompt": [""],
+                    "max_tokens": [128],
+                    "temperature": [0.2],
+                    "top_p": [0.7],
+                }
+            ]
+        }
+
+    # If command line option is specified, overwrite
+    args.offline = False  # always streaming
+
+    if args.max_tokens:
+        input_data["data"][0]["max_tokens"] = [args.max_tokens]
+    else:
+        args.max_tokens = input_data["data"][0]["max_tokens"][0]
+
+    return input_data
+
+
+def main(args, should_summarize=True):
+    if args.backend == "trtllm":
+        input_data = construct_trtllm_input_data(args)
+    elif args.backend == "vllm":
+        input_data = construct_vllm_input_data(args)
+    else:
+        raise ValueError(
+            "Unknown backend specified. Supported backend types are: 'trtllm' "
+            "and 'vllm'."
+        )
+
+    prompts = generate_prompts(args, input_data)
+
+    for prompt in prompts:
+        prepare_input_data(input_data, prompt)
+        export_file = prepare_export_file(args, prompt)
+
+        # Run Perf Analyzer
+        profile(args, export_file)
+
+    if should_summarize:
+        summarize_profile_results(args, prompts)    
+
+
+class Args:
+    backend = "trtllm"
+    model = ""
+    periodic_concurrency_range = []
+    request_period = None
+    max_tokens = 128
+    prompt_size_range = [2048, 2048, 1]
+    input_data = ""
+    ignore_eos = True
+    offline = False
+    url = "localhost:8001"
+    concurrency = 1
+
+
+class Profiler:
+    @staticmethod
+    def profile(model, batch_size, url, input_length):
+        args = Args()
+        args.model = model
+        args.concurrency = batch_size # inflight batch size
+        args.url = url
+        args.prompt_size_range = [input_length, input_length, 1]
+
+        print(f"Warming up...")
+        main(args, should_summarize=False) # warm-up
+        
+        print(f"Warmed up, profiling now...")
+        main(args)
+
+        # get only avg first token and avg 
+        #