Skip to content

Commit

Permalink
Catch errors and improve logging in Profiler (#23)
Browse files Browse the repository at this point in the history
  • Loading branch information
nv-hwoo authored Jan 14, 2024
1 parent cfb4bbb commit bd76c71
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 11 deletions.
1 change: 1 addition & 0 deletions src/triton_cli/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,6 +438,7 @@ def profile_model(args: argparse.Namespace, client: TritonClient):
url=f"{args.url}:{args.port}",
input_length=args.input_length,
output_length=args.output_length,
verbose=args.verbose,
)


Expand Down
70 changes: 59 additions & 11 deletions src/triton_cli/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import json
import logging
import subprocess
from dataclasses import dataclass
from itertools import tee
Expand All @@ -33,6 +34,11 @@

import numpy as np

from triton_cli.constants import LOGGER_NAME

logger = logging.getLogger(LOGGER_NAME)


INPUT_FILENAME = "generated_input_data.json"
METRIC_FIELDS = {
# "max_first_token_latency": ("Max first token latency", "ms"),
Expand Down Expand Up @@ -396,6 +402,28 @@ def calculate_offline_metrics(args, profile_result, export_data):


def calculate_metrics(args, profile_result, export_data):
# Sanity check the number of responses received from backend
if args.ignore_eos:
requests = export_data["experiments"][0]["requests"]
for request in requests:
if len(request["response_timestamps"]) == args.max_tokens:
# Assume FINAL flag is returned with final token response
pass
elif len(request["response_timestamps"]) == args.max_tokens + 1:
# Assume FINAL flag was returned with an empty response after
# the final token
logger.warning(
"Received an extra response from the backend. This may be "
"due to the backend sending an 'empty final response'."
)
else:
raise ValueError(
f"Expecting {args.max_tokens} tokens but received "
f"{len(request['response_timestamps'])} tokens. "
f"This could be due to an unsupported sequence length. "
f"Please double check the input and output length."
)

calculate_offline_metrics(args, profile_result, export_data)
if not args.offline:
calculate_online_metrics(args, profile_result, export_data)
Expand Down Expand Up @@ -455,7 +483,15 @@ def profile(args, export_file):
f"--concurrency-range={args.concurrency}"
)

subprocess.run(args=[command], shell=True, stdout=subprocess.DEVNULL)
proc = subprocess.run(args=[command], shell=True, capture_output=True)

if args.verbose:
logger.info(f"Perf Analyzer output:\n{proc.stdout.decode('utf-8')}")
if proc.returncode:
raise RuntimeError(
"Encountered the following error while running Perf Analyzer:\n"
f"{proc.stderr.decode('utf-8').rstrip()}"
)


def prepare_export_file(args, prompt):
Expand Down Expand Up @@ -615,31 +651,43 @@ class Args:
offline = False
url = "localhost:8001"
concurrency = 1
verbose = False


class Profiler:
@staticmethod
def profile(model, backend, batch_size, url, input_length=128, output_length=128):
def profile(
model,
backend,
batch_size,
url,
input_length=128,
output_length=128,
verbose=False,
):
args = Args()
args.model = model
args.backend = backend
args.concurrency = batch_size # inflight batch size
args.url = url
args.prompt_size_range = [input_length, input_length, 1]
args.max_tokens = output_length
args.verbose = verbose

start, end, step = args.prompt_size_range
assert start == end and step == 1 # no sweeping for now

print("Warming up...")
logger.info("Warming up...")
main(args, should_summarize=False) # warm-up

print("Warmed up, profiling now...\n")
print("[ PROFILE CONFIGURATIONS ]")
print(f" * Model: {args.model}")
print(f" * Backend: {args.backend}")
print(f" * Batch size: {args.concurrency}")
print(f" * Input tokens: {args.prompt_size_range[0]}")
print(f" * Output tokens: {args.max_tokens}")
print("")
logger.info(
"Warmed up, profiling now...\n"
"[ PROFILE CONFIGURATIONS ]\n"
f" * Model: {args.model}\n"
f" * Backend: {args.backend}\n"
f" * Batch size: {args.concurrency}\n"
f" * Input tokens: {args.prompt_size_range[0]}\n"
f" * Output tokens: {args.max_tokens}\n"
""
)
main(args)

0 comments on commit bd76c71

Please sign in to comment.