diff --git a/benchmark/benchmark_latency.py b/benchmark/benchmark_latency.py index 7b607cef51..20c96ae04e 100644 --- a/benchmark/benchmark_latency.py +++ b/benchmark/benchmark_latency.py @@ -17,7 +17,7 @@ import logging import random import time -from typing import List, Tuple +from typing import List, Tuple, Optional import numpy as np from utils import get_tokenizer, sample_requests, send_request @@ -32,6 +32,7 @@ async def benchmark( api_url: str, model_uid: str, input_requests: List[Tuple[str, int, int]], + api_key: Optional[str] = None, ) -> None: for request in input_requests: prompt, prompt_len, output_len = request @@ -60,6 +61,7 @@ def main(args: argparse.Namespace): api_url, model_uid, input_requests, + api_key=args.api_key, ) ) @@ -106,6 +108,9 @@ def main(args: argparse.Namespace): help="Trust remote code from huggingface.", ) parser.add_argument("--model-uid", type=str, help="Xinference model UID.") + parser.add_argument( + "--api-key", type=str, default=None, help="Authorization api key", + ) args = parser.parse_args() main(args) diff --git a/benchmark/benchmark_long.py b/benchmark/benchmark_long.py index 13a899dea2..531a8c6858 100644 --- a/benchmark/benchmark_long.py +++ b/benchmark/benchmark_long.py @@ -17,7 +17,7 @@ import logging import random import time -from typing import List, Tuple +from typing import List, Tuple, Optional import numpy as np @@ -38,6 +38,7 @@ def __init__( model_uid: str, input_requests: List[Tuple[str, int, int]], concurrency: int, + api_key: Optional[str]=None, ): self.api_url = api_url @@ -46,6 +47,7 @@ def __init__( self.concurrency = concurrency self.sent = 0 self.left = len(input_requests) + self.api_key = api_key async def run(self): tasks = [] @@ -68,6 +70,7 @@ async def worker(self, i: int): prompt_len, output_len, REQUEST_LATENCY, + api_key=self.api_key, ) self.left -= 1 # pring longer space to overwrite the previous when left decrease @@ -101,6 +104,7 @@ def main(args: argparse.Namespace): model_uid, input_requests, concurrency=args.concurrency, + api_key=args.api_key, ) asyncio.run(benchmark.run()) benchmark_end_time = time.time() @@ -160,5 +164,8 @@ def main(args: argparse.Namespace): help="Trust remote code from huggingface.", ) parser.add_argument("--model-uid", type=str, help="Xinference model UID.") + parser.add_argument( + "--api-key", type=str, default=None, help="Authorization api key", + ) args = parser.parse_args() main(args) diff --git a/benchmark/benchmark_rerank.py b/benchmark/benchmark_rerank.py index 0b7a2438e6..8e6e416fac 100644 --- a/benchmark/benchmark_rerank.py +++ b/benchmark/benchmark_rerank.py @@ -18,7 +18,7 @@ import random import time import aiohttp -from typing import List, Dict +from typing import List, Dict, Optional from datasets import load_dataset @@ -36,6 +36,7 @@ def __init__( input_requests: List[Dict], top_n: int, concurrency: int, + api_key: Optional[str]=None, ): self.api_url = api_url self.model_uid = model_uid @@ -44,6 +45,7 @@ def __init__( self.concurrency = concurrency self.sent = 0 self.left = len(input_requests) + self.api_key = api_key async def run(self): tasks = [] @@ -73,7 +75,8 @@ async def worker(self, i: int): print("") async def send_request( - self, api_url: str, model_uid: str, prompt: str, documents: List[str] + self, api_url: str, model_uid: str, prompt: str, documents: List[str], + api_key: Optional[str]=None, ): request_start_time = time.time() @@ -85,6 +88,8 @@ async def send_request( } headers = {"User-Agent": "Benchmark Client"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" timeout = aiohttp.ClientTimeout(total=3 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: @@ -121,6 +126,7 @@ def main(args: argparse.Namespace): input_requests, top_n=args.top_n, concurrency=args.concurrency, + api_key=args.api_key, ) asyncio.run(benchmark.run()) benchmark_end_time = time.time() @@ -161,5 +167,8 @@ def main(args: argparse.Namespace): help="Trust remote code from huggingface.", ) parser.add_argument("--model-uid", type=str, help="Xinference model UID.") + parser.add_argument( + "--api-key", type=str, default=None, help="Authorization api key", + ) args = parser.parse_args() main(args) diff --git a/benchmark/benchmark_serving.py b/benchmark/benchmark_serving.py index 0440e0a722..d60b6da7bb 100644 --- a/benchmark/benchmark_serving.py +++ b/benchmark/benchmark_serving.py @@ -17,7 +17,7 @@ import logging import random import time -from typing import List, Tuple +from typing import List, Tuple, Optional import numpy as np @@ -39,6 +39,7 @@ def __init__( input_requests: List[Tuple[str, int, int]], request_rate: float, concurrency: int, + api_key: Optional[str] = None, ): self.api_url = api_url @@ -48,6 +49,7 @@ def __init__( self.request_rate = request_rate self.queue = asyncio.Queue(concurrency or 100) self.left = len(input_requests) + self.api_key = api_key async def run(self): tasks = [] @@ -78,6 +80,7 @@ async def worker(self): prompt_len, output_len, REQUEST_LATENCY, + api_key=self.api_key, ) self.left -= 1 # pring longer space to overwrite the previous when left decrease @@ -111,6 +114,7 @@ def main(args: argparse.Namespace): input_requests, request_rate=args.request_rate, concurrency=args.concurrency, + api_key=args.api_key, ) asyncio.run(benchmark.run()) benchmark_end_time = time.time() @@ -156,6 +160,9 @@ def main(args: argparse.Namespace): parser.add_argument( "--prompt-len-limit", type=int, default=1024, help="Prompt length limitation." ) + parser.add_argument( + "--api-key", type=str, default=None, help="Authorization api key", + ) parser.add_argument( "--concurrency", "-c", diff --git a/benchmark/utils.py b/benchmark/utils.py index 3a393adc62..345dcaa437 100644 --- a/benchmark/utils.py +++ b/benchmark/utils.py @@ -17,7 +17,7 @@ import logging import random import time -from typing import TYPE_CHECKING, List, Tuple +from typing import TYPE_CHECKING, List, Tuple, Optional import openai from transformers import AutoTokenizer, PreTrainedTokenizerFast @@ -173,6 +173,7 @@ async def send_request( prompt_len: int, output_len: int, stats: List[Tuple[int, int, float]], # output. + api_key: Optional[str]=None, ) -> None: request_start_time = time.time() @@ -187,6 +188,8 @@ async def send_request( } headers = {"User-Agent": "Benchmark Client"} + if api_key: + headers["Authorization"] = f"Bearer {api_key}" timeout = aiohttp.ClientTimeout(total=3 * 3600) async with aiohttp.ClientSession(timeout=timeout) as session: