merge main

Valdanitooooo · Aug 15, 2024 · eb48aca · eb48aca
2 parents ce2d6ba + b6d655e
commit eb48aca
Show file tree

Hide file tree

Showing 156 changed files with 5,377 additions and 1,460 deletions.
diff --git a/.github/workflows/issue.yaml b/.github/workflows/issue.yaml
@@ -2,6 +2,7 @@ name: Close inactive issues
 on:
   schedule:
     - cron: "0 19 * * *"
+  workflow_dispatch:
 
 jobs:
   close-issues:
@@ -10,7 +11,7 @@ jobs:
       issues: write
       pull-requests: write
     steps:
-      - uses: actions/stale@v5
+      - uses: actions/stale@v9
         with:
           days-before-issue-stale: 7
           days-before-issue-close: 5
@@ -19,4 +20,5 @@ jobs:
           close-issue-message: "This issue was closed because it has been inactive for 5 days since being marked as stale."
           days-before-pr-stale: -1
           days-before-pr-close: -1
+          operations-per-run: 500
           repo-token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -141,7 +141,7 @@ jobs:
           MODULE: ${{ matrix.module }}
         run: |
           if [ "$MODULE" == "gpu" ]; then
-            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1"
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U "openai>1,<1.40"
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U modelscope
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U sse_starlette
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U xoscar
@@ -152,6 +152,7 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U "transformers"
             ${{ env.SELF_HOST_CONDA }} install -c conda-forge pynini=2.1.5
             ${{ env.SELF_HOST_CONDA }} install -c conda-forge "ffmpeg<7"
+            ${{ env.SELF_HOST_PYTHON }} -m pip install -U funasr
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U nemo_text_processing
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U omegaconf~=2.3.0
             ${{ env.SELF_HOST_PYTHON }} -m pip install -U WeTextProcessing
@@ -169,6 +170,9 @@ jobs:
             ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_whisper.py
+            ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
+              -W ignore::PendingDeprecationWarning \
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_funasr.py
             ${{ env.SELF_HOST_PYTHON }} -m pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/model/audio/tests/test_chattts.py
@@ -185,6 +189,6 @@ jobs:
               --cov-config=setup.cfg --cov-report=xml --cov=xinference xinference/client/tests/test_client.py
             pytest --timeout=1500 \
               -W ignore::PendingDeprecationWarning \
-              --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/audio/tests/test_whisper.py --ignore xinference/model/audio/tests/test_chattts.py --ignore xinference/model/audio/tests/test_cosyvoice.py xinference
+              --cov-config=setup.cfg --cov-report=xml --cov=xinference --ignore xinference/client/tests/test_client.py --ignore xinference/model/image/tests/test_stable_diffusion.py --ignore xinference/model/audio/tests xinference
           fi
         working-directory: .
diff --git a/README.md b/README.md
@@ -34,14 +34,14 @@ potential of cutting-edge AI models.
 - Support speech recognition model: [#929](https://github.com/xorbitsai/inference/pull/929)
 - Metrics support: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### New Models
+- Built-in support for [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
+- Built-in support for [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
+- Built-in support for [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
+- Built-in support for [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
+- Built-in support for [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
 - Built-in support for [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
 - Built-in support for [llama3.1](https://ai.meta.com/blog/meta-llama-3-1/): [#1932](https://github.com/xorbitsai/inference/pull/1932)
 - Built-in support for [Mistral Nemo](https://mistral.ai/news/mistral-nemo/): [#1936](https://github.com/xorbitsai/inference/pull/1936)
-- Built-in support for [CosyVoice](https://github.com/FunAudioLLM/CosyVoice): [#1881](https://github.com/xorbitsai/inference/pull/1881)
-- Built-in support for [codegeex4](https://github.com/THUDM/CodeGeeX4): [#1888](https://github.com/xorbitsai/inference/pull/1888)
-- Built-in support for [Gemma-2-it](https://huggingface.co/blog/gemma2): [#1774](https://github.com/xorbitsai/inference/pull/1774)
-- Built-in support for [jina-reranker-v2](https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual): [#1733](https://github.com/xorbitsai/inference/pull/1733)
-- Built-in support for [Qwen2](https://github.com/QwenLM/Qwen2): [#1509](https://github.com/xorbitsai/inference/pull/1597)
 ### Integrations
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
 - [FastGPT](https://github.com/labring/FastGPT): a knowledge-based platform built on the LLM, offers out-of-the-box data processing and model invocation capabilities, allows for workflow orchestration through Flow visualization.
@@ -97,7 +97,7 @@ with popular third-party libraries including [LangChain](https://python.langchai
 
 ### Jupyter Notebook
 
-The lightest way to experience Xinference is to try our [Juypter Notebook on Google Colab](https://colab.research.google.com/github/xorbitsai/inference/blob/main/examples/Xinference_Quick_Start.ipynb).
+The lightest way to experience Xinference is to try our [Jupyter Notebook on Google Colab](https://colab.research.google.com/github/xorbitsai/inference/blob/main/examples/Xinference_Quick_Start.ipynb).
 
 ### Docker 
 
@@ -107,6 +107,24 @@ Nvidia GPU users can start Xinference server using [Xinference Docker Image](htt
 docker run --name xinference -d -p 9997:9997 -e XINFERENCE_HOME=/data -v </on/your/host>:/data --gpus all xprobe/xinference:latest xinference-local -H 0.0.0.0
 ```
 
+### K8s via helm
+
+Ensure that you have GPU support in your Kubernetes cluster, then install as follows.
+
+```
+# add repo
+helm repo add xinference https://xorbitsai.github.io/xinference-helm-charts
+
+# update indexes and query xinference versions
+helm repo update xinference
+helm search repo xinference/xinference --devel --versions
+
+# install xinference
+helm install xinference xinference/xinference -n xinference --version 0.0.1-v<xinference_release_version>
+```
+
+For more customized installation methods on K8s, please refer to the [documentation](https://inference.readthedocs.io/en/latest/getting_started/using_kubernetes.html).
+
 ### Quick Start
 
 Install Xinference by using pip as follows. (For more options, see [Installation page](https://inference.readthedocs.io/en/latest/getting_started/installation.html).)

diff --git a/README_zh_CN.md b/README_zh_CN.md
@@ -31,14 +31,14 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 - 支持语音识别模型: [#929](https://github.com/xorbitsai/inference/pull/929)
 - 增加 Metrics 统计信息: [#906](https://github.com/xorbitsai/inference/pull/906)
 ### 新模型
+- 内置 [CogVideoX](https://github.com/THUDM/CogVideo): [#2049](https://github.com/xorbitsai/inference/pull/2049)
+- 内置 [flux.1-schnell & flux.1-dev](https://www.basedlabs.ai/tools/flux1): [#2007](https://github.com/xorbitsai/inference/pull/2007)
+- 内置 [MiniCPM-V 2.6](https://github.com/OpenBMB/MiniCPM-V): [#2031](https://github.com/xorbitsai/inference/pull/2031)
+- 内置 [Kolors](https://huggingface.co/Kwai-Kolors/Kolors): [#2028](https://github.com/xorbitsai/inference/pull/2028)
+- 内置 [SenseVoice](https://github.com/FunAudioLLM/SenseVoice): [#2008](https://github.com/xorbitsai/inference/pull/2008)
 - 内置 [Mistral Large 2](https://mistral.ai/news/mistral-large-2407/): [#1944](https://github.com/xorbitsai/inference/pull/1944)
 - 内置 [llama3.1](https://ai.meta.com/blog/meta-llama-3-1/): [#1932](https://github.com/xorbitsai/inference/pull/1932)
 - 内置 [Mistral Nemo](https://mistral.ai/news/mistral-nemo/): [#1936](https://github.com/xorbitsai/inference/pull/1936)
-- 内置 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice): [#1881](https://github.com/xorbitsai/inference/pull/1881)
-- 内置 [codegeex4](https://github.com/THUDM/CodeGeeX4): [#1888](https://github.com/xorbitsai/inference/pull/1888)
-- 内置 [Gemma-2-it](https://huggingface.co/blog/gemma2): [#1774](https://github.com/xorbitsai/inference/pull/1774)
-- 内置 [jina-reranker-v2](https://huggingface.co/jinaai/jina-reranker-v2-base-multilingual): [#1733](https://github.com/xorbitsai/inference/pull/1733)
-- 内置 [Qwen2](https://github.com/QwenLM/Qwen2): [#1509](https://github.com/xorbitsai/inference/pull/1597)
 ### 集成
 - [FastGPT](https://doc.fastai.site/docs/development/custom-models/xinference/)：一个基于 LLM 大模型的开源 AI 知识库构建平台。提供了开箱即用的数据处理、模型调用、RAG 检索、可视化 AI 工作流编排等能力，帮助您轻松实现复杂的问答场景。
 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
@@ -91,6 +91,24 @@ Xorbits Inference（Xinference）是一个性能强大且功能全面的分布
 
 Nvidia GPU 用户可以使用[Xinference Docker 镜像](https://inference.readthedocs.io/zh-cn/latest/getting_started/using_docker_image.html) 启动 Xinference 服务器。在执行安装命令之前，确保你的系统中已经安装了 [Docker](https://docs.docker.com/get-docker/) 和 [CUDA](https://developer.nvidia.com/cuda-downloads)。
 
+### Kubernetes
+
+确保你的 Kubernetes 集群开启了 GPU 支持，然后通过 `helm` 进行如下方式的安装。
+
+```
+# 新增xinference仓库
+helm repo add xinference https://xorbitsai.github.io/xinference-helm-charts
+
+# 更新仓库，查询可安装的版本
+helm repo update xinference
+helm search repo xinference/xinference --devel --versions
+
+# 在K8s中安装xinference
+helm install xinference xinference/xinference -n xinference --version 0.0.1-v<xinference_release_version>
+```
+
+更多定制化安装方式，请参考[文档](https://inference.readthedocs.io/en/latest/getting_started/using_kubernetes.html)。
+
 ### 快速开始
 
 使用 pip 安装 Xinference，操作如下。（更多选项，请参阅[安装页面](https://inference.readthedocs.io/zh-cn/latest/getting_started/installation.html)。）

diff --git a/benchmark/benchmark_latency.py b/benchmark/benchmark_latency.py
@@ -16,29 +16,27 @@
 import asyncio
 import logging
 import random
-import time
-from typing import List, Tuple, Optional
 
 import numpy as np
-from utils import get_tokenizer, sample_requests, send_request
+from utils import get_tokenizer, sample_requests
+from benchmark_runner import BenchmarkRunner
+
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
-REQUEST_LATENCY: List[Tuple[int, int, float]] = []
-
 
-async def benchmark(
-    api_url: str,
-    model_uid: str,
-    input_requests: List[Tuple[str, int, int]],
-    api_key: Optional[str] = None,
-) -> None:
-    for request in input_requests:
-        prompt, prompt_len, output_len = request
-        await send_request(
-            api_url, model_uid, prompt, prompt_len, output_len, REQUEST_LATENCY
-        )
+class LatencyBenchmarkRunner(BenchmarkRunner):
+    async def _run(self):
+        total_requests = len(self.input_requests)
+        for i, request in enumerate(self.input_requests):
+            await self.send_request(request)
+            remaining = total_requests - (i + 1)
+            print(
+                f"\rProcessed {i + 1}/{total_requests} requests, {remaining} remaining.",
+                end="",
+            )
+        print("")
 
 
 def main(args: argparse.Namespace):
@@ -54,36 +52,17 @@ def main(args: argparse.Namespace):
     input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
 
     logger.info("Benchmark starts.")
-    benchmark_start_time = time.time()
-
-    asyncio.run(
-        benchmark(
-            api_url,
-            model_uid,
-            input_requests,
-            api_key=args.api_key,
-        )
-    )
 
-    benchmark_end_time = time.time()
-    benchmark_time = benchmark_end_time - benchmark_start_time
-    print(f"Total time: {benchmark_time:.2f} s")
-    print(f"Throughput: {len(REQUEST_LATENCY) / benchmark_time:.2f} requests/s")
-
-    # Compute the latency statistics.
-    avg_latency = np.mean([latency for _, _, latency in REQUEST_LATENCY])
-    print(f"Average latency: {avg_latency:.2f} s")
-    avg_per_token_latency = np.mean(
-        [
-            latency / (prompt_len + output_len)
-            for prompt_len, output_len, latency in REQUEST_LATENCY
-        ]
-    )
-    print(f"Average latency per token: {avg_per_token_latency:.2f} s")
-    avg_per_output_token_latency = np.mean(
-        [latency / output_len for _, output_len, latency in REQUEST_LATENCY]
+    benchmark = LatencyBenchmarkRunner(
+        api_url,
+        model_uid,
+        input_requests,
+        args.stream,
+        args.api_key,
     )
-    print("Average latency per output token: " f"{avg_per_output_token_latency:.2f} s")
+    asyncio.run(benchmark.run())
+
+    benchmark.print_stats()
 
 
 if __name__ == "__main__":
@@ -109,7 +88,13 @@ def main(args: argparse.Namespace):
     )
     parser.add_argument("--model-uid", type=str, help="Xinference model UID.")
     parser.add_argument(
-        "--api-key", type=str, default=None, help="Authorization api key",
+        "--stream", action="store_true", help="Enable streaming responses."
+    )
+    parser.add_argument(
+        "--api-key",
+        type=str,
+        default=None,
+        help="Authorization api key",
     )
 
     args = parser.parse_args()