fix: qwen3-coder param

docs: update huggingface_token config example
feat: Add support for Nvidia MIG detection in containerized environments.
328 changed files with 23802 additions and 8226 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,3 @@
 install.ps1.sha256sum text eol=lf
-
 * text=auto eol=lf
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -60,7 +60,7 @@ jobs:
          retention-days: 5

      - name: Release GitHub Assets
-        uses: softprops/action-gh-release@v2.2.2
+        uses: softprops/action-gh-release@v2
        if: startsWith(github.ref, 'refs/tags/') && matrix.python-version == '3.11' && matrix.os == 'linux'
        with:
          # Draft for official releases to prepare and review release notes before publishing
--- a/.github/workflows/docker-ci.yaml
+++ b/.github/workflows/docker-ci.yaml
@ -15,18 +15,15 @@ on:
      - "**.png"
      - "**.jpg"
      - "**.gif"
-      - "Dockerfile.rocm.base"
-      - "Dockerfile.dcu.base"
+      - "pack/**.base"
  pull_request:
    branches:
      - main
      - "v*-dev"
    paths:
-      - "Dockerfile"
-      - "Dockerfile.*"
      - ".github/workflows/docker-ci.yaml"
-      - "!Dockerfile.rocm.base"
-      - "!Dockerfile.dcu.base"
+      - "pack/**"
+      - "!pack/**.base"

 jobs:
  publish-docker:
@ -48,12 +45,16 @@ jobs:
            tag_suffix: ""
            build_args:
              - "CUDA_VERSION=12.4.1"
+              - "CUDA_DEVEL_VERSION=12.6.3"
+              - "FLASHINFER_BUILD_MAX_JOBS=1"
          - device: cuda
            dockerfile: "Dockerfile"
            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-cuda12.8"
            build_args:
              - "CUDA_VERSION=12.8.1"
+              - "CUDA_DEVEL_VERSION=12.8.1"
+              - "FLASHINFER_BUILD_MAX_JOBS=1"
          #
          # HIP RoCM
          #
@ -83,7 +84,7 @@ jobs:
          #
          - device: musa
            dockerfile: "Dockerfile.musa"
-            platforms: "linux/amd64,linux/arm64"
+            platforms: "linux/amd64"
            tag_suffix: "-musa"
            build_args: []
          #
@ -115,6 +116,7 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 1
          persist-credentials: false
      - name: Maximize Docker Build Space
@ -159,12 +161,66 @@ jobs:
            echo "$arg" >> $GITHUB_OUTPUT
          done
          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Get Cache Ref
+        id: cache-ref
+        run: |
+          #
+          # Use different cache ref for different branches.
+          #
+          # Examples:
+          # CACHE_FROM_REF
+          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION|DEFAULT_BRANCH}"
+          #   - PR/PUSH to branch      -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH|DEFUALT_BRANCH}"
+          # CACHE_TO_REF
+          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
+          #   - PUSH to branch         -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
+          #
+          # Stories(device cpu):
+          # CACHE_FROM_REF
+          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
+          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          # CACHE_TO_REF
+          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
+          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7
+          DEFAULT_BRANCH="main"
+          TAG_SUFFIX="${{ matrix.tag_suffix }}"
+          if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
+            REF="${GITHUB_REF#refs/tags/}"
+            IFS="." read -r VERSION_MAJOR VERSION_MINOR VERSION_PATCH <<< "${REF}"
+            VERSION="${VERSION_MAJOR}.${VERSION_MINOR}"
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
+            CACHE_TO_REF="${CACHE_FROM_REF}"
+          else
+            REF="${GITHUB_BASE_REF:-${GITHUB_REF}}"
+            BRANCH="${REF#refs/heads/}"
+            BRANCH="${BRANCH%-dev}"
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
+            CACHE_TO_REF="${CACHE_FROM_REF}"
+          fi
+          if ! docker manifest inspect "${CACHE_FROM_REF}" >/dev/null 2>&1; then
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${DEFAULT_BRANCH}"
+          fi
+          echo "CACHE_FROM_REF=${CACHE_FROM_REF}" >> $GITHUB_ENV
+          echo "CACHE_TO_REF=${CACHE_TO_REF}" >> $GITHUB_ENV
+          echo "DEBUG: GITHUB_BASE_REF=${GITHUB_BASE_REF}"
+          echo "DEBUG: GITHUB_REF=${GITHUB_REF}"
+          echo "DEBUG: TAG_SUFFIX=${TAG_SUFFIX}"
+          echo "DEBUG: CACHE_FROM_REF=${CACHE_FROM_REF}"
+          echo "DEBUG: CACHE_TO_REF=${CACHE_TO_REF}"
      - name: Package
        uses: docker/build-push-action@v6
        id: package
        with:
          push: ${{ github.event_name != 'pull_request' }}
-          file: ${{ github.workspace }}/${{ matrix.dockerfile }}
+          file: ${{ github.workspace }}/pack/${{ matrix.dockerfile }}
          context: ${{ github.workspace }}
          platforms: ${{ matrix.platforms }}
          tags: ${{ steps.metadata.outputs.tags }}
@ -174,6 +230,6 @@ jobs:
          build-args: |
            ${{ steps.build-args.outputs.BUILD_ARGS }}
          cache-from: |
-            type=registry,ref=gpustack/build-cache:gpustack${{ matrix.tag_suffix }}
+            type=registry,ref=${{ env.CACHE_FROM_REF }}
          cache-to: |
-            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
+            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref={0},ignore-error=true', env.CACHE_TO_REF) || '' }}
--- a/.github/workflows/install-script-windows.yml
+++ b/.github/workflows/install-script-windows.yml
@ -4,12 +4,14 @@ on:
  push:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
  pull_request:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
@ -74,7 +76,7 @@ jobs:
          $env:INSTALL_PACKAGE_SPEC = [System.IO.Path]::Combine("dist", $env:whlPackageName)   
          Write-Host "INSTALL_PACKAGE_SPEC: $env:INSTALL_PACKAGE_SPEC"   
          Write-Host "AppData $env:APPDATA"
-          
+
          # Use port 8080 since 80 is occupied by the System
          ./install.ps1 -ServerPort 8080

@ -100,8 +102,7 @@ jobs:
                  Start-Sleep -Seconds $retryDelaySeconds
              }
          }
-        
+
          if ($responseCode -ne 200) {
              Write-Host "All retry attempts failed. Last error: $lastError"
          }
-      
--- a/.github/workflows/install-script.yml
+++ b/.github/workflows/install-script.yml
@ -4,12 +4,14 @@ on:
  push:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
  pull_request:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,6 +11,7 @@ repos:
    hooks:
      - id: flake8
        exclude: ".*/migrations"
+        args: [--max-complexity=15]
  - repo: https://github.com/psf/black
    rev: 24.4.2
    hooks:
--- a/44
+++ b/44
@ -1,44 +0,0 @@
-ARG CUDA_VERSION=12.4.1
-ARG CUDA_TAG_SUFFIX=-cudnn-runtime-ubuntu22.04
-
-FROM nvidia/cuda:${CUDA_VERSION}${CUDA_TAG_SUFFIX}
-
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y \
-    git \
-    curl \
-    wget \
-    tzdata \
-    iproute2 \
-    python3 \
-    python3-pip \
-    python3-venv \
-    tini \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY . /workspace/gpustack
-RUN cd /workspace/gpustack && \
-    make build
-
-ARG VLLM_VERSION=0.8.5.post1
-RUN <<EOF
-    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
-        # Install vllm dependencies for x86_64
-        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
-    else
-        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";
-    fi
-    pip install pipx
-    pip install $WHEEL_PACKAGE
-    pip cache purge
-    rm -rf /workspace/gpustack
-EOF
-
-RUN gpustack download-tools
-
-# Download dac weights used by audio models like Dia
-RUN python3 -m dac download
-
-ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,32 +0,0 @@
-FROM ubuntu:22.04
-
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y \
-    git \
-    curl \
-    wget \
-    tzdata \
-    iproute2 \
-    python3 \
-    python3-pip \
-    python3-venv \
-    tini \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY . /workspace/gpustack
-RUN cd /workspace/gpustack && \
-    make build && \
-    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]" && \
-    pip install pipx && \
-    pip install $WHEEL_PACKAGE && \
-    pip cache purge && \
-    rm -rf /workspace/gpustack
-
-RUN gpustack download-tools
-
-# Download dac weights used by audio models like Dia
-RUN python3 -m dac download
-
-ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/Docs-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 - **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
 - **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
+- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including vLLM, Ascend MindIE, llama-box (llama.cpp & stable-diffusion.cpp) and vox-box.
 - **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
 - **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
 - **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -50,83 +50,71 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 ## Installation

-### Linux or macOS
+### Linux

-GPUStack provides a script to install it as a service on systemd or launchd based systems with default port 80. To install GPUStack using this method, just run:
+If you are using NVIDIA GPUs, ensure [Docker](https://docs.docker.com/engine/install/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) are installed on your system. Then, run the following command to start the GPUStack server.

 ```bash
-curl -sfL https://get.gpustack.ai | sh -s -
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
 ```

-### Windows
+For more details on the installation or other GPU hardware platforms, please refer to the [Installation Documentation](docs/installation/installation-requirements.md).

-Run PowerShell as administrator (**avoid** using PowerShell ISE), then run the following command to install GPUStack:
-
-```powershell
-Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
-```
-
-### Other Installation Methods
-
-For manual installation, docker installation or detailed configuration options, please refer to the [Installation Documentation](https://docs.gpustack.ai/latest/installation/installation-script/).
-
-## Getting Started
-
-1. Run and chat with the **llama3.2** model:
+After the server starts, run the following command to get the default admin password:

 ```bash
-gpustack chat llama3.2 "tell me a joke."
+docker exec gpustack cat /var/lib/gpustack/initial_admin_password
 ```

-2. Run and generate an image with the **stable-diffusion-v3-5-large-turbo** model:
+Open your browser and navigate to `http://your_host_ip` to access the GPUStack UI. Use the default username `admin` and the password you retrieved above to log in.

-> ### 💡 Tip
->
-> This command downloads the model (~12GB) from Hugging Face. The download time depends on your network speed. Ensure you have enough disk space and VRAM (12GB) to run the model. If you encounter issues, you can skip this step and move to the next one.
+### macOS & Windows

-```bash
-gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
-"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
-```
+A desktop installer is available for macOS and Windows — see the [documentation](https://docs.gpustack.ai/latest/installation/desktop-installer/) for installation details.

-Once the command completes, the generated image will appear in the default viewer. You can experiment with the prompt and CLI options to customize the output.
+## Deploy a Model

-![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+1. Navigate to the `Catalog` page in the GPUStack UI.

-3. Open `http://your_host_ip` in the browser to access the GPUStack UI. Log in to GPUStack with username `admin` and the default password. You can run the following command to get the password for the default setup:
+2. Select the `Qwen3` model from the list of available models.

-**Linux or macOS**
+3. After the deployment compatibility checks pass, click the `Save` button to deploy the model.

-```bash
-cat /var/lib/gpustack/initial_admin_password
-```
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)

-**Windows**
+4. GPUStack will start downloading the model files and deploying the model. When the deployment status shows `Running`, the model has been deployed successfully.

-```powershell
-Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
-```
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. Click `Playground - Chat` in the navigation menu, check that the model `qwen3` is selected from the top-right `Model` dropdown. Now you can chat with the model in the UI playground.

-4. Click `Playground - Chat` in the navigation menu. Now you can chat with the LLM in the UI playground.
+![quick chat](docs/assets/quick-start/quick-chat.png)

-![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+## Use the model via API

-5. Click `API Keys` in the navigation menu, then click the `New API Key` button.
+1. Hover over the user avatar and navigate to the `API Keys` page, then click the `New API Key` button.

-6. Fill in the `Name` and click the `Save` button.
+2. Fill in the `Name` and click the `Save` button.

-7. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.
+3. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.

-8. Now you can use the API key to access the OpenAI-compatible API. For example, use curl as the following:
+4. You can now use the API key to access the OpenAI-compatible API endpoints provided by GPUStack. For example, use curl as the following:

 ```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1-openai/chat/completions \
+curl http://your_gpustack_server_url/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "llama3.2",
+    "model": "qwen3",
    "messages": [
      {
        "role": "system",
@ -134,7 +122,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
      },
      {
        "role": "user",
-        "content": "Hello!"
+        "content": "Tell me a joke."
      }
    ],
    "stream": true
@ -143,8 +131,8 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \

 ## Supported Platforms

- [x] macOS
 - [x] Linux
+- [x] macOS
 - [x] Windows

 ## Supported Accelerators
@ -156,15 +144,11 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
 - [x] Iluvatar Corex
-
-We plan to support the following accelerators in future releases.
-
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
+- [x] Cambricon MLU

 ## Supported Models

-GPUStack uses [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:
+GPUStack uses [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:

 1. [Hugging Face](https://huggingface.co/)

@ -172,16 +156,16 @@ GPUStack uses [llama-box](https://github.com/gpustack/llama-box) (bundled [llama

 3. Local File Path

-### Example Models:
+### Example Models

-| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                           |
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
-| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
-| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
-| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
-| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
-| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |
+| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                       |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |

 For full list of supported models, please refer to the supported models section in the [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) documentation.

--- a/README_CN.md
+++ b/README_CN.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/文档-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 - **广泛的 GPU 兼容性**：无缝支持 Apple Mac、Windows PC 和 Linux 服务器上各种供应商的 GPU。
 - **广泛的模型支持**：支持各种模型，包括 LLM、多模态 VLM、图像模型、语音模型、文本嵌入模型和重排序模型。
- **灵活的推理后端**：支持与 llama-box（llama.cpp 和 stable-diffusion.cpp）、vox-box、vLLM 和 Ascend MindIE 等多种推理后端的灵活集成。
+- **灵活的推理后端**：支持与 vLLM 、 Ascend MindIE、llama-box（llama.cpp 和 stable-diffusion.cpp）和 vox-box 等多种推理后端的灵活集成。
 - **多版本后端支持**：同时运行推理后端的多个版本，以满足不同模型的不同运行依赖。
 - **分布式推理**：支持单机和多机多卡并行推理，包括跨供应商和运行环境的异构 GPU。
 - **可扩展的 GPU 架构**：通过向基础设施添加更多 GPU 或节点轻松进行扩展。
@ -50,84 +50,71 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 ## 安装

-### Linux 或 macOS
+### Linux

-GPUStack 提供了安装脚本，可以将其安装为 Linux 的 systemd 服务或 macOS 的 launchd 服务，默认端口为 80。要使用此方法安装 GPUStack，执行以下命令：
+如果你是 NVIDIA GPU 环境，请确保 [Docker](https://docs.docker.com/engine/install/) 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) 都已经在系统中安装。 然后，执行如下命令启动 GPUStack：

 ```bash
-curl -sfL https://get.gpustack.ai | INSTALL_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple sh -s -
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
 ```

-### Windows
+有关其它平台的安装或详细配置选项，请参考[安装文档](docs/installation/installation-requirements.md).

-以管理员身份运行 PowerShell（**避免**使用 PowerShell ISE），然后执行以下命令安装 GPUStack：
-
-```powershell
-$env:INSTALL_INDEX_URL = "https://pypi.tuna.tsinghua.edu.cn/simple"
-Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
-```
-
-### 其他安装方式
-
-有关 pip 安装、Docker 安装或详细配置选项，请参考[安装文档](https://docs.gpustack.ai/latest/installation/installation-requirements/)。
-
-## 新手入门
-
-1. 在命令行运行 **llama3.2** 模型并进行对话：
+容器正常运行后，执行以下命令获取默认密码：

 ```bash
-gpustack chat llama3.2 "tell me a joke."
+docker exec gpustack cat /var/lib/gpustack/initial_admin_password
 ```

-2. 运行 **stable-diffusion-v3-5-large-turbo** 模型并生成图像：
+在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用 `admin` 用户名和默认密码登录 GPUStack。

-> ### 💡 Tip
->
-> 此命令将从 Hugging Face 下载模型（约 12GB）。下载时间取决于你的网络速度。确保你有足够的磁盘空间和 VRAM（12GB）来运行模型。如果遇到问题，你可以跳过此步骤并转到下一步。
+### macOS & Windows

-```bash
-gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
-"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
-```
+对于 macOS 和 Windows，我们提供了桌面安装程序。请参阅[文档](https://docs.gpustack.ai/latest/installation/desktop-installer/)了解安装细节。

-命令完成后，生成的图像将出现在默认查看器中。你可以尝试修改 prompt 和 CLI 参数来定制输出。
+## 部署模型

-![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+1. 在 GPUStack 界面，在菜单中点击“模型库”。

-3. 在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用“admin”用户名和默认密码登录 GPUStack。可以执行以下命令获取默认密码：
+2. 从模型列表中选择 `Qwen3` 模型。

-**Linux 或 macOS**
+3. 在部署兼容性检查通过之后，选择保存部署模型。

-```bash
-cat /var/lib/gpustack/initial_admin_password
-```
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)

-**Windows**
+4. GPUStack 将开始下载模型文件并部署模型。当部署状态显示为 `Running` 时，表示模型已成功部署。

-```powershell
-Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
-```
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. 点击菜单中的“试验场 - 对话”，在右上方模型菜单中选择模型 `qwen3`。现在你可以在试验场中与 LLM 进行对话。

-4. 在菜单中点击“试验场 - 对话”，现在你可以在试验场中与 LLM 进行对话。
+![quick chat](docs/assets/quick-start/quick-chat.png)

-![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+## 通过 API 使用模型

-5. 在菜单中点击“API 秘钥”，然后点击“新建 API 秘钥”按钮。
+1. 将鼠标移动到右下角的用户头像上，选择“API 密钥”，然后点击“新建 API 秘钥”按钮。

-6. 填写“名称”，然后点击“保存”按钮。
+2. 填写“名称”，然后点击“保存”按钮。

-7. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。
+3. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。

-8. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：
+4. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：

 ```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1-openai/chat/completions \
+curl http://your_gpustack_server_url/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "llama3.2",
+    "model": "qwen3",
    "messages": [
      {
        "role": "system",
@ -135,7 +122,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
      },
      {
        "role": "user",
-        "content": "Hello!"
+        "content": "Tell me a joke."
      }
    ],
    "stream": true
@ -144,8 +131,8 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \

 ## 平台支持

- [x] macOS
 - [x] Linux
+- [x] macOS
 - [x] Windows

 ## 加速框架支持
@ -157,15 +144,11 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] 海光 DTK
 - [x] 摩尔线程 MUSA
 - [x] 天数智芯 Corex
-
-我们计划在未来的版本中支持以下加速框架：
-
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
+- [x] 寒武纪 MLU

 ## 模型支持

-GPUStack 使用 [llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）、[vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie) 和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：
+GPUStack 使用 [vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：

 1. [Hugging Face](https://huggingface.co/)

@ -175,14 +158,14 @@ GPUStack 使用 [llama-box](https://github.com/gpustack/llama-box)（基于 [lla

 ### 示例模型

-| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                             |
-| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
-| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
-| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
-| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
-| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
-| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |
+| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                         |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |

 有关支持模型的完整列表，请参阅 [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) 文档中的 Supported Models 部分。

@ -236,20 +219,20 @@ GPUStack 用户可以在 UI 中生成自己的 API 密钥。

 ## 加入社区

-扫码添加 GPUStack 微信小助手加入社区群：
+扫码加入社区群：

 <p align="left">
-    <img alt="Wechat-assistant" src="./docs/assets/wechat-assistant.png" width="300px"/>
+    <img alt="Wechat-group" src="./docs/assets/wechat-group-qrcode.jpg" width="300px"/>
 </p>

 ## License

 版权所有 (c) 2024 GPUStack 作者

-本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。  
-您只能在遵守许可证条款的前提下使用本项目。  
+本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。
+您只能在遵守许可证条款的前提下使用本项目。
 许可证的完整内容请参阅 [LICENSE](./LICENSE) 文件。

-除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，  
+除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，
 不附带任何明示或暗示的保证或条件。
 有关许可证规定的具体权利和限制，请参阅许可证了解更多详细信息。
--- a/README_JP.md
+++ b/README_JP.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/ドキュメント-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack は、AI モデルを実行するためのオープンソース GPU ク

 - **幅広い GPU 互換性:** Apple Mac、Windows PC、Linux サーバー上のさまざまなベンダーの GPU をシームレスにサポート。
 - **豊富なモデルサポート:** LLM、VLM、画像モデル、音声モデル、埋め込みモデル、リランクモデルを含む幅広いモデルをサポート。
- **柔軟な推論バックエンド:** llama-box（llama.cpp と stable-diffusion.cpp）、vox-box、vLLM、Ascend MindIE と統合。
+- **柔軟な推論バックエンド:** vLLM、Ascend MindIE、llama-box（llama.cpp と stable-diffusion.cpp）、vox-box と統合。
 - **マルチバージョンバックエンドサポート:** 異なるモデルの多様なランタイム要件を満たすために、推論バックエンドの複数バージョンを同時実行。
 - **分散推論:** ベンダーやランタイム環境をまたぐ異種 GPU を含む、シングルノードおよびマルチノードのマルチ GPU 推論をサポート。
 - **スケーラブルな GPU アーキテクチャ:** インフラストラクチャに GPU やノードを追加することで簡単にスケールアップ。
@ -50,83 +50,71 @@ GPUStack は、AI モデルを実行するためのオープンソース GPU ク

 ## インストール

-### Linux または macOS
+### Linux

-GPUStack は、systemd または launchd ベースのシステムでサービスとしてインストールするスクリプトを提供しており、デフォルトポートは 80 です。この方法で GPUStack をインストールするには、以下を実行します：
+NVIDIA GPU を使用している場合は、Docker と NVIDIA Container Toolkit をインストールしてください。その後、以下のコマンドで GPUStack サーバーを起動します：

 ```bash
-curl -sfL https://get.gpustack.ai | sh -s -
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
 ```

-### Windows
+詳細なインストール手順やその他の GPU ハードウェアプラットフォームについては、インストールドキュメント を参照してください。

-管理者として PowerShell を実行し（PowerShell ISE の使用は**避けてください**）、以下のコマンドを実行して GPUStack をインストールします：
-
-```powershell
-Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
-```
-
-### その他のインストール方法
-
-手動インストール、Docker インストール、または詳細な構成オプションについては、[インストールドキュメント](https://docs.gpustack.ai/latest/installation/installation-script/)を参照してください。
-
-## はじめに
-
-1. **llama3.2**モデルを実行してチャットする：
+サーバー起動後、次のコマンドでデフォルト管理者パスワードを取得できます：

 ```bash
-gpustack chat llama3.2 "tell me a joke."
+cat /var/lib/gpustack/initial_admin_password
 ```

-2. **stable-diffusion-v3-5-large-turbo**モデルで画像を生成する：
+ブラウザで http://your_host_ip にアクセスし、ユーザー名 admin と取得したパスワードでログインします。

-> ### 💡 ヒント
->
-> このコマンドは Hugging Face からモデル（約 12GB）をダウンロードします。ダウンロード時間はネットワーク速度に依存します。モデルを実行するために十分なディスクスペースと VRAM（12GB）があることを確認してください。問題が発生した場合は、このステップをスキップして次に進むことができます。
+### macOS & Windows

-```bash
-gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
-"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
-```
+macOS および Windows 向けにデスクトップインストーラーが用意されています。インストールの詳細は [ドキュメント](https://docs.gpustack.ai/latest/installation/desktop-installer/) をご覧ください。

-コマンドが完了すると、生成された画像がデフォルトビューアに表示されます。プロンプトと CLI オプションを実験して出力をカスタマイズできます。
+## モデルのデプロイ

-![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+1. GPUStack UI の Catalog ページに移動します。

-3. ブラウザで`http://your_host_ip`を開いて GPUStack UI にアクセスします。ユーザー名`admin`とデフォルトパスワードで GPUStack にログインします。デフォルト設定のパスワードを取得するには、以下のコマンドを実行します：
+2. モデルリストから Qwen3 モデルを選択します。

-**Linux または macOS**
+3. デプロイ互換性チェックが完了したら、Save ボタンをクリックしてデプロイします。

-```bash
-cat /var/lib/gpustack/initial_admin_password
-```
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)

-**Windows**
+4. モデルのダウンロードとデプロイが開始されます。ステータスが Running になると、デプロイ成功です。

-```powershell
-Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
-```
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. ナビゲーションメニューから Playground - Chat を選択し、右上の Model ドロップダウンで qwen3 が選択されていることを確認してチャットを開始します。

-4. ナビゲーションメニューで`Playground - Chat`をクリックします。これで UI プレイグラウンドで LLM とチャットできます。
+![quick chat](docs/assets/quick-start/quick-chat.png)

-![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+## API でモデルを使用する

-5. ナビゲーションメニューで`API Keys`をクリックし、`New API Key`ボタンをクリックします。
+1. ユーザーアバターをホバーし、API Keys ページに移動後、New API Key をクリックします。

-6. `Name`を入力し、`Save`ボタンをクリックします。
+2. Name を入力し、Save をクリックします。

-7. 生成された API キーをコピーして安全な場所に保存します。作成時にのみ一度だけ表示されることに注意してください。
+3. 生成された API キーをコピーして安全な場所に保管してください（一度しか表示されません）。

-8. これで API キーを使用して OpenAI 互換 API にアクセスできます。例えば、curl を使用する場合：
+4. OpenAI 互換エンドポイントにアクセスできます。例：

 ```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1-openai/chat/completions \
+curl http://your_gpustack_server_url/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "llama3.2",
+    "model": "qwen3",
    "messages": [
      {
        "role": "system",
@ -134,7 +122,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
      },
      {
        "role": "user",
-        "content": "Hello!"
+        "content": "Tell me a joke."
      }
    ],
    "stream": true
@ -143,8 +131,8 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \

 ## サポートされているプラットフォーム

- [x] macOS
 - [x] Linux
+- [x] macOS
 - [x] Windows

 ## サポートされているアクセラレータ
@ -156,15 +144,11 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
 - [x] Iluvatar Corex
-
-以下のアクセラレータは将来のリリースでサポートする予定です。
-
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
+- [x] Cambricon MLU

 ## サポートされているモデル

-GPUStack は[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：
+GPUStack は[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：

 1. [Hugging Face](https://huggingface.co/)

@ -172,16 +156,16 @@ GPUStack は[llama-box](https://github.com/gpustack/llama-box)（バンドルさ

 3. ローカルファイルパス

-### モデル例：
+### モデル例

-| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                           |
-| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
-| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
-| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
-| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
-| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
-| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                      |
+| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                       |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                  |

 サポートされているモデルの完全なリストについては、[推論バックエンド](https://docs.gpustack.ai/latest/user-guide/inference-backends/)ドキュメントのサポートされているモデルセクションを参照してください。

--- a/benchmarks/benchmark_llm.py
+++ b/benchmarks/benchmark_llm.py
@ -1,346 +0,0 @@
-import asyncio
-import time
-import httpx
-import numpy
-import logging
-import argparse
-import json
-import random
-from openai import AsyncOpenAI
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-
-# Avoid client side connection error: https://github.com/encode/httpx/discussions/3084
-http_client = httpx.AsyncClient(
-    limits=httpx.Limits(
-        max_connections=10000, max_keepalive_connections=10000, keepalive_expiry=30
-    )
-)
-
-SAMPLE_PROMPTS = [
-    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
-    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
-    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
-    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
-    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
-    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
-    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
-    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
-    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
-    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
-    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
-    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
-    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
-    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
-    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
-]
-
-
-async def process_stream(stream):
-    first_token_time = None
-    total_tokens = 0
-    async for chunk in stream:
-        if first_token_time is None:
-            first_token_time = time.time()
-        if chunk.choices[0].delta.content:
-            total_tokens += 1
-        if chunk.choices[0].finish_reason is not None:
-            break
-    return first_token_time, total_tokens
-
-
-async def make_request(
-    client: AsyncOpenAI, model, max_completion_tokens, request_timeout
-):
-    start_time = time.time()
-    content = random.choice(SAMPLE_PROMPTS)
-
-    try:
-        stream = await client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": content}],
-            max_completion_tokens=max_completion_tokens,
-            stream=True,
-        )
-        first_token_time, total_tokens = await asyncio.wait_for(
-            process_stream(stream), timeout=request_timeout
-        )
-
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        ttft = first_token_time - start_time if first_token_time else None
-        tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
-        return total_tokens, elapsed_time, tokens_per_second, ttft
-
-    except asyncio.TimeoutError:
-        logging.warning(f"Request timed out after {request_timeout} seconds")
-        return None
-    except Exception as e:
-        logging.error(f"Error during request: {str(e)}")
-        return None
-
-
-async def worker(
-    client,
-    model,
-    semaphore,
-    queue,
-    results,
-    max_completion_tokens,
-    request_timeout,
-):
-    while True:
-        async with semaphore:
-            task_id = await queue.get()
-            if task_id is None:
-                queue.task_done()
-                break
-            logging.info(f"Starting request {task_id}")
-            result = await make_request(
-                client, model, max_completion_tokens, request_timeout
-            )
-            if result:
-                results.append(result)
-            else:
-                logging.warning(f"Request {task_id} failed")
-            queue.task_done()
-            logging.info(f"Finished request {task_id}")
-
-
-def calculate_percentile(values, percentile, reverse=False):
-    if not values:
-        return None
-    if reverse:
-        return numpy.percentile(values, 100 - percentile)
-    return numpy.percentile(values, percentile)
-
-
-async def preflight_check(client, model) -> bool:
-    result = await make_request(client, model, 16, 60)
-    return result is not None
-
-
-async def main(
-    model,
-    num_requests,
-    concurrency,
-    request_timeout,
-    max_completion_tokens,
-    server_url,
-    api_key,
-):
-    client = AsyncOpenAI(
-        base_url=f"{server_url}/v1",
-        api_key=api_key,
-        http_client=http_client,
-        max_retries=0,
-    )
-
-    if not await preflight_check(client, model):
-        logging.error(
-            "Preflight check failed. Please check configuration and the service status."
-        )
-        return
-
-    semaphore = asyncio.Semaphore(concurrency)
-    queue = asyncio.Queue()
-    results = []
-
-    # Add tasks to the queue
-    for i in range(num_requests):
-        await queue.put(i)
-
-    # Add sentinel values to stop workers
-    for _ in range(concurrency):
-        await queue.put(None)
-
-    # Create worker tasks
-    workers = [
-        asyncio.create_task(
-            worker(
-                client,
-                model,
-                semaphore,
-                queue,
-                results,
-                max_completion_tokens,
-                request_timeout,
-            )
-        )
-        for _ in range(concurrency)
-    ]
-
-    start_time = time.time()
-
-    # Wait for all tasks to complete
-    await queue.join()
-    await asyncio.gather(*workers)
-
-    end_time = time.time()
-
-    # Calculate metrics
-    total_elapsed_time = end_time - start_time
-    total_tokens = sum(tokens for tokens, _, _, _ in results if tokens is not None)
-    latencies = [
-        elapsed_time for _, elapsed_time, _, _ in results if elapsed_time is not None
-    ]
-    tokens_per_second_list = [tps for _, _, tps, _ in results if tps is not None]
-    ttft_list = [ttft for _, _, _, ttft in results if ttft is not None]
-
-    successful_requests = len(results)
-    success_rate = successful_requests / num_requests if num_requests > 0 else 0
-    requests_per_second = (
-        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_latency = sum(latencies) / len(latencies) if latencies else 0
-    avg_tokens_per_second = (
-        sum(tokens_per_second_list) / len(tokens_per_second_list)
-        if tokens_per_second_list
-        else 0
-    )
-    overall_tokens_per_second = (
-        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
-
-    # Calculate percentiles
-    percentiles = [50, 95, 99]
-    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
-    tps_percentiles = [
-        calculate_percentile(tokens_per_second_list, p, reverse=True)
-        for p in percentiles
-    ]
-    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
-
-    return {
-        "model": model,
-        "total_requests": num_requests,
-        "successful_requests": successful_requests,
-        "success_rate": success_rate,
-        "concurrency": concurrency,
-        "request_timeout": request_timeout,
-        "max_completion_tokens": max_completion_tokens,
-        "total_time": total_elapsed_time,
-        "requests_per_second": requests_per_second,
-        "total_completion_tokens": total_tokens,
-        "latency": {
-            "average": avg_latency,
-            "p50": latency_percentiles[0],
-            "p95": latency_percentiles[1],
-            "p99": latency_percentiles[2],
-        },
-        "tokens_per_second": {
-            "overall": overall_tokens_per_second,
-            "average": avg_tokens_per_second,
-            "p50": tps_percentiles[0],
-            "p95": tps_percentiles[1],
-            "p99": tps_percentiles[2],
-        },
-        "time_to_first_token": {
-            "average": avg_ttft,
-            "p50": ttft_percentiles[0],
-            "p95": ttft_percentiles[1],
-            "p99": ttft_percentiles[2],
-        },
-    }
-
-
-def output_results(results, result_file=None):
-    # Round all floats in results to two decimal places for output
-    def _round_floats(obj, ndigits=2):
-        if isinstance(obj, dict):
-            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
-        if isinstance(obj, list):
-            return [_round_floats(v, ndigits) for v in obj]
-        if isinstance(obj, float):
-            return round(obj, ndigits)
-        return obj
-
-    formatted_results = _round_floats(results, 2)
-    if result_file:
-        with open(result_file, "w") as f:
-            json.dump(formatted_results, f, indent=2)
-        logging.info(f"Results saved to {result_file}")
-    else:
-        print(json.dumps(formatted_results, indent=2))
-
-
-def set_http_client(args):
-    if args.headers:
-        for header in args.headers:
-            if ":" not in header:
-                parser.error(f"Invalid header format: {header}. Expected Key:Value")
-            key, value = header.split(":", 1)
-            http_client.headers[key.strip()] = value.strip()
-
-    http_client.timeout = args.request_timeout
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
-    parser.add_argument(
-        "-m", "--model", type=str, required=True, help="Name of the model"
-    )
-    parser.add_argument(
-        "-n",
-        "--num-requests",
-        type=int,
-        default=100,
-        help="Number of requests to make (default: 100)",
-    )
-    parser.add_argument(
-        "-c",
-        "--concurrency",
-        type=int,
-        default=10,
-        help="Number of concurrent requests (default: 10)",
-    )
-    parser.add_argument(
-        "--request-timeout",
-        type=int,
-        default=300,
-        help="Timeout for each request in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-completion-tokens",
-        type=int,
-        default=1024,
-        help="Maximum number of tokens in the completion (default: 1024)",
-    )
-    parser.add_argument(
-        "--server-url",
-        type=str,
-        default="http://127.0.0.1",
-        help="URL of the GPUStack server",
-    )
-    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
-    parser.add_argument(
-        "--result-file",
-        type=str,
-        help="Result file path to save benchmark json results",
-    )
-    parser.add_argument(
-        "-H",
-        "--header",
-        action="append",
-        dest="headers",
-        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
-    )
-    args = parser.parse_args()
-    set_http_client(args)
-
-    results = asyncio.run(
-        main(
-            args.model,
-            args.num_requests,
-            args.concurrency,
-            args.request_timeout,
-            args.max_completion_tokens,
-            args.server_url,
-            args.api_key,
-        )
-    )
-    output_results(results, args.result_file)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -0,0 +1,654 @@
+import asyncio
+from dataclasses import asdict, dataclass, is_dataclass
+import time
+from typing import List, Optional
+import aiohttp
+import numpy
+import logging
+import argparse
+import json
+import random
+from openai import APIConnectionError, AsyncOpenAI
+from aiohttp import ClientSession
+from httpx_aiohttp import AiohttpTransport
+from openai import DefaultAsyncHttpxClient
+from openai.types.chat import (
+    ChatCompletionStreamOptionsParam,
+)
+from tqdm import tqdm
+
+logging.basicConfig(
+    level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+SAMPLE_PROMPTS = [
+    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
+    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
+    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
+    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
+    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
+    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
+    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
+    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
+    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
+    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
+    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
+    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
+    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
+    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
+    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
+]
+
+
+@dataclass
+class PercentileResults:
+    average: float
+    p50: float
+    p95: float
+    p99: float
+
+
+@dataclass
+class BenchmarkResults:
+    model: str
+    total_requests: int
+    successful_requests: int
+    success_rate: float
+    concurrency: int
+    request_timeout: int
+    max_completion_tokens: int
+    total_time: float
+    requests_per_second: float
+    total_tokens: int
+    total_prompt_tokens: int
+    total_completion_tokens: int
+    total_tokens_per_second: float
+    total_prompt_tokens_per_second: float
+    total_completion_tokens_per_second: float
+    latency: PercentileResults
+    completion_tokens_per_second: PercentileResults
+    time_to_first_token: PercentileResults
+
+
+async def process_stream(stream):
+    first_token_time = None
+    async for chunk in stream:
+        if first_token_time is None:
+            first_token_time = time.time()
+        if chunk.usage:
+            return first_token_time, chunk.usage
+    return first_token_time, None
+
+
+def get_random_prompt(prompt_multiplier):
+    """
+    Returns a random prompt from the SAMPLE_PROMPTS list, repeated prompt_multiplier times.
+    """
+    # Add a random prefix to avoid prefix cache hits
+    random_prefix = str(random.randint(100000, 999999))
+    return (
+        random_prefix + " " + (random.choice(SAMPLE_PROMPTS) + " ") * prompt_multiplier
+    )
+
+
+async def make_chat_completion_request(
+    client: AsyncOpenAI,
+    model,
+    max_completion_tokens,
+    ignore_eos,
+    request_timeout,
+    prompt_multiplier,
+):
+    start_time = time.time()
+    content = get_random_prompt(prompt_multiplier)
+    try:
+        stream = await client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            max_completion_tokens=max_completion_tokens,
+            stream=True,
+            stream_options=ChatCompletionStreamOptionsParam(include_usage=True),
+            extra_body={"ignore_eos": ignore_eos} if ignore_eos else None,
+        )
+        first_token_time, usage = await asyncio.wait_for(
+            process_stream(stream), timeout=request_timeout
+        )
+
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        ttft = (first_token_time - start_time) * 1000 if first_token_time else None
+        return usage, elapsed_time, ttft
+    except asyncio.TimeoutError:
+        logging.warning(f"Request timed out after {request_timeout} seconds")
+        return None
+    except APIConnectionError as e:
+        logging.error(f"API connection error: {str(e)}")
+        return None
+    except Exception as e:
+        logging.error(f"Error during request: {str(e)}")
+        return None
+
+
+async def make_embedding_request(
+    client: AsyncOpenAI,
+    model,
+    request_timeout,
+    prompt_multiplier=1,
+):
+    start_time = time.time()
+    content = get_random_prompt(prompt_multiplier)
+    try:
+        response = await asyncio.wait_for(
+            client.embeddings.create(model=model, input=content),
+            timeout=request_timeout,
+        )
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        ttft = None  # Embeddings do not have a time to first token in the same way as chat completions
+
+        return response.usage, elapsed_time, ttft
+    except asyncio.TimeoutError:
+        logging.warning(f"Embedding request timed out after {request_timeout} seconds")
+        return None
+    except Exception as e:
+        logging.error(f"Error during embedding request: {str(e)}")
+        return None
+
+
+async def worker(
+    client,
+    model,
+    semaphore,
+    queue,
+    results,
+    max_completion_tokens,
+    ignore_eos,
+    request_timeout,
+    embeddings=False,
+    prompt_multiplier=1,
+    pbar=None,
+):
+    while True:
+        async with semaphore:
+            task_id = await queue.get()
+            if task_id is None:
+                queue.task_done()
+                break
+            logging.debug(f"Starting request {task_id}")
+            if embeddings:
+                result = await make_embedding_request(
+                    client, model, request_timeout, prompt_multiplier
+                )
+            else:
+                result = await make_chat_completion_request(
+                    client,
+                    model,
+                    max_completion_tokens,
+                    ignore_eos,
+                    request_timeout,
+                    prompt_multiplier,
+                )
+            if result:
+                results.append(result)
+            else:
+                logging.warning(f"Request {task_id} failed")
+            queue.task_done()
+            if pbar:
+                pbar.update(1)
+            logging.debug(f"Finished request {task_id}")
+
+
+def calculate_percentile(values, percentile, reverse=False):
+    if not values:
+        return None
+    if reverse:
+        return numpy.percentile(values, 100 - percentile)
+    return numpy.percentile(values, percentile)
+
+
+async def preflight_check(client, model, embeddings=False) -> bool:
+    if embeddings:
+        result = await make_embedding_request(client, model, 16)
+    else:
+        result = await make_chat_completion_request(client, model, 16, False, 60, 1)
+    return result is not None
+
+
+def set_headers(aiohttp_session: ClientSession, headers: Optional[List[str]]):
+    if headers:
+        for header in headers:
+            if ":" not in header:
+                raise ValueError(f"Invalid header format: {header}. Expected Key:Value")
+            key, value = header.split(":", 1)
+            aiohttp_session.headers[key.strip()] = value.strip()
+
+
+async def main(
+    model,
+    num_requests,
+    concurrency,
+    request_timeout,
+    max_completion_tokens,
+    ignore_eos,
+    server_url,
+    api_key,
+    headers=None,
+    embeddings=False,
+    prompt_multiplier=1,
+) -> Optional[BenchmarkResults]:
+    connector = aiohttp.TCPConnector(
+        limit=2000,
+        force_close=True,
+    )
+    async with ClientSession(connector=connector, trust_env=True) as aiohttp_session:
+        if headers:
+            set_headers(aiohttp_session, headers)
+        transport = AiohttpTransport(client=aiohttp_session)
+        httpx_client = DefaultAsyncHttpxClient(
+            transport=transport, timeout=request_timeout
+        )
+        client = AsyncOpenAI(
+            base_url=f"{server_url}/v1",
+            api_key=api_key,
+            http_client=httpx_client,
+            max_retries=0,
+        )
+
+        if not await preflight_check(client, model, embeddings=embeddings):
+            raise Exception(
+                "Preflight check failed. Please check configuration and the service status."
+            )
+
+        semaphore = asyncio.Semaphore(concurrency)
+        queue = asyncio.Queue()
+        results = []
+
+        # Add tasks to the queue
+        for i in range(num_requests):
+            await queue.put(i)
+
+        # Add sentinel values to stop workers
+        for _ in range(concurrency):
+            await queue.put(None)
+
+        pbar = tqdm(
+            total=num_requests,
+            desc="Running Benchmark requests",
+            unit="request",
+            dynamic_ncols=True,
+        )
+
+        # Create worker tasks
+        workers = [
+            asyncio.create_task(
+                worker(
+                    client,
+                    model,
+                    semaphore,
+                    queue,
+                    results,
+                    max_completion_tokens,
+                    ignore_eos,
+                    request_timeout,
+                    embeddings,
+                    prompt_multiplier,
+                    pbar=pbar,
+                )
+            )
+            for _ in range(concurrency)
+        ]
+
+        start_time = time.time()
+
+        # Wait for all tasks to complete
+        await queue.join()
+        await asyncio.gather(*workers)
+
+        end_time = time.time()
+        total_elapsed_time = end_time - start_time
+        return calculate_results(
+            model,
+            concurrency,
+            request_timeout,
+            max_completion_tokens,
+            total_elapsed_time,
+            num_requests,
+            results,
+        )
+
+
+def calculate_results(
+    model,
+    concurrency,
+    request_timeout,
+    max_completion_tokens,
+    total_elapsed_time,
+    num_requests,
+    results,
+):
+    # Calculate metrics
+    total_tokens = 0
+    prompt_tokens = 0
+    completion_tokens = 0
+    tokens_per_second_list = []
+    prompt_tokens_per_second_list = []
+    completion_tokens_per_second_list = []
+    for usage, elapsed_time, _ in results:
+        if usage is not None:
+            total_tokens += usage.total_tokens
+            prompt_tokens += usage.prompt_tokens
+            completion_tokens += usage.completion_tokens
+            prompt_tokens_per_second = (
+                usage.prompt_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            completion_tokens_per_second = (
+                usage.completion_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            tokens_per_second = (
+                usage.total_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            tokens_per_second_list.append(tokens_per_second)
+            prompt_tokens_per_second_list.append(prompt_tokens_per_second)
+            completion_tokens_per_second_list.append(completion_tokens_per_second)
+
+    latencies = [
+        elapsed_time for _, elapsed_time, _ in results if elapsed_time is not None
+    ]
+    ttft_list = [ttft for _, _, ttft in results if ttft is not None]
+
+    successful_requests = len(results)
+    success_rate = successful_requests / num_requests if num_requests > 0 else 0
+    requests_per_second = (
+        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_latency = sum(latencies) / len(latencies) if latencies else 0
+    avg_completion_tokens_per_second = (
+        sum(completion_tokens_per_second_list) / len(completion_tokens_per_second_list)
+        if completion_tokens_per_second_list
+        else 0
+    )
+    total_tokens_per_second = (
+        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    total_prompt_tokens_per_second = (
+        prompt_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    total_completion_tokens_per_second = (
+        completion_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
+
+    # Calculate percentiles
+    percentiles = [50, 95, 99]
+    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
+    completion_tps_percentiles = [
+        calculate_percentile(completion_tokens_per_second_list, p, reverse=True)
+        for p in percentiles
+    ]
+    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
+
+    return BenchmarkResults(
+        model=model,
+        total_requests=num_requests,
+        successful_requests=successful_requests,
+        success_rate=success_rate,
+        concurrency=concurrency,
+        request_timeout=request_timeout,
+        max_completion_tokens=max_completion_tokens,
+        total_time=total_elapsed_time,
+        requests_per_second=requests_per_second,
+        total_tokens=total_tokens,
+        total_prompt_tokens=prompt_tokens,
+        total_completion_tokens=completion_tokens,
+        total_tokens_per_second=total_tokens_per_second,
+        total_prompt_tokens_per_second=total_prompt_tokens_per_second,
+        total_completion_tokens_per_second=total_completion_tokens_per_second,
+        latency=PercentileResults(
+            average=avg_latency,
+            p50=latency_percentiles[0],
+            p95=latency_percentiles[1],
+            p99=latency_percentiles[2],
+        ),
+        completion_tokens_per_second=PercentileResults(
+            average=avg_completion_tokens_per_second,
+            p50=completion_tps_percentiles[0],
+            p95=completion_tps_percentiles[1],
+            p99=completion_tps_percentiles[2],
+        ),
+        time_to_first_token=PercentileResults(
+            average=avg_ttft,
+            p50=ttft_percentiles[0],
+            p95=ttft_percentiles[1],
+            p99=ttft_percentiles[2],
+        ),
+    )
+
+
+def fmt_line(label, *values, width=40):
+    label_part = f"{label:<{width}}"
+    value_part = " ".join(str(v) for v in values)
+    return f"{label_part}{value_part}"
+
+
+def fmt_float(v, suffix=""):
+    return f"{v:.2f}{suffix}"
+
+
+def output_benchmark_results_pretty(
+    results: BenchmarkResults, file: str = None, embeddings: bool = False
+):
+
+    lines = []
+    lines.append("============== Serving Benchmark Result ===============")
+    lines.append(fmt_line("Model:", results.model))
+    lines.append(
+        fmt_line(
+            "Total requests:",
+            f"{results.successful_requests}/{results.total_requests}({results.success_rate:.2%})",
+        )
+    )
+    lines.append(fmt_line("Concurrency:", results.concurrency))
+    lines.append(fmt_line("Benchmark duration (s):", fmt_float(results.total_time)))
+    lines.append(
+        fmt_line("Request throughput (req/s):", fmt_float(results.requests_per_second))
+    )
+    lines.append(fmt_line("Total input tokens:", results.total_prompt_tokens))
+    if not embeddings:
+        lines.append(fmt_line("Total output tokens:", results.total_completion_tokens))
+
+    output_tok_per_sec = (
+        results.total_completion_tokens / results.total_time
+        if results.total_time > 0
+        else 0
+    )
+    total_tok_per_sec = (
+        results.total_tokens / results.total_time if results.total_time > 0 else 0
+    )
+    if not embeddings:
+        lines.append(
+            fmt_line("Output token throughput (tok/s):", fmt_float(output_tok_per_sec))
+        )
+    lines.append(
+        fmt_line("Total token throughput (tok/s):", fmt_float(total_tok_per_sec))
+    )
+    lines.append("------------------- Request Latency -------------------")
+    lines.append(fmt_line("Average latency (s):", fmt_float(results.latency.average)))
+    lines.append(fmt_line("P50 latency (s):", fmt_float(results.latency.p50)))
+    lines.append(fmt_line("P95 latency (s):", fmt_float(results.latency.p95)))
+    lines.append(fmt_line("P99 latency (s):", fmt_float(results.latency.p99)))
+    if not embeddings:
+        lines.append("--------------- Output Token Per Second ---------------")
+        lines.append(
+            fmt_line(
+                "Average TPS (tok/s):",
+                fmt_float(results.completion_tokens_per_second.average),
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P50 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p50)
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P95 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p95)
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P99 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p99)
+            )
+        )
+
+        lines.append("----------------- Time to First Token -----------------")
+        lines.append(
+            fmt_line(
+                "Average TTFT (ms):", fmt_float(results.time_to_first_token.average)
+            )
+        )
+        lines.append(
+            fmt_line("P50 TTFT (ms):", fmt_float(results.time_to_first_token.p50))
+        )
+        lines.append(
+            fmt_line("P95 TTFT (ms):", fmt_float(results.time_to_first_token.p95))
+        )
+        lines.append(
+            fmt_line("P99 TTFT (ms):", fmt_float(results.time_to_first_token.p99))
+        )
+    lines.append("=" * 55)
+
+    output = "\n".join(lines)
+
+    if file:
+        with open(file, "w") as f:
+            f.write(output + "\n")
+        logging.info(f"Pretty benchmark results saved to {file}")
+    else:
+        print(output)
+
+
+def output_benchmark_results_json(
+    results: BenchmarkResults, result_file=None, embeddings: bool = False
+):
+    # Round all floats in results to two decimal places for output
+    def _round_floats(obj, ndigits=2):
+        if is_dataclass(obj):
+            obj = asdict(obj)
+        if isinstance(obj, dict):
+            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_round_floats(v, ndigits) for v in obj]
+        if isinstance(obj, float):
+            return round(obj, ndigits)
+        return obj
+
+    formatted_results = _round_floats(results, 2)
+    if result_file:
+        with open(result_file, "w") as f:
+            json.dump(formatted_results, f, indent=2)
+        logging.info(f"Results saved to {result_file}")
+    else:
+        print(json.dumps(formatted_results, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Name of the model"
+    )
+    parser.add_argument(
+        "-n",
+        "--num-requests",
+        type=int,
+        default=100,
+        help="Number of requests to make (default: 100)",
+    )
+    parser.add_argument(
+        "-c",
+        "--concurrency",
+        type=int,
+        default=10,
+        help="Number of concurrent requests (default: 10)",
+    )
+    parser.add_argument(
+        "--request-timeout",
+        type=int,
+        default=300,
+        help="Timeout for each request in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-completion-tokens",
+        type=int,
+        default=1024,
+        help="Maximum number of tokens in the completion (default: 1024)",
+    )
+    parser.add_argument(
+        "--prompt-multiplier",
+        type=int,
+        default=1,
+        help="Repeat the randomly selected prompt N times to create longer inputs",
+    )
+    parser.add_argument(
+        '--ignore-eos',
+        action='store_true',
+        help='Set ignore_eos flag when sending the benchmark request. This will not stop the stream when the model generates an EOS token.',
+    )
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://127.0.0.1",
+        help="URL of the GPUStack server",
+    )
+    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
+    parser.add_argument(
+        "--result-file",
+        type=str,
+        help="Result file path to save benchmark json results",
+    )
+    parser.add_argument(
+        "-H",
+        "--header",
+        action="append",
+        dest="headers",
+        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
+    )
+    parser.add_argument(
+        '--embeddings',
+        action='store_true',
+        help='Run embedding benchmark instead of chat completions',
+    )
+    parser.add_argument(
+        '--json',
+        action='store_true',
+        help='Output results in JSON format instead of pretty format',
+    )
+    args = parser.parse_args()
+
+    try:
+        results = asyncio.run(
+            main(
+                args.model,
+                args.num_requests,
+                args.concurrency,
+                args.request_timeout,
+                args.max_completion_tokens,
+                args.ignore_eos,
+                args.server_url,
+                args.api_key,
+                args.headers,
+                args.embeddings,
+                args.prompt_multiplier,
+            )
+        )
+        if args.json:
+            output_benchmark_results_json(
+                results, args.result_file, embeddings=args.embeddings
+            )
+        else:
+            output_benchmark_results_pretty(
+                results, args.result_file, embeddings=args.embeddings
+            )
+    except Exception as e:
+        logging.error(f"Benchmarking failed: {str(e)}")
+        exit(1)
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@ -0,0 +1,26 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.13
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+certifi==2025.6.15
+distro==1.9.0
+frozenlist==1.7.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-aiohttp==0.1.6
+idna==3.10
+jiter==0.10.0
+multidict==6.5.1
+numpy==2.3.1
+openai==1.92.2
+propcache==0.3.2
+pydantic==2.11.7
+pydantic_core==2.33.2
+sniffio==1.3.1
+tqdm==4.67.1
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+yarl==1.20.1
--- a/conftest.py
+++ b/conftest.py
@ -0,0 +1,21 @@
+import shutil
+import tempfile
+import pytest
+from gpustack.config.config import Config, set_global_config
+
+
+@pytest.fixture(scope="module", autouse=True)
+def temp_dir():
+    tmp_dir = tempfile.mkdtemp()
+    print(f"Created temporary directory: {tmp_dir}")
+    yield tmp_dir
+    shutil.rmtree(tmp_dir)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def config(temp_dir):
+    cfg = Config(
+        token="test", jwt_secret_key="test", data_dir=temp_dir, enable_ray=True
+    )
+    set_global_config(cfg)
+    return cfg
--- a/docs/architecture.md
+++ b/docs/architecture.md
@ -27,7 +27,7 @@ The GPUStack server connects to a SQL database as the datastore. GPUStack uses S

 ### Inference Server

-Inference servers are the backends that performs the inference tasks. GPUStack supports [llama-box](https://github.com/gpustack/llama-box), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.
+Inference servers are the backends that performs the inference tasks. GPUStack supports [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.

 ### RPC Server

--- a/docs/assets/compare-playground-screenshot.png
+++ b/docs/assets/compare-playground-screenshot.png
--- a/docs/assets/desktop-installer/add-worker.png
+++ b/docs/assets/desktop-installer/add-worker.png
--- a/docs/assets/desktop-installer/open-web-console.png
+++ b/docs/assets/desktop-installer/open-web-console.png
--- a/docs/assets/desktop-installer/prompt-root-privileges.png
+++ b/docs/assets/desktop-installer/prompt-root-privileges.png
--- a/docs/assets/desktop-installer/quickconfig-env-var.png
+++ b/docs/assets/desktop-installer/quickconfig-env-var.png
--- a/docs/assets/desktop-installer/quickconfig-general.png
+++ b/docs/assets/desktop-installer/quickconfig-general.png
--- a/docs/assets/desktop-installer/to-upgrade-darwin.png
+++ b/docs/assets/desktop-installer/to-upgrade-darwin.png
--- a/docs/assets/faq/quick-config.png
+++ b/docs/assets/faq/quick-config.png
--- a/docs/assets/gpustack-architecture.png
+++ b/docs/assets/gpustack-architecture.png
--- a/docs/assets/gpustack-network-architecture.png
+++ b/docs/assets/gpustack-network-architecture.png
--- a/docs/assets/integrations/integration-gpustack-api-access-info.png
+++ b/docs/assets/integrations/integration-gpustack-api-access-info.png
--- a/docs/assets/integrations/integration-gpustack-models.png
+++ b/docs/assets/integrations/integration-gpustack-models.png
--- a/docs/assets/model-catalog.png
+++ b/docs/assets/model-catalog.png
--- a/docs/assets/playground-screenshot.png
+++ b/docs/assets/playground-screenshot.png
--- a/docs/assets/playground/api-style.png
+++ b/docs/assets/playground/api-style.png
--- a/docs/assets/playground/audio-permission.png
+++ b/docs/assets/playground/audio-permission.png
--- a/docs/assets/playground/chat.png
+++ b/docs/assets/playground/chat.png
--- a/docs/assets/playground/create-image-01.png
+++ b/docs/assets/playground/create-image-01.png
--- a/docs/assets/playground/create-image-02.png
+++ b/docs/assets/playground/create-image-02.png
--- a/docs/assets/playground/embedding.png
+++ b/docs/assets/playground/embedding.png
--- a/docs/assets/playground/image-edit-01.png
+++ b/docs/assets/playground/image-edit-01.png
--- a/docs/assets/playground/image-size.png
+++ b/docs/assets/playground/image-size.png
--- a/docs/assets/playground/ranker.png
+++ b/docs/assets/playground/ranker.png
--- a/docs/assets/playground/speech-to-text.png
+++ b/docs/assets/playground/speech-to-text.png
--- a/docs/assets/playground/text-to-speech.png
+++ b/docs/assets/playground/text-to-speech.png
--- a/docs/assets/quick-start/mac-done.png
+++ b/docs/assets/quick-start/mac-done.png
--- a/docs/assets/quick-start/mac-installer.png
+++ b/docs/assets/quick-start/mac-installer.png
--- a/docs/assets/quick-start/model-running.png
+++ b/docs/assets/quick-start/model-running.png
--- a/docs/assets/quick-start/quick-chat.png
+++ b/docs/assets/quick-start/quick-chat.png
--- a/docs/assets/quick-start/quick-start-login.png
+++ b/docs/assets/quick-start/quick-start-login.png
--- a/docs/assets/quick-start/quick-start-qwen3.png
+++ b/docs/assets/quick-start/quick-start-qwen3.png
--- a/docs/assets/quick-start/windows-done.png
+++ b/docs/assets/quick-start/windows-done.png
--- a/docs/assets/quick-start/windows-installer.png
+++ b/docs/assets/quick-start/windows-installer.png
--- a/docs/assets/sso/auth0-app.png
+++ b/docs/assets/sso/auth0-app.png
--- a/docs/assets/sso/auth0-callback.png
+++ b/docs/assets/sso/auth0-callback.png
--- a/docs/assets/sso/auth0-saml-callback.png
+++ b/docs/assets/sso/auth0-saml-callback.png
--- a/docs/assets/sso/auth0-saml-cert.png
+++ b/docs/assets/sso/auth0-saml-cert.png
--- a/docs/assets/sso/auth0-saml-url.png
+++ b/docs/assets/sso/auth0-saml-url.png
--- a/docs/assets/sso/create-oidc-app.png
+++ b/docs/assets/sso/create-oidc-app.png
--- a/docs/assets/sso/create-saml-app.png
+++ b/docs/assets/sso/create-saml-app.png
--- a/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
--- a/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
--- a/docs/assets/tutorials/using-vision-language-models/playground-vl.png
+++ b/docs/assets/tutorials/using-vision-language-models/playground-vl.png
--- a/docs/assets/using-models/editing-images/image-edit-catalog.png
+++ b/docs/assets/using-models/editing-images/image-edit-catalog.png
--- a/docs/assets/using-models/editing-images/image-edit-example.png
+++ b/docs/assets/using-models/editing-images/image-edit-example.png
--- a/docs/assets/using-models/editing-images/image-edit-input.png
+++ b/docs/assets/using-models/editing-images/image-edit-input.png
--- a/docs/assets/using-models/editing-images/image-edit-output.png
+++ b/docs/assets/using-models/editing-images/image-edit-output.png
--- a/docs/assets/using-models/editing-images/view-code.png
+++ b/docs/assets/using-models/editing-images/view-code.png
--- a/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
+++ b/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
--- a/docs/assets/using-models/using-audio-models/deploy-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-stt-model.png
--- a/docs/assets/using-models/using-audio-models/deploy-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-tts-model.png
--- a/docs/assets/using-models/using-audio-models/inference-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-stt-model.png
--- a/docs/assets/using-models/using-audio-models/inference-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-tts-model.png
--- a/docs/assets/using-models/using-audio-models/stt-model-list.png
+++ b/docs/assets/using-models/using-audio-models/stt-model-list.png
--- a/docs/assets/using-models/using-audio-models/tts-model-list.png
+++ b/docs/assets/using-models/using-audio-models/tts-model-list.png
--- a/docs/assets/using-models/using-embedding-models/deploy-model.png
+++ b/docs/assets/using-models/using-embedding-models/deploy-model.png
--- a/docs/assets/using-models/using-embedding-models/model-list.png
+++ b/docs/assets/using-models/using-embedding-models/model-list.png
--- a/docs/assets/using-models/using-image-generation-models/deploy-model.png
+++ b/docs/assets/using-models/using-image-generation-models/deploy-model.png
--- a/docs/assets/using-models/using-image-generation-models/image-playground.png
+++ b/docs/assets/using-models/using-image-generation-models/image-playground.png
--- a/docs/assets/using-models/using-image-generation-models/model-list.png
+++ b/docs/assets/using-models/using-image-generation-models/model-list.png
--- a/docs/assets/using-models/using-large-language-models/deploy-model-llama-box.png
+++ b/docs/assets/using-models/using-large-language-models/deploy-model-llama-box.png
--- a/docs/assets/using-models/using-large-language-models/deploy-model-vllm.png
+++ b/docs/assets/using-models/using-large-language-models/deploy-model-vllm.png
--- a/Show More
+++ b/Show More