fix: qwen3-coder param

docs: update huggingface_token config example
feat: Add support for Nvidia MIG detection in containerized environments.
339 changed files with 25223 additions and 8524 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -5,3 +5,4 @@ dist/
 .mypy_cache/

 **/third_party/bin
+*.ma
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,3 @@
 install.ps1.sha256sum text eol=lf
-
 * text=auto eol=lf
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -3,7 +3,9 @@ name: CI
 on:
  workflow_dispatch:
  push:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    tags: ["*.*.*"]
    paths-ignore:
      - "mkdocs.yml"
--- a/.github/workflows/docker-ci.yaml
+++ b/.github/workflows/docker-ci.yaml
@ -3,7 +3,9 @@ name: Docker CI
 on:
  workflow_dispatch:
  push:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    tags: ["*.*.*"]
    paths-ignore:
      - "mkdocs.yml"
@ -13,16 +15,15 @@ on:
      - "**.png"
      - "**.jpg"
      - "**.gif"
-      - "Dockerfile.rocm.base"
-      - "Dockerfile.dcu.base"
+      - "pack/**.base"
  pull_request:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    paths:
-      - "Dockerfile"
-      - "Dockerfile.*"
      - ".github/workflows/docker-ci.yaml"
-      - "!Dockerfile.rocm.base"
-      - "!Dockerfile.dcu.base"
+      - "pack/**"
+      - "!pack/**.base"

 jobs:
  publish-docker:
@ -44,19 +45,16 @@ jobs:
            tag_suffix: ""
            build_args:
              - "CUDA_VERSION=12.4.1"
+              - "CUDA_DEVEL_VERSION=12.6.3"
+              - "FLASHINFER_BUILD_MAX_JOBS=1"
          - device: cuda
            dockerfile: "Dockerfile"
            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-cuda12.8"
            build_args:
              - "CUDA_VERSION=12.8.1"
-          - device: cuda
-            dockerfile: "Dockerfile"
-            platforms: "linux/amd64,linux/arm64"
-            tag_suffix: "-cuda11.8"
-            build_args:
-              - "CUDA_VERSION=11.8.0"
-              - "CUDA_TAG_SUFFIX=-cudnn8-runtime-ubuntu22.04"
+              - "CUDA_DEVEL_VERSION=12.8.1"
+              - "FLASHINFER_BUILD_MAX_JOBS=1"
          #
          # HIP RoCM
          #
@ -86,7 +84,7 @@ jobs:
          #
          - device: musa
            dockerfile: "Dockerfile.musa"
-            platforms: "linux/amd64,linux/arm64"
+            platforms: "linux/amd64"
            tag_suffix: "-musa"
            build_args: []
          #
@ -105,11 +103,20 @@ jobs:
            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-cpu"
            build_args: []
+          #
+          # Iluvatar Corex
+          #
+          - device: corex
+            dockerfile: "Dockerfile.corex"
+            platforms: "linux/amd64"
+            tag_suffix: "-corex"
+            build_args: []

    steps:
      - name: Checkout
        uses: actions/checkout@v4
        with:
+          lfs: true
          fetch-depth: 1
          persist-credentials: false
      - name: Maximize Docker Build Space
@ -154,12 +161,66 @@ jobs:
            echo "$arg" >> $GITHUB_OUTPUT
          done
          echo "EOF" >> $GITHUB_OUTPUT
+      - name: Get Cache Ref
+        id: cache-ref
+        run: |
+          #
+          # Use different cache ref for different branches.
+          #
+          # Examples:
+          # CACHE_FROM_REF
+          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION|DEFAULT_BRANCH}"
+          #   - PR/PUSH to branch      -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH|DEFUALT_BRANCH}"
+          # CACHE_TO_REF
+          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
+          #   - PUSH to branch         -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
+          #
+          # Stories(device cpu):
+          # CACHE_FROM_REF
+          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
+          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
+          # CACHE_TO_REF
+          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
+          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7
+          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
+          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7
+          DEFAULT_BRANCH="main"
+          TAG_SUFFIX="${{ matrix.tag_suffix }}"
+          if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
+            REF="${GITHUB_REF#refs/tags/}"
+            IFS="." read -r VERSION_MAJOR VERSION_MINOR VERSION_PATCH <<< "${REF}"
+            VERSION="${VERSION_MAJOR}.${VERSION_MINOR}"
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
+            CACHE_TO_REF="${CACHE_FROM_REF}"
+          else
+            REF="${GITHUB_BASE_REF:-${GITHUB_REF}}"
+            BRANCH="${REF#refs/heads/}"
+            BRANCH="${BRANCH%-dev}"
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
+            CACHE_TO_REF="${CACHE_FROM_REF}"
+          fi
+          if ! docker manifest inspect "${CACHE_FROM_REF}" >/dev/null 2>&1; then
+            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${DEFAULT_BRANCH}"
+          fi
+          echo "CACHE_FROM_REF=${CACHE_FROM_REF}" >> $GITHUB_ENV
+          echo "CACHE_TO_REF=${CACHE_TO_REF}" >> $GITHUB_ENV
+          echo "DEBUG: GITHUB_BASE_REF=${GITHUB_BASE_REF}"
+          echo "DEBUG: GITHUB_REF=${GITHUB_REF}"
+          echo "DEBUG: TAG_SUFFIX=${TAG_SUFFIX}"
+          echo "DEBUG: CACHE_FROM_REF=${CACHE_FROM_REF}"
+          echo "DEBUG: CACHE_TO_REF=${CACHE_TO_REF}"
      - name: Package
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        id: package
        with:
          push: ${{ github.event_name != 'pull_request' }}
-          file: ${{ github.workspace }}/${{ matrix.dockerfile }}
+          file: ${{ github.workspace }}/pack/${{ matrix.dockerfile }}
          context: ${{ github.workspace }}
          platforms: ${{ matrix.platforms }}
          tags: ${{ steps.metadata.outputs.tags }}
@ -169,6 +230,6 @@ jobs:
          build-args: |
            ${{ steps.build-args.outputs.BUILD_ARGS }}
          cache-from: |
-            type=registry,ref=gpustack/build-cache:gpustack${{ matrix.tag_suffix }}
+            type=registry,ref=${{ env.CACHE_FROM_REF }}
          cache-to: |
-            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,oci-mediatypes=false,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
+            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref={0},ignore-error=true', env.CACHE_TO_REF) || '' }}
--- a/.github/workflows/install-script-windows.yml
+++ b/.github/workflows/install-script-windows.yml
@ -4,12 +4,14 @@ on:
  push:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
  pull_request:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
@ -74,7 +76,7 @@ jobs:
          $env:INSTALL_PACKAGE_SPEC = [System.IO.Path]::Combine("dist", $env:whlPackageName)   
          Write-Host "INSTALL_PACKAGE_SPEC: $env:INSTALL_PACKAGE_SPEC"   
          Write-Host "AppData $env:APPDATA"
-          
+
          # Use port 8080 since 80 is occupied by the System
          ./install.ps1 -ServerPort 8080

@ -100,8 +102,7 @@ jobs:
                  Start-Sleep -Seconds $retryDelaySeconds
              }
          }
-        
+
          if ($responseCode -ne 200) {
              Write-Host "All retry attempts failed. Last error: $lastError"
          }
-      
--- a/.github/workflows/install-script.yml
+++ b/.github/workflows/install-script.yml
@ -4,12 +4,14 @@ on:
  push:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
  pull_request:
    branches:
      - main
+      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@ -2,7 +2,9 @@ name: PR Checking

 on:
  pull_request:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    paths-ignore:
      - "mkdocs.yml"
      - "docs/**"
--- a/.gitignore
+++ b/.gitignore
@ -155,6 +155,7 @@ __pycache__/
 # GPUStack related
 */third_party/bin
 */ui/
+*.ma

 # macOS
 .DS_Store
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,6 +11,7 @@ repos:
    hooks:
      - id: flake8
        exclude: ".*/migrations"
+        args: [--max-complexity=15]
  - repo: https://github.com/psf/black
    rev: 24.4.2
    hooks:
--- a/48
+++ b/48
@ -1,48 +0,0 @@
-ARG CUDA_VERSION=12.4.1
-ARG CUDA_TAG_SUFFIX=-cudnn-runtime-ubuntu22.04
-
-FROM nvidia/cuda:${CUDA_VERSION}${CUDA_TAG_SUFFIX}
-
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y \
-    git \
-    curl \
-    wget \
-    tzdata \
-    iproute2 \
-    python3 \
-    python3-pip \
-    python3-venv \
-    tini \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY . /workspace/gpustack
-RUN cd /workspace/gpustack && \
-    make build
-
-ARG VLLM_VERSION=0.8.5.post1
-RUN <<EOF
-    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
-        # Install vllm dependencies for x86_64
-        if [ "$(echo "${CUDA_VERSION}" | cut -d. -f1,2)" = "11.8" ]; then
-            pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl \
-            --extra-index-url https://download.pytorch.org/whl/cu118;
-        fi;
-        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
-    else
-        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";
-    fi
-    pip install pipx
-    pip install $WHEEL_PACKAGE
-    pip cache purge
-    rm -rf /workspace/gpustack
-EOF
-
-RUN gpustack download-tools
-
-# Download dac weights used by audio models like Dia
-RUN python3 -m dac download
-
-ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -1,32 +0,0 @@
-FROM ubuntu:22.04
-
-ARG TARGETPLATFORM
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN apt-get update && apt-get install -y \
-    git \
-    curl \
-    wget \
-    tzdata \
-    iproute2 \
-    python3 \
-    python3-pip \
-    python3-venv \
-    tini \
-    && rm -rf /var/lib/apt/lists/*
-
-COPY . /workspace/gpustack
-RUN cd /workspace/gpustack && \
-    make build && \
-    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]" && \
-    pip install pipx && \
-    pip install $WHEEL_PACKAGE && \
-    pip cache purge && \
-    rm -rf /workspace/gpustack
-
-RUN gpustack download-tools
-
-# Download dac weights used by audio models like Dia
-RUN python3 -m dac download
-
-ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/Dockerfile.npu
+++ b/Dockerfile.npu
@ -1,454 +0,0 @@
-# Packaging logic:
-# 1. base target:
-#   - Install tools, including Python, GCC, CMake, Make, SCCache and dependencies.
-#   - Install specific version Ascend CANN according to the chip, including Toolkit and Kernels.
-# 2. mindie-install target:
-#   - Install specific version Ascend CANN NNAL.
-#   - Copy and intsall the ATB models from a fixed image.
-#   - Install required dependencies.
-#   - Install specific version MindIE.
-# 3. gpustack target (final):
-#   - Install GPUStack, and override the required dependencies after installed.
-#   - Set up the environment for CANN, NNAL and ATB models.
-#   - Set up the entrypoint to start GPUStack.
-
-# Arguments description:
-# - CANN_VERSION is the version of Ascend CANN,
-#   which is used to install the Ascend CANN Toolkit, Kernels and NNAL.
-# - CANN_CHIP is the chip version of Ascend CANN,
-#   which is used to install the Ascend CANN Kernels.
-# - MINDIE_VERSION is the version of Ascend MindIE,
-#   which is used to install the Ascend MindIE,
-#   please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
-# - PYTHON_VERSION is the version of Python,
-#   which should be properly set, it must be 3.x.
-
-ARG CANN_VERSION=8.1.rc1.beta1
-ARG CANN_CHIP=910b
-ARG MINDIE_VERSION=2.0.rc1
-ARG PYTHON_VERSION=3.11
-
-#
-# Stage Base
-#
-# Example build command:
-#   docker build --tag=gpustack/gpustack:npu-base --file=Dockerfile.npu --target base --progress=plain .
-#
-
-FROM ubuntu:20.04 AS base
-SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
-
-ARG TARGETPLATFORM
-ARG TARGETOS
-ARG TARGETARCH
-
-## Install tools
-
-ARG PYTHON_VERSION
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHON_VERSION=${PYTHON_VERSION}
-
-RUN <<EOF
-    # Refresh
-    apt-get update -y && apt-get install -y --no-install-recommends \
-        software-properties-common apt-transport-https \
-      && add-apt-repository -y ppa:ubuntu-toolchain-r/test \
-      && add-apt-repository -y ppa:deadsnakes/ppa \
-      && apt-get update -y
-
-    # Install
-    apt-get install -y --no-install-recommends \
-        ca-certificates build-essential binutils bash openssl \
-        curl wget aria2 \
-        git git-lfs \
-        unzip xz-utils \
-        tzdata locales \
-        iproute2 iputils-ping ifstat net-tools dnsutils pciutils ipmitool \
-        procps sysstat htop \
-        tini vim jq bc tree
-
-    # Update python
-    PYTHON="python${PYTHON_VERSION}"
-    apt-get install -y --no-install-recommends \
-        ${PYTHON} ${PYTHON}-dev ${PYTHON}-distutils ${PYTHON}-venv ${PYTHON}-lib2to3
-    if [ -f /etc/alternatives/python ]; then update-alternatives --remove-all python; fi; update-alternatives --install /usr/bin/python python /usr/bin/${PYTHON} 10
-    if [ -f /etc/alternatives/python3 ]; then update-alternatives --remove-all python3; fi; update-alternatives --install /usr/bin/python3 python3 /usr/bin/${PYTHON} 10
-    curl -sS https://bootstrap.pypa.io/get-pip.py | ${PYTHON}
-
-    # Update locale
-    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt \
-        && pip cache purge
-EOF
-
-ENV LANG='en_US.UTF-8' \
-    LANGUAGE='en_US:en' \
-    LC_ALL='en_US.UTF-8'
-
-## Install GCC
-
-RUN <<EOF
-    # GCC
-
-    # Install
-    apt-get install -y --no-install-recommends \
-        gcc-11 g++-11 gfortran-11 gfortran
-
-    # Update alternatives
-    if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
-    if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
-    if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
-    if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
-    if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
-    if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
-    if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
-    if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
-    if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
-    if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
-    if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt
-EOF
-
-## Install CMake/Make/SCCache
-
-RUN <<EOF
-    # CMake/Make/SCCache
-
-    # Install
-    apt-get install -y --no-install-recommends \
-        pkg-config make
-    curl -sL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
-    curl -sL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt
-EOF
-
-## Install Dependencies
-
-RUN <<EOF
-    # Dependencies
-
-    # Install
-    apt-get install -y --no-install-recommends \
-        zlib1g zlib1g-dev libbz2-dev liblzma-dev libffi-dev openssl libssl-dev libsqlite3-dev \
-        libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 gfortran libhdf5-dev \
-        libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt
-EOF
-
-ARG CANN_VERSION
-ARG CANN_CHIP
-
-ENV CANN_VERSION=${CANN_VERSION} \
-    CANN_CHIP=${CANN_CHIP} \
-    CANN_HOME="/usr/local/Ascend"
-
-## Install CANN Toolkit
-
-RUN <<EOF
-    # CANN Toolkit
-
-    OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
-    ARCH="$(uname -m)"
-    DOWNLOAD_VERSION="$(echo ${CANN_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
-    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
-    URL_SUFFIX="response-content-type=application/octet-stream"
-
-    # Install dependencies
-    python3 -m pip install --no-cache-dir --root-user-action ignore --upgrade pip
-    pip install --no-cache-dir --root-user-action ignore \
-      attrs cython numpy==1.26.4 decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py
-
-    # Install toolkit
-    TOOLKIT_FILE="Ascend-cann-toolkit_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
-    TOOLKIT_PATH="/tmp/${TOOLKIT_FILE}"
-    TOOLKIT_URL="${URL_PREFIX}/${TOOLKIT_FILE}?${URL_SUFFIX}"
-    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL -o "${TOOLKIT_PATH}" "${TOOLKIT_URL}"
-    chmod a+x "${TOOLKIT_PATH}"
-    printf "Y\n" | "${TOOLKIT_PATH}" --install --install-for-all --install-path="${CANN_HOME}"
-
-    # Cleanup
-    rm -f "${TOOLKIT_PATH}" \
-        && rm -rf /var/log/ascend \
-        && rm -rf /var/log/ascend_seclog \
-        && pip cache purge
-EOF
-
-## Install CANN Kernels
-
-RUN <<EOF
-    # CANN Kernels
-
-    OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
-    ARCH="$(uname -m)"
-    DOWNLOAD_VERSION="$(echo ${CANN_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
-    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
-    URL_SUFFIX="response-content-type=application/octet-stream"
-
-    # Prepare environment
-    source ${CANN_HOME}/ascend-toolkit/set_env.sh
-
-    # Install kernels
-    KERNELS_FILE="Ascend-cann-kernels-${CANN_CHIP}_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
-    if ! curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fsSIL "${URL_PREFIX}/${KERNELS_FILE}?${URL_SUFFIX}" >/dev/null 2>&1; then
-        # Fallback to generic kernels
-        KERNELS_FILE="Ascend-cann-kernels-${CANN_CHIP}_${DOWNLOAD_VERSION}_${OS}.run"
-    fi
-    KERNELS_PATH="/tmp/${KERNELS_FILE}"
-    KERNELS_URL="${URL_PREFIX}/${KERNELS_FILE}?${URL_SUFFIX}"
-    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL -o "${KERNELS_PATH}" "${KERNELS_URL}"
-    chmod a+x "${KERNELS_PATH}"
-    printf "Y\n" |"${KERNELS_PATH}" --install --install-for-all --install-path="${CANN_HOME}"
-
-    # Cleanup
-    rm -f "${KERNELS_PATH}" \
-        && rm -rf /var/log/ascend \
-        && rm -rf /var/log/ascend_seclog \
-        && pip cache purge
-EOF
-
-#
-# Stage MindIE Install
-#
-# Example build command:
-#   docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
-#
-
-FROM base AS mindie-install
-
-## Install NNAL
-
-RUN <<EOF
-    # CANN NNAL
-
-    OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
-    ARCH="$(uname -m)"
-    DOWNLOAD_VERSION="$(echo ${CANN_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
-    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
-    URL_SUFFIX="response-content-type=application/octet-stream"
-
-    # Prepare environment
-    source ${CANN_HOME}/ascend-toolkit/set_env.sh
-
-    # Install NNAL
-    NNAL_FILE="Ascend-cann-nnal_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
-    NNAL_PATH="/tmp/${NNAL_FILE}"
-    NNAL_URL="${URL_PREFIX}/${NNAL_FILE}?${URL_SUFFIX}"
-    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL -o "${NNAL_PATH}" "${NNAL_URL}"
-    chmod a+x "${NNAL_PATH}"
-    printf "Y\n" | "${NNAL_PATH}" --install --install-path="${CANN_HOME}"
-
-    # Cleanup
-    rm -f "${NNAL_PATH}" \
-        && rm -rf /var/log/ascend_seclog \
-        && rm -rf /var/log/cann_atb_log \
-        && pip cache purge
-EOF
-
-COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
-RUN <<EOF
-    # ATB Models
-
-    # Install
-    pip install --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl
-
-    # Cleanup
-    rm -f "${NNAL_PATH}" \
-        && rm -rf /var/log/ascend_seclog \
-        && rm -rf /var/log/cann_atb_log \
-        && pip cache purge
-EOF
-
-## Install MindIE
-
-ARG MINDIE_VERSION
-
-ENV MINDIE_VERSION=${MINDIE_VERSION}
-
-RUN <<EOF
-    # MindIE
-
-    OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
-    ARCH="$(uname -m)"
-    DOWNLOAD_VERSION="$(echo ${MINDIE_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
-    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindIE/MindIE%20${DOWNLOAD_VERSION}"
-    URL_SUFFIX="response-content-type=application/octet-stream"
-
-    # Prepare environment
-    source ${CANN_HOME}/ascend-toolkit/set_env.sh
-    source ${CANN_HOME}/nnal/atb/set_env.sh
-
-    # Install dependencies,
-    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
-    # please check https://www.hiascend.com/document/detail/zh/Pytorch/700/configandinstg/instg/insg_0004.html for details.
-    if [ ${ARCH} == "x86_64" ]; then
-        pip install --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
-    else
-        pip install --no-cache-dir --root-user-action ignore torch==2.1.0
-    fi
-    pip install --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0
-    cat <<EOT >/tmp/requirements.txt
-av==14.3.0
-absl-py==2.2.2
-attrs==24.3.0
-certifi==2024.8.30
-cloudpickle==3.0.0
-einops==0.8.1
-easydict==1.13
-frozenlist==1.6.0
-gevent==24.2.1
-geventhttpclient==2.3.1
-greenlet==3.2.1
-grpcio==1.71.0
-icetk==0.0.4
-idna==2.8
-jsonlines==4.0.0
-jsonschema==4.23.0
-jsonschema-specifications==2025.4.1
-Jinja2==3.1.6
-loguru==0.7.2
-matplotlib==3.9.2
-ml_dtypes==0.5.0
-multidict==6.4.3
-nltk==3.9.1
-numba==0.61.2
-numpy==1.26.4
-pandas==2.2.3
-pillow==11.2.1
-prettytable==3.11.0
-pyarrow==19.0.1
-pydantic==2.9.2
-pydantic_core==2.23.4
-python-rapidjson==1.20
-requests==2.32.3
-sacrebleu==2.4.3
-tornado==6.4.2
-transformers==4.46.3
-tiktoken==0.7.0
-typing_extensions==4.13.2
-tzdata==2024.2
-tqdm==4.67.1
-thefuzz==0.22.1
-urllib3==2.4.0
-zope.event==5.0
-zope.interface==7.0.3
-EOT
-    pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
-
-    # Install MindIE
-    MINDIE_FILE="Ascend-mindie_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
-    MINDIE_PATH="/tmp/${MINDIE_FILE}"
-    MINDIE_URL="${URL_PREFIX}/${MINDIE_FILE}?${URL_SUFFIX}"
-    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL -o "${MINDIE_PATH}" "${MINDIE_URL}"
-    chmod a+x "${MINDIE_PATH}"
-    printf "Y\n" | "${MINDIE_PATH}" --install --install-path="${CANN_HOME}"
-
-    # Post process
-    chmod +w "${CANN_HOME}/mindie/latest/mindie-service/conf"
-
-    # Review
-    pip freeze \
-        && python -m site
-
-    # Cleanup
-    rm -f "${MINDIE_PATH}" \
-        && rm -rf /var/log/mindie_log \
-        && rm -rf ~/log \
-        && rm -rf /tmp/* \
-        && pip cache purge
-EOF
-
-#
-# Stage GPUStack
-#
-# Example build command:
-#   docker build --tag=gpustack/gpustack:npu --file=Dockerfile.npu --progress=plain .
-#
-
-FROM mindie-install AS gpustack
-
-## Install GPUStack
-
-RUN --mount=type=bind,target=/workspace/gpustack,rw <<EOF
-    # Build
-    cd /workspace/gpustack \
-        && make build
-
-    # Install,
-    # vox-box relies on PyTorch 2.7, which is not compatible with MindIE.
-    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)"
-    pip install --no-cache-dir --root-user-action ignore $WHEEL_PACKAGE
-
-    # Download tools
-    gpustack download-tools --device npu
-
-    # Post-process,
-    # override the required dependencies after installed.
-    cat <<EOT >/tmp/requirements.txt
-pipx==1.7.1
-EOT
-    pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
-
-    # Set up environment
-    mkdir -p /var/lib/gpustack \
-        && chmod -R 0755 /var/lib/gpustack
-
-    # Review
-    pip freeze \
-        && python -m site
-
-    # Cleanup
-    rm -rf /workspace/gpustack/dist \
-        && rm -rf /tmp/* \
-        && pip cache purge
-EOF
-
-## Setup environment
-
-RUN <<EOF
-    # Export CANN driver lib
-    EXPORT_DRIVER_LIB="export LD_LIBRARY_PATH=${CANN_HOME}/driver/lib64/common:${CANN_HOME}/driver/lib64/driver:\${LD_LIBRARY_PATH}"
-    echo "${EXPORT_DRIVER_LIB}" >> /etc/profile
-    echo "${EXPORT_DRIVER_LIB}" >> ~/.bashrc
-
-    # Source CANN Toolkit environment
-    SOURCE_TOOLKIT_ENV="source ${CANN_HOME}/ascend-toolkit/set_env.sh"
-    echo "${SOURCE_TOOLKIT_ENV}" >> /etc/profile
-    echo "${SOURCE_TOOLKIT_ENV}" >> ~/.bashrc
-
-    # Source CANN NNAL environment
-    SOURCE_NNAL_ENV="source ${CANN_HOME}/nnal/atb/set_env.sh"
-    echo "${SOURCE_NNAL_ENV}" >> /etc/profile
-    echo "${SOURCE_NNAL_ENV}" >> ~/.bashrc
-
-    # Source ATB model environment
-    SOURCE_ATB_MODEL_ENV="source ${CANN_HOME}/atb-models/set_env.sh"
-    echo "${SOURCE_ATB_MODEL_ENV}" >> /etc/profile
-    echo "${SOURCE_ATB_MODEL_ENV}" >> ~/.bashrc
-
-    # Export Driver Tools
-    EXPORT_DRIVER_TOOLS="export PATH=${CANN_HOME}/driver/tools:\${PATH}"
-    echo "${EXPORT_DRIVER_TOOLS}" >> /etc/profile
-    echo "${EXPORT_DRIVER_TOOLS}" >> ~/.bashrc
-
-    # NB(thxCode): For specific MindIE version supporting,
-    # we need to process environment setting up during GPUStack deployment.
-EOF
-
-ENTRYPOINT [ "tini", "--", "/usr/bin/bash", "-c", "source /etc/profile && exec gpustack start \"$@\"", "--" ]
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/Docs-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -21,7 +21,8 @@

 <p align="center">
  <a href="./README.md">English</a> |
-  <a href="./README_CN.md">简体中文</a>
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
 </p>

 <br>
@ -34,7 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 - **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
 - **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Integrates with llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM, and Ascend MindIE.
+- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including vLLM, Ascend MindIE, llama-box (llama.cpp & stable-diffusion.cpp) and vox-box.
 - **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
 - **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
 - **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -49,83 +50,71 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 ## Installation

-### Linux or macOS
+### Linux

-GPUStack provides a script to install it as a service on systemd or launchd based systems with default port 80. To install GPUStack using this method, just run:
+If you are using NVIDIA GPUs, ensure [Docker](https://docs.docker.com/engine/install/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) are installed on your system. Then, run the following command to start the GPUStack server.

 ```bash
-curl -sfL https://get.gpustack.ai | sh -s -
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
 ```

-### Windows
+For more details on the installation or other GPU hardware platforms, please refer to the [Installation Documentation](docs/installation/installation-requirements.md).

-Run PowerShell as administrator (**avoid** using PowerShell ISE), then run the following command to install GPUStack:
-
-```powershell
-Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
-```
-
-### Other Installation Methods
-
-For manual installation, docker installation or detailed configuration options, please refer to the [Installation Documentation](https://docs.gpustack.ai/latest/installation/installation-script/).
-
-## Getting Started
-
-1. Run and chat with the **llama3.2** model:
+After the server starts, run the following command to get the default admin password:

 ```bash
-gpustack chat llama3.2 "tell me a joke."
+docker exec gpustack cat /var/lib/gpustack/initial_admin_password
 ```

-2. Run and generate an image with the **stable-diffusion-v3-5-large-turbo** model:
+Open your browser and navigate to `http://your_host_ip` to access the GPUStack UI. Use the default username `admin` and the password you retrieved above to log in.

-> ### 💡 Tip
->
-> This command downloads the model (~12GB) from Hugging Face. The download time depends on your network speed. Ensure you have enough disk space and VRAM (12GB) to run the model. If you encounter issues, you can skip this step and move to the next one.
+### macOS & Windows

-```bash
-gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
-"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
-```
+A desktop installer is available for macOS and Windows — see the [documentation](https://docs.gpustack.ai/latest/installation/desktop-installer/) for installation details.

-Once the command completes, the generated image will appear in the default viewer. You can experiment with the prompt and CLI options to customize the output.
+## Deploy a Model

-![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+1. Navigate to the `Catalog` page in the GPUStack UI.

-3. Open `http://your_host_ip` in the browser to access the GPUStack UI. Log in to GPUStack with username `admin` and the default password. You can run the following command to get the password for the default setup:
+2. Select the `Qwen3` model from the list of available models.

-**Linux or macOS**
+3. After the deployment compatibility checks pass, click the `Save` button to deploy the model.

-```bash
-cat /var/lib/gpustack/initial_admin_password
-```
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)

-**Windows**
+4. GPUStack will start downloading the model files and deploying the model. When the deployment status shows `Running`, the model has been deployed successfully.

-```powershell
-Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
-```
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. Click `Playground - Chat` in the navigation menu, check that the model `qwen3` is selected from the top-right `Model` dropdown. Now you can chat with the model in the UI playground.

-4. Click `Playground - Chat` in the navigation menu. Now you can chat with the LLM in the UI playground.
+![quick chat](docs/assets/quick-start/quick-chat.png)

-![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+## Use the model via API

-5. Click `API Keys` in the navigation menu, then click the `New API Key` button.
+1. Hover over the user avatar and navigate to the `API Keys` page, then click the `New API Key` button.

-6. Fill in the `Name` and click the `Save` button.
+2. Fill in the `Name` and click the `Save` button.

-7. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.
+3. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.

-8. Now you can use the API key to access the OpenAI-compatible API. For example, use curl as the following:
+4. You can now use the API key to access the OpenAI-compatible API endpoints provided by GPUStack. For example, use curl as the following:

 ```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1-openai/chat/completions \
+curl http://your_gpustack_server_url/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "llama3.2",
+    "model": "qwen3",
    "messages": [
      {
        "role": "system",
@ -133,7 +122,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
      },
      {
        "role": "user",
-        "content": "Hello!"
+        "content": "Tell me a joke."
      }
    ],
    "stream": true
@ -142,8 +131,8 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \

 ## Supported Platforms

- [x] macOS
 - [x] Linux
+- [x] macOS
 - [x] Windows

 ## Supported Accelerators
@ -154,15 +143,12 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] Ascend CANN
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
-
-We plan to support the following accelerators in future releases.
-
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
+- [x] Iluvatar Corex
+- [x] Cambricon MLU

 ## Supported Models

-GPUStack uses [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:
+GPUStack uses [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:

 1. [Hugging Face](https://huggingface.co/)

@ -170,16 +156,16 @@ GPUStack uses [llama-box](https://github.com/gpustack/llama-box) (bundled [llama

 3. Local File Path

-### Example Models:
+### Example Models

-| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                           |
-| -------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
-| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
-| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
-| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
-| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
-| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |
+| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                       |
+| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |

 For full list of supported models, please refer to the supported models section in the [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) documentation.

--- a/README_CN.md
+++ b/README_CN.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/文档-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -21,7 +21,8 @@

 <p align="center">
  <a href="./README.md">English</a> |
-  <a href="./README_CN.md">简体中文</a>
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
 </p>

 <br>
@ -34,7 +35,7 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 - **广泛的 GPU 兼容性**：无缝支持 Apple Mac、Windows PC 和 Linux 服务器上各种供应商的 GPU。
 - **广泛的模型支持**：支持各种模型，包括 LLM、多模态 VLM、图像模型、语音模型、文本嵌入模型和重排序模型。
- **灵活的推理后端**：与 llama-box（llama.cpp 和 stable-diffusion.cpp）、vox-box、vLLM 和 Ascend MindIE 集成。
+- **灵活的推理后端**：支持与 vLLM 、 Ascend MindIE、llama-box（llama.cpp 和 stable-diffusion.cpp）和 vox-box 等多种推理后端的灵活集成。
 - **多版本后端支持**：同时运行推理后端的多个版本，以满足不同模型的不同运行依赖。
 - **分布式推理**：支持单机和多机多卡并行推理，包括跨供应商和运行环境的异构 GPU。
 - **可扩展的 GPU 架构**：通过向基础设施添加更多 GPU 或节点轻松进行扩展。
@ -49,84 +50,71 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 ## 安装

-### Linux 或 macOS
+### Linux

-GPUStack 提供了安装脚本，可以将其安装为 Linux 的 systemd 服务或 macOS 的 launchd 服务，默认端口为 80。要使用此方法安装 GPUStack，执行以下命令：
+如果你是 NVIDIA GPU 环境，请确保 [Docker](https://docs.docker.com/engine/install/) 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) 都已经在系统中安装。 然后，执行如下命令启动 GPUStack：

 ```bash
-curl -sfL https://get.gpustack.ai | INSTALL_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple sh -s -
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
 ```

-### Windows
+有关其它平台的安装或详细配置选项，请参考[安装文档](docs/installation/installation-requirements.md).

-以管理员身份运行 PowerShell（**避免**使用 PowerShell ISE），然后执行以下命令安装 GPUStack：
-
-```powershell
-$env:INSTALL_INDEX_URL = "https://pypi.tuna.tsinghua.edu.cn/simple"
-Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
-```
-
-### 其他安装方式
-
-有关 pip 安装、Docker 安装或详细配置选项，请参考[安装文档](https://docs.gpustack.ai/latest/installation/installation-requirements/)。
-
-## 新手入门
-
-1. 在命令行运行 **llama3.2** 模型并进行对话：
+容器正常运行后，执行以下命令获取默认密码：

 ```bash
-gpustack chat llama3.2 "tell me a joke."
+docker exec gpustack cat /var/lib/gpustack/initial_admin_password
 ```

-2. 运行 **stable-diffusion-v3-5-large-turbo** 模型并生成图像：
+在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用 `admin` 用户名和默认密码登录 GPUStack。

-> ### 💡 Tip
->
-> 此命令将从 Hugging Face 下载模型（约 12GB）。下载时间取决于你的网络速度。确保你有足够的磁盘空间和 VRAM（12GB）来运行模型。如果遇到问题，你可以跳过此步骤并转到下一步。
+### macOS & Windows

-```bash
-gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
-"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
-```
+对于 macOS 和 Windows，我们提供了桌面安装程序。请参阅[文档](https://docs.gpustack.ai/latest/installation/desktop-installer/)了解安装细节。

-命令完成后，生成的图像将出现在默认查看器中。你可以尝试修改 prompt 和 CLI 参数来定制输出。
+## 部署模型

-![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+1. 在 GPUStack 界面，在菜单中点击“模型库”。

-3. 在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用“admin”用户名和默认密码登录 GPUStack。可以执行以下命令获取默认密码：
+2. 从模型列表中选择 `Qwen3` 模型。

-**Linux 或 macOS**
+3. 在部署兼容性检查通过之后，选择保存部署模型。

-```bash
-cat /var/lib/gpustack/initial_admin_password
-```
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)

-**Windows**
+4. GPUStack 将开始下载模型文件并部署模型。当部署状态显示为 `Running` 时，表示模型已成功部署。

-```powershell
-Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
-```
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. 点击菜单中的“试验场 - 对话”，在右上方模型菜单中选择模型 `qwen3`。现在你可以在试验场中与 LLM 进行对话。

-4. 在菜单中点击“试验场 - 对话”，现在你可以在试验场中与 LLM 进行对话。
+![quick chat](docs/assets/quick-start/quick-chat.png)

-![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+## 通过 API 使用模型

-5. 在菜单中点击“API 秘钥”，然后点击“新建 API 秘钥”按钮。
+1. 将鼠标移动到右下角的用户头像上，选择“API 密钥”，然后点击“新建 API 秘钥”按钮。

-6. 填写“名称”，然后点击“保存”按钮。
+2. 填写“名称”，然后点击“保存”按钮。

-7. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。
+3. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。

-8. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：
+4. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：

 ```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1-openai/chat/completions \
+curl http://your_gpustack_server_url/v1/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "llama3.2",
+    "model": "qwen3",
    "messages": [
      {
        "role": "system",
@ -134,7 +122,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
      },
      {
        "role": "user",
-        "content": "Hello!"
+        "content": "Tell me a joke."
      }
    ],
    "stream": true
@ -143,8 +131,8 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \

 ## 平台支持

- [x] macOS
 - [x] Linux
+- [x] macOS
 - [x] Windows

 ## 加速框架支持
@ -155,15 +143,12 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] 昇腾 CANN
 - [x] 海光 DTK
 - [x] 摩尔线程 MUSA
-
-我们计划在未来的版本中支持以下加速框架：
-
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
+- [x] 天数智芯 Corex
+- [x] 寒武纪 MLU

 ## 模型支持

-GPUStack 使用 [llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）、[vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie) 和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：
+GPUStack 使用 [vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：

 1. [Hugging Face](https://huggingface.co/)

@ -173,14 +158,14 @@ GPUStack 使用 [llama-box](https://github.com/gpustack/llama-box)（基于 [lla

 ### 示例模型

-| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                             |
-| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
-| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
-| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
-| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
-| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
-| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |
+| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                         |
+| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |

 有关支持模型的完整列表，请参阅 [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) 文档中的 Supported Models 部分。

@ -234,20 +219,20 @@ GPUStack 用户可以在 UI 中生成自己的 API 密钥。

 ## 加入社区

-扫码添加 GPUStack 微信小助手加入社区群：
+扫码加入社区群：

 <p align="left">
-    <img alt="Wechat-assistant" src="./docs/assets/wechat-assistant.png" width="300px"/>
+    <img alt="Wechat-group" src="./docs/assets/wechat-group-qrcode.jpg" width="300px"/>
 </p>

 ## License

 版权所有 (c) 2024 GPUStack 作者

-本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。  
-您只能在遵守许可证条款的前提下使用本项目。  
+本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。
+您只能在遵守许可证条款的前提下使用本项目。
 许可证的完整内容请参阅 [LICENSE](./LICENSE) 文件。

-除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，  
+除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，
 不附带任何明示或暗示的保证或条件。
 有关许可证规定的具体权利和限制，请参阅许可证了解更多详细信息。
--- a/README_JP.md
+++ b/README_JP.md
@ -0,0 +1,235 @@
+<br>
+
+<p align="center">
+    <img alt="GPUStack" src="https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-logo.png" width="300px"/>
+</p>
+<br>
+
+<p align="center">
+    <a href="https://docs.gpustack.ai" target="_blank">
+        <img alt="Documentation" src="https://img.shields.io/badge/ドキュメント-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
+    <a href="./LICENSE" target="_blank">
+        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
+    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
+        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
+    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
+        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
+    <a href="https://twitter.com/intent/follow?screen_name=gpustack_ai" target="_blank">
+        <img alt="Follow on X(Twitter)" src="https://img.shields.io/twitter/follow/gpustack_ai?logo=X"></a>
+</p>
+<br>
+
+<p align="center">
+  <a href="./README.md">English</a> |
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
+</p>
+
+<br>
+
+![demo](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-demo.gif)
+
+GPUStack は、AI モデルを実行するためのオープンソース GPU クラスタマネージャーです。
+
+### 主な機能
+
+- **幅広い GPU 互換性:** Apple Mac、Windows PC、Linux サーバー上のさまざまなベンダーの GPU をシームレスにサポート。
+- **豊富なモデルサポート:** LLM、VLM、画像モデル、音声モデル、埋め込みモデル、リランクモデルを含む幅広いモデルをサポート。
+- **柔軟な推論バックエンド:** vLLM、Ascend MindIE、llama-box（llama.cpp と stable-diffusion.cpp）、vox-box と統合。
+- **マルチバージョンバックエンドサポート:** 異なるモデルの多様なランタイム要件を満たすために、推論バックエンドの複数バージョンを同時実行。
+- **分散推論:** ベンダーやランタイム環境をまたぐ異種 GPU を含む、シングルノードおよびマルチノードのマルチ GPU 推論をサポート。
+- **スケーラブルな GPU アーキテクチャ:** インフラストラクチャに GPU やノードを追加することで簡単にスケールアップ。
+- **堅牢なモデル安定性:** 自動障害回復、マルチインスタンス冗長性、推論リクエストのロードバランシングで高可用性を確保。
+- **インテリジェントなデプロイ評価:** モデルリソース要件、バックエンドとアーキテクチャの互換性、OS の互換性、その他のデプロイ関連要因を自動的に評価。
+- **自動スケジューリング:** 利用可能なリソースに基づいてモデルを動的に割り当て。
+- **軽量な Python パッケージ:** 最小限の依存関係と低い運用オーバーヘッド。
+- **OpenAI 互換 API:** OpenAI の API 仕様と完全に互換性があり、シームレスな統合を実現。
+- **ユーザーと API キー管理:** ユーザーと API キーの管理を簡素化。
+- **リアルタイム GPU 監視:** GPU 性能と使用率をリアルタイムで追跡。
+- **トークンとレートメトリクス:** トークン使用量と API リクエストレートを監視。
+
+## インストール
+
+### Linux
+
+NVIDIA GPU を使用している場合は、Docker と NVIDIA Container Toolkit をインストールしてください。その後、以下のコマンドで GPUStack サーバーを起動します：
+
+```bash
+docker run -d --name gpustack \
+      --restart=unless-stopped \
+      --gpus all \
+      --network=host \
+      --ipc=host \
+      -v gpustack-data:/var/lib/gpustack \
+      gpustack/gpustack
+```
+
+詳細なインストール手順やその他の GPU ハードウェアプラットフォームについては、インストールドキュメント を参照してください。
+
+サーバー起動後、次のコマンドでデフォルト管理者パスワードを取得できます：
+
+```bash
+cat /var/lib/gpustack/initial_admin_password
+```
+
+ブラウザで http://your_host_ip にアクセスし、ユーザー名 admin と取得したパスワードでログインします。
+
+### macOS & Windows
+
+macOS および Windows 向けにデスクトップインストーラーが用意されています。インストールの詳細は [ドキュメント](https://docs.gpustack.ai/latest/installation/desktop-installer/) をご覧ください。
+
+## モデルのデプロイ
+
+1. GPUStack UI の Catalog ページに移動します。
+
+2. モデルリストから Qwen3 モデルを選択します。
+
+3. デプロイ互換性チェックが完了したら、Save ボタンをクリックしてデプロイします。
+
+![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)
+
+4. モデルのダウンロードとデプロイが開始されます。ステータスが Running になると、デプロイ成功です。
+
+![model is running](docs/assets/quick-start/model-running.png)
+
+5. ナビゲーションメニューから Playground - Chat を選択し、右上の Model ドロップダウンで qwen3 が選択されていることを確認してチャットを開始します。
+
+![quick chat](docs/assets/quick-start/quick-chat.png)
+
+## API でモデルを使用する
+
+1. ユーザーアバターをホバーし、API Keys ページに移動後、New API Key をクリックします。
+
+2. Name を入力し、Save をクリックします。
+
+3. 生成された API キーをコピーして安全な場所に保管してください（一度しか表示されません）。
+
+4. OpenAI 互換エンドポイントにアクセスできます。例：
+
+```bash
+# Replace `your_api_key` and `your_gpustack_server_url`
+# with your actual API key and GPUStack server URL.
+export GPUSTACK_API_KEY=your_api_key
+curl http://your_gpustack_server_url/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
+  -d '{
+    "model": "qwen3",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Tell me a joke."
+      }
+    ],
+    "stream": true
+  }'
+```
+
+## サポートされているプラットフォーム
+
+- [x] Linux
+- [x] macOS
+- [x] Windows
+
+## サポートされているアクセラレータ
+
+- [x] NVIDIA CUDA（[Compute Capability](https://developer.nvidia.com/cuda-gpus) 6.0 以上）
+- [x] Apple Metal（M 系チップ）
+- [x] AMD ROCm
+- [x] Ascend CANN
+- [x] Hygon DTK
+- [x] Moore Threads MUSA
+- [x] Iluvatar Corex
+- [x] Cambricon MLU
+
+## サポートされているモデル
+
+GPUStack は[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：
+
+1. [Hugging Face](https://huggingface.co/)
+
+2. [ModelScope](https://modelscope.cn/)
+
+3. ローカルファイルパス
+
+### モデル例
+
+| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                       |
+| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
+| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
+| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
+| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
+| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
+| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                  |
+
+サポートされているモデルの完全なリストについては、[推論バックエンド](https://docs.gpustack.ai/latest/user-guide/inference-backends/)ドキュメントのサポートされているモデルセクションを参照してください。
+
+## OpenAI 互換 API
+
+GPUStack は`/v1-openai`パスの下で以下の OpenAI 互換 API を提供します：
+
+- [x] [List Models](https://platform.openai.com/docs/api-reference/models/list)
+- [x] [Create Completion](https://platform.openai.com/docs/api-reference/completions/create)
+- [x] [Create Chat Completion](https://platform.openai.com/docs/api-reference/chat/create)
+- [x] [Create Embeddings](https://platform.openai.com/docs/api-reference/embeddings/create)
+- [x] [Create Image](https://platform.openai.com/docs/api-reference/images/create)
+- [x] [Create Image Edit](https://platform.openai.com/docs/api-reference/images/createEdit)
+- [x] [Create Speech](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+- [x] [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+
+例えば、公式の[OpenAI Python API ライブラリ](https://github.com/openai/openai-python)を使用して API を利用できます：
+
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://your_gpustack_server_url/v1-openai", api_key="your_api_key")
+
+completion = client.chat.completions.create(
+  model="llama3.2",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+GPUStack ユーザーは UI で独自の API キーを生成できます。
+
+## ドキュメント
+
+完全なドキュメントについては、[公式ドキュメントサイト](https://docs.gpustack.ai)を参照してください。
+
+## ビルド
+
+1. Python（バージョン 3.10 から 3.12）をインストールします。
+
+2. `make build`を実行します。
+
+ビルドされた wheel パッケージは`dist`ディレクトリにあります。
+
+## コントリビューション
+
+GPUStack への貢献に興味がある場合は、[コントリビューションガイド](./docs/contributing.md)をお読みください。
+
+## コミュニティに参加
+
+問題がある場合や提案がある場合は、サポートのために[コミュニティ](https://discord.gg/VXYJzuaqwD)に参加してください。
+
+## ライセンス
+
+Copyright (c) 2024 The GPUStack authors
+
+Apache License, Version 2.0（以下「ライセンス」）に基づいてライセンスされています。
+このライセンスの詳細については、[LICENSE](./LICENSE)ファイルを参照してください。
+
+適用法で要求されるか、書面で合意されない限り、
+ライセンスに基づいて配布されるソフトウェアは「現状のまま」で配布され、
+明示または黙示を問わず、いかなる種類の保証や条件もありません。
+ライセンスに基づく許可と制限を規定する特定の言語については、
+ライセンスを参照してください。
--- a/benchmarks/benchmark_llm.py
+++ b/benchmarks/benchmark_llm.py
@ -1,346 +0,0 @@
-import asyncio
-import time
-import httpx
-import numpy
-import logging
-import argparse
-import json
-import random
-from openai import AsyncOpenAI
-
-logging.basicConfig(
-    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-
-# Avoid client side connection error: https://github.com/encode/httpx/discussions/3084
-http_client = httpx.AsyncClient(
-    limits=httpx.Limits(
-        max_connections=10000, max_keepalive_connections=10000, keepalive_expiry=30
-    )
-)
-
-SAMPLE_PROMPTS = [
-    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
-    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
-    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
-    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
-    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
-    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
-    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
-    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
-    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
-    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
-    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
-    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
-    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
-    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
-    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
-]
-
-
-async def process_stream(stream):
-    first_token_time = None
-    total_tokens = 0
-    async for chunk in stream:
-        if first_token_time is None:
-            first_token_time = time.time()
-        if chunk.choices[0].delta.content:
-            total_tokens += 1
-        if chunk.choices[0].finish_reason is not None:
-            break
-    return first_token_time, total_tokens
-
-
-async def make_request(
-    client: AsyncOpenAI, model, max_completion_tokens, request_timeout
-):
-    start_time = time.time()
-    content = random.choice(SAMPLE_PROMPTS)
-
-    try:
-        stream = await client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": content}],
-            max_completion_tokens=max_completion_tokens,
-            stream=True,
-        )
-        first_token_time, total_tokens = await asyncio.wait_for(
-            process_stream(stream), timeout=request_timeout
-        )
-
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        ttft = first_token_time - start_time if first_token_time else None
-        tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
-        return total_tokens, elapsed_time, tokens_per_second, ttft
-
-    except asyncio.TimeoutError:
-        logging.warning(f"Request timed out after {request_timeout} seconds")
-        return None
-    except Exception as e:
-        logging.error(f"Error during request: {str(e)}")
-        return None
-
-
-async def worker(
-    client,
-    model,
-    semaphore,
-    queue,
-    results,
-    max_completion_tokens,
-    request_timeout,
-):
-    while True:
-        async with semaphore:
-            task_id = await queue.get()
-            if task_id is None:
-                queue.task_done()
-                break
-            logging.info(f"Starting request {task_id}")
-            result = await make_request(
-                client, model, max_completion_tokens, request_timeout
-            )
-            if result:
-                results.append(result)
-            else:
-                logging.warning(f"Request {task_id} failed")
-            queue.task_done()
-            logging.info(f"Finished request {task_id}")
-
-
-def calculate_percentile(values, percentile, reverse=False):
-    if not values:
-        return None
-    if reverse:
-        return numpy.percentile(values, 100 - percentile)
-    return numpy.percentile(values, percentile)
-
-
-async def preflight_check(client, model) -> bool:
-    result = await make_request(client, model, 16, 60)
-    return result is not None
-
-
-async def main(
-    model,
-    num_requests,
-    concurrency,
-    request_timeout,
-    max_completion_tokens,
-    server_url,
-    api_key,
-):
-    client = AsyncOpenAI(
-        base_url=f"{server_url}/v1",
-        api_key=api_key,
-        http_client=http_client,
-        max_retries=0,
-    )
-
-    if not await preflight_check(client, model):
-        logging.error(
-            "Preflight check failed. Please check configuration and the service status."
-        )
-        return
-
-    semaphore = asyncio.Semaphore(concurrency)
-    queue = asyncio.Queue()
-    results = []
-
-    # Add tasks to the queue
-    for i in range(num_requests):
-        await queue.put(i)
-
-    # Add sentinel values to stop workers
-    for _ in range(concurrency):
-        await queue.put(None)
-
-    # Create worker tasks
-    workers = [
-        asyncio.create_task(
-            worker(
-                client,
-                model,
-                semaphore,
-                queue,
-                results,
-                max_completion_tokens,
-                request_timeout,
-            )
-        )
-        for _ in range(concurrency)
-    ]
-
-    start_time = time.time()
-
-    # Wait for all tasks to complete
-    await queue.join()
-    await asyncio.gather(*workers)
-
-    end_time = time.time()
-
-    # Calculate metrics
-    total_elapsed_time = end_time - start_time
-    total_tokens = sum(tokens for tokens, _, _, _ in results if tokens is not None)
-    latencies = [
-        elapsed_time for _, elapsed_time, _, _ in results if elapsed_time is not None
-    ]
-    tokens_per_second_list = [tps for _, _, tps, _ in results if tps is not None]
-    ttft_list = [ttft for _, _, _, ttft in results if ttft is not None]
-
-    successful_requests = len(results)
-    success_rate = successful_requests / num_requests if num_requests > 0 else 0
-    requests_per_second = (
-        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_latency = sum(latencies) / len(latencies) if latencies else 0
-    avg_tokens_per_second = (
-        sum(tokens_per_second_list) / len(tokens_per_second_list)
-        if tokens_per_second_list
-        else 0
-    )
-    overall_tokens_per_second = (
-        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
-
-    # Calculate percentiles
-    percentiles = [50, 95, 99]
-    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
-    tps_percentiles = [
-        calculate_percentile(tokens_per_second_list, p, reverse=True)
-        for p in percentiles
-    ]
-    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
-
-    return {
-        "model": model,
-        "total_requests": num_requests,
-        "successful_requests": successful_requests,
-        "success_rate": success_rate,
-        "concurrency": concurrency,
-        "request_timeout": request_timeout,
-        "max_completion_tokens": max_completion_tokens,
-        "total_time": total_elapsed_time,
-        "requests_per_second": requests_per_second,
-        "total_completion_tokens": total_tokens,
-        "latency": {
-            "average": avg_latency,
-            "p50": latency_percentiles[0],
-            "p95": latency_percentiles[1],
-            "p99": latency_percentiles[2],
-        },
-        "tokens_per_second": {
-            "overall": overall_tokens_per_second,
-            "average": avg_tokens_per_second,
-            "p50": tps_percentiles[0],
-            "p95": tps_percentiles[1],
-            "p99": tps_percentiles[2],
-        },
-        "time_to_first_token": {
-            "average": avg_ttft,
-            "p50": ttft_percentiles[0],
-            "p95": ttft_percentiles[1],
-            "p99": ttft_percentiles[2],
-        },
-    }
-
-
-def output_results(results, result_file=None):
-    # Round all floats in results to two decimal places for output
-    def _round_floats(obj, ndigits=2):
-        if isinstance(obj, dict):
-            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
-        if isinstance(obj, list):
-            return [_round_floats(v, ndigits) for v in obj]
-        if isinstance(obj, float):
-            return round(obj, ndigits)
-        return obj
-
-    formatted_results = _round_floats(results, 2)
-    if result_file:
-        with open(result_file, "w") as f:
-            json.dump(formatted_results, f, indent=2)
-        logging.info(f"Results saved to {result_file}")
-    else:
-        print(json.dumps(formatted_results, indent=2))
-
-
-def set_http_client(args):
-    if args.headers:
-        for header in args.headers:
-            if ":" not in header:
-                parser.error(f"Invalid header format: {header}. Expected Key:Value")
-            key, value = header.split(":", 1)
-            http_client.headers[key.strip()] = value.strip()
-
-    http_client.timeout = args.request_timeout
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
-    parser.add_argument(
-        "-m", "--model", type=str, required=True, help="Name of the model"
-    )
-    parser.add_argument(
-        "-n",
-        "--num-requests",
-        type=int,
-        default=100,
-        help="Number of requests to make (default: 100)",
-    )
-    parser.add_argument(
-        "-c",
-        "--concurrency",
-        type=int,
-        default=10,
-        help="Number of concurrent requests (default: 10)",
-    )
-    parser.add_argument(
-        "--request-timeout",
-        type=int,
-        default=300,
-        help="Timeout for each request in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-completion-tokens",
-        type=int,
-        default=1024,
-        help="Maximum number of tokens in the completion (default: 1024)",
-    )
-    parser.add_argument(
-        "--server-url",
-        type=str,
-        default="http://127.0.0.1",
-        help="URL of the GPUStack server",
-    )
-    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
-    parser.add_argument(
-        "--result-file",
-        type=str,
-        help="Result file path to save benchmark json results",
-    )
-    parser.add_argument(
-        "-H",
-        "--header",
-        action="append",
-        dest="headers",
-        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
-    )
-    args = parser.parse_args()
-    set_http_client(args)
-
-    results = asyncio.run(
-        main(
-            args.model,
-            args.num_requests,
-            args.concurrency,
-            args.request_timeout,
-            args.max_completion_tokens,
-            args.server_url,
-            args.api_key,
-        )
-    )
-    output_results(results, args.result_file)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -0,0 +1,654 @@
+import asyncio
+from dataclasses import asdict, dataclass, is_dataclass
+import time
+from typing import List, Optional
+import aiohttp
+import numpy
+import logging
+import argparse
+import json
+import random
+from openai import APIConnectionError, AsyncOpenAI
+from aiohttp import ClientSession
+from httpx_aiohttp import AiohttpTransport
+from openai import DefaultAsyncHttpxClient
+from openai.types.chat import (
+    ChatCompletionStreamOptionsParam,
+)
+from tqdm import tqdm
+
+logging.basicConfig(
+    level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+SAMPLE_PROMPTS = [
+    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
+    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
+    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
+    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
+    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
+    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
+    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
+    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
+    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
+    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
+    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
+    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
+    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
+    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
+    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
+]
+
+
+@dataclass
+class PercentileResults:
+    average: float
+    p50: float
+    p95: float
+    p99: float
+
+
+@dataclass
+class BenchmarkResults:
+    model: str
+    total_requests: int
+    successful_requests: int
+    success_rate: float
+    concurrency: int
+    request_timeout: int
+    max_completion_tokens: int
+    total_time: float
+    requests_per_second: float
+    total_tokens: int
+    total_prompt_tokens: int
+    total_completion_tokens: int
+    total_tokens_per_second: float
+    total_prompt_tokens_per_second: float
+    total_completion_tokens_per_second: float
+    latency: PercentileResults
+    completion_tokens_per_second: PercentileResults
+    time_to_first_token: PercentileResults
+
+
+async def process_stream(stream):
+    first_token_time = None
+    async for chunk in stream:
+        if first_token_time is None:
+            first_token_time = time.time()
+        if chunk.usage:
+            return first_token_time, chunk.usage
+    return first_token_time, None
+
+
+def get_random_prompt(prompt_multiplier):
+    """
+    Returns a random prompt from the SAMPLE_PROMPTS list, repeated prompt_multiplier times.
+    """
+    # Add a random prefix to avoid prefix cache hits
+    random_prefix = str(random.randint(100000, 999999))
+    return (
+        random_prefix + " " + (random.choice(SAMPLE_PROMPTS) + " ") * prompt_multiplier
+    )
+
+
+async def make_chat_completion_request(
+    client: AsyncOpenAI,
+    model,
+    max_completion_tokens,
+    ignore_eos,
+    request_timeout,
+    prompt_multiplier,
+):
+    start_time = time.time()
+    content = get_random_prompt(prompt_multiplier)
+    try:
+        stream = await client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            max_completion_tokens=max_completion_tokens,
+            stream=True,
+            stream_options=ChatCompletionStreamOptionsParam(include_usage=True),
+            extra_body={"ignore_eos": ignore_eos} if ignore_eos else None,
+        )
+        first_token_time, usage = await asyncio.wait_for(
+            process_stream(stream), timeout=request_timeout
+        )
+
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        ttft = (first_token_time - start_time) * 1000 if first_token_time else None
+        return usage, elapsed_time, ttft
+    except asyncio.TimeoutError:
+        logging.warning(f"Request timed out after {request_timeout} seconds")
+        return None
+    except APIConnectionError as e:
+        logging.error(f"API connection error: {str(e)}")
+        return None
+    except Exception as e:
+        logging.error(f"Error during request: {str(e)}")
+        return None
+
+
+async def make_embedding_request(
+    client: AsyncOpenAI,
+    model,
+    request_timeout,
+    prompt_multiplier=1,
+):
+    start_time = time.time()
+    content = get_random_prompt(prompt_multiplier)
+    try:
+        response = await asyncio.wait_for(
+            client.embeddings.create(model=model, input=content),
+            timeout=request_timeout,
+        )
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        ttft = None  # Embeddings do not have a time to first token in the same way as chat completions
+
+        return response.usage, elapsed_time, ttft
+    except asyncio.TimeoutError:
+        logging.warning(f"Embedding request timed out after {request_timeout} seconds")
+        return None
+    except Exception as e:
+        logging.error(f"Error during embedding request: {str(e)}")
+        return None
+
+
+async def worker(
+    client,
+    model,
+    semaphore,
+    queue,
+    results,
+    max_completion_tokens,
+    ignore_eos,
+    request_timeout,
+    embeddings=False,
+    prompt_multiplier=1,
+    pbar=None,
+):
+    while True:
+        async with semaphore:
+            task_id = await queue.get()
+            if task_id is None:
+                queue.task_done()
+                break
+            logging.debug(f"Starting request {task_id}")
+            if embeddings:
+                result = await make_embedding_request(
+                    client, model, request_timeout, prompt_multiplier
+                )
+            else:
+                result = await make_chat_completion_request(
+                    client,
+                    model,
+                    max_completion_tokens,
+                    ignore_eos,
+                    request_timeout,
+                    prompt_multiplier,
+                )
+            if result:
+                results.append(result)
+            else:
+                logging.warning(f"Request {task_id} failed")
+            queue.task_done()
+            if pbar:
+                pbar.update(1)
+            logging.debug(f"Finished request {task_id}")
+
+
+def calculate_percentile(values, percentile, reverse=False):
+    if not values:
+        return None
+    if reverse:
+        return numpy.percentile(values, 100 - percentile)
+    return numpy.percentile(values, percentile)
+
+
+async def preflight_check(client, model, embeddings=False) -> bool:
+    if embeddings:
+        result = await make_embedding_request(client, model, 16)
+    else:
+        result = await make_chat_completion_request(client, model, 16, False, 60, 1)
+    return result is not None
+
+
+def set_headers(aiohttp_session: ClientSession, headers: Optional[List[str]]):
+    if headers:
+        for header in headers:
+            if ":" not in header:
+                raise ValueError(f"Invalid header format: {header}. Expected Key:Value")
+            key, value = header.split(":", 1)
+            aiohttp_session.headers[key.strip()] = value.strip()
+
+
+async def main(
+    model,
+    num_requests,
+    concurrency,
+    request_timeout,
+    max_completion_tokens,
+    ignore_eos,
+    server_url,
+    api_key,
+    headers=None,
+    embeddings=False,
+    prompt_multiplier=1,
+) -> Optional[BenchmarkResults]:
+    connector = aiohttp.TCPConnector(
+        limit=2000,
+        force_close=True,
+    )
+    async with ClientSession(connector=connector, trust_env=True) as aiohttp_session:
+        if headers:
+            set_headers(aiohttp_session, headers)
+        transport = AiohttpTransport(client=aiohttp_session)
+        httpx_client = DefaultAsyncHttpxClient(
+            transport=transport, timeout=request_timeout
+        )
+        client = AsyncOpenAI(
+            base_url=f"{server_url}/v1",
+            api_key=api_key,
+            http_client=httpx_client,
+            max_retries=0,
+        )
+
+        if not await preflight_check(client, model, embeddings=embeddings):
+            raise Exception(
+                "Preflight check failed. Please check configuration and the service status."
+            )
+
+        semaphore = asyncio.Semaphore(concurrency)
+        queue = asyncio.Queue()
+        results = []
+
+        # Add tasks to the queue
+        for i in range(num_requests):
+            await queue.put(i)
+
+        # Add sentinel values to stop workers
+        for _ in range(concurrency):
+            await queue.put(None)
+
+        pbar = tqdm(
+            total=num_requests,
+            desc="Running Benchmark requests",
+            unit="request",
+            dynamic_ncols=True,
+        )
+
+        # Create worker tasks
+        workers = [
+            asyncio.create_task(
+                worker(
+                    client,
+                    model,
+                    semaphore,
+                    queue,
+                    results,
+                    max_completion_tokens,
+                    ignore_eos,
+                    request_timeout,
+                    embeddings,
+                    prompt_multiplier,
+                    pbar=pbar,
+                )
+            )
+            for _ in range(concurrency)
+        ]
+
+        start_time = time.time()
+
+        # Wait for all tasks to complete
+        await queue.join()
+        await asyncio.gather(*workers)
+
+        end_time = time.time()
+        total_elapsed_time = end_time - start_time
+        return calculate_results(
+            model,
+            concurrency,
+            request_timeout,
+            max_completion_tokens,
+            total_elapsed_time,
+            num_requests,
+            results,
+        )
+
+
+def calculate_results(
+    model,
+    concurrency,
+    request_timeout,
+    max_completion_tokens,
+    total_elapsed_time,
+    num_requests,
+    results,
+):
+    # Calculate metrics
+    total_tokens = 0
+    prompt_tokens = 0
+    completion_tokens = 0
+    tokens_per_second_list = []
+    prompt_tokens_per_second_list = []
+    completion_tokens_per_second_list = []
+    for usage, elapsed_time, _ in results:
+        if usage is not None:
+            total_tokens += usage.total_tokens
+            prompt_tokens += usage.prompt_tokens
+            completion_tokens += usage.completion_tokens
+            prompt_tokens_per_second = (
+                usage.prompt_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            completion_tokens_per_second = (
+                usage.completion_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            tokens_per_second = (
+                usage.total_tokens / elapsed_time if elapsed_time > 0 else 0
+            )
+            tokens_per_second_list.append(tokens_per_second)
+            prompt_tokens_per_second_list.append(prompt_tokens_per_second)
+            completion_tokens_per_second_list.append(completion_tokens_per_second)
+
+    latencies = [
+        elapsed_time for _, elapsed_time, _ in results if elapsed_time is not None
+    ]
+    ttft_list = [ttft for _, _, ttft in results if ttft is not None]
+
+    successful_requests = len(results)
+    success_rate = successful_requests / num_requests if num_requests > 0 else 0
+    requests_per_second = (
+        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_latency = sum(latencies) / len(latencies) if latencies else 0
+    avg_completion_tokens_per_second = (
+        sum(completion_tokens_per_second_list) / len(completion_tokens_per_second_list)
+        if completion_tokens_per_second_list
+        else 0
+    )
+    total_tokens_per_second = (
+        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    total_prompt_tokens_per_second = (
+        prompt_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    total_completion_tokens_per_second = (
+        completion_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
+
+    # Calculate percentiles
+    percentiles = [50, 95, 99]
+    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
+    completion_tps_percentiles = [
+        calculate_percentile(completion_tokens_per_second_list, p, reverse=True)
+        for p in percentiles
+    ]
+    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
+
+    return BenchmarkResults(
+        model=model,
+        total_requests=num_requests,
+        successful_requests=successful_requests,
+        success_rate=success_rate,
+        concurrency=concurrency,
+        request_timeout=request_timeout,
+        max_completion_tokens=max_completion_tokens,
+        total_time=total_elapsed_time,
+        requests_per_second=requests_per_second,
+        total_tokens=total_tokens,
+        total_prompt_tokens=prompt_tokens,
+        total_completion_tokens=completion_tokens,
+        total_tokens_per_second=total_tokens_per_second,
+        total_prompt_tokens_per_second=total_prompt_tokens_per_second,
+        total_completion_tokens_per_second=total_completion_tokens_per_second,
+        latency=PercentileResults(
+            average=avg_latency,
+            p50=latency_percentiles[0],
+            p95=latency_percentiles[1],
+            p99=latency_percentiles[2],
+        ),
+        completion_tokens_per_second=PercentileResults(
+            average=avg_completion_tokens_per_second,
+            p50=completion_tps_percentiles[0],
+            p95=completion_tps_percentiles[1],
+            p99=completion_tps_percentiles[2],
+        ),
+        time_to_first_token=PercentileResults(
+            average=avg_ttft,
+            p50=ttft_percentiles[0],
+            p95=ttft_percentiles[1],
+            p99=ttft_percentiles[2],
+        ),
+    )
+
+
+def fmt_line(label, *values, width=40):
+    label_part = f"{label:<{width}}"
+    value_part = " ".join(str(v) for v in values)
+    return f"{label_part}{value_part}"
+
+
+def fmt_float(v, suffix=""):
+    return f"{v:.2f}{suffix}"
+
+
+def output_benchmark_results_pretty(
+    results: BenchmarkResults, file: str = None, embeddings: bool = False
+):
+
+    lines = []
+    lines.append("============== Serving Benchmark Result ===============")
+    lines.append(fmt_line("Model:", results.model))
+    lines.append(
+        fmt_line(
+            "Total requests:",
+            f"{results.successful_requests}/{results.total_requests}({results.success_rate:.2%})",
+        )
+    )
+    lines.append(fmt_line("Concurrency:", results.concurrency))
+    lines.append(fmt_line("Benchmark duration (s):", fmt_float(results.total_time)))
+    lines.append(
+        fmt_line("Request throughput (req/s):", fmt_float(results.requests_per_second))
+    )
+    lines.append(fmt_line("Total input tokens:", results.total_prompt_tokens))
+    if not embeddings:
+        lines.append(fmt_line("Total output tokens:", results.total_completion_tokens))
+
+    output_tok_per_sec = (
+        results.total_completion_tokens / results.total_time
+        if results.total_time > 0
+        else 0
+    )
+    total_tok_per_sec = (
+        results.total_tokens / results.total_time if results.total_time > 0 else 0
+    )
+    if not embeddings:
+        lines.append(
+            fmt_line("Output token throughput (tok/s):", fmt_float(output_tok_per_sec))
+        )
+    lines.append(
+        fmt_line("Total token throughput (tok/s):", fmt_float(total_tok_per_sec))
+    )
+    lines.append("------------------- Request Latency -------------------")
+    lines.append(fmt_line("Average latency (s):", fmt_float(results.latency.average)))
+    lines.append(fmt_line("P50 latency (s):", fmt_float(results.latency.p50)))
+    lines.append(fmt_line("P95 latency (s):", fmt_float(results.latency.p95)))
+    lines.append(fmt_line("P99 latency (s):", fmt_float(results.latency.p99)))
+    if not embeddings:
+        lines.append("--------------- Output Token Per Second ---------------")
+        lines.append(
+            fmt_line(
+                "Average TPS (tok/s):",
+                fmt_float(results.completion_tokens_per_second.average),
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P50 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p50)
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P95 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p95)
+            )
+        )
+        lines.append(
+            fmt_line(
+                "P99 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p99)
+            )
+        )
+
+        lines.append("----------------- Time to First Token -----------------")
+        lines.append(
+            fmt_line(
+                "Average TTFT (ms):", fmt_float(results.time_to_first_token.average)
+            )
+        )
+        lines.append(
+            fmt_line("P50 TTFT (ms):", fmt_float(results.time_to_first_token.p50))
+        )
+        lines.append(
+            fmt_line("P95 TTFT (ms):", fmt_float(results.time_to_first_token.p95))
+        )
+        lines.append(
+            fmt_line("P99 TTFT (ms):", fmt_float(results.time_to_first_token.p99))
+        )
+    lines.append("=" * 55)
+
+    output = "\n".join(lines)
+
+    if file:
+        with open(file, "w") as f:
+            f.write(output + "\n")
+        logging.info(f"Pretty benchmark results saved to {file}")
+    else:
+        print(output)
+
+
+def output_benchmark_results_json(
+    results: BenchmarkResults, result_file=None, embeddings: bool = False
+):
+    # Round all floats in results to two decimal places for output
+    def _round_floats(obj, ndigits=2):
+        if is_dataclass(obj):
+            obj = asdict(obj)
+        if isinstance(obj, dict):
+            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_round_floats(v, ndigits) for v in obj]
+        if isinstance(obj, float):
+            return round(obj, ndigits)
+        return obj
+
+    formatted_results = _round_floats(results, 2)
+    if result_file:
+        with open(result_file, "w") as f:
+            json.dump(formatted_results, f, indent=2)
+        logging.info(f"Results saved to {result_file}")
+    else:
+        print(json.dumps(formatted_results, indent=2))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Name of the model"
+    )
+    parser.add_argument(
+        "-n",
+        "--num-requests",
+        type=int,
+        default=100,
+        help="Number of requests to make (default: 100)",
+    )
+    parser.add_argument(
+        "-c",
+        "--concurrency",
+        type=int,
+        default=10,
+        help="Number of concurrent requests (default: 10)",
+    )
+    parser.add_argument(
+        "--request-timeout",
+        type=int,
+        default=300,
+        help="Timeout for each request in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-completion-tokens",
+        type=int,
+        default=1024,
+        help="Maximum number of tokens in the completion (default: 1024)",
+    )
+    parser.add_argument(
+        "--prompt-multiplier",
+        type=int,
+        default=1,
+        help="Repeat the randomly selected prompt N times to create longer inputs",
+    )
+    parser.add_argument(
+        '--ignore-eos',
+        action='store_true',
+        help='Set ignore_eos flag when sending the benchmark request. This will not stop the stream when the model generates an EOS token.',
+    )
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://127.0.0.1",
+        help="URL of the GPUStack server",
+    )
+    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
+    parser.add_argument(
+        "--result-file",
+        type=str,
+        help="Result file path to save benchmark json results",
+    )
+    parser.add_argument(
+        "-H",
+        "--header",
+        action="append",
+        dest="headers",
+        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
+    )
+    parser.add_argument(
+        '--embeddings',
+        action='store_true',
+        help='Run embedding benchmark instead of chat completions',
+    )
+    parser.add_argument(
+        '--json',
+        action='store_true',
+        help='Output results in JSON format instead of pretty format',
+    )
+    args = parser.parse_args()
+
+    try:
+        results = asyncio.run(
+            main(
+                args.model,
+                args.num_requests,
+                args.concurrency,
+                args.request_timeout,
+                args.max_completion_tokens,
+                args.ignore_eos,
+                args.server_url,
+                args.api_key,
+                args.headers,
+                args.embeddings,
+                args.prompt_multiplier,
+            )
+        )
+        if args.json:
+            output_benchmark_results_json(
+                results, args.result_file, embeddings=args.embeddings
+            )
+        else:
+            output_benchmark_results_pretty(
+                results, args.result_file, embeddings=args.embeddings
+            )
+    except Exception as e:
+        logging.error(f"Benchmarking failed: {str(e)}")
+        exit(1)
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@ -0,0 +1,26 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.13
+aiosignal==1.3.2
+annotated-types==0.7.0
+anyio==4.9.0
+attrs==25.3.0
+certifi==2025.6.15
+distro==1.9.0
+frozenlist==1.7.0
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+httpx-aiohttp==0.1.6
+idna==3.10
+jiter==0.10.0
+multidict==6.5.1
+numpy==2.3.1
+openai==1.92.2
+propcache==0.3.2
+pydantic==2.11.7
+pydantic_core==2.33.2
+sniffio==1.3.1
+tqdm==4.67.1
+typing-inspection==0.4.1
+typing_extensions==4.14.0
+yarl==1.20.1
--- a/conftest.py
+++ b/conftest.py
@ -0,0 +1,21 @@
+import shutil
+import tempfile
+import pytest
+from gpustack.config.config import Config, set_global_config
+
+
+@pytest.fixture(scope="module", autouse=True)
+def temp_dir():
+    tmp_dir = tempfile.mkdtemp()
+    print(f"Created temporary directory: {tmp_dir}")
+    yield tmp_dir
+    shutil.rmtree(tmp_dir)
+
+
+@pytest.fixture(scope="module", autouse=True)
+def config(temp_dir):
+    cfg = Config(
+        token="test", jwt_secret_key="test", data_dir=temp_dir, enable_ray=True
+    )
+    set_global_config(cfg)
+    return cfg
--- a/docs/architecture.md
+++ b/docs/architecture.md
@ -27,7 +27,7 @@ The GPUStack server connects to a SQL database as the datastore. GPUStack uses S

 ### Inference Server

-Inference servers are the backends that performs the inference tasks. GPUStack supports [llama-box](https://github.com/gpustack/llama-box), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.
+Inference servers are the backends that performs the inference tasks. GPUStack supports [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.

 ### RPC Server

--- a/docs/assets/compare-playground-screenshot.png
+++ b/docs/assets/compare-playground-screenshot.png
--- a/docs/assets/desktop-installer/add-worker.png
+++ b/docs/assets/desktop-installer/add-worker.png
--- a/docs/assets/desktop-installer/open-web-console.png
+++ b/docs/assets/desktop-installer/open-web-console.png
--- a/docs/assets/desktop-installer/prompt-root-privileges.png
+++ b/docs/assets/desktop-installer/prompt-root-privileges.png
--- a/docs/assets/desktop-installer/quickconfig-env-var.png
+++ b/docs/assets/desktop-installer/quickconfig-env-var.png
--- a/docs/assets/desktop-installer/quickconfig-general.png
+++ b/docs/assets/desktop-installer/quickconfig-general.png
--- a/docs/assets/desktop-installer/to-upgrade-darwin.png
+++ b/docs/assets/desktop-installer/to-upgrade-darwin.png
--- a/docs/assets/faq/quick-config.png
+++ b/docs/assets/faq/quick-config.png
--- a/docs/assets/gpustack-architecture.png
+++ b/docs/assets/gpustack-architecture.png
--- a/docs/assets/gpustack-network-architecture.png
+++ b/docs/assets/gpustack-network-architecture.png
--- a/docs/assets/integrations/integration-gpustack-api-access-info.png
+++ b/docs/assets/integrations/integration-gpustack-api-access-info.png
--- a/docs/assets/integrations/integration-gpustack-models.png
+++ b/docs/assets/integrations/integration-gpustack-models.png
--- a/docs/assets/model-catalog.png
+++ b/docs/assets/model-catalog.png
--- a/docs/assets/playground-screenshot.png
+++ b/docs/assets/playground-screenshot.png
--- a/docs/assets/playground/api-style.png
+++ b/docs/assets/playground/api-style.png
--- a/docs/assets/playground/audio-permission.png
+++ b/docs/assets/playground/audio-permission.png
--- a/docs/assets/playground/chat.png
+++ b/docs/assets/playground/chat.png
--- a/docs/assets/playground/create-image-01.png
+++ b/docs/assets/playground/create-image-01.png
--- a/docs/assets/playground/create-image-02.png
+++ b/docs/assets/playground/create-image-02.png
--- a/docs/assets/playground/embedding.png
+++ b/docs/assets/playground/embedding.png
--- a/docs/assets/playground/image-edit-01.png
+++ b/docs/assets/playground/image-edit-01.png
--- a/docs/assets/playground/image-size.png
+++ b/docs/assets/playground/image-size.png
--- a/docs/assets/playground/ranker.png
+++ b/docs/assets/playground/ranker.png
--- a/docs/assets/playground/speech-to-text.png
+++ b/docs/assets/playground/speech-to-text.png
--- a/docs/assets/playground/text-to-speech.png
+++ b/docs/assets/playground/text-to-speech.png
--- a/docs/assets/quick-start/mac-done.png
+++ b/docs/assets/quick-start/mac-done.png
--- a/docs/assets/quick-start/mac-installer.png
+++ b/docs/assets/quick-start/mac-installer.png
--- a/docs/assets/quick-start/model-running.png
+++ b/docs/assets/quick-start/model-running.png
--- a/docs/assets/quick-start/quick-chat.png
+++ b/docs/assets/quick-start/quick-chat.png
--- a/docs/assets/quick-start/quick-start-login.png
+++ b/docs/assets/quick-start/quick-start-login.png
--- a/docs/assets/quick-start/quick-start-qwen3.png
+++ b/docs/assets/quick-start/quick-start-qwen3.png
--- a/docs/assets/quick-start/windows-done.png
+++ b/docs/assets/quick-start/windows-done.png
--- a/docs/assets/quick-start/windows-installer.png
+++ b/docs/assets/quick-start/windows-installer.png
--- a/docs/assets/sso/auth0-app.png
+++ b/docs/assets/sso/auth0-app.png
--- a/docs/assets/sso/auth0-callback.png
+++ b/docs/assets/sso/auth0-callback.png
--- a/docs/assets/sso/auth0-saml-callback.png
+++ b/docs/assets/sso/auth0-saml-callback.png
--- a/docs/assets/sso/auth0-saml-cert.png
+++ b/docs/assets/sso/auth0-saml-cert.png
--- a/docs/assets/sso/auth0-saml-url.png
+++ b/docs/assets/sso/auth0-saml-url.png
--- a/docs/assets/sso/create-oidc-app.png
+++ b/docs/assets/sso/create-oidc-app.png
--- a/docs/assets/sso/create-saml-app.png
+++ b/docs/assets/sso/create-saml-app.png
--- a/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
--- a/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
--- a/docs/assets/tutorials/using-vision-language-models/playground-vl.png
+++ b/docs/assets/tutorials/using-vision-language-models/playground-vl.png
--- a/docs/assets/using-models/editing-images/image-edit-catalog.png
+++ b/docs/assets/using-models/editing-images/image-edit-catalog.png
--- a/docs/assets/using-models/editing-images/image-edit-example.png
+++ b/docs/assets/using-models/editing-images/image-edit-example.png
--- a/docs/assets/using-models/editing-images/image-edit-input.png
+++ b/docs/assets/using-models/editing-images/image-edit-input.png
--- a/docs/assets/using-models/editing-images/image-edit-output.png
+++ b/docs/assets/using-models/editing-images/image-edit-output.png
--- a/docs/assets/using-models/editing-images/view-code.png
+++ b/docs/assets/using-models/editing-images/view-code.png
--- a/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
+++ b/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
--- a/docs/assets/using-models/using-audio-models/deploy-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-stt-model.png
--- a/docs/assets/using-models/using-audio-models/deploy-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-tts-model.png
--- a/docs/assets/using-models/using-audio-models/inference-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-stt-model.png
--- a/docs/assets/using-models/using-audio-models/inference-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-tts-model.png
--- a/docs/assets/using-models/using-audio-models/stt-model-list.png
+++ b/docs/assets/using-models/using-audio-models/stt-model-list.png
--- a/docs/assets/using-models/using-audio-models/tts-model-list.png
+++ b/docs/assets/using-models/using-audio-models/tts-model-list.png
--- a/docs/assets/using-models/using-embedding-models/deploy-model.png
+++ b/docs/assets/using-models/using-embedding-models/deploy-model.png
--- a/docs/assets/using-models/using-embedding-models/model-list.png
+++ b/docs/assets/using-models/using-embedding-models/model-list.png
--- a/docs/assets/using-models/using-image-generation-models/deploy-model.png
+++ b/docs/assets/using-models/using-image-generation-models/deploy-model.png
--- a/Show More
+++ b/Show More