ci: pin action-gh-release version

ci(npu): adjust processing
- mindie: upgrade transformers for Qwen2-VL - gpustack: clarify permission - script: enable skipping pre-commit Signed-off-by: thxCode <thxcode0824@gmail.com>
328 changed files with 8225 additions and 23801 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -1,3 +1,3 @@
 install.ps1.sha256sum text eol=lf
+
 * text=auto eol=lf
-*.tar.gz filter=lfs diff=lfs merge=lfs -text
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -60,7 +60,7 @@ jobs:
          retention-days: 5

      - name: Release GitHub Assets
-        uses: softprops/action-gh-release@v2
+        uses: softprops/action-gh-release@v2.2.2
        if: startsWith(github.ref, 'refs/tags/') && matrix.python-version == '3.11' && matrix.os == 'linux'
        with:
          # Draft for official releases to prepare and review release notes before publishing
--- a/.github/workflows/docker-ci.yaml
+++ b/.github/workflows/docker-ci.yaml
@ -15,15 +15,18 @@ on:
      - "**.png"
      - "**.jpg"
      - "**.gif"
-      - "pack/**.base"
+      - "Dockerfile.rocm.base"
+      - "Dockerfile.dcu.base"
  pull_request:
    branches:
      - main
      - "v*-dev"
    paths:
+      - "Dockerfile"
+      - "Dockerfile.*"
      - ".github/workflows/docker-ci.yaml"
-      - "pack/**"
-      - "!pack/**.base"
+      - "!Dockerfile.rocm.base"
+      - "!Dockerfile.dcu.base"

 jobs:
  publish-docker:
@ -45,16 +48,12 @@ jobs:
            tag_suffix: ""
            build_args:
              - "CUDA_VERSION=12.4.1"
-              - "CUDA_DEVEL_VERSION=12.6.3"
-              - "FLASHINFER_BUILD_MAX_JOBS=1"
          - device: cuda
            dockerfile: "Dockerfile"
            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-cuda12.8"
            build_args:
              - "CUDA_VERSION=12.8.1"
-              - "CUDA_DEVEL_VERSION=12.8.1"
-              - "FLASHINFER_BUILD_MAX_JOBS=1"
          #
          # HIP RoCM
          #
@ -84,7 +83,7 @@ jobs:
          #
          - device: musa
            dockerfile: "Dockerfile.musa"
-            platforms: "linux/amd64"
+            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-musa"
            build_args: []
          #
@ -116,7 +115,6 @@ jobs:
      - name: Checkout
        uses: actions/checkout@v4
        with:
-          lfs: true
          fetch-depth: 1
          persist-credentials: false
      - name: Maximize Docker Build Space
@ -161,66 +159,12 @@ jobs:
            echo "$arg" >> $GITHUB_OUTPUT
          done
          echo "EOF" >> $GITHUB_OUTPUT
-      - name: Get Cache Ref
-        id: cache-ref
-        run: |
-          #
-          # Use different cache ref for different branches.
-          #
-          # Examples:
-          # CACHE_FROM_REF
-          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION|DEFAULT_BRANCH}"
-          #   - PR/PUSH to branch      -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH|DEFUALT_BRANCH}"
-          # CACHE_TO_REF
-          #   - vX.Y.Z                 -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
-          #   - PUSH to branch         -> "gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
-          #
-          # Stories(device cpu):
-          # CACHE_FROM_REF
-          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
-          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
-          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
-          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
-          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
-          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7, if not found, fallback to gpustack/build-cache:gpustack-cpu-main
-          # CACHE_TO_REF
-          #   - Release tag v0.7.0rc1      -> gpustack/build-cache:gpustack-cpu-v0.7
-          #   - Release tag v0.7.0         -> gpustack/build-cache:gpustack-cpu-v0.7
-          #   - PR to "main" branch        -> gpustack/build-cache:gpustack-cpu-main
-          #   - PR to "v0.7-dev" branch    -> gpustack/build-cache:gpustack-cpu-v0.7
-          #   - Push to "main" branch      -> gpustack/build-cache:gpustack-cpu-main
-          #   - Push to "v0.7-dev" branch  -> gpustack/build-cache:gpustack-cpu-v0.7
-          DEFAULT_BRANCH="main"
-          TAG_SUFFIX="${{ matrix.tag_suffix }}"
-          if [[ "${GITHUB_REF}" == refs/tags/* ]]; then
-            REF="${GITHUB_REF#refs/tags/}"
-            IFS="." read -r VERSION_MAJOR VERSION_MINOR VERSION_PATCH <<< "${REF}"
-            VERSION="${VERSION_MAJOR}.${VERSION_MINOR}"
-            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${VERSION}"
-            CACHE_TO_REF="${CACHE_FROM_REF}"
-          else
-            REF="${GITHUB_BASE_REF:-${GITHUB_REF}}"
-            BRANCH="${REF#refs/heads/}"
-            BRANCH="${BRANCH%-dev}"
-            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${BRANCH}"
-            CACHE_TO_REF="${CACHE_FROM_REF}"
-          fi
-          if ! docker manifest inspect "${CACHE_FROM_REF}" >/dev/null 2>&1; then
-            CACHE_FROM_REF="gpustack/build-cache:gpustack${TAG_SUFFIX}-${DEFAULT_BRANCH}"
-          fi
-          echo "CACHE_FROM_REF=${CACHE_FROM_REF}" >> $GITHUB_ENV
-          echo "CACHE_TO_REF=${CACHE_TO_REF}" >> $GITHUB_ENV
-          echo "DEBUG: GITHUB_BASE_REF=${GITHUB_BASE_REF}"
-          echo "DEBUG: GITHUB_REF=${GITHUB_REF}"
-          echo "DEBUG: TAG_SUFFIX=${TAG_SUFFIX}"
-          echo "DEBUG: CACHE_FROM_REF=${CACHE_FROM_REF}"
-          echo "DEBUG: CACHE_TO_REF=${CACHE_TO_REF}"
      - name: Package
        uses: docker/build-push-action@v6
        id: package
        with:
          push: ${{ github.event_name != 'pull_request' }}
-          file: ${{ github.workspace }}/pack/${{ matrix.dockerfile }}
+          file: ${{ github.workspace }}/${{ matrix.dockerfile }}
          context: ${{ github.workspace }}
          platforms: ${{ matrix.platforms }}
          tags: ${{ steps.metadata.outputs.tags }}
@ -230,6 +174,6 @@ jobs:
          build-args: |
            ${{ steps.build-args.outputs.BUILD_ARGS }}
          cache-from: |
-            type=registry,ref=${{ env.CACHE_FROM_REF }}
+            type=registry,ref=gpustack/build-cache:gpustack${{ matrix.tag_suffix }}
          cache-to: |
-            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref={0},ignore-error=true', env.CACHE_TO_REF) || '' }}
+            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
--- a/.github/workflows/install-script-windows.yml
+++ b/.github/workflows/install-script-windows.yml
@ -4,14 +4,12 @@ on:
  push:
    branches:
      - main
-      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
  pull_request:
    branches:
      - main
-      - "v*-dev"
    paths:
      - "install.ps1"
      - ".github/workflows/install-script-windows.yml"
@ -76,7 +74,7 @@ jobs:
          $env:INSTALL_PACKAGE_SPEC = [System.IO.Path]::Combine("dist", $env:whlPackageName)   
          Write-Host "INSTALL_PACKAGE_SPEC: $env:INSTALL_PACKAGE_SPEC"   
          Write-Host "AppData $env:APPDATA"
-
+          
          # Use port 8080 since 80 is occupied by the System
          ./install.ps1 -ServerPort 8080

@ -102,7 +100,8 @@ jobs:
                  Start-Sleep -Seconds $retryDelaySeconds
              }
          }
-
+        
          if ($responseCode -ne 200) {
              Write-Host "All retry attempts failed. Last error: $lastError"
          }
+      
--- a/.github/workflows/install-script.yml
+++ b/.github/workflows/install-script.yml
@ -4,14 +4,12 @@ on:
  push:
    branches:
      - main
-      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
  pull_request:
    branches:
      - main
-      - "v*-dev"
    paths:
      - "install.sh"
      - ".github/workflows/install-script.yml"
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -11,7 +11,6 @@ repos:
    hooks:
      - id: flake8
        exclude: ".*/migrations"
-        args: [--max-complexity=15]
  - repo: https://github.com/psf/black
    rev: 24.4.2
    hooks:
--- a/44
+++ b/44
@ -0,0 +1,44 @@
+ARG CUDA_VERSION=12.4.1
+ARG CUDA_TAG_SUFFIX=-cudnn-runtime-ubuntu22.04
+
+FROM nvidia/cuda:${CUDA_VERSION}${CUDA_TAG_SUFFIX}
+
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    wget \
+    tzdata \
+    iproute2 \
+    python3 \
+    python3-pip \
+    python3-venv \
+    tini \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . /workspace/gpustack
+RUN cd /workspace/gpustack && \
+    make build
+
+ARG VLLM_VERSION=0.8.5.post1
+RUN <<EOF
+    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
+        # Install vllm dependencies for x86_64
+        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
+    else
+        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";
+    fi
+    pip install pipx
+    pip install $WHEEL_PACKAGE
+    pip cache purge
+    rm -rf /workspace/gpustack
+EOF
+
+RUN gpustack download-tools
+
+# Download dac weights used by audio models like Dia
+RUN python3 -m dac download
+
+ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/pack/Dockerfile.corex
+++ b/pack/Dockerfile.corex
@ -1,13 +1,13 @@
-FROM crpi-thyzhdzt86bexebt.cn-hangzhou.personal.cr.aliyuncs.com/gpustack_ai/gpustack:iluvator-corex4.2.0-vllm0.8.3-py3.10 AS build
+FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS build

 RUN apt-get update && apt-get install -y \
    git \
    curl

-COPY .. /workspace/gpustack
+COPY . /workspace/gpustack
 RUN cd /workspace/gpustack && make build

-FROM crpi-thyzhdzt86bexebt.cn-hangzhou.personal.cr.aliyuncs.com/gpustack_ai/gpustack:iluvator-corex4.2.0-vllm0.8.3-py3.10 AS runtime
+FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS runtime

 RUN apt-get update && apt-get install -y \
    python3 \
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@ -0,0 +1,32 @@
+FROM ubuntu:22.04
+
+ARG TARGETPLATFORM
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update && apt-get install -y \
+    git \
+    curl \
+    wget \
+    tzdata \
+    iproute2 \
+    python3 \
+    python3-pip \
+    python3-venv \
+    tini \
+    && rm -rf /var/lib/apt/lists/*
+
+COPY . /workspace/gpustack
+RUN cd /workspace/gpustack && \
+    make build && \
+    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]" && \
+    pip install pipx && \
+    pip install $WHEEL_PACKAGE && \
+    pip cache purge && \
+    rm -rf /workspace/gpustack
+
+RUN gpustack download-tools
+
+# Download dac weights used by audio models like Dia
+RUN python3 -m dac download
+
+ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/pack/Dockerfile.dcu
+++ b/pack/Dockerfile.dcu
@ -1,10 +1,13 @@
-FROM crpi-thyzhdzt86bexebt.cn-hangzhou.personal.cr.aliyuncs.com/gpustack_ai/gpustack:hygon-dtk25.04.1-vllm0.8.5-py3.10
+FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.4.1-ubuntu22.04-dtk25.04-py3.10-fixpy

 ENV PATH="/root/.local/bin:$PATH"
 ENV DEBIAN_FRONTEND=noninteractive

+RUN pip install https://download.sourcefind.cn:65024/file/4/lmslim/DAS1.5/lmslim-0.2.1+das.dtk2504-cp310-cp310-manylinux_2_28_x86_64.whl \
+    && pip install https://download.sourcefind.cn:65024/file/4/vllm/DAS1.5/vllm-0.7.2+das.opt1.dtk2504-cp310-cp310-manylinux_2_28_x86_64.whl \
+    && pip cache purge
+
 RUN apt-get update && apt-get install -y \
-    git git-lfs \
    python3-venv \
    tzdata \
    iproute2 \
--- a/pack/Dockerfile.musa
+++ b/pack/Dockerfile.musa
@ -1,7 +1,7 @@
 ARG UBUNTU_VERSION=22.04
-ARG MUSA_VERSION=rc4.2.0
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}-amd64
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}-amd64
+ARG MUSA_VERSION=rc4.0.1
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@ -9,10 +9,10 @@ ARG TARGETPLATFORM
 ENV DEBIAN_FRONTEND=noninteractive

 RUN apt-get update && apt-get install -y \
-    git git-lfs \
+    git \
    curl

-COPY .. /workspace/gpustack
+COPY . /workspace/gpustack
 RUN cd /workspace/gpustack && make build

 FROM ${BASE_MUSA_RUN_CONTAINER} AS runtime
--- a/pack/Dockerfile.npu
+++ b/pack/Dockerfile.npu
@ -1,20 +1,17 @@
 # Packaging logic:
 # 1. base target:
-#    - Install tools.
-#    - Upgrade GCC if needed.
-#    - Install C buildkit.
-#    - Upgrade Python if needed.
-#    - Install Python buildkit.
-#    - Install CANN buildkit if needed.
+#   - Install/Upgrade tools, including Python, GCC[optional], CMake, Make, SCCache and dependencies.
+#   - Install specific version Ascend CANN according to the chip, including Toolkit, Kernels and NNAL.
 # 2.1. mindie-install target:
-#   - Add ATB models.
+#   - Copy ATB models from a fixed image.
 #   - Install dependencies for MindIE into system site packages, including Torch, Torch-NPU and TorchVision,
 #     which is used to support multi-versions of MindIE.
-#   - Create a virtual environment to place MindIE: /opt/venvs/mindie.
+#   - Create a virtual environment to place MindIE: $(pipx environment --value PIPX_LOCAL_VENVS)/mindie.
 #   - Install specific version MindIE.
 # 2.2. vllm-install target (parallel against mindie-install):
-#   - Create a virtual environment to place vLLM: /opt/venvs/vllm.
+#   - Create a virtual environment to place vLLM: $(pipx environment --value PIPX_LOCAL_VENVS)/vllm.
 #   - Install specific version Torch, Torch-NPU and TorchVision.
+#   - Install specific version MindIE Turbo.
 #   - Install specific version vLLM and vLLM Ascend.
 # 3. gpustack target (final):
 #   - Install GPUStack, and override the required dependencies after installed.
@ -29,43 +26,38 @@
 # - MINDIE_VERSION is the version of Ascend MindIE,
 #   which is used to install the Ascend MindIE,
 #   please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
-# - MINDIE_TORCH_VERSION is the version of Torch used by Ascend MindIE.
 # - VLLM_VERSION is the version of vLLM,
-#   which is used to install the vLLM.
-# - VLLM_TORCH_VERSION is the version of Torch used by vLLM.
+#   which is used to install the vLLM,
 # - VLLM_ASCEND_VERSION is the version of vLLM Ascend,
 #   which is used to install the vLLM Ascend,
 #   please check https://vllm-ascend.readthedocs.io/en/stable/installation.html for details.
 # - PYTHON_VERSION is the version of Python,
 #   which should be properly set, it must be 3.x.

-ARG CANN_VERSION=8.2.rc1
+ARG CANN_VERSION=8.1.rc1.beta1
 ARG CANN_CHIP=910b
-ARG MINDIE_VERSION=2.1.rc1
-ARG MINDIE_TORCH_VERSION=2.1.0
-ARG VLLM_VERSION=0.10.0
-ARG VLLM_TORCH_VERSION=2.7.1
-ARG VLLM_ASCEND_VERSION=0.10.0rc1
+ARG MINDIE_VERSION=2.0.rc1
+ARG VLLM_VERSION=0.7.3
+ARG VLLM_ASCEND_VERSION=0.7.3.post1
 ARG PYTHON_VERSION=3.11

 #
 # Stage Base
 #
 # Example build command:
-#   docker build --tag=gpustack/gpustack:npu-base --file=pack/Dockerfile.npu --target base --progress=plain .
+#   docker build --tag=gpustack/gpustack:npu-base --file=Dockerfile.npu --target base --progress=plain .
 #

-FROM quay.io/ascend/cann:${CANN_VERSION}-${CANN_CHIP}-ubuntu22.04-py${PYTHON_VERSION} AS base
+FROM ubuntu:22.04 AS base
 SHELL ["/bin/bash", "-eo", "pipefail", "-c"]

 ARG TARGETPLATFORM
 ARG TARGETOS
 ARG TARGETARCH

-ENV DEBIAN_FRONTEND=noninteractive \
-    LANG='en_US.UTF-8' \
-    LANGUAGE='en_US:en' \
-    LC_ALL='en_US.UTF-8'
+## Install Tools
+
+ENV DEBIAN_FRONTEND=noninteractive

 RUN <<EOF
    # Tools
@ -92,46 +84,58 @@ RUN <<EOF
    # Update locale
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

-    # Update timezone
-    rm -f /etc/localtime \
-        && ln -sf /usr/share/zoneinfo/Asia/Shanghai /etc/localtime \
-        && echo "Asia/Shanghai" > /etc/timezone \
-        && dpkg-reconfigure --frontend noninteractive tzdata
-
    # Cleanup
    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt
 EOF

-## Upgrade GCC if needed
+ENV LANG='en_US.UTF-8' \
+    LANGUAGE='en_US:en' \
+    LC_ALL='en_US.UTF-8'
+
+## Install GCC

 RUN <<EOF
    # GCC

-    # Upgrade GCC if the Ubuntu version is lower than 21.04.
+    # NB(thxCode): Upgrade GCC if the Ubuntu version is lower than 21.04.
    source /etc/os-release
-    if (( $(echo "${VERSION_ID} > 21.04" | bc -l) )); then
-        echo "Skipping GCC upgrade for ${VERSION_ID}..."
-        exit 0
+    if (( $(echo "${VERSION_ID} < 21.04" | bc -l) )); then
+        # Install
+        apt-get install -y --no-install-recommends \
+            gcc-11 g++-11 gfortran-11 gfortran
+
+        # Update alternatives
+        if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
+        if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
+        if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
+        if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
+        if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
+        if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
+        if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
+        if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
+        if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
+        if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
+        if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
+
+        # Cleanup
+        rm -rf /var/tmp/* \
+            && rm -rf /tmp/* \
+            && rm -rf /var/cache/apt
    fi
+EOF
+
+## Install CMake/Make/SCCache
+
+RUN <<EOF
+    # CMake/Make/SCCache

    # Install
    apt-get install -y --no-install-recommends \
-        gcc-11 g++-11 gfortran-11 gfortran
-
-    # Update alternatives
-    if [[ -f /etc/alternatives/gcov-dump ]]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
-    if [[ -f /etc/alternatives/lto-dump ]]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
-    if [[ -f /etc/alternatives/gcov ]]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
-    if [[ -f /etc/alternatives/gcc ]]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
-    if [[ -f /etc/alternatives/gcc-nm ]]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
-    if [[ -f /etc/alternatives/cpp ]]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
-    if [[ -f /etc/alternatives/g++ ]]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
-    if [[ -f /etc/alternatives/gcc-ar ]]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
-    if [[ -f /etc/alternatives/gcov-tool ]]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
-    if [[ -f /etc/alternatives/gcc-ranlib ]]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
-    if [[ -f /etc/alternatives/gfortran ]]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
+        pkg-config make
+    curl --retry 3 --retry-connrefused -fL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
+    curl --retry 3 --retry-connrefused -fL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1

    # Cleanup
    rm -rf /var/tmp/* \
@ -139,27 +143,20 @@ RUN <<EOF
        && rm -rf /var/cache/apt
 EOF

-## Install C buildkit
+## Install Compile Dependencies

 RUN <<EOF
-    # C buildkit
+    # Dependencies

    # Install
    apt-get install -y --no-install-recommends \
-        make ninja-build pkg-config ccache
-    curl --retry 3 --retry-connrefused -fL "https://github.com/Kitware/CMake/releases/download/v3.31.7/cmake-3.31.7-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
-
-    # Install dependencies
-    apt-get install -y --no-install-recommends \
-        perl-openssl-defaults perl yasm \
        zlib1g zlib1g-dev libbz2-dev libffi-dev libgdbm-dev libgdbm-compat-dev \
        openssl libssl-dev libsqlite3-dev lcov libomp-dev \
        libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 libhdf5-dev \
        libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl \
-        libncurses5-dev libreadline6-dev libsqlite3-dev \
+        libncurses5-dev libreadline6-dev libsqlite3-dev libssl-dev \
        liblzma-dev lzma lzma-dev tk-dev uuid-dev libmpdec-dev \
-        ffmpeg libjpeg-dev libpng-dev libtiff-dev libwebp-dev \
-        libnuma-dev libjemalloc-dev
+        libnuma-dev

    # Cleanup
    rm -rf /var/tmp/* \
@ -167,7 +164,7 @@ RUN <<EOF
        && rm -rf /var/cache/apt
 EOF

-## Upgrade Python if needed
+## Install Python

 ARG PYTHON_VERSION

@ -176,83 +173,52 @@ ENV PYTHON_VERSION=${PYTHON_VERSION}
 RUN <<EOF
    # Python

-    if (( $(echo "$(python3 --version | cut -d' ' -f2 | cut -d'.' -f1,2) == ${PYTHON_VERSION}" | bc -l) )); then
-        echo "Skipping Python upgrade for ${PYTHON_VERSION}..."
-        if [[ -z "$(ldconfig -v 2>/dev/null | grep libpython${PYTHON_VERSION})" ]]; then
-            PYTHON_LIB_PREFIX=$(python3 -c "import sys; print(sys.base_prefix);")
-            echo "${PYTHON_LIB_PREFIX}/lib" >> /etc/ld.so.conf.d/python3.conf
-            echo "${PYTHON_LIB_PREFIX}/lib64" >> /etc/ld.so.conf.d/python3.conf
-        fi
-        exit 0
-    fi
-
-    # Add deadsnakes PPA for Python versions
-    for i in 1 2 3; do
-        add-apt-repository -y ppa:deadsnakes/ppa && break || { echo "Attempt $i failed, retrying in 5s..."; sleep 5; }
-    done
-    apt-get update -y
-
-    # Install
-    apt-get install -y --no-install-recommends \
-        python${PYTHON_VERSION} \
-        python${PYTHON_VERSION}-dev \
-        python${PYTHON_VERSION}-venv \
-        python${PYTHON_VERSION}-distutils \
-        python${PYTHON_VERSION}-lib2to3 \
-        python${PYTHON_VERSION}-gdbm \
-        python${PYTHON_VERSION}-tk \
-        libibverbs-dev
-
-    # Update alternatives
-    if [[ -f /etc/alternatives/python3 ]]; then update-alternatives --remove-all python3; fi; update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1
-    if [[ -f /etc/alternatives/python ]]; then update-alternatives --remove-all python; fi; update-alternatives --install /usr/bin/python python /usr/bin/python${PYTHON_VERSION} 1
-    curl -sS "https://bootstrap.pypa.io/get-pip.py" | python${PYTHON_VERSION}
-    if [[ -f /etc/alternatives/2to3 ]]; then update-alternatives --remove-all 2to3; fi; update-alternatives --install /usr/bin/2to3 2to3 /usr/bin/2to3${PYTHON_VERSION} 1 || true
-    if [[ -f /etc/alternatives/pydoc3 ]]; then update-alternatives --remove-all pydoc3; fi; update-alternatives --install /usr/bin/pydoc3 pydoc3 /usr/bin/pydoc${PYTHON_VERSION} 1 || true
-    if [[ -f /etc/alternatives/idle3 ]]; then update-alternatives --remove-all idle3; fi; update-alternatives --install /usr/bin/idle3 idle3 /usr/bin/idle${PYTHON_VERSION} 1 || true
-    if [[ -f /etc/alternatives/python3-config ]]; then update-alternatives --remove-all python3-config; fi; update-alternatives --install /usr/bin/python3-config python3-config /usr/bin/python${PYTHON_VERSION}-config 1 || true
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt \
-        && pip cache purge
-EOF
-
-## Install Python buildkit
-
-ENV PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1 \
-    PIP_ROOT_USER_ACTION=ignore
-
-RUN <<EOF
-    # Buildkit
-
+    # Download
+    PYTHON_INSTALL_DIR="/tmp/Python-${PYTHON_VERSION}"
+    mkdir -p ${PYTHON_INSTALL_DIR}
+    PYTHON_LATEST_VERSION=$(curl -s https://repo.huaweicloud.com/python/ | grep -oE "${PYTHON_VERSION}\.[0-9]+" | sort -V | tail -n 1)
+    curl -H 'Referer: https://repo.huaweicloud.com/' --retry 3 --retry-connrefused -fL "https://repo.huaweicloud.com/python/${PYTHON_LATEST_VERSION}/Python-${PYTHON_LATEST_VERSION}.tgz" | tar -zx -C ${PYTHON_INSTALL_DIR} --strip-components 1
+
+    # Build
+    pushd ${PYTHON_INSTALL_DIR}
+    ./configure \
+        --prefix=/usr \
+        --enable-optimizations \
+        --enable-shared \
+        --enable-ipv6 \
+        --enable-loadable-sqlite-extensions \
+        --with-lto=full \
+        --with-ensurepip=install \
+        --with-computed-gotos
+    make -j$(nproc) && make altinstall
+    popd
+
+    # Link
+    ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+    ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+    ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip3
+    ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip
+    ln -vsf /usr/bin/2to3-${PYTHON_VERSION} /usr/bin/2to3
+    ln -vsf /usr/bin/pydoc${PYTHON_VERSION} /usr/bin/pydoc3
+    ln -vsf /usr/bin/idle${PYTHON_VERSION} /usr/bin/idle3
+
+    # Install packages
    cat <<EOT >/tmp/requirements.txt
-build
-cmake<4
-ninja<1.11
-setuptools<80
-setuptools-scm
-packaging<25
-wheel
-pybind11
-Cython
-psutil==7.0.0
+setuptools==80.7.1
 pipx==1.7.1
 EOT
-    pip install -r /tmp/requirements.txt
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt

    # Cleanup
    rm -rf /var/tmp/* \
-        && rm -rf /tmp/*
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
+        && pip cache purge
 EOF

-## Preset this to simplify configuration.
-ENV LOCAL_VENVS=/opt/venvs \
-    USE_EMOJI="false"
-
-## Install CANN buildkit if needed
+## Preset this to simplify configuration,
+## it is the output of $(pipx environment --value PIPX_LOCAL_VENVS).
+ENV PIPX_LOCAL_VENVS=/root/.local/share/pipx/venvs

 ARG CANN_VERSION
 ARG CANN_CHIP
@ -261,6 +227,8 @@ ENV CANN_VERSION=${CANN_VERSION} \
    CANN_CHIP=${CANN_CHIP} \
    CANN_HOME="/usr/local/Ascend"

+## Install CANN Toolkit
+
 RUN <<EOF
    # CANN Toolkit

@ -270,11 +238,6 @@ RUN <<EOF
    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
    URL_SUFFIX="response-content-type=application/octet-stream"

-    if [[ -d "${CANN_HOME}/ascend-toolkit/${DOWNLOAD_VERSION}" ]]; then
-        echo "Skipping CANN Toolkit upgrade for ${DOWNLOAD_VERSION}..."
-        exit 0
-    fi
-
    # Install dependencies
    cat <<EOT >/tmp/requirements.txt
 attrs==24.3.0
@ -290,7 +253,7 @@ scipy==1.15.3
 requests==2.32.3
 absl-py==2.2.2
 EOT
-    pip install -r /tmp/requirements.txt
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt

    # Install toolkit
    TOOLKIT_FILE="Ascend-cann-toolkit_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
@ -305,9 +268,12 @@ EOT
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend \
-        && rm -rf /var/log/ascend_seclog
+        && rm -rf /var/log/ascend_seclog \
+        && pip cache purge
 EOF

+## Install CANN Kernels
+
 RUN <<EOF
    # CANN Kernels

@ -317,11 +283,6 @@ RUN <<EOF
    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
    URL_SUFFIX="response-content-type=application/octet-stream"

-    if [[ -d "${CANN_HOME}/ascend-toolkit/${DOWNLOAD_VERSION}" ]]; then
-        echo "Skipping CANN Kernels upgrade for ${DOWNLOAD_VERSION}..."
-        exit 0
-    fi
-
    # Prepare environment
    source ${CANN_HOME}/ascend-toolkit/set_env.sh

@ -342,9 +303,12 @@ RUN <<EOF
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend \
-        && rm -rf /var/log/ascend_seclog
+        && rm -rf /var/log/ascend_seclog \
+        && pip cache purge
 EOF

+## Install NNAL
+
 RUN <<EOF
    # CANN NNAL

@ -354,11 +318,6 @@ RUN <<EOF
    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%20${DOWNLOAD_VERSION}"
    URL_SUFFIX="response-content-type=application/octet-stream"

-    if [[ -d "${CANN_HOME}/ascend-toolkit/${DOWNLOAD_VERSION}" ]] && [[ -d "${CANN_HOME}/nnal" ]]; then
-        echo "Skipping CANN NNAL upgrade for ${DOWNLOAD_VERSION}..."
-        exit 0
-    fi
-
    # Prepare environment
    source ${CANN_HOME}/ascend-toolkit/set_env.sh

@ -375,32 +334,26 @@ RUN <<EOF
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend_seclog \
-        && rm -rf /var/log/cann_atb_log
+        && rm -rf /var/log/cann_atb_log \
+        && pip cache purge
 EOF

 #
 # Stage MindIE Install
 #
 # Example build command:
-#   docker build --tag=gpustack/gpustack:npu-mindie-install --file=pack/Dockerfile.npu --target mindie-install --progress=plain .
+#   docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
 #

 FROM base AS mindie-install
-SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
-
-ARG TARGETPLATFORM
-ARG TARGETOS
-ARG TARGETARCH

 ## Install MindIE

 ARG MINDIE_VERSION
-ARG MINDIE_TORCH_VERSION

-ENV MINDIE_VERSION=${MINDIE_VERSION} \
-    MINDIE_TORCH_VERSION=${MINDIE_TORCH_VERSION}
+ENV MINDIE_VERSION=${MINDIE_VERSION}

-ADD --chown=root:root pack/mindie-atb-models_${MINDIE_VERSION}_${TARGETOS}-${TARGETARCH}_py${PYTHON_VERSION}_torch${MINDIE_TORCH_VERSION}-abi0.tar.gz ${CANN_HOME}/atb-models/
+COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
 RUN <<EOF
    # MindIE

@ -413,109 +366,75 @@ RUN <<EOF
    # Install Torch, Torch-npu, TorchVision,
    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
    # please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
-    cat <<EOT >/tmp/requirements.txt
-torch==${MINDIE_TORCH_VERSION}
-torchvision
-torchaudio
-EOT
-    if [[ "${TARGETARCH}" == "amd64" ]]; then
-        pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
-            -r /tmp/requirements.txt
+    if [ ${ARCH} == "x86_64" ]; then
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
    else
-        pip install \
-            -r /tmp/requirements.txt
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0
    fi
-    pip install torch-npu==${MINDIE_TORCH_VERSION}.*
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0

    # Install dependencies.
    cat <<EOT >/tmp/requirements.txt
+av==14.3.0
 absl-py==2.2.2
-accelerate==0.34.2
-aiohappyeyeballs==2.6.1
-aiohttp==3.11.18
-aiosignal==1.3.2
 attrs==24.3.0
-av==14.3.0
 certifi==2024.8.30
 cloudpickle==3.0.0
-cpm-kernels==1.0.11
-decorator==5.2.1
-easydict==1.13
 einops==0.8.1
-et-xmlfile==1.1.0
+easydict==1.13
 frozenlist==1.6.0
-fuzzywuzzy==0.18.0
 gevent==24.2.1
 geventhttpclient==2.3.1
 greenlet==3.2.1
 grpcio==1.71.0
 icetk==0.0.4
 idna==2.8
-jieba==0.42.1
-Jinja2==3.1.6
 jsonlines==4.0.0
-jsonschema-specifications==2025.4.1
 jsonschema==4.23.0
-latex2mathml==3.77.0
+jsonschema-specifications==2025.4.1
+Jinja2==3.1.6
 loguru==0.7.2
-Markdown==3.7
 matplotlib==3.9.2
-mdtex2html==1.3.0
 ml_dtypes==0.5.0
 multidict==6.4.3
 nltk==3.9.1
 numba==0.61.2
 numpy==1.26.4
-onnx==1.17.0
-openpyxl==3.1.5
 pandas==2.2.3
 pillow==11.2.1
 prettytable==3.11.0
-propcache==0.3.1
-psutil==7.0.0
 pyarrow==19.0.1
-pydantic_core==2.23.4
 pydantic==2.9.2
+pydantic_core==2.23.4
 python-rapidjson==1.20
 requests==2.32.3
-rouge-score==0.1.2
-rouge==1.0.1
 sacrebleu==2.4.3
-scipy==1.15.3
-text-generation==0.7.0
-thefuzz==0.22.1
-tiktoken==0.7.0
 tornado==6.4.2
-tqdm==4.67.1
 transformers==4.52.3
-safetensors==0.5.3
-tritonclient==2.49.0
+tiktoken==0.7.0
 typing_extensions==4.13.2
 tzdata==2024.2
+tqdm==4.67.1
+thefuzz==0.22.1
 urllib3==2.4.0
-yarl==1.20.0
 zope.event==5.0
 zope.interface==7.0.3
 EOT
-    pip install -r /tmp/requirements.txt
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt

    # Install MindIE ATB models
-    pip install ${CANN_HOME}/atb-models/*.whl
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl

    # Pre process
    # - Create virtual environment to place MindIE
-    python -m venv --system-site-packages ${LOCAL_VENVS}/mindie
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/mindie
    # - Prepare environment
    source ${CANN_HOME}/ascend-toolkit/set_env.sh
    source ${CANN_HOME}/nnal/atb/set_env.sh
-    source ${LOCAL_VENVS}/mindie/bin/activate
+    source ${PIPX_LOCAL_VENVS}/mindie/bin/activate

    # Install MindIE
    MINDIE_FILE="Ascend-mindie_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
-    IFS="." read -r MINDIE_MAJOR MINDIE_MINOR MINDIE_PATCH <<< "${MINDIE_VERSION}"
-    if (( $(echo "${MINDIE_MAJOR}.${MINDIE_MINOR} >= 2.1" | bc -l) )); then
-        MINDIE_FILE="Ascend-mindie_${DOWNLOAD_VERSION}_${OS}-${ARCH}_abi0.run"
-    fi
    MINDIE_PATH="/tmp/${MINDIE_FILE}"
    MINDIE_URL="${URL_PREFIX}/${MINDIE_FILE}?${URL_SUFFIX}"
    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL -o "${MINDIE_PATH}" "${MINDIE_URL}"
@ -529,12 +448,13 @@ EOT
    cat <<EOT >>"${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"

 # NB(thxCode): This is a workaround for GPUStack to activate MindIE.
-source ${LOCAL_VENVS}/mindie/bin/activate || true
+source ${PIPX_LOCAL_VENVS}/mindie/bin/activate || true
 EOT
    chmod -w "${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"
+    deactivate

    # Review
-    pip freeze
+    pipx runpip mindie freeze

    # Cleanup
    rm -rf /var/tmp/* \
@ -543,32 +463,28 @@ EOT
        && rm -rf /var/log/ascend_seclog \
        && rm -rf /var/log/cann_atb_log \
        && rm -rf /var/log/mindie_log \
-        && rm -rf ~/log
+        && rm -rf ~/log \
+        && pip cache purge
 EOF

 #
 # Stage vLLM Install
 #
 # Example build command:
-#   docker build --tag=gpustack/gpustack:npu-vllm-install --file=pack/Dockerfile.npu --target vllm-install --progress=plain .
+#   docker build --tag=gpustack/gpustack:npu-vllm-install --file=Dockerfile.npu --target vllm-install --progress=plain .
 #

 FROM base AS vllm-install
-SHELL ["/bin/bash", "-eo", "pipefail", "-c"]

-ARG TARGETPLATFORM
-ARG TARGETOS
-ARG TARGETARCH
-
-## Install vLLM (Ascend)
+## Install vLLM Ascend

 ARG VLLM_VERSION
-ARG VLLM_TORCH_VERSION
 ARG VLLM_ASCEND_VERSION
+ARG MINDIE_VERSION

 ENV VLLM_VERSION=${VLLM_VERSION} \
-    VLLM_TORCH_VERSION=${VLLM_TORCH_VERSION} \
-    VLLM_ASCEND_VERSION=${VLLM_ASCEND_VERSION}
+    VLLM_ASCEND_VERSION=${VLLM_ASCEND_VERSION} \
+    MINDIE_VERSION=${MINDIE_VERSION}

 RUN <<EOF
    # vLLM
@ -581,11 +497,21 @@ RUN <<EOF

    # Pre process
    # - Create virtual environment to place vLLM
-    python -m venv --system-site-packages ${LOCAL_VENVS}/vllm
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/vllm
    # - Prepare environment
    source ${CANN_HOME}/ascend-toolkit/set_env.sh
    source ${CANN_HOME}/nnal/atb/set_env.sh
-    source ${LOCAL_VENVS}/vllm/bin/activate
+    source ${PIPX_LOCAL_VENVS}/vllm/bin/activate
+
+    # Install Torch, Torch-npu, TorchVision,
+    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
+    # please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
+    if [ ${ARCH} == "x86_64" ]; then
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
+    else
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1
+    fi
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.5.1 torchvision==0.20.1

    # Install dependencies.
    cat <<EOT >/tmp/requirements.txt
@ -599,86 +525,54 @@ rouge_score==0.1.2
 pybind11==2.13.6
 pytest==8.4.0
 cloudpickle==3.0.0
-ray[default]==2.47.1
-protobuf>3.20.0
-grpcio==1.71.0
+ray[client]==2.43.0
 EOT
-    pip install -r /tmp/requirements.txt
-
-    # Install vLLM
-    if [[ "${TARGETARCH}" == "amd64" ]]; then
-        pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
-            vllm==${VLLM_VERSION}
-    else
-        pip install \
-            vllm==${VLLM_VERSION}
-    fi
-    # Fix conflicting packages.
-    # - In x86, triton will be installed, triton doesn't work correctly in Ascend, we need to uninstall it.
-    pip uninstall -y triton || true
-    # Fix conflicting packages.
-    # - In Ascend, opencv-python-headless requires numpy>2.0.0, which is conflicting with Ascend, we need to reinstall it.
-    pip uninstall -y numpy || true
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt

-    # Install Torch, Torch-npu, TorchVision,
-    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
-    # please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
+    # Install vLLM & vLLM-Ascend
    cat <<EOT >/tmp/requirements.txt
-torch==${VLLM_TORCH_VERSION}
-torchvision
-torchaudio
+vllm==${VLLM_VERSION}
+vllm-ascend==${VLLM_ASCEND_VERSION}
 EOT
-    if [[ "${TARGETARCH}" == "amd64" ]]; then
-        pip install --extra-index-url https://download.pytorch.org/whl/cpu/ \
-            -r /tmp/requirements.txt
+    if [ ${ARCH} == "x86_64" ]; then
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
    else
-        pip install \
-            -r /tmp/requirements.txt
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
    fi
-    pip install torch-npu==${VLLM_TORCH_VERSION}.* --pre --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi

-    # Install vLLM Ascend
-    pip install vllm-ascend==${VLLM_ASCEND_VERSION} --pre --extra-index-url https://mirrors.huaweicloud.com/ascend/repos/pypi
-    # Fix SoC version.
-    if [[ "${CANN_CHIP}" == "310p" ]]; then
-        sed -i "s/^__soc_version__.*/__soc_version__ = 'ASCEND310P3'/" ${LOCAL_VENVS}/vllm/lib/python3.11/site-packages/vllm_ascend/_build_info.py
-    fi
+    # Install MindIE Turbo
+    MINDIE_TURBO_FILE="Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}.tar.gz"
+    MINDIE_TURBO_URL="${URL_PREFIX}/${MINDIE_TURBO_FILE}?${URL_SUFFIX}"
+    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL "${MINDIE_TURBO_URL}" | tar -zx -C /tmp --strip-components 1
+    WHEEL_PACKAGE="$(ls /tmp/Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}/*.whl)"
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE}
+
+    # Post process
+    deactivate

    # Review
-    pip freeze
+    pipx runpip vllm freeze

    # Cleanup
    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt \
-        && rm -rf ~/log
+        && rm -rf ~/log \
+        && pip cache purge
 EOF

 #
 # Stage GPUStack
 #
 # Example build command:
-#   docker build --tag=gpustack/gpustack:npu --file=pack/Dockerfile.npu --progress=plain .
+#   docker build --tag=gpustack/gpustack:npu --file=Dockerfile.npu --progress=plain .
 #

 FROM mindie-install AS gpustack
-SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
-
-ARG TARGETPLATFORM
-ARG TARGETOS
-ARG TARGETARCH

 ## Copy vLLM from vllm-install stage

-COPY --from=vllm-install ${LOCAL_VENVS}/vllm ${LOCAL_VENVS}/vllm
-RUN --mount=type=bind,target=/workspace/gpustack,rw <<EOF
-    # Patch vLLM
-    for dir in lib lib64; do
-        if [ -d "${LOCAL_VENVS}/vllm/${dir}/python${PYTHON_VERSION}/site-packages/vllm" ]; then
-            cp /workspace/gpustack/gpustack/_sitecustomize.py ${LOCAL_VENVS}/vllm/${dir}/python${PYTHON_VERSION}/site-packages/sitecustomize.py
-        fi
-    done
-EOF
+COPY --from=vllm-install ${PIPX_LOCAL_VENVS}/vllm ${PIPX_LOCAL_VENVS}/vllm

 ## Install GPUStack

@ -686,66 +580,57 @@ RUN --mount=type=bind,target=/workspace/gpustack,rw <<EOF
    # GPUStack

    # Build GPUStack
-    export PATH="$(pipx environment --value PIPX_BIN_DIR):${PATH}"
+    export PATH="${HOME}/.local/bin:${PATH}"
    cd /workspace/gpustack \
        && git config --global --add safe.directory /workspace/gpustack \
        && make build

    # Pre process
    # - Create virtual environment to place gpustack
-    python -m venv --system-site-packages ${LOCAL_VENVS}/gpustack
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/gpustack
    # - Prepare environment
-    source ${LOCAL_VENVS}/gpustack/bin/activate
+    source ${PIPX_LOCAL_VENVS}/gpustack/bin/activate

    # Install GPUStack,
    # vox-box relies on PyTorch 2.7, which is not compatible with MindIE.
    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)"
-    pip install ${WHEEL_PACKAGE} \
-        && ln -vsf ${LOCAL_VENVS}/gpustack/bin/gpustack /usr/local/bin/gpustack
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE} \
+        && ln -vsf ${PIPX_LOCAL_VENVS}/gpustack/bin/gpustack /usr/local/bin/gpustack

    # Download tools
    gpustack download-tools --device npu
-    tree -hs "$(pip show gpustack | grep Location: | head -n 1 | cut -d" " -f 2)/gpustack/third_party"

    # Active MindIE
    # MindIE is combined with a lot of components, and it is conflict with vLLM,
    # so we need to active MindIE manually at GPUStack.

    # Active vLLM
-    ln -vsf ${LOCAL_VENVS}/vllm/bin/vllm ${LOCAL_VENVS}/gpustack/bin/vllm
+    ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/vllm ${PIPX_LOCAL_VENVS}/gpustack/bin/vllm
    # - Redirect RAY.
-    rm -rf ${LOCAL_VENVS}/gpustack/bin/ray \
-        && ln -vsf ${LOCAL_VENVS}/vllm/bin/ray ${LOCAL_VENVS}/gpustack/bin/ray
+    rm -rf ${PIPX_LOCAL_VENVS}/gpustack/bin/ray \
+        && ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/ray ${PIPX_LOCAL_VENVS}/gpustack/bin/ray

    # Set up environment
    mkdir -p /var/lib/gpustack \
        && chmod -R 0755 /var/lib/gpustack

+    # Post process
+    deactivate
+
    # Review
-    pip freeze
+    pipx runpip gpustack freeze

    # Cleanup
    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
        && rm -rf /var/cache/apt \
-        && rm -rf /workspace/gpustack/dist
+        && rm -rf /workspace/gpustack/dist \
+        && pip cache purge
 EOF

 ## Setup environment

 RUN <<EOF
-    # Export Python lib
-    PYTHON_LIB_PREFIX=$(python3 -c "import sys; print(sys.base_prefix);")
-    EXPORT_PYTHON_LIB="export LD_LIBRARY_PATH=${PYTHON_LIB_PREFIX}/lib:${PYTHON_LIB_PREFIX}/lib64:\${LD_LIBRARY_PATH}"
-    echo "${EXPORT_PYTHON_LIB}" >> /etc/profile
-    echo "${EXPORT_PYTHON_LIB}" >> ~/.bashrc
-
-    # Export CANN lib
-    CANN_LIB_PREFIX="${CANN_HOME}/ascend-toolkit/$(echo ${CANN_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')/$(uname -m)-linux"
-    EXPORT_CANN_LIB="export LD_LIBRARY_PATH=${CANN_LIB_PREFIX}/lib:${CANN_LIB_PREFIX}/lib64:\${LD_LIBRARY_PATH}"
-    echo "${EXPORT_CANN_LIB}" >> /etc/profile
-    echo "${EXPORT_CANN_LIB}" >> ~/.bashrc
-
    # Export CANN driver lib
    EXPORT_DRIVER_LIB="export LD_LIBRARY_PATH=${CANN_HOME}/driver/lib64/common:${CANN_HOME}/driver/lib64/driver:\${LD_LIBRARY_PATH}"
    echo "${EXPORT_DRIVER_LIB}" >> /etc/profile
@ -777,8 +662,4 @@ RUN <<EOF
    # NB(thxCode): Any tuning environment variables should NOT be set here.
 EOF

-# Persist pipx venvs in the data directory
-ENV PIPX_HOME=/var/lib/gpustack/pipx \
-    PIPX_BIN_DIR=/var/lib/gpustack/bin
-
 ENTRYPOINT [ "tini", "--", "/usr/bin/bash", "-c", "source /etc/profile && exec gpustack start \"$@\"", "--" ]
--- a/pack/Dockerfile.rocm
+++ b/pack/Dockerfile.rocm
@ -10,11 +10,10 @@ RUN apt-get update && apt-get install -y \
    tzdata \
    iproute2 \
    build-essential \
-    git git-lfs \
    tini \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

-COPY .. /workspace/gpustack
+COPY . /workspace/gpustack
 RUN cd /workspace/gpustack && make build

 # Install GPUStack
--- a/pack/Dockerfile.rocm.base
+++ b/pack/Dockerfile.rocm.base
--- a/README.md
+++ b/README.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/Docs-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
+    <a href="./docs/assets/wechat-assistant.png" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 - **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
 - **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including vLLM, Ascend MindIE, llama-box (llama.cpp & stable-diffusion.cpp) and vox-box.
+- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
 - **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
 - **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
 - **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -50,71 +50,83 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 ## Installation

-### Linux
+### Linux or macOS

-If you are using NVIDIA GPUs, ensure [Docker](https://docs.docker.com/engine/install/) and [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) are installed on your system. Then, run the following command to start the GPUStack server.
+GPUStack provides a script to install it as a service on systemd or launchd based systems with default port 80. To install GPUStack using this method, just run:

 ```bash
-docker run -d --name gpustack \
-      --restart=unless-stopped \
-      --gpus all \
-      --network=host \
-      --ipc=host \
-      -v gpustack-data:/var/lib/gpustack \
-      gpustack/gpustack
+curl -sfL https://get.gpustack.ai | sh -s -
 ```

-For more details on the installation or other GPU hardware platforms, please refer to the [Installation Documentation](docs/installation/installation-requirements.md).
+### Windows

-After the server starts, run the following command to get the default admin password:
+Run PowerShell as administrator (**avoid** using PowerShell ISE), then run the following command to install GPUStack:

-```bash
-docker exec gpustack cat /var/lib/gpustack/initial_admin_password
+```powershell
+Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
 ```

-Open your browser and navigate to `http://your_host_ip` to access the GPUStack UI. Use the default username `admin` and the password you retrieved above to log in.
+### Other Installation Methods
+
+For manual installation, docker installation or detailed configuration options, please refer to the [Installation Documentation](https://docs.gpustack.ai/latest/installation/installation-script/).
+
+## Getting Started
+
+1. Run and chat with the **llama3.2** model:

-### macOS & Windows
+```bash
+gpustack chat llama3.2 "tell me a joke."
+```

-A desktop installer is available for macOS and Windows — see the [documentation](https://docs.gpustack.ai/latest/installation/desktop-installer/) for installation details.
+2. Run and generate an image with the **stable-diffusion-v3-5-large-turbo** model:

-## Deploy a Model
+> ### 💡 Tip
+>
+> This command downloads the model (~12GB) from Hugging Face. The download time depends on your network speed. Ensure you have enough disk space and VRAM (12GB) to run the model. If you encounter issues, you can skip this step and move to the next one.
+
+```bash
+gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
+"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
+--sample-steps 5 --show
+```

-1. Navigate to the `Catalog` page in the GPUStack UI.
+Once the command completes, the generated image will appear in the default viewer. You can experiment with the prompt and CLI options to customize the output.

-2. Select the `Qwen3` model from the list of available models.
+![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)

-3. After the deployment compatibility checks pass, click the `Save` button to deploy the model.
+3. Open `http://your_host_ip` in the browser to access the GPUStack UI. Log in to GPUStack with username `admin` and the default password. You can run the following command to get the password for the default setup:

-![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)
+**Linux or macOS**

-4. GPUStack will start downloading the model files and deploying the model. When the deployment status shows `Running`, the model has been deployed successfully.
+```bash
+cat /var/lib/gpustack/initial_admin_password
+```

-![model is running](docs/assets/quick-start/model-running.png)
+**Windows**

-5. Click `Playground - Chat` in the navigation menu, check that the model `qwen3` is selected from the top-right `Model` dropdown. Now you can chat with the model in the UI playground.
+```powershell
+Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
+```

-![quick chat](docs/assets/quick-start/quick-chat.png)
+4. Click `Playground - Chat` in the navigation menu. Now you can chat with the LLM in the UI playground.

-## Use the model via API
+![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)

-1. Hover over the user avatar and navigate to the `API Keys` page, then click the `New API Key` button.
+5. Click `API Keys` in the navigation menu, then click the `New API Key` button.

-2. Fill in the `Name` and click the `Save` button.
+6. Fill in the `Name` and click the `Save` button.

-3. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.
+7. Copy the generated API key and save it somewhere safe. Please note that you can only see it once on creation.

-4. You can now use the API key to access the OpenAI-compatible API endpoints provided by GPUStack. For example, use curl as the following:
+8. Now you can use the API key to access the OpenAI-compatible API. For example, use curl as the following:

 ```bash
-# Replace `your_api_key` and `your_gpustack_server_url`
-# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1/chat/completions \
+curl http://your_gpustack_server_url/v1-openai/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "qwen3",
+    "model": "llama3.2",
    "messages": [
      {
        "role": "system",
@ -122,7 +134,7 @@ curl http://your_gpustack_server_url/v1/chat/completions \
      },
      {
        "role": "user",
-        "content": "Tell me a joke."
+        "content": "Hello!"
      }
    ],
    "stream": true
@ -131,8 +143,8 @@ curl http://your_gpustack_server_url/v1/chat/completions \

 ## Supported Platforms

- [x] Linux
 - [x] macOS
+- [x] Linux
 - [x] Windows

 ## Supported Accelerators
@ -144,11 +156,15 @@ curl http://your_gpustack_server_url/v1/chat/completions \
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
 - [x] Iluvatar Corex
- [x] Cambricon MLU
+
+We plan to support the following accelerators in future releases.
+
+- [ ] Intel oneAPI
+- [ ] Qualcomm AI Engine

 ## Supported Models

-GPUStack uses [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:
+GPUStack uses [llama-box](https://github.com/gpustack/llama-box) (bundled [llama.cpp](https://github.com/ggml-org/llama.cpp) and [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp) server), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the backends and supports a wide range of models. Models from the following sources are supported:

 1. [Hugging Face](https://huggingface.co/)

@ -156,16 +172,16 @@ GPUStack uses [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](http

 3. Local File Path

-### Example Models
+### Example Models:

-| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                       |
-| -------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
-| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
-| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
-| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
-| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
-| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |
+| **Category**                     | **Models**                                                                                                                                                                                                                                                                                                                                           |
+| -------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **Large Language Models(LLMs)**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
+| **Vision Language Models(VLMs)** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
+| **Diffusion Models**             | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
+| **Embedding Models**             | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
+| **Reranker Models**              | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
+| **Audio Models**                 | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |

 For full list of supported models, please refer to the supported models section in the [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) documentation.

--- a/README_CN.md
+++ b/README_CN.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/文档-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
+    <a href="./docs/assets/wechat-assistant.png" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 - **广泛的 GPU 兼容性**：无缝支持 Apple Mac、Windows PC 和 Linux 服务器上各种供应商的 GPU。
 - **广泛的模型支持**：支持各种模型，包括 LLM、多模态 VLM、图像模型、语音模型、文本嵌入模型和重排序模型。
- **灵活的推理后端**：支持与 vLLM 、 Ascend MindIE、llama-box（llama.cpp 和 stable-diffusion.cpp）和 vox-box 等多种推理后端的灵活集成。
+- **灵活的推理后端**：支持与 llama-box（llama.cpp 和 stable-diffusion.cpp）、vox-box、vLLM 和 Ascend MindIE 等多种推理后端的灵活集成。
 - **多版本后端支持**：同时运行推理后端的多个版本，以满足不同模型的不同运行依赖。
 - **分布式推理**：支持单机和多机多卡并行推理，包括跨供应商和运行环境的异构 GPU。
 - **可扩展的 GPU 架构**：通过向基础设施添加更多 GPU 或节点轻松进行扩展。
@ -50,71 +50,84 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 ## 安装

-### Linux
+### Linux 或 macOS

-如果你是 NVIDIA GPU 环境，请确保 [Docker](https://docs.docker.com/engine/install/) 和 [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) 都已经在系统中安装。 然后，执行如下命令启动 GPUStack：
+GPUStack 提供了安装脚本，可以将其安装为 Linux 的 systemd 服务或 macOS 的 launchd 服务，默认端口为 80。要使用此方法安装 GPUStack，执行以下命令：

 ```bash
-docker run -d --name gpustack \
-      --restart=unless-stopped \
-      --gpus all \
-      --network=host \
-      --ipc=host \
-      -v gpustack-data:/var/lib/gpustack \
-      gpustack/gpustack
+curl -sfL https://get.gpustack.ai | INSTALL_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple sh -s -
 ```

-有关其它平台的安装或详细配置选项，请参考[安装文档](docs/installation/installation-requirements.md).
+### Windows

-容器正常运行后，执行以下命令获取默认密码：
+以管理员身份运行 PowerShell（**避免**使用 PowerShell ISE），然后执行以下命令安装 GPUStack：

-```bash
-docker exec gpustack cat /var/lib/gpustack/initial_admin_password
+```powershell
+$env:INSTALL_INDEX_URL = "https://pypi.tuna.tsinghua.edu.cn/simple"
+Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
 ```

-在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用 `admin` 用户名和默认密码登录 GPUStack。
+### 其他安装方式
+
+有关 pip 安装、Docker 安装或详细配置选项，请参考[安装文档](https://docs.gpustack.ai/latest/installation/installation-requirements/)。
+
+## 新手入门
+
+1. 在命令行运行 **llama3.2** 模型并进行对话：

-### macOS & Windows
+```bash
+gpustack chat llama3.2 "tell me a joke."
+```

-对于 macOS 和 Windows，我们提供了桌面安装程序。请参阅[文档](https://docs.gpustack.ai/latest/installation/desktop-installer/)了解安装细节。
+2. 运行 **stable-diffusion-v3-5-large-turbo** 模型并生成图像：

-## 部署模型
+> ### 💡 Tip
+>
+> 此命令将从 Hugging Face 下载模型（约 12GB）。下载时间取决于你的网络速度。确保你有足够的磁盘空间和 VRAM（12GB）来运行模型。如果遇到问题，你可以跳过此步骤并转到下一步。
+
+```bash
+gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
+"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
+--sample-steps 5 --show
+```

-1. 在 GPUStack 界面，在菜单中点击“模型库”。
+命令完成后，生成的图像将出现在默认查看器中。你可以尝试修改 prompt 和 CLI 参数来定制输出。

-2. 从模型列表中选择 `Qwen3` 模型。
+![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)

-3. 在部署兼容性检查通过之后，选择保存部署模型。
+3. 在浏览器中打开 `http://your_host_ip`，访问 GPUStack 界面。使用“admin”用户名和默认密码登录 GPUStack。可以执行以下命令获取默认密码：

-![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)
+**Linux 或 macOS**

-4. GPUStack 将开始下载模型文件并部署模型。当部署状态显示为 `Running` 时，表示模型已成功部署。
+```bash
+cat /var/lib/gpustack/initial_admin_password
+```

-![model is running](docs/assets/quick-start/model-running.png)
+**Windows**

-5. 点击菜单中的“试验场 - 对话”，在右上方模型菜单中选择模型 `qwen3`。现在你可以在试验场中与 LLM 进行对话。
+```powershell
+Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
+```

-![quick chat](docs/assets/quick-start/quick-chat.png)
+4. 在菜单中点击“试验场 - 对话”，现在你可以在试验场中与 LLM 进行对话。

-## 通过 API 使用模型
+![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)

-1. 将鼠标移动到右下角的用户头像上，选择“API 密钥”，然后点击“新建 API 秘钥”按钮。
+5. 在菜单中点击“API 秘钥”，然后点击“新建 API 秘钥”按钮。

-2. 填写“名称”，然后点击“保存”按钮。
+6. 填写“名称”，然后点击“保存”按钮。

-3. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。
+7. 复制生成的 API 密钥并将其保存。请注意，秘钥只在创建时可见。

-4. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：
+8. 现在你可以使用 API 密钥访问 OpenAI 兼容 API。例如，curl 的用法如下：

 ```bash
-# Replace `your_api_key` and `your_gpustack_server_url`
-# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1/chat/completions \
+curl http://your_gpustack_server_url/v1-openai/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "qwen3",
+    "model": "llama3.2",
    "messages": [
      {
        "role": "system",
@ -122,7 +135,7 @@ curl http://your_gpustack_server_url/v1/chat/completions \
      },
      {
        "role": "user",
-        "content": "Tell me a joke."
+        "content": "Hello!"
      }
    ],
    "stream": true
@ -131,8 +144,8 @@ curl http://your_gpustack_server_url/v1/chat/completions \

 ## 平台支持

- [x] Linux
 - [x] macOS
+- [x] Linux
 - [x] Windows

 ## 加速框架支持
@ -144,11 +157,15 @@ curl http://your_gpustack_server_url/v1/chat/completions \
 - [x] 海光 DTK
 - [x] 摩尔线程 MUSA
 - [x] 天数智芯 Corex
- [x] 寒武纪 MLU
+
+我们计划在未来的版本中支持以下加速框架：
+
+- [ ] Intel oneAPI
+- [ ] Qualcomm AI Engine

 ## 模型支持

-GPUStack 使用 [vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：
+GPUStack 使用 [llama-box](https://github.com/gpustack/llama-box)（基于 [llama.cpp](https://github.com/ggml-org/llama.cpp) 和 [stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)）、[vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](https://www.hiascend.com/en/software/mindie) 和 [vox-box](https://github.com/gpustack/vox-box) 作为后端并提供广泛的模型支持。支持从以下来源部署模型：

 1. [Hugging Face](https://huggingface.co/)

@ -158,14 +175,14 @@ GPUStack 使用 [vLLM](https://github.com/vllm-project/vllm)、 [Ascend MindIE](

 ### 示例模型

-| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                         |
-| ---------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
-| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
-| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
-| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
-| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
-| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                        |
+| **类别**               | **模型**                                                                                                                                                                                                                                                                                                                                             |
+| ---------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **大语言模型（LLM）**  | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
+| **多模态模型（VLM）**  | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
+| **Diffusion 扩散模型** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
+| **Embedding 模型**     | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
+| **Reranker 模型**      | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
+| **语音模型**           | [Whisper](https://huggingface.co/models?search=Systran/faster) (Speech-to-Text), [CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice) (Text-to-Speech)                                                                                                                                                                            |

 有关支持模型的完整列表，请参阅 [inference backends](https://docs.gpustack.ai/latest/user-guide/inference-backends/) 文档中的 Supported Models 部分。

@ -219,20 +236,20 @@ GPUStack 用户可以在 UI 中生成自己的 API 密钥。

 ## 加入社区

-扫码加入社区群：
+扫码添加 GPUStack 微信小助手加入社区群：

 <p align="left">
-    <img alt="Wechat-group" src="./docs/assets/wechat-group-qrcode.jpg" width="300px"/>
+    <img alt="Wechat-assistant" src="./docs/assets/wechat-assistant.png" width="300px"/>
 </p>

 ## License

 版权所有 (c) 2024 GPUStack 作者

-本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。
-您只能在遵守许可证条款的前提下使用本项目。
+本项目基于 Apache-2.0 许可证（以下简称“许可证”）授权。  
+您只能在遵守许可证条款的前提下使用本项目。  
 许可证的完整内容请参阅 [LICENSE](./LICENSE) 文件。

-除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，
+除非适用法律另有规定或双方另有书面约定，依据许可证分发的软件按“原样”提供，  
 不附带任何明示或暗示的保证或条件。
 有关许可证规定的具体权利和限制，请参阅许可证了解更多详细信息。
--- a/README_JP.md
+++ b/README_JP.md
@ -10,7 +10,7 @@
        <img alt="Documentation" src="https://img.shields.io/badge/ドキュメント-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
    <a href="./LICENSE" target="_blank">
        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
-    <a href="./docs/assets/wechat-group-qrcode.jpg" target="_blank">
+    <a href="./docs/assets/wechat-assistant.png" target="_blank">
        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
@ -35,7 +35,7 @@ GPUStack は、AI モデルを実行するためのオープンソース GPU ク

 - **幅広い GPU 互換性:** Apple Mac、Windows PC、Linux サーバー上のさまざまなベンダーの GPU をシームレスにサポート。
 - **豊富なモデルサポート:** LLM、VLM、画像モデル、音声モデル、埋め込みモデル、リランクモデルを含む幅広いモデルをサポート。
- **柔軟な推論バックエンド:** vLLM、Ascend MindIE、llama-box（llama.cpp と stable-diffusion.cpp）、vox-box と統合。
+- **柔軟な推論バックエンド:** llama-box（llama.cpp と stable-diffusion.cpp）、vox-box、vLLM、Ascend MindIE と統合。
 - **マルチバージョンバックエンドサポート:** 異なるモデルの多様なランタイム要件を満たすために、推論バックエンドの複数バージョンを同時実行。
 - **分散推論:** ベンダーやランタイム環境をまたぐ異種 GPU を含む、シングルノードおよびマルチノードのマルチ GPU 推論をサポート。
 - **スケーラブルな GPU アーキテクチャ:** インフラストラクチャに GPU やノードを追加することで簡単にスケールアップ。
@ -50,71 +50,83 @@ GPUStack は、AI モデルを実行するためのオープンソース GPU ク

 ## インストール

-### Linux
+### Linux または macOS

-NVIDIA GPU を使用している場合は、Docker と NVIDIA Container Toolkit をインストールしてください。その後、以下のコマンドで GPUStack サーバーを起動します：
+GPUStack は、systemd または launchd ベースのシステムでサービスとしてインストールするスクリプトを提供しており、デフォルトポートは 80 です。この方法で GPUStack をインストールするには、以下を実行します：

 ```bash
-docker run -d --name gpustack \
-      --restart=unless-stopped \
-      --gpus all \
-      --network=host \
-      --ipc=host \
-      -v gpustack-data:/var/lib/gpustack \
-      gpustack/gpustack
+curl -sfL https://get.gpustack.ai | sh -s -
 ```

-詳細なインストール手順やその他の GPU ハードウェアプラットフォームについては、インストールドキュメント を参照してください。
+### Windows

-サーバー起動後、次のコマンドでデフォルト管理者パスワードを取得できます：
+管理者として PowerShell を実行し（PowerShell ISE の使用は**避けてください**）、以下のコマンドを実行して GPUStack をインストールします：

-```bash
-cat /var/lib/gpustack/initial_admin_password
+```powershell
+Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
 ```

-ブラウザで http://your_host_ip にアクセスし、ユーザー名 admin と取得したパスワードでログインします。
+### その他のインストール方法
+
+手動インストール、Docker インストール、または詳細な構成オプションについては、[インストールドキュメント](https://docs.gpustack.ai/latest/installation/installation-script/)を参照してください。

-### macOS & Windows
+## はじめに

-macOS および Windows 向けにデスクトップインストーラーが用意されています。インストールの詳細は [ドキュメント](https://docs.gpustack.ai/latest/installation/desktop-installer/) をご覧ください。
+1. **llama3.2**モデルを実行してチャットする：

-## モデルのデプロイ
+```bash
+gpustack chat llama3.2 "tell me a joke."
+```
+
+2. **stable-diffusion-v3-5-large-turbo**モデルで画像を生成する：
+
+> ### 💡 ヒント
+>
+> このコマンドは Hugging Face からモデル（約 12GB）をダウンロードします。ダウンロード時間はネットワーク速度に依存します。モデルを実行するために十分なディスクスペースと VRAM（12GB）があることを確認してください。問題が発生した場合は、このステップをスキップして次に進むことができます。
+
+```bash
+gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
+"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
+--sample-steps 5 --show
+```

-1. GPUStack UI の Catalog ページに移動します。
+コマンドが完了すると、生成された画像がデフォルトビューアに表示されます。プロンプトと CLI オプションを実験して出力をカスタマイズできます。

-2. モデルリストから Qwen3 モデルを選択します。
+![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)

-3. デプロイ互換性チェックが完了したら、Save ボタンをクリックしてデプロイします。
+3. ブラウザで`http://your_host_ip`を開いて GPUStack UI にアクセスします。ユーザー名`admin`とデフォルトパスワードで GPUStack にログインします。デフォルト設定のパスワードを取得するには、以下のコマンドを実行します：

-![deploy qwen3 from catalog](docs/assets/quick-start/quick-start-qwen3.png)
+**Linux または macOS**

-4. モデルのダウンロードとデプロイが開始されます。ステータスが Running になると、デプロイ成功です。
+```bash
+cat /var/lib/gpustack/initial_admin_password
+```

-![model is running](docs/assets/quick-start/model-running.png)
+**Windows**

-5. ナビゲーションメニューから Playground - Chat を選択し、右上の Model ドロップダウンで qwen3 が選択されていることを確認してチャットを開始します。
+```powershell
+Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
+```

-![quick chat](docs/assets/quick-start/quick-chat.png)
+4. ナビゲーションメニューで`Playground - Chat`をクリックします。これで UI プレイグラウンドで LLM とチャットできます。

-## API でモデルを使用する
+![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)

-1. ユーザーアバターをホバーし、API Keys ページに移動後、New API Key をクリックします。
+5. ナビゲーションメニューで`API Keys`をクリックし、`New API Key`ボタンをクリックします。

-2. Name を入力し、Save をクリックします。
+6. `Name`を入力し、`Save`ボタンをクリックします。

-3. 生成された API キーをコピーして安全な場所に保管してください（一度しか表示されません）。
+7. 生成された API キーをコピーして安全な場所に保存します。作成時にのみ一度だけ表示されることに注意してください。

-4. OpenAI 互換エンドポイントにアクセスできます。例：
+8. これで API キーを使用して OpenAI 互換 API にアクセスできます。例えば、curl を使用する場合：

 ```bash
-# Replace `your_api_key` and `your_gpustack_server_url`
-# with your actual API key and GPUStack server URL.
 export GPUSTACK_API_KEY=your_api_key
-curl http://your_gpustack_server_url/v1/chat/completions \
+curl http://your_gpustack_server_url/v1-openai/chat/completions \
  -H "Content-Type: application/json" \
  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
  -d '{
-    "model": "qwen3",
+    "model": "llama3.2",
    "messages": [
      {
        "role": "system",
@ -122,7 +134,7 @@ curl http://your_gpustack_server_url/v1/chat/completions \
      },
      {
        "role": "user",
-        "content": "Tell me a joke."
+        "content": "Hello!"
      }
    ],
    "stream": true
@ -131,8 +143,8 @@ curl http://your_gpustack_server_url/v1/chat/completions \

 ## サポートされているプラットフォーム

- [x] Linux
 - [x] macOS
+- [x] Linux
 - [x] Windows

 ## サポートされているアクセラレータ
@ -144,11 +156,15 @@ curl http://your_gpustack_server_url/v1/chat/completions \
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
 - [x] Iluvatar Corex
- [x] Cambricon MLU
+
+以下のアクセラレータは将来のリリースでサポートする予定です。
+
+- [ ] Intel oneAPI
+- [ ] Qualcomm AI Engine

 ## サポートされているモデル

-GPUStack は[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：
+GPUStack は[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：

 1. [Hugging Face](https://huggingface.co/)

@ -156,16 +172,16 @@ GPUStack は[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https

 3. ローカルファイルパス

-### モデル例
+### モデル例：

-| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                       |
-| ----------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)    |
-| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL3](https://huggingface.co/models?search=internvl3) |
-| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                   |
-| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings), [Qwen3-Embedding](https://huggingface.co/models?search=qwen/qwen3-embedding)                                                                       |
-| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker), [Qwen3-Reranker](https://huggingface.co/models?search=qwen/qwen3-reranker)                                                                |
-| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                  |
+| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                           |
+| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
+| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
+| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
+| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
+| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
+| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                      |

 サポートされているモデルの完全なリストについては、[推論バックエンド](https://docs.gpustack.ai/latest/user-guide/inference-backends/)ドキュメントのサポートされているモデルセクションを参照してください。

--- a/benchmarks/benchmark_llm.py
+++ b/benchmarks/benchmark_llm.py
@ -0,0 +1,346 @@
+import asyncio
+import time
+import httpx
+import numpy
+import logging
+import argparse
+import json
+import random
+from openai import AsyncOpenAI
+
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
+)
+
+
+# Avoid client side connection error: https://github.com/encode/httpx/discussions/3084
+http_client = httpx.AsyncClient(
+    limits=httpx.Limits(
+        max_connections=10000, max_keepalive_connections=10000, keepalive_expiry=30
+    )
+)
+
+SAMPLE_PROMPTS = [
+    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
+    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
+    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
+    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
+    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
+    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
+    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
+    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
+    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
+    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
+    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
+    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
+    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
+    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
+    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
+]
+
+
+async def process_stream(stream):
+    first_token_time = None
+    total_tokens = 0
+    async for chunk in stream:
+        if first_token_time is None:
+            first_token_time = time.time()
+        if chunk.choices[0].delta.content:
+            total_tokens += 1
+        if chunk.choices[0].finish_reason is not None:
+            break
+    return first_token_time, total_tokens
+
+
+async def make_request(
+    client: AsyncOpenAI, model, max_completion_tokens, request_timeout
+):
+    start_time = time.time()
+    content = random.choice(SAMPLE_PROMPTS)
+
+    try:
+        stream = await client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            max_completion_tokens=max_completion_tokens,
+            stream=True,
+        )
+        first_token_time, total_tokens = await asyncio.wait_for(
+            process_stream(stream), timeout=request_timeout
+        )
+
+        end_time = time.time()
+        elapsed_time = end_time - start_time
+        ttft = first_token_time - start_time if first_token_time else None
+        tokens_per_second = total_tokens / elapsed_time if elapsed_time > 0 else 0
+        return total_tokens, elapsed_time, tokens_per_second, ttft
+
+    except asyncio.TimeoutError:
+        logging.warning(f"Request timed out after {request_timeout} seconds")
+        return None
+    except Exception as e:
+        logging.error(f"Error during request: {str(e)}")
+        return None
+
+
+async def worker(
+    client,
+    model,
+    semaphore,
+    queue,
+    results,
+    max_completion_tokens,
+    request_timeout,
+):
+    while True:
+        async with semaphore:
+            task_id = await queue.get()
+            if task_id is None:
+                queue.task_done()
+                break
+            logging.info(f"Starting request {task_id}")
+            result = await make_request(
+                client, model, max_completion_tokens, request_timeout
+            )
+            if result:
+                results.append(result)
+            else:
+                logging.warning(f"Request {task_id} failed")
+            queue.task_done()
+            logging.info(f"Finished request {task_id}")
+
+
+def calculate_percentile(values, percentile, reverse=False):
+    if not values:
+        return None
+    if reverse:
+        return numpy.percentile(values, 100 - percentile)
+    return numpy.percentile(values, percentile)
+
+
+async def preflight_check(client, model) -> bool:
+    result = await make_request(client, model, 16, 60)
+    return result is not None
+
+
+async def main(
+    model,
+    num_requests,
+    concurrency,
+    request_timeout,
+    max_completion_tokens,
+    server_url,
+    api_key,
+):
+    client = AsyncOpenAI(
+        base_url=f"{server_url}/v1",
+        api_key=api_key,
+        http_client=http_client,
+        max_retries=0,
+    )
+
+    if not await preflight_check(client, model):
+        logging.error(
+            "Preflight check failed. Please check configuration and the service status."
+        )
+        return
+
+    semaphore = asyncio.Semaphore(concurrency)
+    queue = asyncio.Queue()
+    results = []
+
+    # Add tasks to the queue
+    for i in range(num_requests):
+        await queue.put(i)
+
+    # Add sentinel values to stop workers
+    for _ in range(concurrency):
+        await queue.put(None)
+
+    # Create worker tasks
+    workers = [
+        asyncio.create_task(
+            worker(
+                client,
+                model,
+                semaphore,
+                queue,
+                results,
+                max_completion_tokens,
+                request_timeout,
+            )
+        )
+        for _ in range(concurrency)
+    ]
+
+    start_time = time.time()
+
+    # Wait for all tasks to complete
+    await queue.join()
+    await asyncio.gather(*workers)
+
+    end_time = time.time()
+
+    # Calculate metrics
+    total_elapsed_time = end_time - start_time
+    total_tokens = sum(tokens for tokens, _, _, _ in results if tokens is not None)
+    latencies = [
+        elapsed_time for _, elapsed_time, _, _ in results if elapsed_time is not None
+    ]
+    tokens_per_second_list = [tps for _, _, tps, _ in results if tps is not None]
+    ttft_list = [ttft for _, _, _, ttft in results if ttft is not None]
+
+    successful_requests = len(results)
+    success_rate = successful_requests / num_requests if num_requests > 0 else 0
+    requests_per_second = (
+        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_latency = sum(latencies) / len(latencies) if latencies else 0
+    avg_tokens_per_second = (
+        sum(tokens_per_second_list) / len(tokens_per_second_list)
+        if tokens_per_second_list
+        else 0
+    )
+    overall_tokens_per_second = (
+        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
+    )
+    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
+
+    # Calculate percentiles
+    percentiles = [50, 95, 99]
+    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
+    tps_percentiles = [
+        calculate_percentile(tokens_per_second_list, p, reverse=True)
+        for p in percentiles
+    ]
+    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
+
+    return {
+        "model": model,
+        "total_requests": num_requests,
+        "successful_requests": successful_requests,
+        "success_rate": success_rate,
+        "concurrency": concurrency,
+        "request_timeout": request_timeout,
+        "max_completion_tokens": max_completion_tokens,
+        "total_time": total_elapsed_time,
+        "requests_per_second": requests_per_second,
+        "total_completion_tokens": total_tokens,
+        "latency": {
+            "average": avg_latency,
+            "p50": latency_percentiles[0],
+            "p95": latency_percentiles[1],
+            "p99": latency_percentiles[2],
+        },
+        "tokens_per_second": {
+            "overall": overall_tokens_per_second,
+            "average": avg_tokens_per_second,
+            "p50": tps_percentiles[0],
+            "p95": tps_percentiles[1],
+            "p99": tps_percentiles[2],
+        },
+        "time_to_first_token": {
+            "average": avg_ttft,
+            "p50": ttft_percentiles[0],
+            "p95": ttft_percentiles[1],
+            "p99": ttft_percentiles[2],
+        },
+    }
+
+
+def output_results(results, result_file=None):
+    # Round all floats in results to two decimal places for output
+    def _round_floats(obj, ndigits=2):
+        if isinstance(obj, dict):
+            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
+        if isinstance(obj, list):
+            return [_round_floats(v, ndigits) for v in obj]
+        if isinstance(obj, float):
+            return round(obj, ndigits)
+        return obj
+
+    formatted_results = _round_floats(results, 2)
+    if result_file:
+        with open(result_file, "w") as f:
+            json.dump(formatted_results, f, indent=2)
+        logging.info(f"Results saved to {result_file}")
+    else:
+        print(json.dumps(formatted_results, indent=2))
+
+
+def set_http_client(args):
+    if args.headers:
+        for header in args.headers:
+            if ":" not in header:
+                parser.error(f"Invalid header format: {header}. Expected Key:Value")
+            key, value = header.split(":", 1)
+            http_client.headers[key.strip()] = value.strip()
+
+    http_client.timeout = args.request_timeout
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
+    parser.add_argument(
+        "-m", "--model", type=str, required=True, help="Name of the model"
+    )
+    parser.add_argument(
+        "-n",
+        "--num-requests",
+        type=int,
+        default=100,
+        help="Number of requests to make (default: 100)",
+    )
+    parser.add_argument(
+        "-c",
+        "--concurrency",
+        type=int,
+        default=10,
+        help="Number of concurrent requests (default: 10)",
+    )
+    parser.add_argument(
+        "--request-timeout",
+        type=int,
+        default=300,
+        help="Timeout for each request in seconds (default: 300)",
+    )
+    parser.add_argument(
+        "--max-completion-tokens",
+        type=int,
+        default=1024,
+        help="Maximum number of tokens in the completion (default: 1024)",
+    )
+    parser.add_argument(
+        "--server-url",
+        type=str,
+        default="http://127.0.0.1",
+        help="URL of the GPUStack server",
+    )
+    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
+    parser.add_argument(
+        "--result-file",
+        type=str,
+        help="Result file path to save benchmark json results",
+    )
+    parser.add_argument(
+        "-H",
+        "--header",
+        action="append",
+        dest="headers",
+        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
+    )
+    args = parser.parse_args()
+    set_http_client(args)
+
+    results = asyncio.run(
+        main(
+            args.model,
+            args.num_requests,
+            args.concurrency,
+            args.request_timeout,
+            args.max_completion_tokens,
+            args.server_url,
+            args.api_key,
+        )
+    )
+    output_results(results, args.result_file)
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@ -1,654 +0,0 @@
-import asyncio
-from dataclasses import asdict, dataclass, is_dataclass
-import time
-from typing import List, Optional
-import aiohttp
-import numpy
-import logging
-import argparse
-import json
-import random
-from openai import APIConnectionError, AsyncOpenAI
-from aiohttp import ClientSession
-from httpx_aiohttp import AiohttpTransport
-from openai import DefaultAsyncHttpxClient
-from openai.types.chat import (
-    ChatCompletionStreamOptionsParam,
-)
-from tqdm import tqdm
-
-logging.basicConfig(
-    level=logging.WARNING, format="%(asctime)s - %(levelname)s - %(message)s"
-)
-
-
-SAMPLE_PROMPTS = [
-    "Explain how blockchain technology works, and provide a real-world example of its application outside of cryptocurrency.",
-    "Compare and contrast the philosophies of Nietzsche and Kant, including their views on morality and human nature.",
-    "Imagine you're a travel blogger. Write a detailed post describing a week-long adventure through rural Japan.",
-    "Write a fictional letter from Albert Einstein to a modern-day physicist, discussing the current state of quantum mechanics.",
-    "Provide a comprehensive explanation of how transformers work in machine learning, including attention mechanisms and positional encoding.",
-    "Draft a business proposal for launching a new AI-powered productivity app, including target audience, key features, and a monetization strategy.",
-    "Simulate a panel discussion between Elon Musk, Marie Curie, and Sun Tzu on the topic of 'Leadership in Times of Crisis'.",
-    "Describe the process of photosynthesis in depth, and explain its importance in the global carbon cycle.",
-    "Analyze the impact of social media on political polarization, citing relevant studies or historical examples.",
-    "Write a short science fiction story where humans discover a parallel universe that operates under different physical laws.",
-    "Explain the role of the Federal Reserve in the U.S. economy and how it manages inflation and unemployment.",
-    "Describe the architecture of a modern web application, from frontend to backend, including databases, APIs, and deployment.",
-    "Write an essay discussing whether artificial general intelligence (AGI) poses an existential threat to humanity.",
-    "Summarize the key events and consequences of the Cuban Missile Crisis, and reflect on lessons for modern diplomacy.",
-    "Create a guide for beginners on how to train a custom LLM using open-source tools and publicly available datasets.",
-]
-
-
-@dataclass
-class PercentileResults:
-    average: float
-    p50: float
-    p95: float
-    p99: float
-
-
-@dataclass
-class BenchmarkResults:
-    model: str
-    total_requests: int
-    successful_requests: int
-    success_rate: float
-    concurrency: int
-    request_timeout: int
-    max_completion_tokens: int
-    total_time: float
-    requests_per_second: float
-    total_tokens: int
-    total_prompt_tokens: int
-    total_completion_tokens: int
-    total_tokens_per_second: float
-    total_prompt_tokens_per_second: float
-    total_completion_tokens_per_second: float
-    latency: PercentileResults
-    completion_tokens_per_second: PercentileResults
-    time_to_first_token: PercentileResults
-
-
-async def process_stream(stream):
-    first_token_time = None
-    async for chunk in stream:
-        if first_token_time is None:
-            first_token_time = time.time()
-        if chunk.usage:
-            return first_token_time, chunk.usage
-    return first_token_time, None
-
-
-def get_random_prompt(prompt_multiplier):
-    """
-    Returns a random prompt from the SAMPLE_PROMPTS list, repeated prompt_multiplier times.
-    """
-    # Add a random prefix to avoid prefix cache hits
-    random_prefix = str(random.randint(100000, 999999))
-    return (
-        random_prefix + " " + (random.choice(SAMPLE_PROMPTS) + " ") * prompt_multiplier
-    )
-
-
-async def make_chat_completion_request(
-    client: AsyncOpenAI,
-    model,
-    max_completion_tokens,
-    ignore_eos,
-    request_timeout,
-    prompt_multiplier,
-):
-    start_time = time.time()
-    content = get_random_prompt(prompt_multiplier)
-    try:
-        stream = await client.chat.completions.create(
-            model=model,
-            messages=[{"role": "user", "content": content}],
-            max_completion_tokens=max_completion_tokens,
-            stream=True,
-            stream_options=ChatCompletionStreamOptionsParam(include_usage=True),
-            extra_body={"ignore_eos": ignore_eos} if ignore_eos else None,
-        )
-        first_token_time, usage = await asyncio.wait_for(
-            process_stream(stream), timeout=request_timeout
-        )
-
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        ttft = (first_token_time - start_time) * 1000 if first_token_time else None
-        return usage, elapsed_time, ttft
-    except asyncio.TimeoutError:
-        logging.warning(f"Request timed out after {request_timeout} seconds")
-        return None
-    except APIConnectionError as e:
-        logging.error(f"API connection error: {str(e)}")
-        return None
-    except Exception as e:
-        logging.error(f"Error during request: {str(e)}")
-        return None
-
-
-async def make_embedding_request(
-    client: AsyncOpenAI,
-    model,
-    request_timeout,
-    prompt_multiplier=1,
-):
-    start_time = time.time()
-    content = get_random_prompt(prompt_multiplier)
-    try:
-        response = await asyncio.wait_for(
-            client.embeddings.create(model=model, input=content),
-            timeout=request_timeout,
-        )
-        end_time = time.time()
-        elapsed_time = end_time - start_time
-        ttft = None  # Embeddings do not have a time to first token in the same way as chat completions
-
-        return response.usage, elapsed_time, ttft
-    except asyncio.TimeoutError:
-        logging.warning(f"Embedding request timed out after {request_timeout} seconds")
-        return None
-    except Exception as e:
-        logging.error(f"Error during embedding request: {str(e)}")
-        return None
-
-
-async def worker(
-    client,
-    model,
-    semaphore,
-    queue,
-    results,
-    max_completion_tokens,
-    ignore_eos,
-    request_timeout,
-    embeddings=False,
-    prompt_multiplier=1,
-    pbar=None,
-):
-    while True:
-        async with semaphore:
-            task_id = await queue.get()
-            if task_id is None:
-                queue.task_done()
-                break
-            logging.debug(f"Starting request {task_id}")
-            if embeddings:
-                result = await make_embedding_request(
-                    client, model, request_timeout, prompt_multiplier
-                )
-            else:
-                result = await make_chat_completion_request(
-                    client,
-                    model,
-                    max_completion_tokens,
-                    ignore_eos,
-                    request_timeout,
-                    prompt_multiplier,
-                )
-            if result:
-                results.append(result)
-            else:
-                logging.warning(f"Request {task_id} failed")
-            queue.task_done()
-            if pbar:
-                pbar.update(1)
-            logging.debug(f"Finished request {task_id}")
-
-
-def calculate_percentile(values, percentile, reverse=False):
-    if not values:
-        return None
-    if reverse:
-        return numpy.percentile(values, 100 - percentile)
-    return numpy.percentile(values, percentile)
-
-
-async def preflight_check(client, model, embeddings=False) -> bool:
-    if embeddings:
-        result = await make_embedding_request(client, model, 16)
-    else:
-        result = await make_chat_completion_request(client, model, 16, False, 60, 1)
-    return result is not None
-
-
-def set_headers(aiohttp_session: ClientSession, headers: Optional[List[str]]):
-    if headers:
-        for header in headers:
-            if ":" not in header:
-                raise ValueError(f"Invalid header format: {header}. Expected Key:Value")
-            key, value = header.split(":", 1)
-            aiohttp_session.headers[key.strip()] = value.strip()
-
-
-async def main(
-    model,
-    num_requests,
-    concurrency,
-    request_timeout,
-    max_completion_tokens,
-    ignore_eos,
-    server_url,
-    api_key,
-    headers=None,
-    embeddings=False,
-    prompt_multiplier=1,
-) -> Optional[BenchmarkResults]:
-    connector = aiohttp.TCPConnector(
-        limit=2000,
-        force_close=True,
-    )
-    async with ClientSession(connector=connector, trust_env=True) as aiohttp_session:
-        if headers:
-            set_headers(aiohttp_session, headers)
-        transport = AiohttpTransport(client=aiohttp_session)
-        httpx_client = DefaultAsyncHttpxClient(
-            transport=transport, timeout=request_timeout
-        )
-        client = AsyncOpenAI(
-            base_url=f"{server_url}/v1",
-            api_key=api_key,
-            http_client=httpx_client,
-            max_retries=0,
-        )
-
-        if not await preflight_check(client, model, embeddings=embeddings):
-            raise Exception(
-                "Preflight check failed. Please check configuration and the service status."
-            )
-
-        semaphore = asyncio.Semaphore(concurrency)
-        queue = asyncio.Queue()
-        results = []
-
-        # Add tasks to the queue
-        for i in range(num_requests):
-            await queue.put(i)
-
-        # Add sentinel values to stop workers
-        for _ in range(concurrency):
-            await queue.put(None)
-
-        pbar = tqdm(
-            total=num_requests,
-            desc="Running Benchmark requests",
-            unit="request",
-            dynamic_ncols=True,
-        )
-
-        # Create worker tasks
-        workers = [
-            asyncio.create_task(
-                worker(
-                    client,
-                    model,
-                    semaphore,
-                    queue,
-                    results,
-                    max_completion_tokens,
-                    ignore_eos,
-                    request_timeout,
-                    embeddings,
-                    prompt_multiplier,
-                    pbar=pbar,
-                )
-            )
-            for _ in range(concurrency)
-        ]
-
-        start_time = time.time()
-
-        # Wait for all tasks to complete
-        await queue.join()
-        await asyncio.gather(*workers)
-
-        end_time = time.time()
-        total_elapsed_time = end_time - start_time
-        return calculate_results(
-            model,
-            concurrency,
-            request_timeout,
-            max_completion_tokens,
-            total_elapsed_time,
-            num_requests,
-            results,
-        )
-
-
-def calculate_results(
-    model,
-    concurrency,
-    request_timeout,
-    max_completion_tokens,
-    total_elapsed_time,
-    num_requests,
-    results,
-):
-    # Calculate metrics
-    total_tokens = 0
-    prompt_tokens = 0
-    completion_tokens = 0
-    tokens_per_second_list = []
-    prompt_tokens_per_second_list = []
-    completion_tokens_per_second_list = []
-    for usage, elapsed_time, _ in results:
-        if usage is not None:
-            total_tokens += usage.total_tokens
-            prompt_tokens += usage.prompt_tokens
-            completion_tokens += usage.completion_tokens
-            prompt_tokens_per_second = (
-                usage.prompt_tokens / elapsed_time if elapsed_time > 0 else 0
-            )
-            completion_tokens_per_second = (
-                usage.completion_tokens / elapsed_time if elapsed_time > 0 else 0
-            )
-            tokens_per_second = (
-                usage.total_tokens / elapsed_time if elapsed_time > 0 else 0
-            )
-            tokens_per_second_list.append(tokens_per_second)
-            prompt_tokens_per_second_list.append(prompt_tokens_per_second)
-            completion_tokens_per_second_list.append(completion_tokens_per_second)
-
-    latencies = [
-        elapsed_time for _, elapsed_time, _ in results if elapsed_time is not None
-    ]
-    ttft_list = [ttft for _, _, ttft in results if ttft is not None]
-
-    successful_requests = len(results)
-    success_rate = successful_requests / num_requests if num_requests > 0 else 0
-    requests_per_second = (
-        successful_requests / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_latency = sum(latencies) / len(latencies) if latencies else 0
-    avg_completion_tokens_per_second = (
-        sum(completion_tokens_per_second_list) / len(completion_tokens_per_second_list)
-        if completion_tokens_per_second_list
-        else 0
-    )
-    total_tokens_per_second = (
-        total_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    total_prompt_tokens_per_second = (
-        prompt_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    total_completion_tokens_per_second = (
-        completion_tokens / total_elapsed_time if total_elapsed_time > 0 else 0
-    )
-    avg_ttft = sum(ttft_list) / len(ttft_list) if ttft_list else 0
-
-    # Calculate percentiles
-    percentiles = [50, 95, 99]
-    latency_percentiles = [calculate_percentile(latencies, p) for p in percentiles]
-    completion_tps_percentiles = [
-        calculate_percentile(completion_tokens_per_second_list, p, reverse=True)
-        for p in percentiles
-    ]
-    ttft_percentiles = [calculate_percentile(ttft_list, p) for p in percentiles]
-
-    return BenchmarkResults(
-        model=model,
-        total_requests=num_requests,
-        successful_requests=successful_requests,
-        success_rate=success_rate,
-        concurrency=concurrency,
-        request_timeout=request_timeout,
-        max_completion_tokens=max_completion_tokens,
-        total_time=total_elapsed_time,
-        requests_per_second=requests_per_second,
-        total_tokens=total_tokens,
-        total_prompt_tokens=prompt_tokens,
-        total_completion_tokens=completion_tokens,
-        total_tokens_per_second=total_tokens_per_second,
-        total_prompt_tokens_per_second=total_prompt_tokens_per_second,
-        total_completion_tokens_per_second=total_completion_tokens_per_second,
-        latency=PercentileResults(
-            average=avg_latency,
-            p50=latency_percentiles[0],
-            p95=latency_percentiles[1],
-            p99=latency_percentiles[2],
-        ),
-        completion_tokens_per_second=PercentileResults(
-            average=avg_completion_tokens_per_second,
-            p50=completion_tps_percentiles[0],
-            p95=completion_tps_percentiles[1],
-            p99=completion_tps_percentiles[2],
-        ),
-        time_to_first_token=PercentileResults(
-            average=avg_ttft,
-            p50=ttft_percentiles[0],
-            p95=ttft_percentiles[1],
-            p99=ttft_percentiles[2],
-        ),
-    )
-
-
-def fmt_line(label, *values, width=40):
-    label_part = f"{label:<{width}}"
-    value_part = " ".join(str(v) for v in values)
-    return f"{label_part}{value_part}"
-
-
-def fmt_float(v, suffix=""):
-    return f"{v:.2f}{suffix}"
-
-
-def output_benchmark_results_pretty(
-    results: BenchmarkResults, file: str = None, embeddings: bool = False
-):
-
-    lines = []
-    lines.append("============== Serving Benchmark Result ===============")
-    lines.append(fmt_line("Model:", results.model))
-    lines.append(
-        fmt_line(
-            "Total requests:",
-            f"{results.successful_requests}/{results.total_requests}({results.success_rate:.2%})",
-        )
-    )
-    lines.append(fmt_line("Concurrency:", results.concurrency))
-    lines.append(fmt_line("Benchmark duration (s):", fmt_float(results.total_time)))
-    lines.append(
-        fmt_line("Request throughput (req/s):", fmt_float(results.requests_per_second))
-    )
-    lines.append(fmt_line("Total input tokens:", results.total_prompt_tokens))
-    if not embeddings:
-        lines.append(fmt_line("Total output tokens:", results.total_completion_tokens))
-
-    output_tok_per_sec = (
-        results.total_completion_tokens / results.total_time
-        if results.total_time > 0
-        else 0
-    )
-    total_tok_per_sec = (
-        results.total_tokens / results.total_time if results.total_time > 0 else 0
-    )
-    if not embeddings:
-        lines.append(
-            fmt_line("Output token throughput (tok/s):", fmt_float(output_tok_per_sec))
-        )
-    lines.append(
-        fmt_line("Total token throughput (tok/s):", fmt_float(total_tok_per_sec))
-    )
-    lines.append("------------------- Request Latency -------------------")
-    lines.append(fmt_line("Average latency (s):", fmt_float(results.latency.average)))
-    lines.append(fmt_line("P50 latency (s):", fmt_float(results.latency.p50)))
-    lines.append(fmt_line("P95 latency (s):", fmt_float(results.latency.p95)))
-    lines.append(fmt_line("P99 latency (s):", fmt_float(results.latency.p99)))
-    if not embeddings:
-        lines.append("--------------- Output Token Per Second ---------------")
-        lines.append(
-            fmt_line(
-                "Average TPS (tok/s):",
-                fmt_float(results.completion_tokens_per_second.average),
-            )
-        )
-        lines.append(
-            fmt_line(
-                "P50 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p50)
-            )
-        )
-        lines.append(
-            fmt_line(
-                "P95 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p95)
-            )
-        )
-        lines.append(
-            fmt_line(
-                "P99 TPS (tok/s):", fmt_float(results.completion_tokens_per_second.p99)
-            )
-        )
-
-        lines.append("----------------- Time to First Token -----------------")
-        lines.append(
-            fmt_line(
-                "Average TTFT (ms):", fmt_float(results.time_to_first_token.average)
-            )
-        )
-        lines.append(
-            fmt_line("P50 TTFT (ms):", fmt_float(results.time_to_first_token.p50))
-        )
-        lines.append(
-            fmt_line("P95 TTFT (ms):", fmt_float(results.time_to_first_token.p95))
-        )
-        lines.append(
-            fmt_line("P99 TTFT (ms):", fmt_float(results.time_to_first_token.p99))
-        )
-    lines.append("=" * 55)
-
-    output = "\n".join(lines)
-
-    if file:
-        with open(file, "w") as f:
-            f.write(output + "\n")
-        logging.info(f"Pretty benchmark results saved to {file}")
-    else:
-        print(output)
-
-
-def output_benchmark_results_json(
-    results: BenchmarkResults, result_file=None, embeddings: bool = False
-):
-    # Round all floats in results to two decimal places for output
-    def _round_floats(obj, ndigits=2):
-        if is_dataclass(obj):
-            obj = asdict(obj)
-        if isinstance(obj, dict):
-            return {k: _round_floats(v, ndigits) for k, v in obj.items()}
-        if isinstance(obj, list):
-            return [_round_floats(v, ndigits) for v in obj]
-        if isinstance(obj, float):
-            return round(obj, ndigits)
-        return obj
-
-    formatted_results = _round_floats(results, 2)
-    if result_file:
-        with open(result_file, "w") as f:
-            json.dump(formatted_results, f, indent=2)
-        logging.info(f"Results saved to {result_file}")
-    else:
-        print(json.dumps(formatted_results, indent=2))
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Benchmark Chat Completions API")
-    parser.add_argument(
-        "-m", "--model", type=str, required=True, help="Name of the model"
-    )
-    parser.add_argument(
-        "-n",
-        "--num-requests",
-        type=int,
-        default=100,
-        help="Number of requests to make (default: 100)",
-    )
-    parser.add_argument(
-        "-c",
-        "--concurrency",
-        type=int,
-        default=10,
-        help="Number of concurrent requests (default: 10)",
-    )
-    parser.add_argument(
-        "--request-timeout",
-        type=int,
-        default=300,
-        help="Timeout for each request in seconds (default: 300)",
-    )
-    parser.add_argument(
-        "--max-completion-tokens",
-        type=int,
-        default=1024,
-        help="Maximum number of tokens in the completion (default: 1024)",
-    )
-    parser.add_argument(
-        "--prompt-multiplier",
-        type=int,
-        default=1,
-        help="Repeat the randomly selected prompt N times to create longer inputs",
-    )
-    parser.add_argument(
-        '--ignore-eos',
-        action='store_true',
-        help='Set ignore_eos flag when sending the benchmark request. This will not stop the stream when the model generates an EOS token.',
-    )
-    parser.add_argument(
-        "--server-url",
-        type=str,
-        default="http://127.0.0.1",
-        help="URL of the GPUStack server",
-    )
-    parser.add_argument("--api-key", type=str, default="fake", help="GPUStack API key")
-    parser.add_argument(
-        "--result-file",
-        type=str,
-        help="Result file path to save benchmark json results",
-    )
-    parser.add_argument(
-        "-H",
-        "--header",
-        action="append",
-        dest="headers",
-        help="Custom HTTP header in Key:Value format. May be specified multiple times.",
-    )
-    parser.add_argument(
-        '--embeddings',
-        action='store_true',
-        help='Run embedding benchmark instead of chat completions',
-    )
-    parser.add_argument(
-        '--json',
-        action='store_true',
-        help='Output results in JSON format instead of pretty format',
-    )
-    args = parser.parse_args()
-
-    try:
-        results = asyncio.run(
-            main(
-                args.model,
-                args.num_requests,
-                args.concurrency,
-                args.request_timeout,
-                args.max_completion_tokens,
-                args.ignore_eos,
-                args.server_url,
-                args.api_key,
-                args.headers,
-                args.embeddings,
-                args.prompt_multiplier,
-            )
-        )
-        if args.json:
-            output_benchmark_results_json(
-                results, args.result_file, embeddings=args.embeddings
-            )
-        else:
-            output_benchmark_results_pretty(
-                results, args.result_file, embeddings=args.embeddings
-            )
-    except Exception as e:
-        logging.error(f"Benchmarking failed: {str(e)}")
-        exit(1)
--- a/benchmarks/requirements.txt
+++ b/benchmarks/requirements.txt
@ -1,26 +0,0 @@
-aiohappyeyeballs==2.6.1
-aiohttp==3.12.13
-aiosignal==1.3.2
-annotated-types==0.7.0
-anyio==4.9.0
-attrs==25.3.0
-certifi==2025.6.15
-distro==1.9.0
-frozenlist==1.7.0
-h11==0.16.0
-httpcore==1.0.9
-httpx==0.28.1
-httpx-aiohttp==0.1.6
-idna==3.10
-jiter==0.10.0
-multidict==6.5.1
-numpy==2.3.1
-openai==1.92.2
-propcache==0.3.2
-pydantic==2.11.7
-pydantic_core==2.33.2
-sniffio==1.3.1
-tqdm==4.67.1
-typing-inspection==0.4.1
-typing_extensions==4.14.0
-yarl==1.20.1
--- a/conftest.py
+++ b/conftest.py
@ -1,21 +0,0 @@
-import shutil
-import tempfile
-import pytest
-from gpustack.config.config import Config, set_global_config
-
-
-@pytest.fixture(scope="module", autouse=True)
-def temp_dir():
-    tmp_dir = tempfile.mkdtemp()
-    print(f"Created temporary directory: {tmp_dir}")
-    yield tmp_dir
-    shutil.rmtree(tmp_dir)
-
-
-@pytest.fixture(scope="module", autouse=True)
-def config(temp_dir):
-    cfg = Config(
-        token="test", jwt_secret_key="test", data_dir=temp_dir, enable_ray=True
-    )
-    set_global_config(cfg)
-    return cfg
--- a/docs/architecture.md
+++ b/docs/architecture.md
@ -27,7 +27,7 @@ The GPUStack server connects to a SQL database as the datastore. GPUStack uses S

 ### Inference Server

-Inference servers are the backends that performs the inference tasks. GPUStack supports [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie), [llama-box](https://github.com/gpustack/llama-box) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.
+Inference servers are the backends that performs the inference tasks. GPUStack supports [llama-box](https://github.com/gpustack/llama-box), [vLLM](https://github.com/vllm-project/vllm), [Ascend MindIE](https://www.hiascend.com/en/software/mindie) and [vox-box](https://github.com/gpustack/vox-box) as the inference server.

 ### RPC Server

--- a/docs/assets/compare-playground-screenshot.png
+++ b/docs/assets/compare-playground-screenshot.png
--- a/docs/assets/desktop-installer/add-worker.png
+++ b/docs/assets/desktop-installer/add-worker.png
--- a/docs/assets/desktop-installer/open-web-console.png
+++ b/docs/assets/desktop-installer/open-web-console.png
--- a/docs/assets/desktop-installer/prompt-root-privileges.png
+++ b/docs/assets/desktop-installer/prompt-root-privileges.png
--- a/docs/assets/desktop-installer/quickconfig-env-var.png
+++ b/docs/assets/desktop-installer/quickconfig-env-var.png
--- a/docs/assets/desktop-installer/quickconfig-general.png
+++ b/docs/assets/desktop-installer/quickconfig-general.png
--- a/docs/assets/desktop-installer/to-upgrade-darwin.png
+++ b/docs/assets/desktop-installer/to-upgrade-darwin.png
--- a/docs/assets/faq/quick-config.png
+++ b/docs/assets/faq/quick-config.png
--- a/docs/assets/gpustack-architecture.png
+++ b/docs/assets/gpustack-architecture.png
--- a/docs/assets/gpustack-network-architecture.png
+++ b/docs/assets/gpustack-network-architecture.png
--- a/docs/assets/integrations/integration-gpustack-api-access-info.png
+++ b/docs/assets/integrations/integration-gpustack-api-access-info.png
--- a/docs/assets/integrations/integration-gpustack-models.png
+++ b/docs/assets/integrations/integration-gpustack-models.png
--- a/docs/assets/model-catalog.png
+++ b/docs/assets/model-catalog.png
--- a/docs/assets/playground-screenshot.png
+++ b/docs/assets/playground-screenshot.png
--- a/docs/assets/playground/api-style.png
+++ b/docs/assets/playground/api-style.png
--- a/docs/assets/playground/audio-permission.png
+++ b/docs/assets/playground/audio-permission.png
--- a/docs/assets/playground/chat.png
+++ b/docs/assets/playground/chat.png
--- a/docs/assets/playground/create-image-01.png
+++ b/docs/assets/playground/create-image-01.png
--- a/docs/assets/playground/create-image-02.png
+++ b/docs/assets/playground/create-image-02.png
--- a/docs/assets/playground/embedding.png
+++ b/docs/assets/playground/embedding.png
--- a/docs/assets/playground/image-edit-01.png
+++ b/docs/assets/playground/image-edit-01.png
--- a/docs/assets/playground/image-size.png
+++ b/docs/assets/playground/image-size.png
--- a/docs/assets/playground/ranker.png
+++ b/docs/assets/playground/ranker.png
--- a/docs/assets/playground/speech-to-text.png
+++ b/docs/assets/playground/speech-to-text.png
--- a/docs/assets/playground/text-to-speech.png
+++ b/docs/assets/playground/text-to-speech.png
--- a/docs/assets/quick-start/mac-done.png
+++ b/docs/assets/quick-start/mac-done.png
--- a/docs/assets/quick-start/mac-installer.png
+++ b/docs/assets/quick-start/mac-installer.png
--- a/docs/assets/quick-start/model-running.png
+++ b/docs/assets/quick-start/model-running.png
--- a/docs/assets/quick-start/quick-chat.png
+++ b/docs/assets/quick-start/quick-chat.png
--- a/docs/assets/quick-start/quick-start-login.png
+++ b/docs/assets/quick-start/quick-start-login.png
--- a/docs/assets/quick-start/quick-start-qwen3.png
+++ b/docs/assets/quick-start/quick-start-qwen3.png
--- a/docs/assets/quick-start/windows-done.png
+++ b/docs/assets/quick-start/windows-done.png
--- a/docs/assets/quick-start/windows-installer.png
+++ b/docs/assets/quick-start/windows-installer.png
--- a/docs/assets/sso/auth0-app.png
+++ b/docs/assets/sso/auth0-app.png
--- a/docs/assets/sso/auth0-callback.png
+++ b/docs/assets/sso/auth0-callback.png
--- a/docs/assets/sso/auth0-saml-callback.png
+++ b/docs/assets/sso/auth0-saml-callback.png
--- a/docs/assets/sso/auth0-saml-cert.png
+++ b/docs/assets/sso/auth0-saml-cert.png
--- a/docs/assets/sso/auth0-saml-url.png
+++ b/docs/assets/sso/auth0-saml-url.png
--- a/docs/assets/sso/create-oidc-app.png
+++ b/docs/assets/sso/create-oidc-app.png
--- a/docs/assets/sso/create-saml-app.png
+++ b/docs/assets/sso/create-saml-app.png
--- a/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/allow-cpu-offload.png
--- a/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
+++ b/docs/assets/tutorials/inference-on-cpus/cpu-offload.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model-gguf.png
--- a/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
+++ b/docs/assets/tutorials/inference-with-tool-calling/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/deploy-model.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/gpu-usage.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/model-list.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/playground.png
--- a/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
+++ b/docs/assets/tutorials/performing-distributed-inference-across-workers/worker-list.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-1.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/deploy-model-2.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-ascend-mindie/resources-loaded.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/deploy-model.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/initial-resources.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/model-info.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-chat.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/playground-compare.png
--- a/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
+++ b/docs/assets/tutorials/running-deepseek-r1-671b-with-distributed-vllm/resources-loaded.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/playground.png
--- a/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
+++ b/docs/assets/tutorials/running-on-copilot-plus-pcs-with-snapdragon-x/qwen2.5.png
--- a/docs/assets/tutorials/using-vision-language-models/playground-vl.png
+++ b/docs/assets/tutorials/using-vision-language-models/playground-vl.png
--- a/docs/assets/using-models/editing-images/image-edit-catalog.png
+++ b/docs/assets/using-models/editing-images/image-edit-catalog.png
--- a/docs/assets/using-models/editing-images/image-edit-example.png
+++ b/docs/assets/using-models/editing-images/image-edit-example.png
--- a/docs/assets/using-models/editing-images/image-edit-input.png
+++ b/docs/assets/using-models/editing-images/image-edit-input.png
--- a/docs/assets/using-models/editing-images/image-edit-output.png
+++ b/docs/assets/using-models/editing-images/image-edit-output.png
--- a/docs/assets/using-models/editing-images/view-code.png
+++ b/docs/assets/using-models/editing-images/view-code.png
--- a/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
+++ b/docs/assets/using-models/recommended-parameters-for-image-generation-models/add-lora-file.png
--- a/docs/assets/using-models/using-audio-models/deploy-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-stt-model.png
--- a/docs/assets/using-models/using-audio-models/deploy-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/deploy-tts-model.png
--- a/docs/assets/using-models/using-audio-models/inference-stt-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-stt-model.png
--- a/docs/assets/using-models/using-audio-models/inference-tts-model.png
+++ b/docs/assets/using-models/using-audio-models/inference-tts-model.png
--- a/docs/assets/using-models/using-audio-models/stt-model-list.png
+++ b/docs/assets/using-models/using-audio-models/stt-model-list.png
--- a/docs/assets/using-models/using-audio-models/tts-model-list.png
+++ b/docs/assets/using-models/using-audio-models/tts-model-list.png
--- a/docs/assets/using-models/using-embedding-models/deploy-model.png
+++ b/docs/assets/using-models/using-embedding-models/deploy-model.png
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
gitlawr	899bea6697	ci: pin action-gh-release version	7 months ago
thxCode	ef03d971e3	ci(npu): adjust processing - mindie: upgrade transformers for Qwen2-VL - gpustack: clarify permission - script: enable skipping pre-commit Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	3891135a95	fix: disallow abbrev parsing backend parameters	7 months ago
thxCode	79eef89ca3	refactor(npu): tidy up mindie and vllm - install GCC if less then Ubuntu 21.04 - link llama-box-rpc-server via directory relative linking - build Python instead of install - install MindIE via pipx venv - use multi-stage build to install vLLM in parallel - remove instance running envs of vLLM Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	95f77e4921	refactor: mindie turbo support Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	bdebba4215	chore: update llama-box to v0.0.154	7 months ago
gitlawr	78d71882fa	chore: update llama-box to v0.0.153	7 months ago
linyinli	a63683c768	fix: distributed inference for vLLM Ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
gitlawr	5688ffb457	fix: exception on log file not ready	7 months ago
linyinli	ace9f8451f	fix: remove incompatible Qwen3 config for vLLM Ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
gitlawr	f459045cc3	feat: add r1 0528 to catalog	7 months ago
thxCode	234f5049be	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	c356d546a0	ci: drop cuda11.8	7 months ago
linyinli	5341d63230	feat: support vllm ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
Terada Kousuke	8ee9099856	Add Japanese README	7 months ago
Yuxing Deng	588063fbfe	feat: add support for built binary to run mulitprocessing	7 months ago
peiyuan.zhang	a3af7dda24	remove Installation Script	7 months ago
peiyuan.zhang	8a41aaa6a9	support iluvatar	7 months ago
thxCode	12abfbe858	refactor(llama-box): enable max projected cache by default Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	940c2cdfc7	chore(tool): bump version use musa rc4.0 Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	6296336bcb	refactor(scheduler): params processing in gguf-parser - receive new arguments: --swa-full, --max-projected-cache - add exception processing in specific version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	dd879a988c	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	13c75b5bdd	refactor(catalog): enable visual max image size in pixtral Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	c25236e7a0	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	4ed6e1a223	refactor(llama-box): get downloaded platform name Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	42919d734a	ci: docker build cache Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	77b21c09cc	ci: update pr tigger branches	7 months ago
gitlawr	5a063e1c91	chore: update vox-box	7 months ago
gitlawr	91695f48f3	ci: use tag ui on release	7 months ago