ci: pin action-gh-release version

ci(npu): adjust processing
- mindie: upgrade transformers for Qwen2-VL - gpustack: clarify permission - script: enable skipping pre-commit Signed-off-by: thxCode <thxcode0824@gmail.com>
44 changed files with 1414 additions and 291 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -5,3 +5,4 @@ dist/
 .mypy_cache/

 **/third_party/bin
+*.ma
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -3,7 +3,9 @@ name: CI
 on:
  workflow_dispatch:
  push:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    tags: ["*.*.*"]
    paths-ignore:
      - "mkdocs.yml"
@ -58,7 +60,7 @@ jobs:
          retention-days: 5

      - name: Release GitHub Assets
-        uses: softprops/action-gh-release@v2
+        uses: softprops/action-gh-release@v2.2.2
        if: startsWith(github.ref, 'refs/tags/') && matrix.python-version == '3.11' && matrix.os == 'linux'
        with:
          # Draft for official releases to prepare and review release notes before publishing
--- a/.github/workflows/docker-ci.yaml
+++ b/.github/workflows/docker-ci.yaml
@ -3,7 +3,9 @@ name: Docker CI
 on:
  workflow_dispatch:
  push:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    tags: ["*.*.*"]
    paths-ignore:
      - "mkdocs.yml"
@ -16,7 +18,9 @@ on:
      - "Dockerfile.rocm.base"
      - "Dockerfile.dcu.base"
  pull_request:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    paths:
      - "Dockerfile"
      - "Dockerfile.*"
@ -50,13 +54,6 @@ jobs:
            tag_suffix: "-cuda12.8"
            build_args:
              - "CUDA_VERSION=12.8.1"
-          - device: cuda
-            dockerfile: "Dockerfile"
-            platforms: "linux/amd64,linux/arm64"
-            tag_suffix: "-cuda11.8"
-            build_args:
-              - "CUDA_VERSION=11.8.0"
-              - "CUDA_TAG_SUFFIX=-cudnn8-runtime-ubuntu22.04"
          #
          # HIP RoCM
          #
@ -105,6 +102,14 @@ jobs:
            platforms: "linux/amd64,linux/arm64"
            tag_suffix: "-cpu"
            build_args: []
+          #
+          # Iluvatar Corex
+          #
+          - device: corex
+            dockerfile: "Dockerfile.corex"
+            platforms: "linux/amd64"
+            tag_suffix: "-corex"
+            build_args: []

    steps:
      - name: Checkout
@ -155,7 +160,7 @@ jobs:
          done
          echo "EOF" >> $GITHUB_OUTPUT
      - name: Package
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v6
        id: package
        with:
          push: ${{ github.event_name != 'pull_request' }}
@ -171,4 +176,4 @@ jobs:
          cache-from: |
            type=registry,ref=gpustack/build-cache:gpustack${{ matrix.tag_suffix }}
          cache-to: |
-            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,oci-mediatypes=false,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
+            ${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
--- a/.github/workflows/pr.yml
+++ b/.github/workflows/pr.yml
@ -2,7 +2,9 @@ name: PR Checking

 on:
  pull_request:
-    branches: [main]
+    branches:
+      - main
+      - "v*-dev"
    paths-ignore:
      - "mkdocs.yml"
      - "docs/**"
--- a/.gitignore
+++ b/.gitignore
@ -155,6 +155,7 @@ __pycache__/
 # GPUStack related
 */third_party/bin
 */ui/
+*.ma

 # macOS
 .DS_Store
--- a/4
+++ b/4
@ -26,10 +26,6 @@ ARG VLLM_VERSION=0.8.5.post1
 RUN <<EOF
    if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
        # Install vllm dependencies for x86_64
-        if [ "$(echo "${CUDA_VERSION}" | cut -d. -f1,2)" = "11.8" ]; then
-            pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl \
-            --extra-index-url https://download.pytorch.org/whl/cu118;
-        fi;
        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
    else
        WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";
--- a/Dockerfile.corex
+++ b/Dockerfile.corex
@ -0,0 +1,28 @@
+FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS build
+
+RUN apt-get update && apt-get install -y \
+    git \
+    curl
+
+COPY . /workspace/gpustack
+RUN cd /workspace/gpustack && make build
+
+FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS runtime
+
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    wget \
+    tzdata \
+    iproute2 \
+    tini \
+    && apt-get clean && rm -rf /var/lib/apt/lists/*
+
+COPY --from=build /workspace/gpustack/dist/*.whl /dist/
+RUN pip install /dist/*.whl && \
+    pip cache purge && \
+    rm -rf /dist
+
+RUN gpustack download-tools
+
+ENTRYPOINT [ "tini", "--", "gpustack", "start" ]
--- a/Dockerfile.musa
+++ b/Dockerfile.musa
@ -1,7 +1,7 @@
 ARG UBUNTU_VERSION=22.04
-ARG MUSA_VERSION=rc3.1.1
-ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
-ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
+ARG MUSA_VERSION=rc4.0.1
+ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
+ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}

 FROM ${BASE_MUSA_DEV_CONTAINER} AS build

--- a/Dockerfile.npu
+++ b/Dockerfile.npu
@ -1,12 +1,18 @@
 # Packaging logic:
 # 1. base target:
-#   - Install tools, including Python, GCC, CMake, Make, SCCache and dependencies.
-#   - Install specific version Ascend CANN according to the chip, including Toolkit and Kernels.
-# 2. mindie-install target:
-#   - Install specific version Ascend CANN NNAL.
-#   - Copy and intsall the ATB models from a fixed image.
-#   - Install required dependencies.
+#   - Install/Upgrade tools, including Python, GCC[optional], CMake, Make, SCCache and dependencies.
+#   - Install specific version Ascend CANN according to the chip, including Toolkit, Kernels and NNAL.
+# 2.1. mindie-install target:
+#   - Copy ATB models from a fixed image.
+#   - Install dependencies for MindIE into system site packages, including Torch, Torch-NPU and TorchVision,
+#     which is used to support multi-versions of MindIE.
+#   - Create a virtual environment to place MindIE: $(pipx environment --value PIPX_LOCAL_VENVS)/mindie.
 #   - Install specific version MindIE.
+# 2.2. vllm-install target (parallel against mindie-install):
+#   - Create a virtual environment to place vLLM: $(pipx environment --value PIPX_LOCAL_VENVS)/vllm.
+#   - Install specific version Torch, Torch-NPU and TorchVision.
+#   - Install specific version MindIE Turbo.
+#   - Install specific version vLLM and vLLM Ascend.
 # 3. gpustack target (final):
 #   - Install GPUStack, and override the required dependencies after installed.
 #   - Set up the environment for CANN, NNAL and ATB models.
@ -20,12 +26,19 @@
 # - MINDIE_VERSION is the version of Ascend MindIE,
 #   which is used to install the Ascend MindIE,
 #   please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
+# - VLLM_VERSION is the version of vLLM,
+#   which is used to install the vLLM,
+# - VLLM_ASCEND_VERSION is the version of vLLM Ascend,
+#   which is used to install the vLLM Ascend,
+#   please check https://vllm-ascend.readthedocs.io/en/stable/installation.html for details.
 # - PYTHON_VERSION is the version of Python,
 #   which should be properly set, it must be 3.x.

 ARG CANN_VERSION=8.1.rc1.beta1
 ARG CANN_CHIP=910b
 ARG MINDIE_VERSION=2.0.rc1
+ARG VLLM_VERSION=0.7.3
+ARG VLLM_ASCEND_VERSION=0.7.3.post1
 ARG PYTHON_VERSION=3.11

 #
@ -35,26 +48,26 @@ ARG PYTHON_VERSION=3.11
 #   docker build --tag=gpustack/gpustack:npu-base --file=Dockerfile.npu --target base --progress=plain .
 #

-FROM ubuntu:20.04 AS base
+FROM ubuntu:22.04 AS base
 SHELL ["/bin/bash", "-eo", "pipefail", "-c"]

 ARG TARGETPLATFORM
 ARG TARGETOS
 ARG TARGETARCH

-## Install tools
+## Install Tools

-ARG PYTHON_VERSION
-
-ENV DEBIAN_FRONTEND=noninteractive \
-    PYTHON_VERSION=${PYTHON_VERSION}
+ENV DEBIAN_FRONTEND=noninteractive

 RUN <<EOF
+    # Tools
+
    # Refresh
    apt-get update -y && apt-get install -y --no-install-recommends \
        software-properties-common apt-transport-https \
+        ca-certificates gnupg2 lsb-release gnupg-agent \
+      && apt-get update -y \
      && add-apt-repository -y ppa:ubuntu-toolchain-r/test \
-      && add-apt-repository -y ppa:deadsnakes/ppa \
      && apt-get update -y

    # Install
@ -68,22 +81,13 @@ RUN <<EOF
        procps sysstat htop \
        tini vim jq bc tree

-    # Update python
-    PYTHON="python${PYTHON_VERSION}"
-    apt-get install -y --no-install-recommends \
-        ${PYTHON} ${PYTHON}-dev ${PYTHON}-distutils ${PYTHON}-venv ${PYTHON}-lib2to3
-    if [ -f /etc/alternatives/python ]; then update-alternatives --remove-all python; fi; update-alternatives --install /usr/bin/python python /usr/bin/${PYTHON} 10
-    if [ -f /etc/alternatives/python3 ]; then update-alternatives --remove-all python3; fi; update-alternatives --install /usr/bin/python3 python3 /usr/bin/${PYTHON} 10
-    curl -sS https://bootstrap.pypa.io/get-pip.py | ${PYTHON}
-
    # Update locale
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

    # Cleanup
    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt \
-        && pip cache purge
+        && rm -rf /var/cache/apt
 EOF

 ENV LANG='en_US.UTF-8' \
@ -95,27 +99,31 @@ ENV LANG='en_US.UTF-8' \
 RUN <<EOF
    # GCC

-    # Install
-    apt-get install -y --no-install-recommends \
-        gcc-11 g++-11 gfortran-11 gfortran
-
-    # Update alternatives
-    if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
-    if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
-    if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
-    if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
-    if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
-    if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
-    if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
-    if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
-    if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
-    if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
-    if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
-
-    # Cleanup
-    rm -rf /var/tmp/* \
-        && rm -rf /tmp/* \
-        && rm -rf /var/cache/apt
+    # NB(thxCode): Upgrade GCC if the Ubuntu version is lower than 21.04.
+    source /etc/os-release
+    if (( $(echo "${VERSION_ID} < 21.04" | bc -l) )); then
+        # Install
+        apt-get install -y --no-install-recommends \
+            gcc-11 g++-11 gfortran-11 gfortran
+
+        # Update alternatives
+        if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
+        if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
+        if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
+        if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
+        if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
+        if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
+        if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
+        if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
+        if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
+        if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
+        if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
+
+        # Cleanup
+        rm -rf /var/tmp/* \
+            && rm -rf /tmp/* \
+            && rm -rf /var/cache/apt
+    fi
 EOF

 ## Install CMake/Make/SCCache
@ -126,8 +134,8 @@ RUN <<EOF
    # Install
    apt-get install -y --no-install-recommends \
        pkg-config make
-    curl -sL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
-    curl -sL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1
+    curl --retry 3 --retry-connrefused -fL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
+    curl --retry 3 --retry-connrefused -fL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1

    # Cleanup
    rm -rf /var/tmp/* \
@ -135,16 +143,20 @@ RUN <<EOF
        && rm -rf /var/cache/apt
 EOF

-## Install Dependencies
+## Install Compile Dependencies

 RUN <<EOF
    # Dependencies

    # Install
    apt-get install -y --no-install-recommends \
-        zlib1g zlib1g-dev libbz2-dev liblzma-dev libffi-dev openssl libssl-dev libsqlite3-dev \
-        libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 gfortran libhdf5-dev \
-        libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl
+        zlib1g zlib1g-dev libbz2-dev libffi-dev libgdbm-dev libgdbm-compat-dev \
+        openssl libssl-dev libsqlite3-dev lcov libomp-dev \
+        libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 libhdf5-dev \
+        libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl \
+        libncurses5-dev libreadline6-dev libsqlite3-dev libssl-dev \
+        liblzma-dev lzma lzma-dev tk-dev uuid-dev libmpdec-dev \
+        libnuma-dev

    # Cleanup
    rm -rf /var/tmp/* \
@ -152,6 +164,62 @@ RUN <<EOF
        && rm -rf /var/cache/apt
 EOF

+## Install Python
+
+ARG PYTHON_VERSION
+
+ENV PYTHON_VERSION=${PYTHON_VERSION}
+
+RUN <<EOF
+    # Python
+
+    # Download
+    PYTHON_INSTALL_DIR="/tmp/Python-${PYTHON_VERSION}"
+    mkdir -p ${PYTHON_INSTALL_DIR}
+    PYTHON_LATEST_VERSION=$(curl -s https://repo.huaweicloud.com/python/ | grep -oE "${PYTHON_VERSION}\.[0-9]+" | sort -V | tail -n 1)
+    curl -H 'Referer: https://repo.huaweicloud.com/' --retry 3 --retry-connrefused -fL "https://repo.huaweicloud.com/python/${PYTHON_LATEST_VERSION}/Python-${PYTHON_LATEST_VERSION}.tgz" | tar -zx -C ${PYTHON_INSTALL_DIR} --strip-components 1
+
+    # Build
+    pushd ${PYTHON_INSTALL_DIR}
+    ./configure \
+        --prefix=/usr \
+        --enable-optimizations \
+        --enable-shared \
+        --enable-ipv6 \
+        --enable-loadable-sqlite-extensions \
+        --with-lto=full \
+        --with-ensurepip=install \
+        --with-computed-gotos
+    make -j$(nproc) && make altinstall
+    popd
+
+    # Link
+    ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
+    ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
+    ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip3
+    ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip
+    ln -vsf /usr/bin/2to3-${PYTHON_VERSION} /usr/bin/2to3
+    ln -vsf /usr/bin/pydoc${PYTHON_VERSION} /usr/bin/pydoc3
+    ln -vsf /usr/bin/idle${PYTHON_VERSION} /usr/bin/idle3
+
+    # Install packages
+    cat <<EOT >/tmp/requirements.txt
+setuptools==80.7.1
+pipx==1.7.1
+EOT
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+
+    # Cleanup
+    rm -rf /var/tmp/* \
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
+        && pip cache purge
+EOF
+
+## Preset this to simplify configuration,
+## it is the output of $(pipx environment --value PIPX_LOCAL_VENVS).
+ENV PIPX_LOCAL_VENVS=/root/.local/share/pipx/venvs
+
 ARG CANN_VERSION
 ARG CANN_CHIP

@ -171,9 +239,21 @@ RUN <<EOF
    URL_SUFFIX="response-content-type=application/octet-stream"

    # Install dependencies
-    python3 -m pip install --no-cache-dir --root-user-action ignore --upgrade pip
-    pip install --no-cache-dir --root-user-action ignore \
-      attrs cython numpy==1.26.4 decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py
+    cat <<EOT >/tmp/requirements.txt
+attrs==24.3.0
+numpy==1.26.4
+decorator==5.2.1
+sympy==1.14.0
+cffi==1.17.1
+PyYAML==6.0.2
+pathlib2==2.3.7.post1
+psutil==7.0.0
+protobuf==6.31.0
+scipy==1.15.3
+requests==2.32.3
+absl-py==2.2.2
+EOT
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt

    # Install toolkit
    TOOLKIT_FILE="Ascend-cann-toolkit_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
@ -184,7 +264,9 @@ RUN <<EOF
    printf "Y\n" | "${TOOLKIT_PATH}" --install --install-for-all --install-path="${CANN_HOME}"

    # Cleanup
-    rm -f "${TOOLKIT_PATH}" \
+   rm -rf /var/tmp/* \
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend \
        && rm -rf /var/log/ascend_seclog \
        && pip cache purge
@ -217,21 +299,14 @@ RUN <<EOF
    printf "Y\n" |"${KERNELS_PATH}" --install --install-for-all --install-path="${CANN_HOME}"

    # Cleanup
-    rm -f "${KERNELS_PATH}" \
+    rm -rf /var/tmp/* \
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend \
        && rm -rf /var/log/ascend_seclog \
        && pip cache purge
 EOF

-#
-# Stage MindIE Install
-#
-# Example build command:
-#   docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
-#
-
-FROM base AS mindie-install
-
 ## Install NNAL

 RUN <<EOF
@ -255,25 +330,22 @@ RUN <<EOF
    printf "Y\n" | "${NNAL_PATH}" --install --install-path="${CANN_HOME}"

    # Cleanup
-    rm -f "${NNAL_PATH}" \
+    rm -rf /var/tmp/* \
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
        && rm -rf /var/log/ascend_seclog \
        && rm -rf /var/log/cann_atb_log \
        && pip cache purge
 EOF

-COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
-RUN <<EOF
-    # ATB Models
-
-    # Install
-    pip install --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl
+#
+# Stage MindIE Install
+#
+# Example build command:
+#   docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
+#

-    # Cleanup
-    rm -f "${NNAL_PATH}" \
-        && rm -rf /var/log/ascend_seclog \
-        && rm -rf /var/log/cann_atb_log \
-        && pip cache purge
-EOF
+FROM base AS mindie-install

 ## Install MindIE

@ -281,6 +353,7 @@ ARG MINDIE_VERSION

 ENV MINDIE_VERSION=${MINDIE_VERSION}

+COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
 RUN <<EOF
    # MindIE

@ -290,19 +363,17 @@ RUN <<EOF
    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindIE/MindIE%20${DOWNLOAD_VERSION}"
    URL_SUFFIX="response-content-type=application/octet-stream"

-    # Prepare environment
-    source ${CANN_HOME}/ascend-toolkit/set_env.sh
-    source ${CANN_HOME}/nnal/atb/set_env.sh
-
-    # Install dependencies,
+    # Install Torch, Torch-npu, TorchVision,
    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
-    # please check https://www.hiascend.com/document/detail/zh/Pytorch/700/configandinstg/instg/insg_0004.html for details.
+    # please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
    if [ ${ARCH} == "x86_64" ]; then
-        pip install --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
    else
-        pip install --no-cache-dir --root-user-action ignore torch==2.1.0
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0
    fi
-    pip install --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0
+
+    # Install dependencies.
    cat <<EOT >/tmp/requirements.txt
 av==14.3.0
 absl-py==2.2.2
@ -339,7 +410,7 @@ python-rapidjson==1.20
 requests==2.32.3
 sacrebleu==2.4.3
 tornado==6.4.2
-transformers==4.46.3
+transformers==4.52.3
 tiktoken==0.7.0
 typing_extensions==4.13.2
 tzdata==2024.2
@ -349,7 +420,18 @@ urllib3==2.4.0
 zope.event==5.0
 zope.interface==7.0.3
 EOT
-    pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+
+    # Install MindIE ATB models
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl
+
+    # Pre process
+    # - Create virtual environment to place MindIE
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/mindie
+    # - Prepare environment
+    source ${CANN_HOME}/ascend-toolkit/set_env.sh
+    source ${CANN_HOME}/nnal/atb/set_env.sh
+    source ${PIPX_LOCAL_VENVS}/mindie/bin/activate

    # Install MindIE
    MINDIE_FILE="Ascend-mindie_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
@ -360,17 +442,122 @@ EOT
    printf "Y\n" | "${MINDIE_PATH}" --install --install-path="${CANN_HOME}"

    # Post process
-    chmod +w "${CANN_HOME}/mindie/latest/mindie-service/conf"
+    # - Make MindIE service configuration writable
+    chmod +w "${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/conf"
+    # - Tell GPUStack how to launch MindIE
+    cat <<EOT >>"${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"
+
+# NB(thxCode): This is a workaround for GPUStack to activate MindIE.
+source ${PIPX_LOCAL_VENVS}/mindie/bin/activate || true
+EOT
+    chmod -w "${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"
+    deactivate

    # Review
-    pip freeze \
-        && python -m site
+    pipx runpip mindie freeze

    # Cleanup
-    rm -f "${MINDIE_PATH}" \
+    rm -rf /var/tmp/* \
+        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
+        && rm -rf /var/log/ascend_seclog \
+        && rm -rf /var/log/cann_atb_log \
        && rm -rf /var/log/mindie_log \
        && rm -rf ~/log \
+        && pip cache purge
+EOF
+
+#
+# Stage vLLM Install
+#
+# Example build command:
+#   docker build --tag=gpustack/gpustack:npu-vllm-install --file=Dockerfile.npu --target vllm-install --progress=plain .
+#
+
+FROM base AS vllm-install
+
+## Install vLLM Ascend
+
+ARG VLLM_VERSION
+ARG VLLM_ASCEND_VERSION
+ARG MINDIE_VERSION
+
+ENV VLLM_VERSION=${VLLM_VERSION} \
+    VLLM_ASCEND_VERSION=${VLLM_ASCEND_VERSION} \
+    MINDIE_VERSION=${MINDIE_VERSION}
+
+RUN <<EOF
+    # vLLM
+
+    OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
+    ARCH="$(uname -m)"
+    DOWNLOAD_VERSION="$(echo ${MINDIE_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
+    URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindIE/MindIE%20${DOWNLOAD_VERSION}"
+    URL_SUFFIX="response-content-type=application/octet-stream"
+
+    # Pre process
+    # - Create virtual environment to place vLLM
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/vllm
+    # - Prepare environment
+    source ${CANN_HOME}/ascend-toolkit/set_env.sh
+    source ${CANN_HOME}/nnal/atb/set_env.sh
+    source ${PIPX_LOCAL_VENVS}/vllm/bin/activate
+
+    # Install Torch, Torch-npu, TorchVision,
+    # according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
+    # please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
+    if [ ${ARCH} == "x86_64" ]; then
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
+    else
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1
+    fi
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.5.1 torchvision==0.20.1
+
+    # Install dependencies.
+    cat <<EOT >/tmp/requirements.txt
+ml-dtypes==0.5.0
+tornado==6.4.2
+gevent==24.2.1
+geventhttpclient==2.3.1
+sacrebleu==2.4.3
+pandas==2.2.3
+rouge_score==0.1.2
+pybind11==2.13.6
+pytest==8.4.0
+cloudpickle==3.0.0
+ray[client]==2.43.0
+EOT
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+
+    # Install vLLM & vLLM-Ascend
+    cat <<EOT >/tmp/requirements.txt
+vllm==${VLLM_VERSION}
+vllm-ascend==${VLLM_ASCEND_VERSION}
+EOT
+    if [ ${ARCH} == "x86_64" ]; then
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
+    else
+        pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+    fi
+
+    # Install MindIE Turbo
+    MINDIE_TURBO_FILE="Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}.tar.gz"
+    MINDIE_TURBO_URL="${URL_PREFIX}/${MINDIE_TURBO_FILE}?${URL_SUFFIX}"
+    curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL "${MINDIE_TURBO_URL}" | tar -zx -C /tmp --strip-components 1
+    WHEEL_PACKAGE="$(ls /tmp/Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}/*.whl)"
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE}
+
+    # Post process
+    deactivate
+
+    # Review
+    pipx runpip vllm freeze
+
+    # Cleanup
+    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
+        && rm -rf ~/log \
        && pip cache purge
 EOF

@ -383,39 +570,61 @@ EOF

 FROM mindie-install AS gpustack

+## Copy vLLM from vllm-install stage
+
+COPY --from=vllm-install ${PIPX_LOCAL_VENVS}/vllm ${PIPX_LOCAL_VENVS}/vllm
+
 ## Install GPUStack

 RUN --mount=type=bind,target=/workspace/gpustack,rw <<EOF
-    # Build
+    # GPUStack
+
+    # Build GPUStack
+    export PATH="${HOME}/.local/bin:${PATH}"
    cd /workspace/gpustack \
+        && git config --global --add safe.directory /workspace/gpustack \
        && make build

-    # Install,
+    # Pre process
+    # - Create virtual environment to place gpustack
+    python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/gpustack
+    # - Prepare environment
+    source ${PIPX_LOCAL_VENVS}/gpustack/bin/activate
+
+    # Install GPUStack,
    # vox-box relies on PyTorch 2.7, which is not compatible with MindIE.
    WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)"
-    pip install --no-cache-dir --root-user-action ignore $WHEEL_PACKAGE
+    pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE} \
+        && ln -vsf ${PIPX_LOCAL_VENVS}/gpustack/bin/gpustack /usr/local/bin/gpustack

    # Download tools
    gpustack download-tools --device npu

-    # Post-process,
-    # override the required dependencies after installed.
-    cat <<EOT >/tmp/requirements.txt
-pipx==1.7.1
-EOT
-    pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
+    # Active MindIE
+    # MindIE is combined with a lot of components, and it is conflict with vLLM,
+    # so we need to active MindIE manually at GPUStack.
+
+    # Active vLLM
+    ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/vllm ${PIPX_LOCAL_VENVS}/gpustack/bin/vllm
+    # - Redirect RAY.
+    rm -rf ${PIPX_LOCAL_VENVS}/gpustack/bin/ray \
+        && ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/ray ${PIPX_LOCAL_VENVS}/gpustack/bin/ray

    # Set up environment
    mkdir -p /var/lib/gpustack \
        && chmod -R 0755 /var/lib/gpustack

+    # Post process
+    deactivate
+
    # Review
-    pip freeze \
-        && python -m site
+    pipx runpip gpustack freeze

    # Cleanup
-    rm -rf /workspace/gpustack/dist \
+    rm -rf /var/tmp/* \
        && rm -rf /tmp/* \
+        && rm -rf /var/cache/apt \
+        && rm -rf /workspace/gpustack/dist \
        && pip cache purge
 EOF

@ -442,13 +651,15 @@ RUN <<EOF
    echo "${SOURCE_ATB_MODEL_ENV}" >> /etc/profile
    echo "${SOURCE_ATB_MODEL_ENV}" >> ~/.bashrc

-    # Export Driver Tools
+    # Export Driver tools
    EXPORT_DRIVER_TOOLS="export PATH=${CANN_HOME}/driver/tools:\${PATH}"
    echo "${EXPORT_DRIVER_TOOLS}" >> /etc/profile
    echo "${EXPORT_DRIVER_TOOLS}" >> ~/.bashrc

    # NB(thxCode): For specific MindIE version supporting,
    # we need to process environment setting up during GPUStack deployment.
+
+    # NB(thxCode): Any tuning environment variables should NOT be set here.
 EOF

 ENTRYPOINT [ "tini", "--", "/usr/bin/bash", "-c", "source /etc/profile && exec gpustack start \"$@\"", "--" ]
--- a/README.md
+++ b/README.md
@ -21,7 +21,8 @@

 <p align="center">
  <a href="./README.md">English</a> |
-  <a href="./README_CN.md">简体中文</a>
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
 </p>

 <br>
@ -34,7 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 - **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
 - **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Integrates with llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM, and Ascend MindIE.
+- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
 - **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
 - **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
 - **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -154,6 +155,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] Ascend CANN
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
+- [x] Iluvatar Corex

 We plan to support the following accelerators in future releases.

--- a/README_CN.md
+++ b/README_CN.md
@ -21,7 +21,8 @@

 <p align="center">
  <a href="./README.md">English</a> |
-  <a href="./README_CN.md">简体中文</a>
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
 </p>

 <br>
@ -34,7 +35,7 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。

 - **广泛的 GPU 兼容性**：无缝支持 Apple Mac、Windows PC 和 Linux 服务器上各种供应商的 GPU。
 - **广泛的模型支持**：支持各种模型，包括 LLM、多模态 VLM、图像模型、语音模型、文本嵌入模型和重排序模型。
- **灵活的推理后端**：与 llama-box（llama.cpp 和 stable-diffusion.cpp）、vox-box、vLLM 和 Ascend MindIE 集成。
+- **灵活的推理后端**：支持与 llama-box（llama.cpp 和 stable-diffusion.cpp）、vox-box、vLLM 和 Ascend MindIE 等多种推理后端的灵活集成。
 - **多版本后端支持**：同时运行推理后端的多个版本，以满足不同模型的不同运行依赖。
 - **分布式推理**：支持单机和多机多卡并行推理，包括跨供应商和运行环境的异构 GPU。
 - **可扩展的 GPU 架构**：通过向基础设施添加更多 GPU 或节点轻松进行扩展。
@ -155,6 +156,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
 - [x] 昇腾 CANN
 - [x] 海光 DTK
 - [x] 摩尔线程 MUSA
+- [x] 天数智芯 Corex

 我们计划在未来的版本中支持以下加速框架：

--- a/README_JP.md
+++ b/README_JP.md
@ -0,0 +1,251 @@
+<br>
+
+<p align="center">
+    <img alt="GPUStack" src="https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-logo.png" width="300px"/>
+</p>
+<br>
+
+<p align="center">
+    <a href="https://docs.gpustack.ai" target="_blank">
+        <img alt="Documentation" src="https://img.shields.io/badge/ドキュメント-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
+    <a href="./LICENSE" target="_blank">
+        <img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
+    <a href="./docs/assets/wechat-assistant.png" target="_blank">
+        <img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
+    <a href="https://discord.gg/VXYJzuaqwD" target="_blank">
+        <img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
+    <a href="https://twitter.com/intent/follow?screen_name=gpustack_ai" target="_blank">
+        <img alt="Follow on X(Twitter)" src="https://img.shields.io/twitter/follow/gpustack_ai?logo=X"></a>
+</p>
+<br>
+
+<p align="center">
+  <a href="./README.md">English</a> |
+  <a href="./README_CN.md">简体中文</a> |
+  <a href="./README_JP.md">日本語</a>
+</p>
+
+<br>
+
+![demo](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-demo.gif)
+
+GPUStack は、AI モデルを実行するためのオープンソース GPU クラスタマネージャーです。
+
+### 主な機能
+
+- **幅広い GPU 互換性:** Apple Mac、Windows PC、Linux サーバー上のさまざまなベンダーの GPU をシームレスにサポート。
+- **豊富なモデルサポート:** LLM、VLM、画像モデル、音声モデル、埋め込みモデル、リランクモデルを含む幅広いモデルをサポート。
+- **柔軟な推論バックエンド:** llama-box（llama.cpp と stable-diffusion.cpp）、vox-box、vLLM、Ascend MindIE と統合。
+- **マルチバージョンバックエンドサポート:** 異なるモデルの多様なランタイム要件を満たすために、推論バックエンドの複数バージョンを同時実行。
+- **分散推論:** ベンダーやランタイム環境をまたぐ異種 GPU を含む、シングルノードおよびマルチノードのマルチ GPU 推論をサポート。
+- **スケーラブルな GPU アーキテクチャ:** インフラストラクチャに GPU やノードを追加することで簡単にスケールアップ。
+- **堅牢なモデル安定性:** 自動障害回復、マルチインスタンス冗長性、推論リクエストのロードバランシングで高可用性を確保。
+- **インテリジェントなデプロイ評価:** モデルリソース要件、バックエンドとアーキテクチャの互換性、OS の互換性、その他のデプロイ関連要因を自動的に評価。
+- **自動スケジューリング:** 利用可能なリソースに基づいてモデルを動的に割り当て。
+- **軽量な Python パッケージ:** 最小限の依存関係と低い運用オーバーヘッド。
+- **OpenAI 互換 API:** OpenAI の API 仕様と完全に互換性があり、シームレスな統合を実現。
+- **ユーザーと API キー管理:** ユーザーと API キーの管理を簡素化。
+- **リアルタイム GPU 監視:** GPU 性能と使用率をリアルタイムで追跡。
+- **トークンとレートメトリクス:** トークン使用量と API リクエストレートを監視。
+
+## インストール
+
+### Linux または macOS
+
+GPUStack は、systemd または launchd ベースのシステムでサービスとしてインストールするスクリプトを提供しており、デフォルトポートは 80 です。この方法で GPUStack をインストールするには、以下を実行します：
+
+```bash
+curl -sfL https://get.gpustack.ai | sh -s -
+```
+
+### Windows
+
+管理者として PowerShell を実行し（PowerShell ISE の使用は**避けてください**）、以下のコマンドを実行して GPUStack をインストールします：
+
+```powershell
+Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
+```
+
+### その他のインストール方法
+
+手動インストール、Docker インストール、または詳細な構成オプションについては、[インストールドキュメント](https://docs.gpustack.ai/latest/installation/installation-script/)を参照してください。
+
+## はじめに
+
+1. **llama3.2**モデルを実行してチャットする：
+
+```bash
+gpustack chat llama3.2 "tell me a joke."
+```
+
+2. **stable-diffusion-v3-5-large-turbo**モデルで画像を生成する：
+
+> ### 💡 ヒント
+>
+> このコマンドは Hugging Face からモデル（約 12GB）をダウンロードします。ダウンロード時間はネットワーク速度に依存します。モデルを実行するために十分なディスクスペースと VRAM（12GB）があることを確認してください。問題が発生した場合は、このステップをスキップして次に進むことができます。
+
+```bash
+gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
+"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
+--sample-steps 5 --show
+```
+
+コマンドが完了すると、生成された画像がデフォルトビューアに表示されます。プロンプトと CLI オプションを実験して出力をカスタマイズできます。
+
+![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
+
+3. ブラウザで`http://your_host_ip`を開いて GPUStack UI にアクセスします。ユーザー名`admin`とデフォルトパスワードで GPUStack にログインします。デフォルト設定のパスワードを取得するには、以下のコマンドを実行します：
+
+**Linux または macOS**
+
+```bash
+cat /var/lib/gpustack/initial_admin_password
+```
+
+**Windows**
+
+```powershell
+Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
+```
+
+4. ナビゲーションメニューで`Playground - Chat`をクリックします。これで UI プレイグラウンドで LLM とチャットできます。
+
+![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
+
+5. ナビゲーションメニューで`API Keys`をクリックし、`New API Key`ボタンをクリックします。
+
+6. `Name`を入力し、`Save`ボタンをクリックします。
+
+7. 生成された API キーをコピーして安全な場所に保存します。作成時にのみ一度だけ表示されることに注意してください。
+
+8. これで API キーを使用して OpenAI 互換 API にアクセスできます。例えば、curl を使用する場合：
+
+```bash
+export GPUSTACK_API_KEY=your_api_key
+curl http://your_gpustack_server_url/v1-openai/chat/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer $GPUSTACK_API_KEY" \
+  -d '{
+    "model": "llama3.2",
+    "messages": [
+      {
+        "role": "system",
+        "content": "You are a helpful assistant."
+      },
+      {
+        "role": "user",
+        "content": "Hello!"
+      }
+    ],
+    "stream": true
+  }'
+```
+
+## サポートされているプラットフォーム
+
+- [x] macOS
+- [x] Linux
+- [x] Windows
+
+## サポートされているアクセラレータ
+
+- [x] NVIDIA CUDA（[Compute Capability](https://developer.nvidia.com/cuda-gpus) 6.0 以上）
+- [x] Apple Metal（M 系チップ）
+- [x] AMD ROCm
+- [x] Ascend CANN
+- [x] Hygon DTK
+- [x] Moore Threads MUSA
+- [x] Iluvatar Corex
+
+以下のアクセラレータは将来のリリースでサポートする予定です。
+
+- [ ] Intel oneAPI
+- [ ] Qualcomm AI Engine
+
+## サポートされているモデル
+
+GPUStack は[llama-box](https://github.com/gpustack/llama-box)（バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー）、[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています：
+
+1. [Hugging Face](https://huggingface.co/)
+
+2. [ModelScope](https://modelscope.cn/)
+
+3. ローカルファイルパス
+
+### モデル例：
+
+| **カテゴリ**                  | **モデル**                                                                                                                                                                                                                                                                                                                                           |
+| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| **大規模言語モデル（LLM）**   | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma)        |
+| **ビジョン言語モデル（VLM）** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
+| **拡散モデル**                | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux)                                                                                                                                                                                                       |
+| **埋め込みモデル**            | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings)                                                                                                                                                         |
+| **リランカーモデル**          | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker)                                                                                                                                                |
+| **音声モデル**                | [Whisper](https://huggingface.co/models?search=Systran/faster)（音声認識）、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)（音声合成）                                                                                                                                                                                      |
+
+サポートされているモデルの完全なリストについては、[推論バックエンド](https://docs.gpustack.ai/latest/user-guide/inference-backends/)ドキュメントのサポートされているモデルセクションを参照してください。
+
+## OpenAI 互換 API
+
+GPUStack は`/v1-openai`パスの下で以下の OpenAI 互換 API を提供します：
+
+- [x] [List Models](https://platform.openai.com/docs/api-reference/models/list)
+- [x] [Create Completion](https://platform.openai.com/docs/api-reference/completions/create)
+- [x] [Create Chat Completion](https://platform.openai.com/docs/api-reference/chat/create)
+- [x] [Create Embeddings](https://platform.openai.com/docs/api-reference/embeddings/create)
+- [x] [Create Image](https://platform.openai.com/docs/api-reference/images/create)
+- [x] [Create Image Edit](https://platform.openai.com/docs/api-reference/images/createEdit)
+- [x] [Create Speech](https://platform.openai.com/docs/api-reference/audio/createSpeech)
+- [x] [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
+
+例えば、公式の[OpenAI Python API ライブラリ](https://github.com/openai/openai-python)を使用して API を利用できます：
+
+```python
+from openai import OpenAI
+client = OpenAI(base_url="http://your_gpustack_server_url/v1-openai", api_key="your_api_key")
+
+completion = client.chat.completions.create(
+  model="llama3.2",
+  messages=[
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "Hello!"}
+  ]
+)
+
+print(completion.choices[0].message)
+```
+
+GPUStack ユーザーは UI で独自の API キーを生成できます。
+
+## ドキュメント
+
+完全なドキュメントについては、[公式ドキュメントサイト](https://docs.gpustack.ai)を参照してください。
+
+## ビルド
+
+1. Python（バージョン 3.10 から 3.12）をインストールします。
+
+2. `make build`を実行します。
+
+ビルドされた wheel パッケージは`dist`ディレクトリにあります。
+
+## コントリビューション
+
+GPUStack への貢献に興味がある場合は、[コントリビューションガイド](./docs/contributing.md)をお読みください。
+
+## コミュニティに参加
+
+問題がある場合や提案がある場合は、サポートのために[コミュニティ](https://discord.gg/VXYJzuaqwD)に参加してください。
+
+## ライセンス
+
+Copyright (c) 2024 The GPUStack authors
+
+Apache License, Version 2.0（以下「ライセンス」）に基づいてライセンスされています。
+このライセンスの詳細については、[LICENSE](./LICENSE)ファイルを参照してください。
+
+適用法で要求されるか、書面で合意されない限り、
+ライセンスに基づいて配布されるソフトウェアは「現状のまま」で配布され、
+明示または黙示を問わず、いかなる種類の保証や条件もありません。
+ライセンスに基づく許可と制限を規定する特定の言語については、
+ライセンスを参照してください。
--- a/docs/faq.md
+++ b/docs/faq.md
@ -4,7 +4,7 @@

 ### Hybird Cluster Support

-It supports a mix of Linux, Windows, and macOS nodes, as well as x86_64 and arm64 architectures. Additionally, It also supports various GPUs, including NVIDIA, Apple Metal, AMD, Ascend, Hygon and Moore Threads.
+GPUStack supports a mix of Linux, Windows, and macOS nodes, as well as x86_64 and arm64 architectures. Additionally, It also supports various GPUs, including NVIDIA, Apple Metal, AMD, Ascend, Hygon and Moore Threads.

 ### Distributed Inference Support

--- a/docs/installation/ascend-cann/online-installation.md
+++ b/docs/installation/ascend-cann/online-installation.md
@ -36,6 +36,7 @@ npu-smi info

 - [x] llama-box (Only supports FP16 precision)
 - [x] MindIE
+- [x] vLLM (Only supports Ascend 910B series)

 ### Prerequisites

--- a/docs/installation/iluvatar-corex/air-gapped-installation.md
+++ b/docs/installation/iluvatar-corex/air-gapped-installation.md
@ -0,0 +1,47 @@
+# Air-Gapped Installation
+
+You can install GPUStack in an air-gapped environment. An air-gapped environment refers to a setup where GPUStack will be installed offline.
+
+The following methods are available for installing GPUStack in an air-gapped environment:
+
+| OS    | Arch  | Supported methods                           |
+| ----- | ----- | ------------------------------------------- |
+| Linux | AMD64 | [Docker Installation](#docker-installation) |
+
+## Supported backends
+
+- [x] vLLM
+
+## Docker Installation
+
+### Prerequisites
+
+- [Driver for MR-V100 MR-V50 BI-V100 BI-V150](https://support.iluvatar.com/#/ProductLine?id=2)
+
+Check if the driver is installed:
+
+```bash
+ixsmi
+```
+
+- [Docker](https://support.iluvatar.com/#/ProductLine?id=2)
+- [Corex Container Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
+
+### Run GPUStack
+
+When running GPUStack with Docker, it works out of the box in an air-gapped environment as long as the Docker images are available. To do this, follow these steps:
+
+1. Pull GPUStack docker image in an online environment:
+
+```bash
+docker pull gpustack/gpustack:latest-corex
+```
+
+If your online environment differs from the air-gapped environment in terms of OS or arch, specify the OS and arch of the air-gapped environment when pulling the image:
+
+```bash
+docker pull --platform linux/amd64 gpustack/gpustack:latest-corex
+```
+
+2. Publish docker image to a private registry or load it directly in the air-gapped environment.
+3. Refer to the [Docker Installation](./online-installation.md#docker-installation) guide to run GPUStack using Docker.
--- a/docs/installation/iluvatar-corex/online-installation.md
+++ b/docs/installation/iluvatar-corex/online-installation.md
@ -0,0 +1,113 @@
+# Online Installation
+
+## Supported Devices
+
+- [x] Iluvatar GPUs (MR-V100 MR-V50 BI-V100 BI-V150)
+
+## Supported Platforms
+
+| OS    | Arch  | Supported methods                                                                                        |
+| ----- | ----- | -------------------------------------------------------------------------------------------------------- |
+| Linux | AMD64 | [Docker Installation](#docker-installation) (Recommended)<br>[Installation Script](#installation-script) |
+
+## Supported backends
+
+- [x] vllm
+
+## Prerequisites
+
+- [Driver for MR-V100 MR-V50 BI-V100 BI-V150](https://support.iluvatar.com/#/ProductLine?id=2)
+
+Check if the driver is installed:
+
+```bash
+ixsmi
+```
+
+## Docker Installation
+
+- [Docker](https://support.iluvatar.com/#/ProductLine?id=2)
+- [Corex Container Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
+
+### Run GPUStack
+
+Run the following command to start the GPUStack server **and built-in worker** (host network mode is recommended):
+
+```bash
+docker run -d --name gpustack \
+    -v /lib/modules:/lib/modules \
+    -v /dev:/dev \
+    --privileged \
+    --cap-add=ALL \
+    --pid=host \
+    --restart=unless-stopped \
+    --network=host \
+    --ipc=host \
+    -v gpustack-data:/var/lib/gpustack \
+    gpustack/gpustack:latest-corex
+```
+
+If you need to change the default server port 80, please use the `--port` parameter:
+
+```bash
+docker run -d --name gpustack \
+    -v /lib/modules:/lib/modules \
+    -v /dev:/dev \
+    --privileged \
+    --cap-add=ALL \
+    --pid=host \
+    --restart=unless-stopped \
+    --network=host \
+    --ipc=host \
+    -v gpustack-data:/var/lib/gpustack \
+    gpustack/gpustack:latest-corex \
+    --port 9090
+```
+
+If other ports are in conflict, or if you want to customize startup options, refer to the [CLI Reference](../../cli-reference/start.md) for available flags and configuration instructions.
+
+Check if the startup logs are normal:
+
+```bash
+docker logs -f gpustack
+```
+
+If the logs are normal, open `http://your_host_ip` in the browser to access the GPUStack UI. Log in to GPUStack with username `admin` and the default password. You can run the following command to get the password for the default setup:
+
+```bash
+docker exec -it gpustack cat /var/lib/gpustack/initial_admin_password
+```
+
+### (Optional) Add Worker
+
+You can add more GPU nodes to GPUStack to form a GPU cluster. You need to add workers on other GPU nodes and specify the `--server-url` and `--token` parameters to join GPUStack.
+
+To get the token used for adding workers, run the following command on the GPUStack **server node**:
+
+```bash
+docker exec -it gpustack cat /var/lib/gpustack/token
+```
+
+To start GPUStack as a worker, and **register it with the GPUStack server**, run the following command on the **worker node**. Be sure to replace the URL and token with your specific values:
+
+```bash
+docker run -d --name gpustack \
+    -v /lib/modules:/lib/modules \
+    -v /dev:/dev \
+    --privileged \
+    --cap-add=ALL \
+    --pid=host \
+    --restart=unless-stopped \
+    --network=host \
+    --ipc=host \
+    -v gpustack-data:/var/lib/gpustack \
+    gpustack/gpustack:latest-corex \
+    --server-url http://your_gpustack_url --token your_gpustack_token
+```
+
+!!! note
+
+    1. **Heterogeneous cluster is supported.** No matter what type of device it is, you can add it to the current GPUStack as a worker by specifying the `--server-url` and `--token` parameters.
+
+    2. You can set additional flags for the `gpustack start` command by appending them to the docker run command.
+    For configuration details, please refer to the [CLI Reference](../../cli-reference/start.md).
--- a/docs/installation/installation-requirements.md
+++ b/docs/installation/installation-requirements.md
@ -55,6 +55,7 @@ GPUStack supports the following accelerators:
 - [x] Ascend CANN
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
+- [x] Iluvatar Corex

 Ensure all necessary drivers and libraries are installed on the system prior to installing GPUStack.

@ -94,6 +95,13 @@ To use Moore Threads MUSA as an accelerator, ensure the following components are
 - [MUSA SDK](https://developer.mthreads.com/sdk/download/musa)
 - [MT Container Toolkits](https://developer.mthreads.com/sdk/download/CloudNative) (Optional, required for docker installation)

+### Iluvatar Corex
+
+To use Iluvatar Corex as an accelerator, ensure the following components are installed:
+
+- [Corex driver](https://support.iluvatar.com/#/ProductLine?id=2)
+- [Corex Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
+
 ## Networking Requirements

 ### Network Architecture
--- a/docs/overview.md
+++ b/docs/overview.md
@ -8,7 +8,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.

 - **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
 - **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Integrates with llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM, and Ascend MindIE.
+- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
 - **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
 - **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
 - **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -35,6 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.
 - [x] Ascend CANN
 - [x] Hygon DTK
 - [x] Moore Threads MUSA
+- [x] Iluvatar Corex

 We plan to support the following accelerators in future releases.

--- a/docs/user-guide/inference-backends.md
+++ b/docs/user-guide/inference-backends.md
@ -75,12 +75,12 @@ For more details, please refer to [vLLM documentation](https://docs.vllm.ai/en/s

 ### Supported Platforms

-The vLLM backend works on AMD64 Linux.
+The vLLM backend works on Linux.

 !!! Note

    1. When users install GPUStack on amd64 Linux using the installation script, vLLM is automatically installed.
-    2. When users deploy a model using the vLLM backend, GPUStack sets worker label selectors to `{"os": "linux", "arch": "amd64"}` by default to ensure the model instance is scheduled to proper workers. You can customize the worker label selectors in the model configuration.
+    2. When users deploy a model using the vLLM backend, GPUStack sets worker label selectors to `{"os": "linux"}` by default to ensure the model instance is scheduled to proper workers. You can customize the worker label selectors in the model configuration.

 ### Supported Models

--- a/gpustack/assets/model-catalog-modelscope.yaml
+++ b/gpustack/assets/model-catalog-modelscope.yaml
@ -250,8 +250,6 @@
      replicas: 1
      backend: vllm
      backend_parameters: &qwen3_thinking_vllm_parameters
-        - --enable-auto-tool-choice
-        - --tool-call-parser=hermes
        - --enable-reasoning
        - --reasoning-parser=deepseek_r1
    - quantizations: ["FP8"]
@ -990,6 +988,102 @@
      backend: vllm
      backend_parameters:
        - --trust-remote-code
+- name: Deepseek R1 0528 Qwen3 8B
+  description: DeepSeek-R1-0528-Qwen3-8B is a post-trained model derived by distilling the chain-of-thought reasoning patterns from DeepSeek-R1-0528 into the Qwen3 8B Base model. As a result, it achieves state-of-the-art (SOTA) performance among open-source models on the AIME 2024 benchmark, outperforming the original Qwen3 8B by 10.0% and reaching the level of Qwen3-235B-thinking.
+  home: https://www.deepseek.com
+  icon: /static/catalog_icons/deepseek.png
+  categories:
+    - llm
+  capabilities:
+    - context/128K
+  sizes:
+    - 8
+  licenses:
+    - mit
+  release_date: "2025-05-28"
+  templates:
+    - quantizations:
+        - UD-IQ1_M
+        - UD-IQ1_S
+        - Q2_K_L
+        - Q3_K_M
+        - Q4_K_M
+        - Q5_K_M
+        - Q6_K
+        - Q8_0
+        - BF16
+      source: model_scope
+      model_scope_model_id: unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF
+      model_scope_file_path: "*-{quantization}*.gguf"
+      replicas: 1
+      backend: llama-box
+      backend_parameters:
+        # give R1 more default context to think
+        - --ctx-size=32768
+        # recommended temperature and top_p for R1
+        - --temp=0.6
+        - --top-p=0.95
+      cpu_offloading: true
+      distributed_inference_across_workers: true
+    - quantizations: ["BF16"]
+      source: model_scope
+      model_scope_model_id: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
+- name: Deepseek R1 0528
+  description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
+  home: https://www.deepseek.com
+  icon: /static/catalog_icons/deepseek.png
+  categories:
+    - llm
+  capabilities:
+    - context/128K
+  sizes:
+    - 671
+  licenses:
+    - mit
+  release_date: "2025-05-28"
+  templates:
+    - quantizations:
+        - UD-IQ1_M
+        - UD-IQ1_S
+        - UD-Q2_K_XL
+        - UD-Q3_K_XL
+        - Q4_K_M
+        - Q8_0
+        - BF16
+      source: model_scope
+      model_scope_model_id: unsloth/DeepSeek-R1-0528-GGUF
+      model_scope_file_path: "*-{quantization}*.gguf"
+      replicas: 1
+      backend: llama-box
+      backend_parameters:
+        # give R1 more default context to think
+        - --ctx-size=32768
+        # recommended temperature and top_p for R1
+        - --temp=0.6
+        - --top-p=0.95
+      cpu_offloading: true
+      distributed_inference_across_workers: true
+    - quantizations: ["FP8"]
+      source: model_scope
+      model_scope_model_id: deepseek-ai/DeepSeek-R1-0528
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
+    - quantizations: ["BF16"]
+      source: model_scope
+      model_scope_model_id: unsloth/DeepSeek-R1-0528-BF16
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
 - name: Deepseek R1
  description: DeepSeek's first-generation reasoning model that delivers superior performance in math, code, and reasoning tasks. It effectively overcomes reasoning challenges and achieves performance comparable to OpenAI-o1 across various benchmarks. This includes six dense models distilled from DeepSeek-R1 based on Llama and Qwen.
  home: https://www.deepseek.com
@ -1009,7 +1103,6 @@
  licenses:
    - deepseek
  release_date: "2025-01-20"
-  order: 2
  templates:
    - quantizations: ["FP8"]
      sizes:
@ -1609,6 +1702,8 @@
      model_scope_file_path: "*-{quantization}*.gguf"
      replicas: 1
      backend: llama-box
+      backend_parameters:
+        - --visual-max-image-size=1344
      cpu_offloading: true
      distributed_inference_across_workers: true
    - quantizations: ["BF16"]
--- a/gpustack/assets/model-catalog.yaml
+++ b/gpustack/assets/model-catalog.yaml
@ -262,8 +262,6 @@
      replicas: 1
      backend: vllm
      backend_parameters: &qwen3_thinking_vllm_parameters
-        - --enable-auto-tool-choice
-        - --tool-call-parser=hermes
        - --enable-reasoning
        - --reasoning-parser=deepseek_r1
    - quantizations: ["FP8"]
@ -957,6 +955,102 @@
      backend: vllm
      backend_parameters:
        - --trust-remote-code
+- name: Deepseek R1 0528 Qwen3 8B
+  description: DeepSeek-R1-0528-Qwen3-8B is a post-trained model derived by distilling the chain-of-thought reasoning patterns from DeepSeek-R1-0528 into the Qwen3 8B Base model. As a result, it achieves state-of-the-art (SOTA) performance among open-source models on the AIME 2024 benchmark, outperforming the original Qwen3 8B by 10.0% and reaching the level of Qwen3-235B-thinking.
+  home: https://www.deepseek.com
+  icon: /static/catalog_icons/deepseek.png
+  categories:
+    - llm
+  capabilities:
+    - context/128K
+  sizes:
+    - 8
+  licenses:
+    - mit
+  release_date: "2025-05-28"
+  templates:
+    - quantizations:
+        - UD-IQ1_M
+        - UD-IQ1_S
+        - Q2_K_L
+        - Q3_K_M
+        - Q4_K_M
+        - Q5_K_M
+        - Q6_K
+        - Q8_0
+        - BF16
+      source: huggingface
+      huggingface_repo_id: unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF
+      huggingface_filename: "*-{quantization}*.gguf"
+      replicas: 1
+      backend: llama-box
+      backend_parameters:
+        # give R1 more default context to think
+        - --ctx-size=32768
+        # recommended temperature and top_p for R1
+        - --temp=0.6
+        - --top-p=0.95
+      cpu_offloading: true
+      distributed_inference_across_workers: true
+    - quantizations: ["BF16"]
+      source: huggingface
+      huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
+- name: Deepseek R1 0528
+  description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
+  home: https://www.deepseek.com
+  icon: /static/catalog_icons/deepseek.png
+  categories:
+    - llm
+  capabilities:
+    - context/128K
+  sizes:
+    - 671
+  licenses:
+    - mit
+  release_date: "2025-05-28"
+  templates:
+    - quantizations:
+        - UD-IQ1_M
+        - UD-IQ1_S
+        - UD-Q2_K_XL
+        - UD-Q3_K_XL
+        - Q4_K_M
+        - Q8_0
+        - BF16
+      source: huggingface
+      huggingface_repo_id: unsloth/DeepSeek-R1-0528-GGUF
+      huggingface_filename: "*-{quantization}*.gguf"
+      replicas: 1
+      backend: llama-box
+      backend_parameters:
+        # give R1 more default context to think
+        - --ctx-size=32768
+        # recommended temperature and top_p for R1
+        - --temp=0.6
+        - --top-p=0.95
+      cpu_offloading: true
+      distributed_inference_across_workers: true
+    - quantizations: ["FP8"]
+      source: huggingface
+      huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
+    - quantizations: ["BF16"]
+      source: huggingface
+      huggingface_repo_id: unsloth/DeepSeek-R1-0528-BF16
+      replicas: 1
+      backend: vllm
+      backend_parameters:
+        - --trust-remote-code
+        - --max-model-len=32768
 - name: Deepseek R1
  description: DeepSeek's first-generation reasoning model that delivers superior performance in math, code, and reasoning tasks. It effectively overcomes reasoning challenges and achieves performance comparable to OpenAI-o1 across various benchmarks. This includes six dense models distilled from DeepSeek-R1 based on Llama and Qwen.
  home: https://www.deepseek.com
@ -976,7 +1070,6 @@
  licenses:
    - deepseek
  release_date: "2025-01-20"
-  order: 2
  templates:
    - quantizations:
        - Q2_K_L
@ -1584,6 +1677,8 @@
      huggingface_filename: "*-{quantization}*.gguf"
      replicas: 1
      backend: llama-box
+      backend_parameters:
+        - --visual-max-image-size=1344
      cpu_offloading: true
      distributed_inference_across_workers: true
    - quantizations: ["BF16"]
--- a/gpustack/config/config.py
+++ b/gpustack/config/config.py
@ -383,7 +383,7 @@ class Config(BaseSettings):

            if vendor not in VendorEnum.__members__.values():
                raise Exception(
-                    "Unsupported GPU device vendor, supported vendors are: Apple, NVIDIA, 'Moore Threads', Huawei, AMD, Hygon"
+                    "Unsupported GPU device vendor, supported vendors are: Apple, NVIDIA, 'Moore Threads', Huawei, AMD, Hygon, Iluvatar"
                )

            if not memory:
@ -391,7 +391,7 @@ class Config(BaseSettings):

            if type not in DeviceTypeEnum.__members__.values():
                raise Exception(
-                    "Unsupported GPU type, supported type are: cuda, musa, npu, mps, rocm, dcu"
+                    "Unsupported GPU type, supported type are: cuda, musa, npu, mps, rocm, dcu, corex"
                )

            memory_total = memory.get("total")
--- a/gpustack/detectors/detector_factory.py
+++ b/gpustack/detectors/detector_factory.py
@ -10,6 +10,7 @@ from gpustack.detectors.fastfetch.fastfetch import Fastfetch
 from gpustack.detectors.npu_smi.npu_smi import NPUSMI
 from gpustack.detectors.rocm_smi.rocm_smi import RocmSMI
 from gpustack.detectors.regredit.regredit import Regredit
+from gpustack.detectors.ixsmi.ixsmi import IXSMI
 from gpustack.utils import platform


@ -40,6 +41,7 @@ class DetectorFactory:
            platform.DeviceTypeEnum.MUSA.value: [fastfetch],
            platform.DeviceTypeEnum.ROCM.value: [RocmSMI(), Regredit()],
            platform.DeviceTypeEnum.DCU.value: [RocmSMI()],
+            platform.DeviceTypeEnum.COREX.value: [IXSMI()],
        }

    def _validate_detectors(self):
--- a/gpustack/detectors/ixsmi/ixsmi.py
+++ b/gpustack/detectors/ixsmi/ixsmi.py
@ -0,0 +1,111 @@
+import csv
+import subprocess
+from gpustack.detectors.base import GPUDetector
+from gpustack.schemas.workers import (
+    GPUCoreInfo,
+    GPUDeviceInfo,
+    GPUDevicesInfo,
+    MemoryInfo,
+    VendorEnum,
+)
+from gpustack.utils import platform
+from gpustack.utils.command import is_command_available
+from gpustack.utils.convert import safe_float, safe_int
+
+
+class IXSMI(GPUDetector):
+    def is_available(self) -> bool:
+        return is_command_available("ixsmi")
+
+    def gather_gpu_info(self) -> GPUDevicesInfo:
+        command = self._command_gather_gpu()
+        results = self._run_command(command)
+        if results is None:
+            return []
+
+        return self.decode_gpu_devices(results)
+
+    def decode_gpu_devices(self, result) -> GPUDevicesInfo:  # noqa: C901
+        """
+        results example:
+        $ixsmi --format=csv,noheader --query-gpu=index,name,memory.total,memory.used,utilization.gpu,temperature.gpu
+        0, Iluvatar MR-V50, 16384 MiB, 116 MiB, 0 %, 30 C
+        1, Iluvatar MR-V100, 32768 MiB, 27996 MiB, 0 %, 36 C
+        """
+
+        devices = []
+        reader = csv.reader(result.splitlines())
+        for row in reader:
+            if len(row) < 6:
+                continue
+            index, name, memory_total, memory_used, utilization_gpu, temperature_gpu = (
+                row
+            )
+
+            index = safe_int(index)
+            name = name.strip()
+            # Convert MiB to bytes
+            memory_total = safe_int(memory_total.split()[0]) * 1024 * 1024
+            # Convert MiB to bytes
+            memory_used = safe_int(memory_used.split()[0]) * 1024 * 1024
+            utilization_gpu = safe_float(
+                utilization_gpu.split()[0]
+            )  # Remove the '%' sign
+            temperature_gpu = safe_float(temperature_gpu)
+
+            device = GPUDeviceInfo(
+                index=index,
+                name=name,
+                vendor=VendorEnum.Iluvatar.value,
+                memory=MemoryInfo(
+                    is_unified_memory=False,
+                    used=memory_used,
+                    total=memory_total,
+                    utilization_rate=(
+                        (memory_used / memory_total) * 100 if memory_total > 0 else 0
+                    ),
+                ),
+                core=GPUCoreInfo(
+                    utilization_rate=utilization_gpu,
+                    total=0,  # Total cores information is not provided by ixsmi
+                ),
+                temperature=temperature_gpu,
+                type=platform.DeviceTypeEnum.CUDA.value,
+            )
+            devices.append(device)
+        return devices
+
+    def _run_command(self, command):
+        result = None
+        try:
+            result = subprocess.run(
+                command, capture_output=True, text=True, encoding="utf-8"
+            )
+
+            if result is None or result.stdout is None:
+                return None
+
+            output = result.stdout
+            if "no devices" in output.lower():
+                return None
+
+            if result.returncode != 0:
+                raise Exception(f"Unexpected return code: {result.returncode}")
+
+            if output == "" or output is None:
+                raise Exception(f"Output is empty, return code: {result.returncode}")
+
+            return output
+        except Exception as e:
+            error_message = f"Failed to execute {command}: {e}"
+            if result:
+                error_message += f", stdout: {result.stdout}, stderr: {result.stderr}"
+            raise Exception(error_message)
+
+    def _command_gather_gpu(self):
+        executable_command = [
+            "ixsmi",
+            "--format=csv,noheader",
+            "--query-gpu=index,name,memory.total,memory.used,utilization.gpu,temperature.gpu",
+        ]
+        return executable_command
--- a/gpustack/main.py
+++ b/gpustack/main.py
@ -1,4 +1,5 @@
 import argparse
+from multiprocessing import freeze_support

 from gpustack.cmd import setup_start_cmd
 from gpustack.cmd.chat import setup_chat_cmd
@ -36,4 +37,8 @@ def main():


 if __name__ == "__main__":
+    # When using multiprocessing with 'spawn' mode, freeze_support() must be called in the main module
+    # to ensure the main process environment is correctly initialized when child processes are spawned.
+    # See: https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods
+    freeze_support()
    main()
--- a/gpustack/routes/model_instances.py
+++ b/gpustack/routes/model_instances.py
@ -105,10 +105,10 @@ async def get_serving_logs(
            try:
                async with client.get(model_instance_log_url, timeout=timeout) as resp:
                    if resp.status != 200:
-                        raise HTTPException(
-                            status_code=resp.status,
-                            detail=f"Error fetching serving logs: {resp.reason}",
-                        )
+                        body = await resp.read()
+                        yield body, resp.headers, resp.status
+                        return
+
                    async for chunk in resp.content.iter_any():
                        yield chunk, resp.headers, resp.status
            except Exception as e:
--- a/gpustack/routes/models.py
+++ b/gpustack/routes/models.py
@ -276,6 +276,7 @@ def validate_gpu(
        VendorEnum.NVIDIA.value,
        VendorEnum.AMD.value,
        VendorEnum.Hygon.value,
+        VendorEnum.Huawei.value,
    ]:
        raise BadRequestException(
            f"vLLM backend is not supported on {gpu_device.vendor} GPUs."
--- a/gpustack/routes/worker/logs.py
+++ b/gpustack/routes/worker/logs.py
@ -1,8 +1,9 @@
 from fastapi import APIRouter, Request
-from fastapi import HTTPException
 from fastapi.responses import StreamingResponse
 from pathlib import Path
+from tenacity import RetryError

+from gpustack.api.exceptions import NotFoundException
 from gpustack.worker.logs import LogOptionsDep
 from gpustack.worker.logs import log_generator
 from gpustack.utils import file
@ -17,7 +18,7 @@ async def get_serve_logs(request: Request, id: int, log_options: LogOptionsDep):

    try:
        file.check_file_with_retries(path)
-    except FileNotFoundError:
-        raise HTTPException(status_code=404, detail="Log file not found")
+    except (FileNotFoundError, RetryError):
+        raise NotFoundException(message="Log file not found")

    return StreamingResponse(log_generator(path, log_options), media_type="text/plain")
--- a/gpustack/scheduler/calculator.py
+++ b/gpustack/scheduler/calculator.py
@ -132,6 +132,8 @@ class GGUFParserCommandMutableParameters:
    # NB(thxCode): Partial options are not applied to backend, but to the parser.
    # We can receive these options from the backend advanced config.

+    backend_version: Optional[str] = None
+
    # Estimate
    flash_attention: Optional[bool] = None
    main_gpu: Optional[int] = None
@ -148,6 +150,8 @@ class GGUFParserCommandMutableParameters:
    split_mode: Optional[str] = None
    ubatch_size: Optional[int] = None
    visual_max_image_size: [int] = None
+    max_projected_cache: [int] = None
+    swa_full: bool = False
    # Estimate/StableDiffusionCpp
    image_autoencoder_tiling: bool = True
    image_batch_count: Optional[int] = None
@ -166,7 +170,7 @@ class GGUFParserCommandMutableParameters:
    skip_tls_verify: Optional[bool] = None

    def from_args(self, args: List[str]):
-        parser = argparse.ArgumentParser(exit_on_error=False)
+        parser = argparse.ArgumentParser(exit_on_error=False, allow_abbrev=False)

        # Estimate
        parser.add_argument(
@ -257,6 +261,17 @@ class GGUFParserCommandMutableParameters:
            type=int,
            required=False,
        )
+        parser.add_argument(
+            "--max-projected-cache",
+            "--visual-max-image-cache",
+            type=int,
+            required=False,
+        )
+        parser.add_argument(
+            "--swa-full",
+            action='store_true',
+            required=False,
+        )
        # Estimate/StableDiffusionCpp
        parser.add_argument(
            "--image-autoencoder-tiling",
@ -357,18 +372,32 @@ class GGUFParserCommandMutableParameters:
        try:
            args_parsed = parser.parse_known_args(args=args)
            for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
-                if attr_value := getattr(args_parsed[0], attr_name):
-                    try:
-                        setattr(self, attr_name, attr_value)
-                    except ValueError as e:
-                        slogger.warning(
-                            f"Failed to receive mutable parameter {attr_name}: {e}"
-                        )
+                try:
+                    attr_value = getattr(args_parsed[0], attr_name)
+                    if attr_value is not None:
+                        try:
+                            setattr(self, attr_name, attr_value)
+                        except ValueError as e:
+                            slogger.warning(
+                                f"Failed to receive mutable parameter {attr_name}: {e}"
+                            )
+                except AttributeError:
+                    # If reach here, that means the field is an internal property,
+                    # which would not register in the argument parser.
+                    pass
        except (argparse.ArgumentError, argparse.ArgumentTypeError) as e:
            slogger.warning(f"Failed to parse mutable parameters: {e}")

    def extend_command(self, command: List[str]):
+        internal_properties = [
+            "backend_version",
+        ]
+
        for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
+            if attr_name in internal_properties:
+                # Skip internal properties.
+                continue
+
            attr_value = getattr(self, attr_name)
            if attr_value is not None:
                if isinstance(attr_value, bool):
@ -382,6 +411,13 @@ class GGUFParserCommandMutableParameters:
                        [f"--{attr_name.replace('_', '-')}", str(attr_value)]
                    )

+        if self.backend_version:
+            # Parser v0.18.0+ supports estimating Sliding Window Attention (SWA) usage,
+            # however, llama-box treats `--batch-size` as the same as `--ctx-size` within [v0.0.140, v0.0.148],
+            # so we need to set `--batch-size` to `--ctx-size` to avoid wrong RAM/VRAM estimation.
+            if "v0.0.139" < self.backend_version < "v0.0.149":
+                command.append(f"--batch-size={self.ctx_size}")
+

 async def _gguf_parser_command(  # noqa: C901
    model: Model, offload: GPUOffloadEnum = GPUOffloadEnum.Full, **kwargs
@ -400,7 +436,7 @@ async def _gguf_parser_command(  # noqa: C901
    ]

    # Extend the command with mutable arguments.
-    params = GGUFParserCommandMutableParameters()
+    params = GGUFParserCommandMutableParameters(backend_version=model.backend_version)
    params.from_args(model.backend_parameters)
    params.extend_command(command)

--- a/gpustack/scheduler/evaluator.py
+++ b/gpustack/scheduler/evaluator.py
@ -261,11 +261,6 @@ async def evaluate_environment(
            "The Ascend MindIE backend requires Ascend NPUs but none are available."
        ]

-    if backend == BackendEnum.VLLM and only_ascend_npu(workers):
-        return False, [
-            "The vLLM backend is not supported on Ascend NPUs at the moment. Use the Ascend MindIE or llama-box backend instead."
-        ]
-
    return True, []


--- a/gpustack/schemas/workers.py
+++ b/gpustack/schemas/workers.py
@ -58,6 +58,7 @@ class VendorEnum(str, Enum):
    Huawei = "Huawei"
    AMD = "AMD"
    Hygon = "Hygon"
+    Iluvatar = "Iluvatar"


 class MountPoint(BaseModel):
--- a/gpustack/utils/file.py
+++ b/gpustack/utils/file.py
@ -32,7 +32,7 @@ def copy_owner_recursively(src, dst):
                os.chown(os.path.join(dirpath, filename), st.st_uid, st.st_gid)


-@retry(stop=stop_after_attempt(5), wait=wait_fixed(0.5))
+@retry(stop=stop_after_attempt(10), wait=wait_fixed(1))
 def check_file_with_retries(path: Path):
    if not os.path.exists(path):
        raise FileNotFoundError(f"Log file not found: {path}")
--- a/gpustack/utils/platform.py
+++ b/gpustack/utils/platform.py
@ -85,6 +85,7 @@ class DeviceTypeEnum(str, Enum):
    ROCM = "rocm"
    MUSA = "musa"
    DCU = "dcu"
+    COREX = "corex"


 def device() -> str:
@ -96,6 +97,7 @@ def device() -> str:
    - mps
    - rocm
    - dcu
+    - iluvatar
    - etc.
    """
    if (
@ -126,6 +128,9 @@ def device() -> str:
    ):
        return DeviceTypeEnum.ROCM.value

+    if is_command_available("ixsmi"):
+        return "corex"
+
    return ""


@ -137,6 +142,7 @@ def device_type_from_vendor(vendor: VendorEnum) -> str:
        VendorEnum.AMD.value: DeviceTypeEnum.ROCM.value,
        VendorEnum.Hygon.value: DeviceTypeEnum.DCU.value,
        VendorEnum.MTHREADS.value: DeviceTypeEnum.MUSA.value,
+        VendorEnum.Iluvatar.value: DeviceTypeEnum.COREX.value,
    }

    return mapping.get(vendor, "")
@ -166,3 +172,43 @@ def get_cuda_version() -> str:
        except Exception as e:
            logger.error(f"Error running nvcc: {e}")
    return ""
+
+
+def get_cann_version() -> str:
+    """
+    Returns the CANN version installed on the system.
+    """
+
+    env_cann_version = os.getenv("CANN_VERSION", "")
+    if env_cann_version:
+        return env_cann_version
+
+    try:
+        # Borrowed from https://gitee.com/ascend/pytorch/blob/master/test/npu/test_cann_version.py.
+        import torch  # noqa: F401
+        import torch_npu  # noqa: F401
+        from torch_npu.utils.collect_env import (
+            get_cann_version as get_cann_version_from_env,
+        )
+        from torch_npu.npu.utils import get_cann_version
+
+        cann_version = get_cann_version_from_env()
+        if cann_version:
+            return cann_version.lower()
+        cann_version = get_cann_version()
+        if cann_version:
+            return cann_version.lower()
+    except ImportError:
+        pass
+
+    return ""
+
+
+def get_cann_chip() -> str:
+    """
+    Returns the CANN chip version installed on the system.
+    """
+
+    # TODO(thxCode): figure out a way to discover the CANN chip version
+
+    return os.getenv("CANN_CHIP", "")
--- a/gpustack/worker/backends/ascend_mindie.py
+++ b/gpustack/worker/backends/ascend_mindie.py
@ -54,7 +54,7 @@ class AscendMindIEParameters:
    override_generation_config_parsed: Optional[any] = None  # store JSON parsed result

    def from_args(self, args: List[str]):
-        parser = argparse.ArgumentParser(exit_on_error=False)
+        parser = argparse.ArgumentParser(exit_on_error=False, allow_abbrev=False)
        #
        # Log config
        #
--- a/gpustack/worker/backends/base.py
+++ b/gpustack/worker/backends/base.py
@ -31,6 +31,7 @@ ACCELERATOR_VENDOR_TO_ENV_NAME = {
    VendorEnum.Huawei: "ASCEND_RT_VISIBLE_DEVICES",
    VendorEnum.AMD: "ROCR_VISIBLE_DEVICES",
    VendorEnum.Hygon: "HIP_VISIBLE_DEVICES",
+    VendorEnum.Iluvatar: "CUDA_VISIBLE_DEVICES",
 }


--- a/gpustack/worker/backends/llama_box.py
+++ b/gpustack/worker/backends/llama_box.py
@ -83,6 +83,10 @@ class LlamaBoxServer(InferenceServer):
        default_mmproj = get_mmproj_file(self._model_path)
        if mmproj is None and default_mmproj:
            arguments.extend(["--mmproj", default_mmproj])
+            # Enable `--max-projected-cache` to optimize chatting experience,
+            # cause llama-box will ignore unknown parameters,
+            # we can safely add this parameter without breaking previous version.
+            arguments.extend(["--max-projected-cache", "10"])

        if rpc_servers:
            rpc_servers_argument = ",".join(rpc_servers)
--- a/gpustack/worker/backends/vllm.py
+++ b/gpustack/worker/backends/vllm.py
@ -4,7 +4,7 @@ import os
 import subprocess
 import sys
 import sysconfig
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import Dict, List, Optional
 from gpustack.schemas.models import ModelInstance, ModelInstanceStateEnum
 from gpustack.utils.command import find_parameter, get_versioned_command
 from gpustack.utils.hub import (
@ -96,14 +96,6 @@ class VLLMServer(InferenceServer):
            return

        device_str = "GPU"
-        if not TYPE_CHECKING:
-            from vllm.platforms import current_platform
-
-            device_str = current_platform.ray_device_key
-            if not device_str:
-                raise RuntimeError(
-                    f"current platform {current_platform.device_name} does not support ray."
-                )

        ray_placement_group_bundles: List[Dict[str, float]] = []
        bundle_indexes = []
--- a/gpustack/worker/tools_manager.py
+++ b/gpustack/worker/tools_manager.py
@ -17,13 +17,12 @@ from gpustack.schemas.models import BackendEnum
 from gpustack.utils.command import get_versioned_command
 from gpustack.utils.compat_importlib import pkg_resources
 from gpustack.utils import platform, envs
-from gpustack.config.config import get_global_config

 logger = logging.getLogger(__name__)


-BUILTIN_LLAMA_BOX_VERSION = "v0.0.144"
-BUILTIN_GGUF_PARSER_VERSION = "v0.17.5"
+BUILTIN_LLAMA_BOX_VERSION = "v0.0.154"
+BUILTIN_GGUF_PARSER_VERSION = "v0.19.0"
 BUILTIN_RAY_VERSION = "2.43.0"


@ -60,8 +59,6 @@ class ToolsManager:
        self._os = system if system else platform.system()
        self._arch = arch if arch else platform.arch()
        self._device = device if device else platform.device()
-        if self._device == platform.DeviceTypeEnum.CUDA.value:
-            self._llama_box_cuda_version = self._get_llama_box_cuda_version()
        self._download_base_url = tools_download_base_url
        self._bin_dir = bin_dir
        self._pipx_path = pipx_path
@ -373,6 +370,25 @@ class ToolsManager:
                f"Failed to execute 'pipx environment --value PIPX_BIN_DIR': {e}"
            )

+    def _get_pipx_local_venvs(self, pipx_path: str) -> Path:
+        """
+        Use `pipx environment --value PIPX_LOCAL_VENVS` to get the directory where pipx installs local virtual environments.
+        """
+        try:
+            result = subprocess.run(
+                [pipx_path, "environment", "--value", "PIPX_LOCAL_VENVS"],
+                capture_output=True,
+                text=True,
+                check=True,
+            )
+            pipx_local_venv_dir = result.stdout.strip()
+            if pipx_local_venv_dir:
+                return Path(pipx_local_venv_dir)
+        except subprocess.CalledProcessError as e:
+            raise Exception(
+                f"Failed to execute 'pipx environment --value PIPX_LOCAL_VENVS': {e}"
+            )
+
    def _download_acsend_mindie(self, version: str, target_dir: Path):
        # Check if the system is supported
        if self._os != "linux" or self._arch not in ["amd64", "arm64"]:
@ -426,7 +442,7 @@ class ToolsManager:
            shutil.rmtree(llama_box_tmp_dir)
        os.makedirs(llama_box_tmp_dir, exist_ok=True)

-        platform_name = self._get_llama_box_platform_name()
+        platform_name = self._get_llama_box_platform_name(version)
        tmp_file = llama_box_tmp_dir / f"llama-box-{version}-{platform_name}.zip"
        url_path = f"gpustack/llama-box/releases/download/{version}/llama-box-{platform_name}.zip"

@ -450,117 +466,111 @@ class ToolsManager:

    def _link_llama_box_rpc_server(self):
        """
-        Create a symlink for llama-box-rpc-server in the bin directory.
+        Create a directory relative symlink for llama-box-rpc-server in the bin directory.
        This is used to help differentiate between the llama-box and llama-box-rpc-server processes.
        """
-        target_dir = self.third_party_bin_path / "llama-box"
-        file_name = "llama-box.exe" if self._os == "windows" else "llama-box"
-        llama_box_file = target_dir / file_name

+        src_file_name = "llama-box"
+        dst_file_name = "llama-box-rpc-server"
        if self._os == "windows":
-            target_rpc_server_file = target_dir / "llama-box-rpc-server.exe"
-        else:
-            target_rpc_server_file = target_dir / "llama-box-rpc-server"
+            src_file_name += ".exe"
+            dst_file_name += ".exe"
+
+        target_dir = self.third_party_bin_path / "llama-box"
+        src_file = target_dir / src_file_name
+        dst_file = target_dir / dst_file_name

-        if os.path.lexists(target_rpc_server_file):
-            os.remove(target_rpc_server_file)
+        if os.path.lexists(dst_file):
+            os.remove(dst_file)

        if self._os == "windows":
-            os.link(llama_box_file, target_rpc_server_file)
+            os.link(src_file, dst_file)
        else:
-            os.symlink(llama_box_file, target_rpc_server_file)
+            target_dir_fd = os.open(target_dir, os.O_RDONLY)
+            os.symlink(src_file_name, dst_file_name, dir_fd=target_dir_fd)

-        logger.debug(f"Linked llama-box-rpc-server to {target_rpc_server_file}")
+        logger.debug(f"Linked llama-box-rpc-server to {dst_file}")

-    def _get_llama_box_cuda_version(self) -> str:
+    def _get_llama_box_platform_name(self, version: str) -> str:  # noqa C901
        """
-        Gets the appropriate CUDA version of the llama-box based on the system's CUDA version.
+        Get the platform name for llama-box based on the OS, architecture, and device type.
        """

-        default_version = "12.4"
-        cuda_version = platform.get_cuda_version()
-        match = re.match(r"(\d+)\.(\d+)", cuda_version)
-        if not match:
-            return default_version
-
-        major, minor = map(int, match.groups())
-        if major == 11:
-            return "11.8"
-        elif major == 12 and minor >= 8:
-            return "12.8"
-
-        return default_version
-
-    def _get_llama_box_platform_name(self) -> str:  # noqa C901
-        platform_name = ""
-        if (
-            self._os == "darwin"
-            and self._arch == "arm64"
-            and self._device == platform.DeviceTypeEnum.MPS.value
-        ):
-            platform_name = "darwin-arm64-metal"
-        elif self._os == "darwin":
-            platform_name = "darwin-amd64-avx2"
-        elif (
-            self._os in ["linux", "windows"]
-            and self._arch in ["amd64", "arm64"]
-            and self._device == platform.DeviceTypeEnum.CUDA.value
-        ):
-            # Only amd64 for windows
-            normalized_arch = "amd64" if self._os == "windows" else self._arch
-            platform_name = (
-                f"{self._os}-{normalized_arch}-cuda-{self._llama_box_cuda_version}"
-            )
-        elif (
-            self._os == "linux"
-            and self._arch == "amd64"
-            and self._device == platform.DeviceTypeEnum.MUSA.value
-        ):
-            platform_name = "linux-amd64-musa-rc3.1"
-        elif self._os == "linux" and self._device == platform.DeviceTypeEnum.NPU.value:
-            # Available version: 8.0.0(.beta1) [default] / 8.0.rc2(.beta1) / 8.0.rc3(.beta1)
-            version = "8.0"
-            if ".rc2" in os.getenv("CANN_VERSION", ""):
-                version = "8.0.rc2"
-            elif ".rc3" in os.getenv("CANN_VERSION", ""):
-                version = "8.0.rc3"
-            # Available variant: 910b [default] / 310p
-            variant = ""
-            if os.getenv("CANN_CHIP", "") == "310p":
-                variant = "-310p"
-            platform_name = f"linux-{self._arch}-cann-{version}{variant}"
-        elif (
-            self._os == "linux"
-            and self._arch == "amd64"
-            and self._device == platform.DeviceTypeEnum.ROCM.value
-        ):
-            platform_name = "linux-amd64-hip-6.2"
-        elif (
-            self._os == "linux"
-            and self._arch == "amd64"
-            and self._device == platform.DeviceTypeEnum.DCU.value
-        ):
-            platform_name = "linux-amd64-dtk-24.04"
-        elif self._os == "linux" and self._arch == "amd64":
-            platform_name = "linux-amd64-avx2"
-        elif self._os == "linux" and self._arch == "arm64":
-            platform_name = "linux-arm64-neon"
-        elif (
-            self._os == "windows"
-            and self._arch == "amd64"
-            and self._device == platform.DeviceTypeEnum.ROCM.value
-        ):
-            platform_name = "windows-amd64-hip-6.2"
-        elif self._os == "windows" and self._arch == "amd64":
-            platform_name = "windows-amd64-avx2"
-        elif self._os == "windows" and self._arch == "arm64":
-            platform_name = "windows-arm64-neon"
+        # Get the toolkit based on the device type.
+        device_toolkit_mapper = {
+            platform.DeviceTypeEnum.CUDA.value: "cuda",
+            platform.DeviceTypeEnum.NPU.value: "cann",
+            platform.DeviceTypeEnum.MPS.value: "metal",
+            platform.DeviceTypeEnum.ROCM.value: "hip",
+            platform.DeviceTypeEnum.MUSA.value: "musa",
+            platform.DeviceTypeEnum.DCU.value: "dtk",
+        }
+        if self._device in device_toolkit_mapper:
+            toolkit = device_toolkit_mapper[self._device]
+        elif self._arch == "amd64":
+            toolkit = "avx2"
+        elif self._arch == "arm64":
+            toolkit = "neon"
        else:
            raise Exception(
                f"unsupported platform, os: {self._os}, arch: {self._arch}, device: {self._device}"
            )

-        return platform_name
+        # Get the toolkit version based on the toolkit,
+        # support fetching from environment variable or using default values.
+        toolkit_version = ""
+        if toolkit == "cuda":
+            # Since v0.0.145, llama-box no longer supports CUDA 11.8.
+            toolkit_version = "12.4"
+            cuda_version = platform.get_cuda_version()
+            match = re.match(r"(\d+)\.(\d+)", cuda_version)
+            if match:
+                major, minor = map(int, match.groups())
+                if major == 11 and version <= "v0.0.144":
+                    toolkit_version = "11.8"
+                elif major == 12 and minor >= 8:
+                    toolkit_version = "12.8"
+        elif toolkit == "cann":
+            # Since v0.0.145, llama-box supports CANN 8.1 by default,
+            # and supports CANN 8.0 only for backward compatibility.
+            toolkit_version = "8.0"
+            cann_version = platform.get_cann_version()
+            match = re.match(r"(\d+)\.(\d+)", cann_version)
+            if match:
+                major, minor = map(int, match.groups())
+                if major == 8 and minor >= 1 and version > "v0.0.144":
+                    toolkit_version = "8.1"
+            # Currently, llama-box only supports release candidate version of CANN 8.1.
+            if toolkit_version == "8.1":
+                match = re.search(r"\.rc\d+", cann_version)
+                if match:
+                    rc = match.group(0)
+                    if rc:
+                        toolkit_version += rc
+            cann_chip = platform.get_cann_chip()
+            if cann_chip and "310p" == cann_chip:
+                toolkit_version += "-310p"
+        elif toolkit == "hip":
+            toolkit_version = "6.2"
+        elif toolkit == "musa":
+            # Since v0.0.150, llama-box supports MUSA rc4.0,
+            # and no longer supports MUSA rc3.1.
+            toolkit_version = "rc3.1"
+            if version > "v0.0.149":
+                toolkit_version = "rc4.0"
+        elif toolkit == "dtk":
+            toolkit_version = "24.04"
+
+        # The name conversation of llama-box is `${os}-${arch}-${toolkit}[-${toolkit_version}]`,
+        # for example: linux-amd64-cuda-12.4, linux-arm64-cann-8.0.rc2-310p.
+        segments = [
+            self._os,
+            self._arch,
+            toolkit,
+        ]
+        if toolkit_version:
+            segments.append(toolkit_version)
+        return "-".join(segments)

    def download_gguf_parser(self):
        version = BUILTIN_GGUF_PARSER_VERSION
@ -750,20 +760,33 @@ class ToolsManager:
    ):
        """Install Ascend MindIE run package to the target directory."""

+        pipx_path = shutil.which("pipx")
+        if self._pipx_path:
+            pipx_path = self._pipx_path
+
+        if not pipx_path:
+            raise Exception(
+                "pipx is required to install versioned Ascend MindIE but not found in system PATH. "
+                "Please install pipx first or provide the path to pipx using the server option `--pipx-path`. "
+                "Alternatively, you can install Ascend MindIE manually."
+            )
+
+        pipx_local_venvs = self._get_pipx_local_venvs(pipx_path)
+        if not pipx_local_venvs:
+            raise Exception(
+                "Failed to determine pipx local venvs. Ensure pipx is correctly installed."
+            )
+
        # Create a virtual environment to collect the new Python packages.
-        cfg = get_global_config()
-        venv_parent_dir = Path(cfg.data_dir).joinpath("venvs", "mindie")
-        venv_parent_dir.mkdir(parents=True, exist_ok=True)
+        venv_dir = Path(pipx_local_venvs).joinpath(f"mindie_{version}")
        try:
            subprocess.check_call(
-                [sys.executable, "-m", "venv", "--system-site-packages", version],
-                cwd=venv_parent_dir,
+                [sys.executable, "-m", "venv", "--system-site-packages", venv_dir],
            )
        except subprocess.CalledProcessError as e:
            raise Exception(
                f"Failed to create a virtual environment for Ascend MindIE installation: {e}"
            )
-        venv_dir = venv_parent_dir.joinpath(version)
        venv_path = venv_dir.joinpath("bin", "activate")
        logger.info(
            f"Created virtual environment for Ascend MindIE installation: {venv_dir}"
@ -796,5 +819,26 @@ class ToolsManager:
                env=env,
                cwd=target_dir,
            )
+            logger.info(f"Installed Ascend MindIE '{version}' to {target_dir}")
+
+            # Post process, inject the virtual environment activation script into set_env.sh.
+            logger.info(
+                "Injecting virtual environment activation into Ascend MindIE launch"
+            )
+            set_env_script = target_dir.joinpath(
+                "mindie", version, "mindie-service", "set_env.sh"
+            )
+            # - Enable set_env.sh writable permission
+            st = os.stat(set_env_script)
+            old_mode = st.st_mode
+            new_mode = old_mode | stat.S_IWUSR
+            os.chmod(set_env_script, new_mode)
+            with open(set_env_script, 'a', encoding='utf-8') as f:
+                f.write(f"\nsource {venv_path} || true\n")
+            # - Disable set_env.sh writable permission
+            os.chmod(set_env_script, old_mode)
+            logger.info(
+                f"Injected virtual environment activation into Ascend MindIE launch: {set_env_script}"
+            )
        except subprocess.CalledProcessError as e:
            raise Exception(f"Failed to install Ascend MindIE {command}: {e}")
--- a/hack/build.sh
+++ b/hack/build.sh
@ -12,7 +12,7 @@ function build() {
 }

 function prepare_dependencies() {
-  bash "${ROOT_DIR}/hack/install.sh"
+  POETRY_ONLY=true bash "${ROOT_DIR}/hack/install.sh"
 }

 function set_version() {  
--- a/hack/install.sh
+++ b/hack/install.sh
@ -12,9 +12,12 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd -P)"
 source "${ROOT_DIR}/hack/lib/init.sh"

 function download_deps() {
-  pip install poetry==1.8.3 pre-commit==3.7.1
+  pip install poetry==1.8.3
  poetry install
-  pre-commit install
+  if [[ "${POETRY_ONLY:-false}" == "false" ]]; then
+    pip install pre-commit==3.7.1
+    pre-commit install
+  fi
 }

 function download_ui() {
@ -22,12 +25,15 @@ function download_ui() {
  local ui_path="${ROOT_DIR}/gpustack/ui"
  local tmp_ui_path="${ui_path}/tmp"
  local tag="latest"
-  # local tag="${1}"
+
+  if [[ "${GIT_VERSION}" != "v0.0.0" ]]; then
+    tag="${GIT_VERSION}"
+  fi

  rm -rf "${ui_path}"
  mkdir -p "${tmp_ui_path}/ui"

-  gpustack::log::info "downloading ui assets"
+  gpustack::log::info "downloading '${tag}' UI assets"

  if ! curl --retry 3 --retry-connrefused --retry-delay 3 -sSfL "https://gpustack-ui-1303613262.cos.accelerate.myqcloud.com/releases/${tag}.tar.gz" 2>/dev/null |
    tar -xzf - --directory "${tmp_ui_path}/ui" 2>/dev/null; then
--- a/hack/windows/install.ps1
+++ b/hack/windows/install.ps1
@ -31,10 +31,14 @@ function Get-UI {
    $tmpUIPath = Join-Path -Path $tmpPath -ChildPath "ui"
    $tag = "latest"

+    if ($GIT_VERSION -ne "v0.0.0") {
+        $tag = $GIT_VERSION
+    }
+
    $null = Remove-Item -Recurse -Force $uiPath -ErrorAction Ignore
    $null = New-Item -ItemType Directory -Path $tmpUIPath

-    GPUStack.Log.Info "downloading UI assets"
+    GPUStack.Log.Info "downloading '$tag' UI assets"

    try {
        $tmpFile = "$tmpPath/ui.tar.gz"
--- a/mkdocs.yml
+++ b/mkdocs.yml
@ -89,6 +89,9 @@ nav:
      - Moore Threads MUSA:
          - Online Installation: installation/moorethreads-musa/online-installation.md
          - Air-Gapped Installation: installation/moorethreads-musa/air-gapped-installation.md
+      - Iluvatar Corex:
+          - Online Installation: installation/iluvatar-corex/online-installation.md
+          - Air-Gapped Installation: installation/iluvatar-corex/air-gapped-installation.md
      - CPU:
          - Online Installation: installation/cpu/online-installation.md
          - Air-Gapped Installation: installation/cpu/air-gapped-installation.md
--- a/poetry.lock
+++ b/poetry.lock
@ -3963,8 +3963,11 @@ files = [
    {file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
    {file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
+    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
    {file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
+    {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
    {file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
    {file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
    {file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},
@ -5121,6 +5124,7 @@ description = "Nvidia JIT LTO Library"
 optional = true
 python-versions = ">=3"
 files = [
+    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
    {file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
 ]
@ -6239,6 +6243,7 @@ files = [
    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"},
    {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
@ -7671,6 +7676,7 @@ files = [
    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
+    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
    {file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
@ -7679,6 +7685,7 @@ files = [
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
+    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
    {file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
@ -7687,6 +7694,7 @@ files = [
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
+    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
    {file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
@ -7695,6 +7703,7 @@ files = [
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
+    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
    {file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
@ -7703,6 +7712,7 @@ files = [
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
+    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
    {file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
    {file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
@ -9338,14 +9348,14 @@ tensorizer = ["tensorizer (>=2.9.0)"]

 [[package]]
 name = "vox-box"
-version = "0.0.14"
+version = "0.0.15"
 description = "Vox box"
 optional = true
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "vox_box-0.0.14-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3a3b4b392a26b83a683f9dae8328c9ce69b7513fceb751d24bc7018d69e2c54a"},
-    {file = "vox_box-0.0.14-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8f40a3baaeaf4b89a4ff3a3a49543891f8b3b16e341ffcba270ee0906a20fa87"},
-    {file = "vox_box-0.0.14.tar.gz", hash = "sha256:f15c9ea6281f68adac83d7f0bf5121ef565b299a8cf87e488e5a655ef2e18642"},
+    {file = "vox_box-0.0.15-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ec8272fa350bc5106635f79d32a98e2c4bb93f0bfa9f7948ab78335b349c4dcc"},
+    {file = "vox_box-0.0.15-py3-none-manylinux2014_x86_64.whl", hash = "sha256:1e046ddc2d139ec54a8555119c551b5172f6155f2b4b8c2581319296b109e22f"},
+    {file = "vox_box-0.0.15.tar.gz", hash = "sha256:db64d72b6324ef4bfd8b073ce0eaa90c0bc6f401f0994696754bd6c813f4b61d"},
 ]

 [package.dependencies]
@ -9988,4 +9998,4 @@ vllm = ["bitsandbytes", "mistral_common", "timm", "vllm"]
 [metadata]
 lock-version = "2.0"
 python-versions = ">=3.10,<3.13"
-content-hash = "acf2fccb87b9c374ab046669e24b25f1c0584ee81e51061412aa2b63cb8f2734"
+content-hash = "807f91a97e80f065aef18907753da78334606453a85a641cb23e093d423fba27"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -48,7 +48,7 @@ wmi = { version="^1.5.1", markers = "platform_system == 'Windows'" }
 pywin32 = { version="^308", markers = "platform_system == 'Windows'" }
 packaging = "^24.1"
 psycopg2-binary = "^2.9.10"
-vox-box = {version = "0.0.14", optional = true}
+vox-box = {version = "0.0.15", optional = true}

 tenacity = "^9.0.0"
 aiocache = "^0.12.3"
Author	SHA1	Message	Date
gitlawr	899bea6697	ci: pin action-gh-release version	7 months ago
thxCode	ef03d971e3	ci(npu): adjust processing - mindie: upgrade transformers for Qwen2-VL - gpustack: clarify permission - script: enable skipping pre-commit Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	3891135a95	fix: disallow abbrev parsing backend parameters	7 months ago
thxCode	79eef89ca3	refactor(npu): tidy up mindie and vllm - install GCC if less then Ubuntu 21.04 - link llama-box-rpc-server via directory relative linking - build Python instead of install - install MindIE via pipx venv - use multi-stage build to install vLLM in parallel - remove instance running envs of vLLM Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	95f77e4921	refactor: mindie turbo support Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	bdebba4215	chore: update llama-box to v0.0.154	7 months ago
gitlawr	78d71882fa	chore: update llama-box to v0.0.153	7 months ago
linyinli	a63683c768	fix: distributed inference for vLLM Ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
gitlawr	5688ffb457	fix: exception on log file not ready	7 months ago
linyinli	ace9f8451f	fix: remove incompatible Qwen3 config for vLLM Ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
gitlawr	f459045cc3	feat: add r1 0528 to catalog	7 months ago
thxCode	234f5049be	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	c356d546a0	ci: drop cuda11.8	7 months ago
linyinli	5341d63230	feat: support vllm ascend Signed-off-by: linyinli <yinlin@gpustack.ai>	7 months ago
Terada Kousuke	8ee9099856	Add Japanese README	7 months ago
Yuxing Deng	588063fbfe	feat: add support for built binary to run mulitprocessing	7 months ago
peiyuan.zhang	a3af7dda24	remove Installation Script	7 months ago
peiyuan.zhang	8a41aaa6a9	support iluvatar	7 months ago
thxCode	12abfbe858	refactor(llama-box): enable max projected cache by default Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	940c2cdfc7	chore(tool): bump version use musa rc4.0 Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	6296336bcb	refactor(scheduler): params processing in gguf-parser - receive new arguments: --swa-full, --max-projected-cache - add exception processing in specific version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	dd879a988c	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	13c75b5bdd	refactor(catalog): enable visual max image size in pixtral Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	c25236e7a0	chore(tool): bump version Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	4ed6e1a223	refactor(llama-box): get downloaded platform name Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
thxCode	42919d734a	ci: docker build cache Signed-off-by: thxCode <thxcode0824@gmail.com>	7 months ago
gitlawr	77b21c09cc	ci: update pr tigger branches	7 months ago
gitlawr	5a063e1c91	chore: update vox-box	7 months ago
gitlawr	91695f48f3	ci: use tag ui on release	7 months ago