Compare commits

...

29 Commits

Author SHA1 Message Date
gitlawr 899bea6697 ci: pin action-gh-release version
7 months ago
thxCode ef03d971e3 ci(npu): adjust processing
7 months ago
gitlawr 3891135a95 fix: disallow abbrev parsing backend parameters
7 months ago
thxCode 79eef89ca3 refactor(npu): tidy up mindie and vllm
7 months ago
thxCode 95f77e4921 refactor: mindie turbo support
7 months ago
gitlawr bdebba4215 chore: update llama-box to v0.0.154
7 months ago
gitlawr 78d71882fa chore: update llama-box to v0.0.153
7 months ago
linyinli a63683c768 fix: distributed inference for vLLM Ascend
7 months ago
gitlawr 5688ffb457 fix: exception on log file not ready
7 months ago
linyinli ace9f8451f fix: remove incompatible Qwen3 config for vLLM Ascend
7 months ago
gitlawr f459045cc3 feat: add r1 0528 to catalog
7 months ago
thxCode 234f5049be chore(tool): bump version
7 months ago
gitlawr c356d546a0 ci: drop cuda11.8
7 months ago
linyinli 5341d63230 feat: support vllm ascend
7 months ago
Terada Kousuke 8ee9099856 Add Japanese README
7 months ago
Yuxing Deng 588063fbfe feat: add support for built binary to run mulitprocessing
7 months ago
peiyuan.zhang a3af7dda24 remove Installation Script
7 months ago
peiyuan.zhang 8a41aaa6a9 support iluvatar
7 months ago
thxCode 12abfbe858 refactor(llama-box): enable max projected cache by default
7 months ago
thxCode 940c2cdfc7 chore(tool): bump version
7 months ago
thxCode 6296336bcb refactor(scheduler): params processing in gguf-parser
7 months ago
thxCode dd879a988c chore(tool): bump version
7 months ago
thxCode 13c75b5bdd refactor(catalog): enable visual max image size in pixtral
7 months ago
thxCode c25236e7a0 chore(tool): bump version
7 months ago
thxCode 4ed6e1a223 refactor(llama-box): get downloaded platform name
7 months ago
thxCode 42919d734a ci: docker build cache
7 months ago
gitlawr 77b21c09cc ci: update pr tigger branches
7 months ago
gitlawr 5a063e1c91 chore: update vox-box
7 months ago
gitlawr 91695f48f3 ci: use tag ui on release
7 months ago

@ -5,3 +5,4 @@ dist/
.mypy_cache/
**/third_party/bin
*.ma

@ -3,7 +3,9 @@ name: CI
on:
workflow_dispatch:
push:
branches: [main]
branches:
- main
- "v*-dev"
tags: ["*.*.*"]
paths-ignore:
- "mkdocs.yml"
@ -58,7 +60,7 @@ jobs:
retention-days: 5
- name: Release GitHub Assets
uses: softprops/action-gh-release@v2
uses: softprops/action-gh-release@v2.2.2
if: startsWith(github.ref, 'refs/tags/') && matrix.python-version == '3.11' && matrix.os == 'linux'
with:
# Draft for official releases to prepare and review release notes before publishing

@ -3,7 +3,9 @@ name: Docker CI
on:
workflow_dispatch:
push:
branches: [main]
branches:
- main
- "v*-dev"
tags: ["*.*.*"]
paths-ignore:
- "mkdocs.yml"
@ -16,7 +18,9 @@ on:
- "Dockerfile.rocm.base"
- "Dockerfile.dcu.base"
pull_request:
branches: [main]
branches:
- main
- "v*-dev"
paths:
- "Dockerfile"
- "Dockerfile.*"
@ -50,13 +54,6 @@ jobs:
tag_suffix: "-cuda12.8"
build_args:
- "CUDA_VERSION=12.8.1"
- device: cuda
dockerfile: "Dockerfile"
platforms: "linux/amd64,linux/arm64"
tag_suffix: "-cuda11.8"
build_args:
- "CUDA_VERSION=11.8.0"
- "CUDA_TAG_SUFFIX=-cudnn8-runtime-ubuntu22.04"
#
# HIP RoCM
#
@ -105,6 +102,14 @@ jobs:
platforms: "linux/amd64,linux/arm64"
tag_suffix: "-cpu"
build_args: []
#
# Iluvatar Corex
#
- device: corex
dockerfile: "Dockerfile.corex"
platforms: "linux/amd64"
tag_suffix: "-corex"
build_args: []
steps:
- name: Checkout
@ -155,7 +160,7 @@ jobs:
done
echo "EOF" >> $GITHUB_OUTPUT
- name: Package
uses: docker/build-push-action@v5
uses: docker/build-push-action@v6
id: package
with:
push: ${{ github.event_name != 'pull_request' }}
@ -171,4 +176,4 @@ jobs:
cache-from: |
type=registry,ref=gpustack/build-cache:gpustack${{ matrix.tag_suffix }}
cache-to: |
${{ github.event_name != 'pull_request' && format('type=registry,mode=max,oci-mediatypes=false,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}
${{ github.event_name != 'pull_request' && format('type=registry,mode=max,compression=gzip,ref=gpustack/build-cache:gpustack{0},ignore-error=true', matrix.tag_suffix) || '' }}

@ -2,7 +2,9 @@ name: PR Checking
on:
pull_request:
branches: [main]
branches:
- main
- "v*-dev"
paths-ignore:
- "mkdocs.yml"
- "docs/**"

1
.gitignore vendored

@ -155,6 +155,7 @@ __pycache__/
# GPUStack related
*/third_party/bin
*/ui/
*.ma
# macOS
.DS_Store

@ -26,10 +26,6 @@ ARG VLLM_VERSION=0.8.5.post1
RUN <<EOF
if [ "$TARGETPLATFORM" = "linux/amd64" ]; then
# Install vllm dependencies for x86_64
if [ "$(echo "${CUDA_VERSION}" | cut -d. -f1,2)" = "11.8" ]; then
pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp38-abi3-manylinux1_x86_64.whl \
--extra-index-url https://download.pytorch.org/whl/cu118;
fi;
WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[all]";
else
WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)[audio]";

@ -0,0 +1,28 @@
FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS build
RUN apt-get update && apt-get install -y \
git \
curl
COPY . /workspace/gpustack
RUN cd /workspace/gpustack && make build
FROM crpi-92uj7jb20gffz04j.cn-guangzhou.personal.cr.aliyuncs.com/iluvatar_common/vllm0.8.3-4.2.0:v1 AS runtime
RUN apt-get update && apt-get install -y \
python3 \
python3-pip \
wget \
tzdata \
iproute2 \
tini \
&& apt-get clean && rm -rf /var/lib/apt/lists/*
COPY --from=build /workspace/gpustack/dist/*.whl /dist/
RUN pip install /dist/*.whl && \
pip cache purge && \
rm -rf /dist
RUN gpustack download-tools
ENTRYPOINT [ "tini", "--", "gpustack", "start" ]

@ -1,7 +1,7 @@
ARG UBUNTU_VERSION=22.04
ARG MUSA_VERSION=rc3.1.1
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-runtime-ubuntu${UBUNTU_VERSION}
ARG MUSA_VERSION=rc4.0.1
ARG BASE_MUSA_DEV_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-devel-ubuntu${UBUNTU_VERSION}
ARG BASE_MUSA_RUN_CONTAINER=mthreads/musa:${MUSA_VERSION}-mudnn-runtime-ubuntu${UBUNTU_VERSION}
FROM ${BASE_MUSA_DEV_CONTAINER} AS build

@ -1,12 +1,18 @@
# Packaging logic:
# 1. base target:
# - Install tools, including Python, GCC, CMake, Make, SCCache and dependencies.
# - Install specific version Ascend CANN according to the chip, including Toolkit and Kernels.
# 2. mindie-install target:
# - Install specific version Ascend CANN NNAL.
# - Copy and intsall the ATB models from a fixed image.
# - Install required dependencies.
# - Install/Upgrade tools, including Python, GCC[optional], CMake, Make, SCCache and dependencies.
# - Install specific version Ascend CANN according to the chip, including Toolkit, Kernels and NNAL.
# 2.1. mindie-install target:
# - Copy ATB models from a fixed image.
# - Install dependencies for MindIE into system site packages, including Torch, Torch-NPU and TorchVision,
# which is used to support multi-versions of MindIE.
# - Create a virtual environment to place MindIE: $(pipx environment --value PIPX_LOCAL_VENVS)/mindie.
# - Install specific version MindIE.
# 2.2. vllm-install target (parallel against mindie-install):
# - Create a virtual environment to place vLLM: $(pipx environment --value PIPX_LOCAL_VENVS)/vllm.
# - Install specific version Torch, Torch-NPU and TorchVision.
# - Install specific version MindIE Turbo.
# - Install specific version vLLM and vLLM Ascend.
# 3. gpustack target (final):
# - Install GPUStack, and override the required dependencies after installed.
# - Set up the environment for CANN, NNAL and ATB models.
@ -20,12 +26,19 @@
# - MINDIE_VERSION is the version of Ascend MindIE,
# which is used to install the Ascend MindIE,
# please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
# - VLLM_VERSION is the version of vLLM,
# which is used to install the vLLM,
# - VLLM_ASCEND_VERSION is the version of vLLM Ascend,
# which is used to install the vLLM Ascend,
# please check https://vllm-ascend.readthedocs.io/en/stable/installation.html for details.
# - PYTHON_VERSION is the version of Python,
# which should be properly set, it must be 3.x.
ARG CANN_VERSION=8.1.rc1.beta1
ARG CANN_CHIP=910b
ARG MINDIE_VERSION=2.0.rc1
ARG VLLM_VERSION=0.7.3
ARG VLLM_ASCEND_VERSION=0.7.3.post1
ARG PYTHON_VERSION=3.11
#
@ -35,26 +48,26 @@ ARG PYTHON_VERSION=3.11
# docker build --tag=gpustack/gpustack:npu-base --file=Dockerfile.npu --target base --progress=plain .
#
FROM ubuntu:20.04 AS base
FROM ubuntu:22.04 AS base
SHELL ["/bin/bash", "-eo", "pipefail", "-c"]
ARG TARGETPLATFORM
ARG TARGETOS
ARG TARGETARCH
## Install tools
## Install Tools
ARG PYTHON_VERSION
ENV DEBIAN_FRONTEND=noninteractive \
PYTHON_VERSION=${PYTHON_VERSION}
ENV DEBIAN_FRONTEND=noninteractive
RUN <<EOF
# Tools
# Refresh
apt-get update -y && apt-get install -y --no-install-recommends \
software-properties-common apt-transport-https \
ca-certificates gnupg2 lsb-release gnupg-agent \
&& apt-get update -y \
&& add-apt-repository -y ppa:ubuntu-toolchain-r/test \
&& add-apt-repository -y ppa:deadsnakes/ppa \
&& apt-get update -y
# Install
@ -68,22 +81,13 @@ RUN <<EOF
procps sysstat htop \
tini vim jq bc tree
# Update python
PYTHON="python${PYTHON_VERSION}"
apt-get install -y --no-install-recommends \
${PYTHON} ${PYTHON}-dev ${PYTHON}-distutils ${PYTHON}-venv ${PYTHON}-lib2to3
if [ -f /etc/alternatives/python ]; then update-alternatives --remove-all python; fi; update-alternatives --install /usr/bin/python python /usr/bin/${PYTHON} 10
if [ -f /etc/alternatives/python3 ]; then update-alternatives --remove-all python3; fi; update-alternatives --install /usr/bin/python3 python3 /usr/bin/${PYTHON} 10
curl -sS https://bootstrap.pypa.io/get-pip.py | ${PYTHON}
# Update locale
localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& pip cache purge
&& rm -rf /var/cache/apt
EOF
ENV LANG='en_US.UTF-8' \
@ -95,27 +99,31 @@ ENV LANG='en_US.UTF-8' \
RUN <<EOF
# GCC
# Install
apt-get install -y --no-install-recommends \
gcc-11 g++-11 gfortran-11 gfortran
# Update alternatives
if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt
# NB(thxCode): Upgrade GCC if the Ubuntu version is lower than 21.04.
source /etc/os-release
if (( $(echo "${VERSION_ID} < 21.04" | bc -l) )); then
# Install
apt-get install -y --no-install-recommends \
gcc-11 g++-11 gfortran-11 gfortran
# Update alternatives
if [ -f /etc/alternatives/gcov-dump ]; then update-alternatives --remove-all gcov-dump; fi; update-alternatives --install /usr/bin/gcov-dump gcov-dump /usr/bin/gcov-dump-11 10
if [ -f /etc/alternatives/lto-dump ]; then update-alternatives --remove-all lto-dump; fi; update-alternatives --install /usr/bin/lto-dump lto-dump /usr/bin/lto-dump-11 10
if [ -f /etc/alternatives/gcov ]; then update-alternatives --remove-all gcov; fi; update-alternatives --install /usr/bin/gcov gcov /usr/bin/gcov-11 10
if [ -f /etc/alternatives/gcc ]; then update-alternatives --remove-all gcc; fi; update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-11 10
if [ -f /etc/alternatives/gcc-nm ]; then update-alternatives --remove-all gcc-nm; fi; update-alternatives --install /usr/bin/gcc-nm gcc-nm /usr/bin/gcc-nm-11 10
if [ -f /etc/alternatives/cpp ]; then update-alternatives --remove-all cpp; fi; update-alternatives --install /usr/bin/cpp cpp /usr/bin/cpp-11 10
if [ -f /etc/alternatives/g++ ]; then update-alternatives --remove-all g++; fi; update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-11 10
if [ -f /etc/alternatives/gcc-ar ]; then update-alternatives --remove-all gcc-ar; fi; update-alternatives --install /usr/bin/gcc-ar gcc-ar /usr/bin/gcc-ar-11 10
if [ -f /etc/alternatives/gcov-tool ]; then update-alternatives --remove-all gcov-tool; fi; update-alternatives --install /usr/bin/gcov-tool gcov-tool /usr/bin/gcov-tool-11 10
if [ -f /etc/alternatives/gcc-ranlib ]; then update-alternatives --remove-all gcc-ranlib; fi; update-alternatives --install /usr/bin/gcc-ranlib gcc-ranlib /usr/bin/gcc-ranlib-11 10
if [ -f /etc/alternatives/gfortran ]; then update-alternatives --remove-all gfortran; fi; update-alternatives --install /usr/bin/gfortran gfortran /usr/bin/gfortran-11 10
# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt
fi
EOF
## Install CMake/Make/SCCache
@ -126,8 +134,8 @@ RUN <<EOF
# Install
apt-get install -y --no-install-recommends \
pkg-config make
curl -sL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
curl -sL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1
curl --retry 3 --retry-connrefused -fL "https://github.com/Kitware/CMake/releases/download/v3.22.1/cmake-3.22.1-linux-$(uname -m).tar.gz" | tar -zx -C /usr --strip-components 1
curl --retry 3 --retry-connrefused -fL "https://github.com/mozilla/sccache/releases/download/v0.10.0/sccache-v0.10.0-$(uname -m)-unknown-linux-musl.tar.gz" | tar -zx -C /usr/bin --strip-components 1
# Cleanup
rm -rf /var/tmp/* \
@ -135,16 +143,20 @@ RUN <<EOF
&& rm -rf /var/cache/apt
EOF
## Install Dependencies
## Install Compile Dependencies
RUN <<EOF
# Dependencies
# Install
apt-get install -y --no-install-recommends \
zlib1g zlib1g-dev libbz2-dev liblzma-dev libffi-dev openssl libssl-dev libsqlite3-dev \
libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 gfortran libhdf5-dev \
libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl
zlib1g zlib1g-dev libbz2-dev libffi-dev libgdbm-dev libgdbm-compat-dev \
openssl libssl-dev libsqlite3-dev lcov libomp-dev \
libblas-dev liblapack-dev libopenblas-dev libblas3 liblapack3 libhdf5-dev \
libxml2 libxslt1-dev libgl1-mesa-glx libgmpxx4ldbl \
libncurses5-dev libreadline6-dev libsqlite3-dev libssl-dev \
liblzma-dev lzma lzma-dev tk-dev uuid-dev libmpdec-dev \
libnuma-dev
# Cleanup
rm -rf /var/tmp/* \
@ -152,6 +164,62 @@ RUN <<EOF
&& rm -rf /var/cache/apt
EOF
## Install Python
ARG PYTHON_VERSION
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN <<EOF
# Python
# Download
PYTHON_INSTALL_DIR="/tmp/Python-${PYTHON_VERSION}"
mkdir -p ${PYTHON_INSTALL_DIR}
PYTHON_LATEST_VERSION=$(curl -s https://repo.huaweicloud.com/python/ | grep -oE "${PYTHON_VERSION}\.[0-9]+" | sort -V | tail -n 1)
curl -H 'Referer: https://repo.huaweicloud.com/' --retry 3 --retry-connrefused -fL "https://repo.huaweicloud.com/python/${PYTHON_LATEST_VERSION}/Python-${PYTHON_LATEST_VERSION}.tgz" | tar -zx -C ${PYTHON_INSTALL_DIR} --strip-components 1
# Build
pushd ${PYTHON_INSTALL_DIR}
./configure \
--prefix=/usr \
--enable-optimizations \
--enable-shared \
--enable-ipv6 \
--enable-loadable-sqlite-extensions \
--with-lto=full \
--with-ensurepip=install \
--with-computed-gotos
make -j$(nproc) && make altinstall
popd
# Link
ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python3
ln -vsf /usr/bin/python${PYTHON_VERSION} /usr/bin/python
ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip3
ln -vsf /usr/bin/pip${PYTHON_VERSION} /usr/bin/pip
ln -vsf /usr/bin/2to3-${PYTHON_VERSION} /usr/bin/2to3
ln -vsf /usr/bin/pydoc${PYTHON_VERSION} /usr/bin/pydoc3
ln -vsf /usr/bin/idle${PYTHON_VERSION} /usr/bin/idle3
# Install packages
cat <<EOT >/tmp/requirements.txt
setuptools==80.7.1
pipx==1.7.1
EOT
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& pip cache purge
EOF
## Preset this to simplify configuration,
## it is the output of $(pipx environment --value PIPX_LOCAL_VENVS).
ENV PIPX_LOCAL_VENVS=/root/.local/share/pipx/venvs
ARG CANN_VERSION
ARG CANN_CHIP
@ -171,9 +239,21 @@ RUN <<EOF
URL_SUFFIX="response-content-type=application/octet-stream"
# Install dependencies
python3 -m pip install --no-cache-dir --root-user-action ignore --upgrade pip
pip install --no-cache-dir --root-user-action ignore \
attrs cython numpy==1.26.4 decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py
cat <<EOT >/tmp/requirements.txt
attrs==24.3.0
numpy==1.26.4
decorator==5.2.1
sympy==1.14.0
cffi==1.17.1
PyYAML==6.0.2
pathlib2==2.3.7.post1
psutil==7.0.0
protobuf==6.31.0
scipy==1.15.3
requests==2.32.3
absl-py==2.2.2
EOT
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
# Install toolkit
TOOLKIT_FILE="Ascend-cann-toolkit_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
@ -184,7 +264,9 @@ RUN <<EOF
printf "Y\n" | "${TOOLKIT_PATH}" --install --install-for-all --install-path="${CANN_HOME}"
# Cleanup
rm -f "${TOOLKIT_PATH}" \
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf /var/log/ascend \
&& rm -rf /var/log/ascend_seclog \
&& pip cache purge
@ -217,21 +299,14 @@ RUN <<EOF
printf "Y\n" |"${KERNELS_PATH}" --install --install-for-all --install-path="${CANN_HOME}"
# Cleanup
rm -f "${KERNELS_PATH}" \
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf /var/log/ascend \
&& rm -rf /var/log/ascend_seclog \
&& pip cache purge
EOF
#
# Stage MindIE Install
#
# Example build command:
# docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
#
FROM base AS mindie-install
## Install NNAL
RUN <<EOF
@ -255,25 +330,22 @@ RUN <<EOF
printf "Y\n" | "${NNAL_PATH}" --install --install-path="${CANN_HOME}"
# Cleanup
rm -f "${NNAL_PATH}" \
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf /var/log/ascend_seclog \
&& rm -rf /var/log/cann_atb_log \
&& pip cache purge
EOF
COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
RUN <<EOF
# ATB Models
# Install
pip install --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl
#
# Stage MindIE Install
#
# Example build command:
# docker build --tag=gpustack/gpustack:npu-mindie-install --file=Dockerfile.npu --target mindie-install --progress=plain .
#
# Cleanup
rm -f "${NNAL_PATH}" \
&& rm -rf /var/log/ascend_seclog \
&& rm -rf /var/log/cann_atb_log \
&& pip cache purge
EOF
FROM base AS mindie-install
## Install MindIE
@ -281,6 +353,7 @@ ARG MINDIE_VERSION
ENV MINDIE_VERSION=${MINDIE_VERSION}
COPY --from=thxcode/mindie:2.0.T17-800I-A2-py311-openeuler24.03-lts --chown=root:root ${CANN_HOME}/atb-models ${CANN_HOME}/atb-models
RUN <<EOF
# MindIE
@ -290,19 +363,17 @@ RUN <<EOF
URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindIE/MindIE%20${DOWNLOAD_VERSION}"
URL_SUFFIX="response-content-type=application/octet-stream"
# Prepare environment
source ${CANN_HOME}/ascend-toolkit/set_env.sh
source ${CANN_HOME}/nnal/atb/set_env.sh
# Install dependencies,
# Install Torch, Torch-npu, TorchVision,
# according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
# please check https://www.hiascend.com/document/detail/zh/Pytorch/700/configandinstg/instg/insg_0004.html for details.
# please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
if [ ${ARCH} == "x86_64" ]; then
pip install --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu
else
pip install --no-cache-dir --root-user-action ignore torch==2.1.0
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.1.0
fi
pip install --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.1.0.post12 torchvision==0.16.0
# Install dependencies.
cat <<EOT >/tmp/requirements.txt
av==14.3.0
absl-py==2.2.2
@ -339,7 +410,7 @@ python-rapidjson==1.20
requests==2.32.3
sacrebleu==2.4.3
tornado==6.4.2
transformers==4.46.3
transformers==4.52.3
tiktoken==0.7.0
typing_extensions==4.13.2
tzdata==2024.2
@ -349,7 +420,18 @@ urllib3==2.4.0
zope.event==5.0
zope.interface==7.0.3
EOT
pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
# Install MindIE ATB models
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${CANN_HOME}/atb-models/*.whl
# Pre process
# - Create virtual environment to place MindIE
python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/mindie
# - Prepare environment
source ${CANN_HOME}/ascend-toolkit/set_env.sh
source ${CANN_HOME}/nnal/atb/set_env.sh
source ${PIPX_LOCAL_VENVS}/mindie/bin/activate
# Install MindIE
MINDIE_FILE="Ascend-mindie_${DOWNLOAD_VERSION}_${OS}-${ARCH}.run"
@ -360,17 +442,122 @@ EOT
printf "Y\n" | "${MINDIE_PATH}" --install --install-path="${CANN_HOME}"
# Post process
chmod +w "${CANN_HOME}/mindie/latest/mindie-service/conf"
# - Make MindIE service configuration writable
chmod +w "${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/conf"
# - Tell GPUStack how to launch MindIE
cat <<EOT >>"${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"
# NB(thxCode): This is a workaround for GPUStack to activate MindIE.
source ${PIPX_LOCAL_VENVS}/mindie/bin/activate || true
EOT
chmod -w "${CANN_HOME}/mindie/${DOWNLOAD_VERSION}/mindie-service/set_env.sh"
deactivate
# Review
pip freeze \
&& python -m site
pipx runpip mindie freeze
# Cleanup
rm -f "${MINDIE_PATH}" \
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf /var/log/ascend_seclog \
&& rm -rf /var/log/cann_atb_log \
&& rm -rf /var/log/mindie_log \
&& rm -rf ~/log \
&& pip cache purge
EOF
#
# Stage vLLM Install
#
# Example build command:
# docker build --tag=gpustack/gpustack:npu-vllm-install --file=Dockerfile.npu --target vllm-install --progress=plain .
#
FROM base AS vllm-install
## Install vLLM Ascend
ARG VLLM_VERSION
ARG VLLM_ASCEND_VERSION
ARG MINDIE_VERSION
ENV VLLM_VERSION=${VLLM_VERSION} \
VLLM_ASCEND_VERSION=${VLLM_ASCEND_VERSION} \
MINDIE_VERSION=${MINDIE_VERSION}
RUN <<EOF
# vLLM
OS="$(uname -s | tr '[:upper:]' '[:lower:]')"
ARCH="$(uname -m)"
DOWNLOAD_VERSION="$(echo ${MINDIE_VERSION%\.beta1} | tr '[:lower:]' '[:upper:]')"
URL_PREFIX="https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/MindIE/MindIE%20${DOWNLOAD_VERSION}"
URL_SUFFIX="response-content-type=application/octet-stream"
# Pre process
# - Create virtual environment to place vLLM
python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/vllm
# - Prepare environment
source ${CANN_HOME}/ascend-toolkit/set_env.sh
source ${CANN_HOME}/nnal/atb/set_env.sh
source ${PIPX_LOCAL_VENVS}/vllm/bin/activate
# Install Torch, Torch-npu, TorchVision,
# according to Ascend Extension Installation, have the mapping requirements for the CANN_VERSION,
# please check https://www.hiascend.com/developer/download/community/result?module=ie%2Bpt%2Bcann for details.
if [ ${ARCH} == "x86_64" ]; then
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1+cpu --index-url https://download.pytorch.org/whl/cpu
else
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch==2.5.1
fi
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore torch-npu==2.5.1 torchvision==0.20.1
# Install dependencies.
cat <<EOT >/tmp/requirements.txt
ml-dtypes==0.5.0
tornado==6.4.2
gevent==24.2.1
geventhttpclient==2.3.1
sacrebleu==2.4.3
pandas==2.2.3
rouge_score==0.1.2
pybind11==2.13.6
pytest==8.4.0
cloudpickle==3.0.0
ray[client]==2.43.0
EOT
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
# Install vLLM & vLLM-Ascend
cat <<EOT >/tmp/requirements.txt
vllm==${VLLM_VERSION}
vllm-ascend==${VLLM_ASCEND_VERSION}
EOT
if [ ${ARCH} == "x86_64" ]; then
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt --extra-index-url https://download.pytorch.org/whl/cpu
else
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
fi
# Install MindIE Turbo
MINDIE_TURBO_FILE="Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}.tar.gz"
MINDIE_TURBO_URL="${URL_PREFIX}/${MINDIE_TURBO_FILE}?${URL_SUFFIX}"
curl -H 'Referer: https://www.hiascend.com/' --retry 3 --retry-connrefused -fL "${MINDIE_TURBO_URL}" | tar -zx -C /tmp --strip-components 1
WHEEL_PACKAGE="$(ls /tmp/Ascend-mindie-turbo_${DOWNLOAD_VERSION}_py${PYTHON_VERSION//./}_${OS}_${ARCH}/*.whl)"
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE}
# Post process
deactivate
# Review
pipx runpip vllm freeze
# Cleanup
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf ~/log \
&& pip cache purge
EOF
@ -383,39 +570,61 @@ EOF
FROM mindie-install AS gpustack
## Copy vLLM from vllm-install stage
COPY --from=vllm-install ${PIPX_LOCAL_VENVS}/vllm ${PIPX_LOCAL_VENVS}/vllm
## Install GPUStack
RUN --mount=type=bind,target=/workspace/gpustack,rw <<EOF
# Build
# GPUStack
# Build GPUStack
export PATH="${HOME}/.local/bin:${PATH}"
cd /workspace/gpustack \
&& git config --global --add safe.directory /workspace/gpustack \
&& make build
# Install,
# Pre process
# - Create virtual environment to place gpustack
python -m venv --system-site-packages ${PIPX_LOCAL_VENVS}/gpustack
# - Prepare environment
source ${PIPX_LOCAL_VENVS}/gpustack/bin/activate
# Install GPUStack,
# vox-box relies on PyTorch 2.7, which is not compatible with MindIE.
WHEEL_PACKAGE="$(ls /workspace/gpustack/dist/*.whl)"
pip install --no-cache-dir --root-user-action ignore $WHEEL_PACKAGE
pip install --disable-pip-version-check --no-cache-dir --root-user-action ignore ${WHEEL_PACKAGE} \
&& ln -vsf ${PIPX_LOCAL_VENVS}/gpustack/bin/gpustack /usr/local/bin/gpustack
# Download tools
gpustack download-tools --device npu
# Post-process,
# override the required dependencies after installed.
cat <<EOT >/tmp/requirements.txt
pipx==1.7.1
EOT
pip install --no-cache-dir --root-user-action ignore -r /tmp/requirements.txt
# Active MindIE
# MindIE is combined with a lot of components, and it is conflict with vLLM,
# so we need to active MindIE manually at GPUStack.
# Active vLLM
ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/vllm ${PIPX_LOCAL_VENVS}/gpustack/bin/vllm
# - Redirect RAY.
rm -rf ${PIPX_LOCAL_VENVS}/gpustack/bin/ray \
&& ln -vsf ${PIPX_LOCAL_VENVS}/vllm/bin/ray ${PIPX_LOCAL_VENVS}/gpustack/bin/ray
# Set up environment
mkdir -p /var/lib/gpustack \
&& chmod -R 0755 /var/lib/gpustack
# Post process
deactivate
# Review
pip freeze \
&& python -m site
pipx runpip gpustack freeze
# Cleanup
rm -rf /workspace/gpustack/dist \
rm -rf /var/tmp/* \
&& rm -rf /tmp/* \
&& rm -rf /var/cache/apt \
&& rm -rf /workspace/gpustack/dist \
&& pip cache purge
EOF
@ -442,13 +651,15 @@ RUN <<EOF
echo "${SOURCE_ATB_MODEL_ENV}" >> /etc/profile
echo "${SOURCE_ATB_MODEL_ENV}" >> ~/.bashrc
# Export Driver Tools
# Export Driver tools
EXPORT_DRIVER_TOOLS="export PATH=${CANN_HOME}/driver/tools:\${PATH}"
echo "${EXPORT_DRIVER_TOOLS}" >> /etc/profile
echo "${EXPORT_DRIVER_TOOLS}" >> ~/.bashrc
# NB(thxCode): For specific MindIE version supporting,
# we need to process environment setting up during GPUStack deployment.
# NB(thxCode): Any tuning environment variables should NOT be set here.
EOF
ENTRYPOINT [ "tini", "--", "/usr/bin/bash", "-c", "source /etc/profile && exec gpustack start \"$@\"", "--" ]

@ -21,7 +21,8 @@
<p align="center">
<a href="./README.md">English</a> |
<a href="./README_CN.md">简体中文</a>
<a href="./README_CN.md">简体中文</a> |
<a href="./README_JP.md">日本語</a>
</p>
<br>
@ -34,7 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.
- **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
- **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Integrates with llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM, and Ascend MindIE.
- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
- **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
- **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
- **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -154,6 +155,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
- [x] Ascend CANN
- [x] Hygon DTK
- [x] Moore Threads MUSA
- [x] Iluvatar Corex
We plan to support the following accelerators in future releases.

@ -21,7 +21,8 @@
<p align="center">
<a href="./README.md">English</a> |
<a href="./README_CN.md">简体中文</a>
<a href="./README_CN.md">简体中文</a> |
<a href="./README_JP.md">日本語</a>
</p>
<br>
@ -34,7 +35,7 @@ GPUStack 是一个用于运行 AI 模型的开源 GPU 集群管理器。
- **广泛的 GPU 兼容性**:无缝支持 Apple Mac、Windows PC 和 Linux 服务器上各种供应商的 GPU。
- **广泛的模型支持**:支持各种模型,包括 LLM、多模态 VLM、图像模型、语音模型、文本嵌入模型和重排序模型。
- **灵活的推理后端**:与 llama-boxllama.cpp 和 stable-diffusion.cpp、vox-box、vLLM 和 Ascend MindIE 集成。
- **灵活的推理后端**支持与 llama-boxllama.cpp 和 stable-diffusion.cpp、vox-box、vLLM 和 Ascend MindIE 等多种推理后端的灵活集成。
- **多版本后端支持**:同时运行推理后端的多个版本,以满足不同模型的不同运行依赖。
- **分布式推理**:支持单机和多机多卡并行推理,包括跨供应商和运行环境的异构 GPU。
- **可扩展的 GPU 架构**:通过向基础设施添加更多 GPU 或节点轻松进行扩展。
@ -155,6 +156,7 @@ curl http://your_gpustack_server_url/v1-openai/chat/completions \
- [x] 昇腾 CANN
- [x] 海光 DTK
- [x] 摩尔线程 MUSA
- [x] 天数智芯 Corex
我们计划在未来的版本中支持以下加速框架:

@ -0,0 +1,251 @@
<br>
<p align="center">
<img alt="GPUStack" src="https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-logo.png" width="300px"/>
</p>
<br>
<p align="center">
<a href="https://docs.gpustack.ai" target="_blank">
<img alt="Documentation" src="https://img.shields.io/badge/ドキュメント-GPUStack-blue?logo=readthedocs&logoColor=white"></a>
<a href="./LICENSE" target="_blank">
<img alt="License" src="https://img.shields.io/github/license/gpustack/gpustack?logo=github&logoColor=white&label=License&color=blue"></a>
<a href="./docs/assets/wechat-assistant.png" target="_blank">
<img alt="WeChat" src="https://img.shields.io/badge/微信群-GPUStack-blue?logo=wechat&logoColor=white"></a>
<a href="https://discord.gg/VXYJzuaqwD" target="_blank">
<img alt="Discord" src="https://img.shields.io/badge/Discord-GPUStack-blue?logo=discord&logoColor=white"></a>
<a href="https://twitter.com/intent/follow?screen_name=gpustack_ai" target="_blank">
<img alt="Follow on X(Twitter)" src="https://img.shields.io/twitter/follow/gpustack_ai?logo=X"></a>
</p>
<br>
<p align="center">
<a href="./README.md">English</a> |
<a href="./README_CN.md">简体中文</a> |
<a href="./README_JP.md">日本語</a>
</p>
<br>
![demo](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/gpustack-demo.gif)
GPUStack は、AI モデルを実行するためのオープンソース GPU クラスタマネージャーです。
### 主な機能
- **幅広い GPU 互換性:** Apple Mac、Windows PC、Linux サーバー上のさまざまなベンダーの GPU をシームレスにサポート。
- **豊富なモデルサポート:** LLM、VLM、画像モデル、音声モデル、埋め込みモデル、リランクモデルを含む幅広いモデルをサポート。
- **柔軟な推論バックエンド:** llama-boxllama.cpp と stable-diffusion.cpp、vox-box、vLLM、Ascend MindIE と統合。
- **マルチバージョンバックエンドサポート:** 異なるモデルの多様なランタイム要件を満たすために、推論バックエンドの複数バージョンを同時実行。
- **分散推論:** ベンダーやランタイム環境をまたぐ異種 GPU を含む、シングルノードおよびマルチノードのマルチ GPU 推論をサポート。
- **スケーラブルな GPU アーキテクチャ:** インフラストラクチャに GPU やノードを追加することで簡単にスケールアップ。
- **堅牢なモデル安定性:** 自動障害回復、マルチインスタンス冗長性、推論リクエストのロードバランシングで高可用性を確保。
- **インテリジェントなデプロイ評価:** モデルリソース要件、バックエンドとアーキテクチャの互換性、OS の互換性、その他のデプロイ関連要因を自動的に評価。
- **自動スケジューリング:** 利用可能なリソースに基づいてモデルを動的に割り当て。
- **軽量な Python パッケージ:** 最小限の依存関係と低い運用オーバーヘッド。
- **OpenAI 互換 API:** OpenAI の API 仕様と完全に互換性があり、シームレスな統合を実現。
- **ユーザーと API キー管理:** ユーザーと API キーの管理を簡素化。
- **リアルタイム GPU 監視:** GPU 性能と使用率をリアルタイムで追跡。
- **トークンとレートメトリクス:** トークン使用量と API リクエストレートを監視。
## インストール
### Linux または macOS
GPUStack は、systemd または launchd ベースのシステムでサービスとしてインストールするスクリプトを提供しており、デフォルトポートは 80 です。この方法で GPUStack をインストールするには、以下を実行します:
```bash
curl -sfL https://get.gpustack.ai | sh -s -
```
### Windows
管理者として PowerShell を実行しPowerShell ISE の使用は**避けてください**)、以下のコマンドを実行して GPUStack をインストールします:
```powershell
Invoke-Expression (Invoke-WebRequest -Uri "https://get.gpustack.ai" -UseBasicParsing).Content
```
### その他のインストール方法
手動インストール、Docker インストール、または詳細な構成オプションについては、[インストールドキュメント](https://docs.gpustack.ai/latest/installation/installation-script/)を参照してください。
## はじめに
1. **llama3.2**モデルを実行してチャットする:
```bash
gpustack chat llama3.2 "tell me a joke."
```
2. **stable-diffusion-v3-5-large-turbo**モデルで画像を生成する:
> ### 💡 ヒント
>
> このコマンドは Hugging Face からモデル(約 12GBをダウンロードします。ダウンロード時間はネットワーク速度に依存します。モデルを実行するために十分なディスクスペースと VRAM12GBがあることを確認してください。問題が発生した場合は、このステップをスキップして次に進むことができます。
```bash
gpustack draw hf.co/gpustack/stable-diffusion-v3-5-large-turbo-GGUF:stable-diffusion-v3-5-large-turbo-Q4_0.gguf \
"A minion holding a sign that says 'GPUStack'. The background is filled with futuristic elements like neon lights, circuit boards, and holographic displays. The minion is wearing a tech-themed outfit, possibly with LED lights or digital patterns. The sign itself has a sleek, modern design with glowing edges. The overall atmosphere is high-tech and vibrant, with a mix of dark and neon colors." \
--sample-steps 5 --show
```
コマンドが完了すると、生成された画像がデフォルトビューアに表示されます。プロンプトと CLI オプションを実験して出力をカスタマイズできます。
![Generated Image](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/quickstart-minion.png)
3. ブラウザで`http://your_host_ip`を開いて GPUStack UI にアクセスします。ユーザー名`admin`とデフォルトパスワードで GPUStack にログインします。デフォルト設定のパスワードを取得するには、以下のコマンドを実行します:
**Linux または macOS**
```bash
cat /var/lib/gpustack/initial_admin_password
```
**Windows**
```powershell
Get-Content -Path "$env:APPDATA\gpustack\initial_admin_password" -Raw
```
4. ナビゲーションメニューで`Playground - Chat`をクリックします。これで UI プレイグラウンドで LLM とチャットできます。
![Playground Screenshot](https://raw.githubusercontent.com/gpustack/gpustack/main/docs/assets/playground-screenshot.png)
5. ナビゲーションメニューで`API Keys`をクリックし、`New API Key`ボタンをクリックします。
6. `Name`を入力し、`Save`ボタンをクリックします。
7. 生成された API キーをコピーして安全な場所に保存します。作成時にのみ一度だけ表示されることに注意してください。
8. これで API キーを使用して OpenAI 互換 API にアクセスできます。例えば、curl を使用する場合:
```bash
export GPUSTACK_API_KEY=your_api_key
curl http://your_gpustack_server_url/v1-openai/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer $GPUSTACK_API_KEY" \
-d '{
"model": "llama3.2",
"messages": [
{
"role": "system",
"content": "You are a helpful assistant."
},
{
"role": "user",
"content": "Hello!"
}
],
"stream": true
}'
```
## サポートされているプラットフォーム
- [x] macOS
- [x] Linux
- [x] Windows
## サポートされているアクセラレータ
- [x] NVIDIA CUDA[Compute Capability](https://developer.nvidia.com/cuda-gpus) 6.0 以上)
- [x] Apple MetalM 系チップ)
- [x] AMD ROCm
- [x] Ascend CANN
- [x] Hygon DTK
- [x] Moore Threads MUSA
- [x] Iluvatar Corex
以下のアクセラレータは将来のリリースでサポートする予定です。
- [ ] Intel oneAPI
- [ ] Qualcomm AI Engine
## サポートされているモデル
GPUStack は[llama-box](https://github.com/gpustack/llama-box)(バンドルされた[llama.cpp](https://github.com/ggml-org/llama.cpp)と[stable-diffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)サーバー)、[vLLM](https://github.com/vllm-project/vllm)、[Ascend MindIE](https://www.hiascend.com/en/software/mindie)、[vox-box](https://github.com/gpustack/vox-box)をバックエンドとして使用し、幅広いモデルをサポートしています。以下のソースからのモデルがサポートされています:
1. [Hugging Face](https://huggingface.co/)
2. [ModelScope](https://modelscope.cn/)
3. ローカルファイルパス
### モデル例:
| **カテゴリ** | **モデル** |
| ----------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **大規模言語モデルLLM** | [Qwen](https://huggingface.co/models?search=Qwen/Qwen), [LLaMA](https://huggingface.co/meta-llama), [Mistral](https://huggingface.co/mistralai), [DeepSeek](https://huggingface.co/models?search=deepseek-ai/deepseek), [Phi](https://huggingface.co/models?search=microsoft/phi), [Gemma](https://huggingface.co/models?search=Google/gemma) |
| **ビジョン言語モデルVLM** | [Llama3.2-Vision](https://huggingface.co/models?pipeline_tag=image-text-to-text&search=llama3.2), [Pixtral](https://huggingface.co/models?search=pixtral) , [Qwen2.5-VL](https://huggingface.co/models?search=Qwen/Qwen2.5-VL), [LLaVA](https://huggingface.co/models?search=llava), [InternVL2.5](https://huggingface.co/models?search=internvl2_5) |
| **拡散モデル** | [Stable Diffusion](https://huggingface.co/models?search=gpustack/stable-diffusion), [FLUX](https://huggingface.co/models?search=gpustack/flux) |
| **埋め込みモデル** | [BGE](https://huggingface.co/gpustack/bge-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-embedding-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-embeddings) |
| **リランカーモデル** | [BGE](https://huggingface.co/gpustack/bge-reranker-v2-m3-GGUF), [BCE](https://huggingface.co/gpustack/bce-reranker-base_v1-GGUF), [Jina](https://huggingface.co/models?search=gpustack/jina-reranker) |
| **音声モデル** | [Whisper](https://huggingface.co/models?search=Systran/faster)(音声認識)、[CosyVoice](https://huggingface.co/models?search=FunAudioLLM/CosyVoice)(音声合成) |
サポートされているモデルの完全なリストについては、[推論バックエンド](https://docs.gpustack.ai/latest/user-guide/inference-backends/)ドキュメントのサポートされているモデルセクションを参照してください。
## OpenAI 互換 API
GPUStack は`/v1-openai`パスの下で以下の OpenAI 互換 API を提供します:
- [x] [List Models](https://platform.openai.com/docs/api-reference/models/list)
- [x] [Create Completion](https://platform.openai.com/docs/api-reference/completions/create)
- [x] [Create Chat Completion](https://platform.openai.com/docs/api-reference/chat/create)
- [x] [Create Embeddings](https://platform.openai.com/docs/api-reference/embeddings/create)
- [x] [Create Image](https://platform.openai.com/docs/api-reference/images/create)
- [x] [Create Image Edit](https://platform.openai.com/docs/api-reference/images/createEdit)
- [x] [Create Speech](https://platform.openai.com/docs/api-reference/audio/createSpeech)
- [x] [Create Transcription](https://platform.openai.com/docs/api-reference/audio/createTranscription)
例えば、公式の[OpenAI Python API ライブラリ](https://github.com/openai/openai-python)を使用して API を利用できます:
```python
from openai import OpenAI
client = OpenAI(base_url="http://your_gpustack_server_url/v1-openai", api_key="your_api_key")
completion = client.chat.completions.create(
model="llama3.2",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": "Hello!"}
]
)
print(completion.choices[0].message)
```
GPUStack ユーザーは UI で独自の API キーを生成できます。
## ドキュメント
完全なドキュメントについては、[公式ドキュメントサイト](https://docs.gpustack.ai)を参照してください。
## ビルド
1. Pythonバージョン 3.10 から 3.12)をインストールします。
2. `make build`を実行します。
ビルドされた wheel パッケージは`dist`ディレクトリにあります。
## コントリビューション
GPUStack への貢献に興味がある場合は、[コントリビューションガイド](./docs/contributing.md)をお読みください。
## コミュニティに参加
問題がある場合や提案がある場合は、サポートのために[コミュニティ](https://discord.gg/VXYJzuaqwD)に参加してください。
## ライセンス
Copyright (c) 2024 The GPUStack authors
Apache License, Version 2.0(以下「ライセンス」)に基づいてライセンスされています。
このライセンスの詳細については、[LICENSE](./LICENSE)ファイルを参照してください。
適用法で要求されるか、書面で合意されない限り、
ライセンスに基づいて配布されるソフトウェアは「現状のまま」で配布され、
明示または黙示を問わず、いかなる種類の保証や条件もありません。
ライセンスに基づく許可と制限を規定する特定の言語については、
ライセンスを参照してください。

@ -4,7 +4,7 @@
### Hybird Cluster Support
It supports a mix of Linux, Windows, and macOS nodes, as well as x86_64 and arm64 architectures. Additionally, It also supports various GPUs, including NVIDIA, Apple Metal, AMD, Ascend, Hygon and Moore Threads.
GPUStack supports a mix of Linux, Windows, and macOS nodes, as well as x86_64 and arm64 architectures. Additionally, It also supports various GPUs, including NVIDIA, Apple Metal, AMD, Ascend, Hygon and Moore Threads.
### Distributed Inference Support

@ -36,6 +36,7 @@ npu-smi info
- [x] llama-box (Only supports FP16 precision)
- [x] MindIE
- [x] vLLM (Only supports Ascend 910B series)
### Prerequisites

@ -0,0 +1,47 @@
# Air-Gapped Installation
You can install GPUStack in an air-gapped environment. An air-gapped environment refers to a setup where GPUStack will be installed offline.
The following methods are available for installing GPUStack in an air-gapped environment:
| OS | Arch | Supported methods |
| ----- | ----- | ------------------------------------------- |
| Linux | AMD64 | [Docker Installation](#docker-installation) |
## Supported backends
- [x] vLLM
## Docker Installation
### Prerequisites
- [Driver for MR-V100 MR-V50 BI-V100 BI-V150](https://support.iluvatar.com/#/ProductLine?id=2)
Check if the driver is installed:
```bash
ixsmi
```
- [Docker](https://support.iluvatar.com/#/ProductLine?id=2)
- [Corex Container Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
### Run GPUStack
When running GPUStack with Docker, it works out of the box in an air-gapped environment as long as the Docker images are available. To do this, follow these steps:
1. Pull GPUStack docker image in an online environment:
```bash
docker pull gpustack/gpustack:latest-corex
```
If your online environment differs from the air-gapped environment in terms of OS or arch, specify the OS and arch of the air-gapped environment when pulling the image:
```bash
docker pull --platform linux/amd64 gpustack/gpustack:latest-corex
```
2. Publish docker image to a private registry or load it directly in the air-gapped environment.
3. Refer to the [Docker Installation](./online-installation.md#docker-installation) guide to run GPUStack using Docker.

@ -0,0 +1,113 @@
# Online Installation
## Supported Devices
- [x] Iluvatar GPUs (MR-V100 MR-V50 BI-V100 BI-V150)
## Supported Platforms
| OS | Arch | Supported methods |
| ----- | ----- | -------------------------------------------------------------------------------------------------------- |
| Linux | AMD64 | [Docker Installation](#docker-installation) (Recommended)<br>[Installation Script](#installation-script) |
## Supported backends
- [x] vllm
## Prerequisites
- [Driver for MR-V100 MR-V50 BI-V100 BI-V150](https://support.iluvatar.com/#/ProductLine?id=2)
Check if the driver is installed:
```bash
ixsmi
```
## Docker Installation
- [Docker](https://support.iluvatar.com/#/ProductLine?id=2)
- [Corex Container Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
### Run GPUStack
Run the following command to start the GPUStack server **and built-in worker** (host network mode is recommended):
```bash
docker run -d --name gpustack \
-v /lib/modules:/lib/modules \
-v /dev:/dev \
--privileged \
--cap-add=ALL \
--pid=host \
--restart=unless-stopped \
--network=host \
--ipc=host \
-v gpustack-data:/var/lib/gpustack \
gpustack/gpustack:latest-corex
```
If you need to change the default server port 80, please use the `--port` parameter:
```bash
docker run -d --name gpustack \
-v /lib/modules:/lib/modules \
-v /dev:/dev \
--privileged \
--cap-add=ALL \
--pid=host \
--restart=unless-stopped \
--network=host \
--ipc=host \
-v gpustack-data:/var/lib/gpustack \
gpustack/gpustack:latest-corex \
--port 9090
```
If other ports are in conflict, or if you want to customize startup options, refer to the [CLI Reference](../../cli-reference/start.md) for available flags and configuration instructions.
Check if the startup logs are normal:
```bash
docker logs -f gpustack
```
If the logs are normal, open `http://your_host_ip` in the browser to access the GPUStack UI. Log in to GPUStack with username `admin` and the default password. You can run the following command to get the password for the default setup:
```bash
docker exec -it gpustack cat /var/lib/gpustack/initial_admin_password
```
### (Optional) Add Worker
You can add more GPU nodes to GPUStack to form a GPU cluster. You need to add workers on other GPU nodes and specify the `--server-url` and `--token` parameters to join GPUStack.
To get the token used for adding workers, run the following command on the GPUStack **server node**:
```bash
docker exec -it gpustack cat /var/lib/gpustack/token
```
To start GPUStack as a worker, and **register it with the GPUStack server**, run the following command on the **worker node**. Be sure to replace the URL and token with your specific values:
```bash
docker run -d --name gpustack \
-v /lib/modules:/lib/modules \
-v /dev:/dev \
--privileged \
--cap-add=ALL \
--pid=host \
--restart=unless-stopped \
--network=host \
--ipc=host \
-v gpustack-data:/var/lib/gpustack \
gpustack/gpustack:latest-corex \
--server-url http://your_gpustack_url --token your_gpustack_token
```
!!! note
1. **Heterogeneous cluster is supported.** No matter what type of device it is, you can add it to the current GPUStack as a worker by specifying the `--server-url` and `--token` parameters.
2. You can set additional flags for the `gpustack start` command by appending them to the docker run command.
For configuration details, please refer to the [CLI Reference](../../cli-reference/start.md).

@ -55,6 +55,7 @@ GPUStack supports the following accelerators:
- [x] Ascend CANN
- [x] Hygon DTK
- [x] Moore Threads MUSA
- [x] Iluvatar Corex
Ensure all necessary drivers and libraries are installed on the system prior to installing GPUStack.
@ -94,6 +95,13 @@ To use Moore Threads MUSA as an accelerator, ensure the following components are
- [MUSA SDK](https://developer.mthreads.com/sdk/download/musa)
- [MT Container Toolkits](https://developer.mthreads.com/sdk/download/CloudNative) (Optional, required for docker installation)
### Iluvatar Corex
To use Iluvatar Corex as an accelerator, ensure the following components are installed:
- [Corex driver](https://support.iluvatar.com/#/ProductLine?id=2)
- [Corex Toolkits](https://support.iluvatar.com/#/ProductLine?id=2)
## Networking Requirements
### Network Architecture

@ -8,7 +8,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.
- **Broad GPU Compatibility:** Seamlessly supports GPUs from various vendors across Apple Macs, Windows PCs, and Linux servers.
- **Extensive Model Support:** Supports a wide range of models including LLMs, VLMs, image models, audio models, embedding models, and rerank models.
- **Flexible Inference Backends:** Integrates with llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM, and Ascend MindIE.
- **Flexible Inference Backends:** Flexibly integrates with multiple inference backends including llama-box (llama.cpp & stable-diffusion.cpp), vox-box, vLLM and Ascend MindIE.
- **Multi-Version Backend Support:** Run multiple versions of inference backends concurrently to meet the diverse runtime requirements of different models.
- **Distributed Inference:** Supports single-node and multi-node multi-GPU inference, including heterogeneous GPUs across vendors and runtime environments.
- **Scalable GPU Architecture:** Easily scale up by adding more GPUs or nodes to your infrastructure.
@ -35,6 +35,7 @@ GPUStack is an open-source GPU cluster manager for running AI models.
- [x] Ascend CANN
- [x] Hygon DTK
- [x] Moore Threads MUSA
- [x] Iluvatar Corex
We plan to support the following accelerators in future releases.

@ -75,12 +75,12 @@ For more details, please refer to [vLLM documentation](https://docs.vllm.ai/en/s
### Supported Platforms
The vLLM backend works on AMD64 Linux.
The vLLM backend works on Linux.
!!! Note
1. When users install GPUStack on amd64 Linux using the installation script, vLLM is automatically installed.
2. When users deploy a model using the vLLM backend, GPUStack sets worker label selectors to `{"os": "linux", "arch": "amd64"}` by default to ensure the model instance is scheduled to proper workers. You can customize the worker label selectors in the model configuration.
2. When users deploy a model using the vLLM backend, GPUStack sets worker label selectors to `{"os": "linux"}` by default to ensure the model instance is scheduled to proper workers. You can customize the worker label selectors in the model configuration.
### Supported Models

@ -250,8 +250,6 @@
replicas: 1
backend: vllm
backend_parameters: &qwen3_thinking_vllm_parameters
- --enable-auto-tool-choice
- --tool-call-parser=hermes
- --enable-reasoning
- --reasoning-parser=deepseek_r1
- quantizations: ["FP8"]
@ -990,6 +988,102 @@
backend: vllm
backend_parameters:
- --trust-remote-code
- name: Deepseek R1 0528 Qwen3 8B
description: DeepSeek-R1-0528-Qwen3-8B is a post-trained model derived by distilling the chain-of-thought reasoning patterns from DeepSeek-R1-0528 into the Qwen3 8B Base model. As a result, it achieves state-of-the-art (SOTA) performance among open-source models on the AIME 2024 benchmark, outperforming the original Qwen3 8B by 10.0% and reaching the level of Qwen3-235B-thinking.
home: https://www.deepseek.com
icon: /static/catalog_icons/deepseek.png
categories:
- llm
capabilities:
- context/128K
sizes:
- 8
licenses:
- mit
release_date: "2025-05-28"
templates:
- quantizations:
- UD-IQ1_M
- UD-IQ1_S
- Q2_K_L
- Q3_K_M
- Q4_K_M
- Q5_K_M
- Q6_K
- Q8_0
- BF16
source: model_scope
model_scope_model_id: unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF
model_scope_file_path: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
# give R1 more default context to think
- --ctx-size=32768
# recommended temperature and top_p for R1
- --temp=0.6
- --top-p=0.95
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["BF16"]
source: model_scope
model_scope_model_id: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- name: Deepseek R1 0528
description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
home: https://www.deepseek.com
icon: /static/catalog_icons/deepseek.png
categories:
- llm
capabilities:
- context/128K
sizes:
- 671
licenses:
- mit
release_date: "2025-05-28"
templates:
- quantizations:
- UD-IQ1_M
- UD-IQ1_S
- UD-Q2_K_XL
- UD-Q3_K_XL
- Q4_K_M
- Q8_0
- BF16
source: model_scope
model_scope_model_id: unsloth/DeepSeek-R1-0528-GGUF
model_scope_file_path: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
# give R1 more default context to think
- --ctx-size=32768
# recommended temperature and top_p for R1
- --temp=0.6
- --top-p=0.95
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["FP8"]
source: model_scope
model_scope_model_id: deepseek-ai/DeepSeek-R1-0528
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- quantizations: ["BF16"]
source: model_scope
model_scope_model_id: unsloth/DeepSeek-R1-0528-BF16
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- name: Deepseek R1
description: DeepSeek's first-generation reasoning model that delivers superior performance in math, code, and reasoning tasks. It effectively overcomes reasoning challenges and achieves performance comparable to OpenAI-o1 across various benchmarks. This includes six dense models distilled from DeepSeek-R1 based on Llama and Qwen.
home: https://www.deepseek.com
@ -1009,7 +1103,6 @@
licenses:
- deepseek
release_date: "2025-01-20"
order: 2
templates:
- quantizations: ["FP8"]
sizes:
@ -1609,6 +1702,8 @@
model_scope_file_path: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
- --visual-max-image-size=1344
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["BF16"]

@ -262,8 +262,6 @@
replicas: 1
backend: vllm
backend_parameters: &qwen3_thinking_vllm_parameters
- --enable-auto-tool-choice
- --tool-call-parser=hermes
- --enable-reasoning
- --reasoning-parser=deepseek_r1
- quantizations: ["FP8"]
@ -957,6 +955,102 @@
backend: vllm
backend_parameters:
- --trust-remote-code
- name: Deepseek R1 0528 Qwen3 8B
description: DeepSeek-R1-0528-Qwen3-8B is a post-trained model derived by distilling the chain-of-thought reasoning patterns from DeepSeek-R1-0528 into the Qwen3 8B Base model. As a result, it achieves state-of-the-art (SOTA) performance among open-source models on the AIME 2024 benchmark, outperforming the original Qwen3 8B by 10.0% and reaching the level of Qwen3-235B-thinking.
home: https://www.deepseek.com
icon: /static/catalog_icons/deepseek.png
categories:
- llm
capabilities:
- context/128K
sizes:
- 8
licenses:
- mit
release_date: "2025-05-28"
templates:
- quantizations:
- UD-IQ1_M
- UD-IQ1_S
- Q2_K_L
- Q3_K_M
- Q4_K_M
- Q5_K_M
- Q6_K
- Q8_0
- BF16
source: huggingface
huggingface_repo_id: unsloth/DeepSeek-R1-0528-Qwen3-8B-GGUF
huggingface_filename: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
# give R1 more default context to think
- --ctx-size=32768
# recommended temperature and top_p for R1
- --temp=0.6
- --top-p=0.95
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["BF16"]
source: huggingface
huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528-Qwen3-8B
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- name: Deepseek R1 0528
description: DeepSeek-R1-0528 is a minor version of the DeepSeek R1 model that features enhanced reasoning depth and inference capabilities. These improvements are achieved through increased computational resources and algorithmic optimizations applied during post-training. The model delivers strong performance across a range of benchmark evaluations, including mathematics, programming, and general logic, with overall capabilities approaching those of leading models such as O3 and Gemini 2.5 Pro.
home: https://www.deepseek.com
icon: /static/catalog_icons/deepseek.png
categories:
- llm
capabilities:
- context/128K
sizes:
- 671
licenses:
- mit
release_date: "2025-05-28"
templates:
- quantizations:
- UD-IQ1_M
- UD-IQ1_S
- UD-Q2_K_XL
- UD-Q3_K_XL
- Q4_K_M
- Q8_0
- BF16
source: huggingface
huggingface_repo_id: unsloth/DeepSeek-R1-0528-GGUF
huggingface_filename: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
# give R1 more default context to think
- --ctx-size=32768
# recommended temperature and top_p for R1
- --temp=0.6
- --top-p=0.95
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["FP8"]
source: huggingface
huggingface_repo_id: deepseek-ai/DeepSeek-R1-0528
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- quantizations: ["BF16"]
source: huggingface
huggingface_repo_id: unsloth/DeepSeek-R1-0528-BF16
replicas: 1
backend: vllm
backend_parameters:
- --trust-remote-code
- --max-model-len=32768
- name: Deepseek R1
description: DeepSeek's first-generation reasoning model that delivers superior performance in math, code, and reasoning tasks. It effectively overcomes reasoning challenges and achieves performance comparable to OpenAI-o1 across various benchmarks. This includes six dense models distilled from DeepSeek-R1 based on Llama and Qwen.
home: https://www.deepseek.com
@ -976,7 +1070,6 @@
licenses:
- deepseek
release_date: "2025-01-20"
order: 2
templates:
- quantizations:
- Q2_K_L
@ -1584,6 +1677,8 @@
huggingface_filename: "*-{quantization}*.gguf"
replicas: 1
backend: llama-box
backend_parameters:
- --visual-max-image-size=1344
cpu_offloading: true
distributed_inference_across_workers: true
- quantizations: ["BF16"]

@ -383,7 +383,7 @@ class Config(BaseSettings):
if vendor not in VendorEnum.__members__.values():
raise Exception(
"Unsupported GPU device vendor, supported vendors are: Apple, NVIDIA, 'Moore Threads', Huawei, AMD, Hygon"
"Unsupported GPU device vendor, supported vendors are: Apple, NVIDIA, 'Moore Threads', Huawei, AMD, Hygon, Iluvatar"
)
if not memory:
@ -391,7 +391,7 @@ class Config(BaseSettings):
if type not in DeviceTypeEnum.__members__.values():
raise Exception(
"Unsupported GPU type, supported type are: cuda, musa, npu, mps, rocm, dcu"
"Unsupported GPU type, supported type are: cuda, musa, npu, mps, rocm, dcu, corex"
)
memory_total = memory.get("total")

@ -10,6 +10,7 @@ from gpustack.detectors.fastfetch.fastfetch import Fastfetch
from gpustack.detectors.npu_smi.npu_smi import NPUSMI
from gpustack.detectors.rocm_smi.rocm_smi import RocmSMI
from gpustack.detectors.regredit.regredit import Regredit
from gpustack.detectors.ixsmi.ixsmi import IXSMI
from gpustack.utils import platform
@ -40,6 +41,7 @@ class DetectorFactory:
platform.DeviceTypeEnum.MUSA.value: [fastfetch],
platform.DeviceTypeEnum.ROCM.value: [RocmSMI(), Regredit()],
platform.DeviceTypeEnum.DCU.value: [RocmSMI()],
platform.DeviceTypeEnum.COREX.value: [IXSMI()],
}
def _validate_detectors(self):

@ -0,0 +1,111 @@
import csv
import subprocess
from gpustack.detectors.base import GPUDetector
from gpustack.schemas.workers import (
GPUCoreInfo,
GPUDeviceInfo,
GPUDevicesInfo,
MemoryInfo,
VendorEnum,
)
from gpustack.utils import platform
from gpustack.utils.command import is_command_available
from gpustack.utils.convert import safe_float, safe_int
class IXSMI(GPUDetector):
def is_available(self) -> bool:
return is_command_available("ixsmi")
def gather_gpu_info(self) -> GPUDevicesInfo:
command = self._command_gather_gpu()
results = self._run_command(command)
if results is None:
return []
return self.decode_gpu_devices(results)
def decode_gpu_devices(self, result) -> GPUDevicesInfo: # noqa: C901
"""
results example:
$ixsmi --format=csv,noheader --query-gpu=index,name,memory.total,memory.used,utilization.gpu,temperature.gpu
0, Iluvatar MR-V50, 16384 MiB, 116 MiB, 0 %, 30 C
1, Iluvatar MR-V100, 32768 MiB, 27996 MiB, 0 %, 36 C
"""
devices = []
reader = csv.reader(result.splitlines())
for row in reader:
if len(row) < 6:
continue
index, name, memory_total, memory_used, utilization_gpu, temperature_gpu = (
row
)
index = safe_int(index)
name = name.strip()
# Convert MiB to bytes
memory_total = safe_int(memory_total.split()[0]) * 1024 * 1024
# Convert MiB to bytes
memory_used = safe_int(memory_used.split()[0]) * 1024 * 1024
utilization_gpu = safe_float(
utilization_gpu.split()[0]
) # Remove the '%' sign
temperature_gpu = safe_float(temperature_gpu)
device = GPUDeviceInfo(
index=index,
name=name,
vendor=VendorEnum.Iluvatar.value,
memory=MemoryInfo(
is_unified_memory=False,
used=memory_used,
total=memory_total,
utilization_rate=(
(memory_used / memory_total) * 100 if memory_total > 0 else 0
),
),
core=GPUCoreInfo(
utilization_rate=utilization_gpu,
total=0, # Total cores information is not provided by ixsmi
),
temperature=temperature_gpu,
type=platform.DeviceTypeEnum.CUDA.value,
)
devices.append(device)
return devices
def _run_command(self, command):
result = None
try:
result = subprocess.run(
command, capture_output=True, text=True, encoding="utf-8"
)
if result is None or result.stdout is None:
return None
output = result.stdout
if "no devices" in output.lower():
return None
if result.returncode != 0:
raise Exception(f"Unexpected return code: {result.returncode}")
if output == "" or output is None:
raise Exception(f"Output is empty, return code: {result.returncode}")
return output
except Exception as e:
error_message = f"Failed to execute {command}: {e}"
if result:
error_message += f", stdout: {result.stdout}, stderr: {result.stderr}"
raise Exception(error_message)
def _command_gather_gpu(self):
executable_command = [
"ixsmi",
"--format=csv,noheader",
"--query-gpu=index,name,memory.total,memory.used,utilization.gpu,temperature.gpu",
]
return executable_command

@ -1,4 +1,5 @@
import argparse
from multiprocessing import freeze_support
from gpustack.cmd import setup_start_cmd
from gpustack.cmd.chat import setup_chat_cmd
@ -36,4 +37,8 @@ def main():
if __name__ == "__main__":
# When using multiprocessing with 'spawn' mode, freeze_support() must be called in the main module
# to ensure the main process environment is correctly initialized when child processes are spawned.
# See: https://docs.python.org/3/library/multiprocessing.html#the-spawn-and-forkserver-start-methods
freeze_support()
main()

@ -105,10 +105,10 @@ async def get_serving_logs(
try:
async with client.get(model_instance_log_url, timeout=timeout) as resp:
if resp.status != 200:
raise HTTPException(
status_code=resp.status,
detail=f"Error fetching serving logs: {resp.reason}",
)
body = await resp.read()
yield body, resp.headers, resp.status
return
async for chunk in resp.content.iter_any():
yield chunk, resp.headers, resp.status
except Exception as e:

@ -276,6 +276,7 @@ def validate_gpu(
VendorEnum.NVIDIA.value,
VendorEnum.AMD.value,
VendorEnum.Hygon.value,
VendorEnum.Huawei.value,
]:
raise BadRequestException(
f"vLLM backend is not supported on {gpu_device.vendor} GPUs."

@ -1,8 +1,9 @@
from fastapi import APIRouter, Request
from fastapi import HTTPException
from fastapi.responses import StreamingResponse
from pathlib import Path
from tenacity import RetryError
from gpustack.api.exceptions import NotFoundException
from gpustack.worker.logs import LogOptionsDep
from gpustack.worker.logs import log_generator
from gpustack.utils import file
@ -17,7 +18,7 @@ async def get_serve_logs(request: Request, id: int, log_options: LogOptionsDep):
try:
file.check_file_with_retries(path)
except FileNotFoundError:
raise HTTPException(status_code=404, detail="Log file not found")
except (FileNotFoundError, RetryError):
raise NotFoundException(message="Log file not found")
return StreamingResponse(log_generator(path, log_options), media_type="text/plain")

@ -132,6 +132,8 @@ class GGUFParserCommandMutableParameters:
# NB(thxCode): Partial options are not applied to backend, but to the parser.
# We can receive these options from the backend advanced config.
backend_version: Optional[str] = None
# Estimate
flash_attention: Optional[bool] = None
main_gpu: Optional[int] = None
@ -148,6 +150,8 @@ class GGUFParserCommandMutableParameters:
split_mode: Optional[str] = None
ubatch_size: Optional[int] = None
visual_max_image_size: [int] = None
max_projected_cache: [int] = None
swa_full: bool = False
# Estimate/StableDiffusionCpp
image_autoencoder_tiling: bool = True
image_batch_count: Optional[int] = None
@ -166,7 +170,7 @@ class GGUFParserCommandMutableParameters:
skip_tls_verify: Optional[bool] = None
def from_args(self, args: List[str]):
parser = argparse.ArgumentParser(exit_on_error=False)
parser = argparse.ArgumentParser(exit_on_error=False, allow_abbrev=False)
# Estimate
parser.add_argument(
@ -257,6 +261,17 @@ class GGUFParserCommandMutableParameters:
type=int,
required=False,
)
parser.add_argument(
"--max-projected-cache",
"--visual-max-image-cache",
type=int,
required=False,
)
parser.add_argument(
"--swa-full",
action='store_true',
required=False,
)
# Estimate/StableDiffusionCpp
parser.add_argument(
"--image-autoencoder-tiling",
@ -357,18 +372,32 @@ class GGUFParserCommandMutableParameters:
try:
args_parsed = parser.parse_known_args(args=args)
for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
if attr_value := getattr(args_parsed[0], attr_name):
try:
setattr(self, attr_name, attr_value)
except ValueError as e:
slogger.warning(
f"Failed to receive mutable parameter {attr_name}: {e}"
)
try:
attr_value = getattr(args_parsed[0], attr_name)
if attr_value is not None:
try:
setattr(self, attr_name, attr_value)
except ValueError as e:
slogger.warning(
f"Failed to receive mutable parameter {attr_name}: {e}"
)
except AttributeError:
# If reach here, that means the field is an internal property,
# which would not register in the argument parser.
pass
except (argparse.ArgumentError, argparse.ArgumentTypeError) as e:
slogger.warning(f"Failed to parse mutable parameters: {e}")
def extend_command(self, command: List[str]):
internal_properties = [
"backend_version",
]
for attr_name in [attr.name for attr in dataclasses.fields(self.__class__)]:
if attr_name in internal_properties:
# Skip internal properties.
continue
attr_value = getattr(self, attr_name)
if attr_value is not None:
if isinstance(attr_value, bool):
@ -382,6 +411,13 @@ class GGUFParserCommandMutableParameters:
[f"--{attr_name.replace('_', '-')}", str(attr_value)]
)
if self.backend_version:
# Parser v0.18.0+ supports estimating Sliding Window Attention (SWA) usage,
# however, llama-box treats `--batch-size` as the same as `--ctx-size` within [v0.0.140, v0.0.148],
# so we need to set `--batch-size` to `--ctx-size` to avoid wrong RAM/VRAM estimation.
if "v0.0.139" < self.backend_version < "v0.0.149":
command.append(f"--batch-size={self.ctx_size}")
async def _gguf_parser_command( # noqa: C901
model: Model, offload: GPUOffloadEnum = GPUOffloadEnum.Full, **kwargs
@ -400,7 +436,7 @@ async def _gguf_parser_command( # noqa: C901
]
# Extend the command with mutable arguments.
params = GGUFParserCommandMutableParameters()
params = GGUFParserCommandMutableParameters(backend_version=model.backend_version)
params.from_args(model.backend_parameters)
params.extend_command(command)

@ -261,11 +261,6 @@ async def evaluate_environment(
"The Ascend MindIE backend requires Ascend NPUs but none are available."
]
if backend == BackendEnum.VLLM and only_ascend_npu(workers):
return False, [
"The vLLM backend is not supported on Ascend NPUs at the moment. Use the Ascend MindIE or llama-box backend instead."
]
return True, []

@ -58,6 +58,7 @@ class VendorEnum(str, Enum):
Huawei = "Huawei"
AMD = "AMD"
Hygon = "Hygon"
Iluvatar = "Iluvatar"
class MountPoint(BaseModel):

@ -32,7 +32,7 @@ def copy_owner_recursively(src, dst):
os.chown(os.path.join(dirpath, filename), st.st_uid, st.st_gid)
@retry(stop=stop_after_attempt(5), wait=wait_fixed(0.5))
@retry(stop=stop_after_attempt(10), wait=wait_fixed(1))
def check_file_with_retries(path: Path):
if not os.path.exists(path):
raise FileNotFoundError(f"Log file not found: {path}")

@ -85,6 +85,7 @@ class DeviceTypeEnum(str, Enum):
ROCM = "rocm"
MUSA = "musa"
DCU = "dcu"
COREX = "corex"
def device() -> str:
@ -96,6 +97,7 @@ def device() -> str:
- mps
- rocm
- dcu
- iluvatar
- etc.
"""
if (
@ -126,6 +128,9 @@ def device() -> str:
):
return DeviceTypeEnum.ROCM.value
if is_command_available("ixsmi"):
return "corex"
return ""
@ -137,6 +142,7 @@ def device_type_from_vendor(vendor: VendorEnum) -> str:
VendorEnum.AMD.value: DeviceTypeEnum.ROCM.value,
VendorEnum.Hygon.value: DeviceTypeEnum.DCU.value,
VendorEnum.MTHREADS.value: DeviceTypeEnum.MUSA.value,
VendorEnum.Iluvatar.value: DeviceTypeEnum.COREX.value,
}
return mapping.get(vendor, "")
@ -166,3 +172,43 @@ def get_cuda_version() -> str:
except Exception as e:
logger.error(f"Error running nvcc: {e}")
return ""
def get_cann_version() -> str:
"""
Returns the CANN version installed on the system.
"""
env_cann_version = os.getenv("CANN_VERSION", "")
if env_cann_version:
return env_cann_version
try:
# Borrowed from https://gitee.com/ascend/pytorch/blob/master/test/npu/test_cann_version.py.
import torch # noqa: F401
import torch_npu # noqa: F401
from torch_npu.utils.collect_env import (
get_cann_version as get_cann_version_from_env,
)
from torch_npu.npu.utils import get_cann_version
cann_version = get_cann_version_from_env()
if cann_version:
return cann_version.lower()
cann_version = get_cann_version()
if cann_version:
return cann_version.lower()
except ImportError:
pass
return ""
def get_cann_chip() -> str:
"""
Returns the CANN chip version installed on the system.
"""
# TODO(thxCode): figure out a way to discover the CANN chip version
return os.getenv("CANN_CHIP", "")

@ -54,7 +54,7 @@ class AscendMindIEParameters:
override_generation_config_parsed: Optional[any] = None # store JSON parsed result
def from_args(self, args: List[str]):
parser = argparse.ArgumentParser(exit_on_error=False)
parser = argparse.ArgumentParser(exit_on_error=False, allow_abbrev=False)
#
# Log config
#

@ -31,6 +31,7 @@ ACCELERATOR_VENDOR_TO_ENV_NAME = {
VendorEnum.Huawei: "ASCEND_RT_VISIBLE_DEVICES",
VendorEnum.AMD: "ROCR_VISIBLE_DEVICES",
VendorEnum.Hygon: "HIP_VISIBLE_DEVICES",
VendorEnum.Iluvatar: "CUDA_VISIBLE_DEVICES",
}

@ -83,6 +83,10 @@ class LlamaBoxServer(InferenceServer):
default_mmproj = get_mmproj_file(self._model_path)
if mmproj is None and default_mmproj:
arguments.extend(["--mmproj", default_mmproj])
# Enable `--max-projected-cache` to optimize chatting experience,
# cause llama-box will ignore unknown parameters,
# we can safely add this parameter without breaking previous version.
arguments.extend(["--max-projected-cache", "10"])
if rpc_servers:
rpc_servers_argument = ",".join(rpc_servers)

@ -4,7 +4,7 @@ import os
import subprocess
import sys
import sysconfig
from typing import TYPE_CHECKING, Dict, List, Optional
from typing import Dict, List, Optional
from gpustack.schemas.models import ModelInstance, ModelInstanceStateEnum
from gpustack.utils.command import find_parameter, get_versioned_command
from gpustack.utils.hub import (
@ -96,14 +96,6 @@ class VLLMServer(InferenceServer):
return
device_str = "GPU"
if not TYPE_CHECKING:
from vllm.platforms import current_platform
device_str = current_platform.ray_device_key
if not device_str:
raise RuntimeError(
f"current platform {current_platform.device_name} does not support ray."
)
ray_placement_group_bundles: List[Dict[str, float]] = []
bundle_indexes = []

@ -17,13 +17,12 @@ from gpustack.schemas.models import BackendEnum
from gpustack.utils.command import get_versioned_command
from gpustack.utils.compat_importlib import pkg_resources
from gpustack.utils import platform, envs
from gpustack.config.config import get_global_config
logger = logging.getLogger(__name__)
BUILTIN_LLAMA_BOX_VERSION = "v0.0.144"
BUILTIN_GGUF_PARSER_VERSION = "v0.17.5"
BUILTIN_LLAMA_BOX_VERSION = "v0.0.154"
BUILTIN_GGUF_PARSER_VERSION = "v0.19.0"
BUILTIN_RAY_VERSION = "2.43.0"
@ -60,8 +59,6 @@ class ToolsManager:
self._os = system if system else platform.system()
self._arch = arch if arch else platform.arch()
self._device = device if device else platform.device()
if self._device == platform.DeviceTypeEnum.CUDA.value:
self._llama_box_cuda_version = self._get_llama_box_cuda_version()
self._download_base_url = tools_download_base_url
self._bin_dir = bin_dir
self._pipx_path = pipx_path
@ -373,6 +370,25 @@ class ToolsManager:
f"Failed to execute 'pipx environment --value PIPX_BIN_DIR': {e}"
)
def _get_pipx_local_venvs(self, pipx_path: str) -> Path:
"""
Use `pipx environment --value PIPX_LOCAL_VENVS` to get the directory where pipx installs local virtual environments.
"""
try:
result = subprocess.run(
[pipx_path, "environment", "--value", "PIPX_LOCAL_VENVS"],
capture_output=True,
text=True,
check=True,
)
pipx_local_venv_dir = result.stdout.strip()
if pipx_local_venv_dir:
return Path(pipx_local_venv_dir)
except subprocess.CalledProcessError as e:
raise Exception(
f"Failed to execute 'pipx environment --value PIPX_LOCAL_VENVS': {e}"
)
def _download_acsend_mindie(self, version: str, target_dir: Path):
# Check if the system is supported
if self._os != "linux" or self._arch not in ["amd64", "arm64"]:
@ -426,7 +442,7 @@ class ToolsManager:
shutil.rmtree(llama_box_tmp_dir)
os.makedirs(llama_box_tmp_dir, exist_ok=True)
platform_name = self._get_llama_box_platform_name()
platform_name = self._get_llama_box_platform_name(version)
tmp_file = llama_box_tmp_dir / f"llama-box-{version}-{platform_name}.zip"
url_path = f"gpustack/llama-box/releases/download/{version}/llama-box-{platform_name}.zip"
@ -450,117 +466,111 @@ class ToolsManager:
def _link_llama_box_rpc_server(self):
"""
Create a symlink for llama-box-rpc-server in the bin directory.
Create a directory relative symlink for llama-box-rpc-server in the bin directory.
This is used to help differentiate between the llama-box and llama-box-rpc-server processes.
"""
target_dir = self.third_party_bin_path / "llama-box"
file_name = "llama-box.exe" if self._os == "windows" else "llama-box"
llama_box_file = target_dir / file_name
src_file_name = "llama-box"
dst_file_name = "llama-box-rpc-server"
if self._os == "windows":
target_rpc_server_file = target_dir / "llama-box-rpc-server.exe"
else:
target_rpc_server_file = target_dir / "llama-box-rpc-server"
src_file_name += ".exe"
dst_file_name += ".exe"
target_dir = self.third_party_bin_path / "llama-box"
src_file = target_dir / src_file_name
dst_file = target_dir / dst_file_name
if os.path.lexists(target_rpc_server_file):
os.remove(target_rpc_server_file)
if os.path.lexists(dst_file):
os.remove(dst_file)
if self._os == "windows":
os.link(llama_box_file, target_rpc_server_file)
os.link(src_file, dst_file)
else:
os.symlink(llama_box_file, target_rpc_server_file)
target_dir_fd = os.open(target_dir, os.O_RDONLY)
os.symlink(src_file_name, dst_file_name, dir_fd=target_dir_fd)
logger.debug(f"Linked llama-box-rpc-server to {target_rpc_server_file}")
logger.debug(f"Linked llama-box-rpc-server to {dst_file}")
def _get_llama_box_cuda_version(self) -> str:
def _get_llama_box_platform_name(self, version: str) -> str: # noqa C901
"""
Gets the appropriate CUDA version of the llama-box based on the system's CUDA version.
Get the platform name for llama-box based on the OS, architecture, and device type.
"""
default_version = "12.4"
cuda_version = platform.get_cuda_version()
match = re.match(r"(\d+)\.(\d+)", cuda_version)
if not match:
return default_version
major, minor = map(int, match.groups())
if major == 11:
return "11.8"
elif major == 12 and minor >= 8:
return "12.8"
return default_version
def _get_llama_box_platform_name(self) -> str: # noqa C901
platform_name = ""
if (
self._os == "darwin"
and self._arch == "arm64"
and self._device == platform.DeviceTypeEnum.MPS.value
):
platform_name = "darwin-arm64-metal"
elif self._os == "darwin":
platform_name = "darwin-amd64-avx2"
elif (
self._os in ["linux", "windows"]
and self._arch in ["amd64", "arm64"]
and self._device == platform.DeviceTypeEnum.CUDA.value
):
# Only amd64 for windows
normalized_arch = "amd64" if self._os == "windows" else self._arch
platform_name = (
f"{self._os}-{normalized_arch}-cuda-{self._llama_box_cuda_version}"
)
elif (
self._os == "linux"
and self._arch == "amd64"
and self._device == platform.DeviceTypeEnum.MUSA.value
):
platform_name = "linux-amd64-musa-rc3.1"
elif self._os == "linux" and self._device == platform.DeviceTypeEnum.NPU.value:
# Available version: 8.0.0(.beta1) [default] / 8.0.rc2(.beta1) / 8.0.rc3(.beta1)
version = "8.0"
if ".rc2" in os.getenv("CANN_VERSION", ""):
version = "8.0.rc2"
elif ".rc3" in os.getenv("CANN_VERSION", ""):
version = "8.0.rc3"
# Available variant: 910b [default] / 310p
variant = ""
if os.getenv("CANN_CHIP", "") == "310p":
variant = "-310p"
platform_name = f"linux-{self._arch}-cann-{version}{variant}"
elif (
self._os == "linux"
and self._arch == "amd64"
and self._device == platform.DeviceTypeEnum.ROCM.value
):
platform_name = "linux-amd64-hip-6.2"
elif (
self._os == "linux"
and self._arch == "amd64"
and self._device == platform.DeviceTypeEnum.DCU.value
):
platform_name = "linux-amd64-dtk-24.04"
elif self._os == "linux" and self._arch == "amd64":
platform_name = "linux-amd64-avx2"
elif self._os == "linux" and self._arch == "arm64":
platform_name = "linux-arm64-neon"
elif (
self._os == "windows"
and self._arch == "amd64"
and self._device == platform.DeviceTypeEnum.ROCM.value
):
platform_name = "windows-amd64-hip-6.2"
elif self._os == "windows" and self._arch == "amd64":
platform_name = "windows-amd64-avx2"
elif self._os == "windows" and self._arch == "arm64":
platform_name = "windows-arm64-neon"
# Get the toolkit based on the device type.
device_toolkit_mapper = {
platform.DeviceTypeEnum.CUDA.value: "cuda",
platform.DeviceTypeEnum.NPU.value: "cann",
platform.DeviceTypeEnum.MPS.value: "metal",
platform.DeviceTypeEnum.ROCM.value: "hip",
platform.DeviceTypeEnum.MUSA.value: "musa",
platform.DeviceTypeEnum.DCU.value: "dtk",
}
if self._device in device_toolkit_mapper:
toolkit = device_toolkit_mapper[self._device]
elif self._arch == "amd64":
toolkit = "avx2"
elif self._arch == "arm64":
toolkit = "neon"
else:
raise Exception(
f"unsupported platform, os: {self._os}, arch: {self._arch}, device: {self._device}"
)
return platform_name
# Get the toolkit version based on the toolkit,
# support fetching from environment variable or using default values.
toolkit_version = ""
if toolkit == "cuda":
# Since v0.0.145, llama-box no longer supports CUDA 11.8.
toolkit_version = "12.4"
cuda_version = platform.get_cuda_version()
match = re.match(r"(\d+)\.(\d+)", cuda_version)
if match:
major, minor = map(int, match.groups())
if major == 11 and version <= "v0.0.144":
toolkit_version = "11.8"
elif major == 12 and minor >= 8:
toolkit_version = "12.8"
elif toolkit == "cann":
# Since v0.0.145, llama-box supports CANN 8.1 by default,
# and supports CANN 8.0 only for backward compatibility.
toolkit_version = "8.0"
cann_version = platform.get_cann_version()
match = re.match(r"(\d+)\.(\d+)", cann_version)
if match:
major, minor = map(int, match.groups())
if major == 8 and minor >= 1 and version > "v0.0.144":
toolkit_version = "8.1"
# Currently, llama-box only supports release candidate version of CANN 8.1.
if toolkit_version == "8.1":
match = re.search(r"\.rc\d+", cann_version)
if match:
rc = match.group(0)
if rc:
toolkit_version += rc
cann_chip = platform.get_cann_chip()
if cann_chip and "310p" == cann_chip:
toolkit_version += "-310p"
elif toolkit == "hip":
toolkit_version = "6.2"
elif toolkit == "musa":
# Since v0.0.150, llama-box supports MUSA rc4.0,
# and no longer supports MUSA rc3.1.
toolkit_version = "rc3.1"
if version > "v0.0.149":
toolkit_version = "rc4.0"
elif toolkit == "dtk":
toolkit_version = "24.04"
# The name conversation of llama-box is `${os}-${arch}-${toolkit}[-${toolkit_version}]`,
# for example: linux-amd64-cuda-12.4, linux-arm64-cann-8.0.rc2-310p.
segments = [
self._os,
self._arch,
toolkit,
]
if toolkit_version:
segments.append(toolkit_version)
return "-".join(segments)
def download_gguf_parser(self):
version = BUILTIN_GGUF_PARSER_VERSION
@ -750,20 +760,33 @@ class ToolsManager:
):
"""Install Ascend MindIE run package to the target directory."""
pipx_path = shutil.which("pipx")
if self._pipx_path:
pipx_path = self._pipx_path
if not pipx_path:
raise Exception(
"pipx is required to install versioned Ascend MindIE but not found in system PATH. "
"Please install pipx first or provide the path to pipx using the server option `--pipx-path`. "
"Alternatively, you can install Ascend MindIE manually."
)
pipx_local_venvs = self._get_pipx_local_venvs(pipx_path)
if not pipx_local_venvs:
raise Exception(
"Failed to determine pipx local venvs. Ensure pipx is correctly installed."
)
# Create a virtual environment to collect the new Python packages.
cfg = get_global_config()
venv_parent_dir = Path(cfg.data_dir).joinpath("venvs", "mindie")
venv_parent_dir.mkdir(parents=True, exist_ok=True)
venv_dir = Path(pipx_local_venvs).joinpath(f"mindie_{version}")
try:
subprocess.check_call(
[sys.executable, "-m", "venv", "--system-site-packages", version],
cwd=venv_parent_dir,
[sys.executable, "-m", "venv", "--system-site-packages", venv_dir],
)
except subprocess.CalledProcessError as e:
raise Exception(
f"Failed to create a virtual environment for Ascend MindIE installation: {e}"
)
venv_dir = venv_parent_dir.joinpath(version)
venv_path = venv_dir.joinpath("bin", "activate")
logger.info(
f"Created virtual environment for Ascend MindIE installation: {venv_dir}"
@ -796,5 +819,26 @@ class ToolsManager:
env=env,
cwd=target_dir,
)
logger.info(f"Installed Ascend MindIE '{version}' to {target_dir}")
# Post process, inject the virtual environment activation script into set_env.sh.
logger.info(
"Injecting virtual environment activation into Ascend MindIE launch"
)
set_env_script = target_dir.joinpath(
"mindie", version, "mindie-service", "set_env.sh"
)
# - Enable set_env.sh writable permission
st = os.stat(set_env_script)
old_mode = st.st_mode
new_mode = old_mode | stat.S_IWUSR
os.chmod(set_env_script, new_mode)
with open(set_env_script, 'a', encoding='utf-8') as f:
f.write(f"\nsource {venv_path} || true\n")
# - Disable set_env.sh writable permission
os.chmod(set_env_script, old_mode)
logger.info(
f"Injected virtual environment activation into Ascend MindIE launch: {set_env_script}"
)
except subprocess.CalledProcessError as e:
raise Exception(f"Failed to install Ascend MindIE {command}: {e}")

@ -12,7 +12,7 @@ function build() {
}
function prepare_dependencies() {
bash "${ROOT_DIR}/hack/install.sh"
POETRY_ONLY=true bash "${ROOT_DIR}/hack/install.sh"
}
function set_version() {

@ -12,9 +12,12 @@ ROOT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd -P)"
source "${ROOT_DIR}/hack/lib/init.sh"
function download_deps() {
pip install poetry==1.8.3 pre-commit==3.7.1
pip install poetry==1.8.3
poetry install
pre-commit install
if [[ "${POETRY_ONLY:-false}" == "false" ]]; then
pip install pre-commit==3.7.1
pre-commit install
fi
}
function download_ui() {
@ -22,12 +25,15 @@ function download_ui() {
local ui_path="${ROOT_DIR}/gpustack/ui"
local tmp_ui_path="${ui_path}/tmp"
local tag="latest"
# local tag="${1}"
if [[ "${GIT_VERSION}" != "v0.0.0" ]]; then
tag="${GIT_VERSION}"
fi
rm -rf "${ui_path}"
mkdir -p "${tmp_ui_path}/ui"
gpustack::log::info "downloading ui assets"
gpustack::log::info "downloading '${tag}' UI assets"
if ! curl --retry 3 --retry-connrefused --retry-delay 3 -sSfL "https://gpustack-ui-1303613262.cos.accelerate.myqcloud.com/releases/${tag}.tar.gz" 2>/dev/null |
tar -xzf - --directory "${tmp_ui_path}/ui" 2>/dev/null; then

@ -31,10 +31,14 @@ function Get-UI {
$tmpUIPath = Join-Path -Path $tmpPath -ChildPath "ui"
$tag = "latest"
if ($GIT_VERSION -ne "v0.0.0") {
$tag = $GIT_VERSION
}
$null = Remove-Item -Recurse -Force $uiPath -ErrorAction Ignore
$null = New-Item -ItemType Directory -Path $tmpUIPath
GPUStack.Log.Info "downloading UI assets"
GPUStack.Log.Info "downloading '$tag' UI assets"
try {
$tmpFile = "$tmpPath/ui.tar.gz"

@ -89,6 +89,9 @@ nav:
- Moore Threads MUSA:
- Online Installation: installation/moorethreads-musa/online-installation.md
- Air-Gapped Installation: installation/moorethreads-musa/air-gapped-installation.md
- Iluvatar Corex:
- Online Installation: installation/iluvatar-corex/online-installation.md
- Air-Gapped Installation: installation/iluvatar-corex/air-gapped-installation.md
- CPU:
- Online Installation: installation/cpu/online-installation.md
- Air-Gapped Installation: installation/cpu/air-gapped-installation.md

20
poetry.lock generated

@ -3963,8 +3963,11 @@ files = [
{file = "lxml-5.4.0-cp36-cp36m-win_amd64.whl", hash = "sha256:7ce1a171ec325192c6a636b64c94418e71a1964f56d002cc28122fceff0b6121"},
{file = "lxml-5.4.0-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:795f61bcaf8770e1b37eec24edf9771b307df3af74d1d6f27d812e15a9ff3872"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_12_i686.manylinux2010_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:29f451a4b614a7b5b6c2e043d7b64a15bd8304d7e767055e8ab68387a8cacf4e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:891f7f991a68d20c75cb13c5c9142b2a3f9eb161f1f12a9489c82172d1f133c0"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4aa412a82e460571fad592d0f93ce9935a20090029ba08eca05c614f99b0cc92"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_aarch64.whl", hash = "sha256:ac7ba71f9561cd7d7b55e1ea5511543c0282e2b6450f122672a2694621d63b7e"},
{file = "lxml-5.4.0-cp37-cp37m-manylinux_2_28_x86_64.whl", hash = "sha256:c5d32f5284012deaccd37da1e2cd42f081feaa76981f0eaa474351b68df813c5"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:ce31158630a6ac85bddd6b830cffd46085ff90498b397bd0a259f59d27a12188"},
{file = "lxml-5.4.0-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:31e63621e073e04697c1b2d23fcb89991790eef370ec37ce4d5d469f40924ed6"},
{file = "lxml-5.4.0-cp37-cp37m-win32.whl", hash = "sha256:be2ba4c3c5b7900246a8f866580700ef0d538f2ca32535e991027bdaba944063"},
{file = "lxml-5.4.0-cp37-cp37m-win_amd64.whl", hash = "sha256:09846782b1ef650b321484ad429217f5154da4d6e786636c38e434fa32e94e49"},
@ -5121,6 +5124,7 @@ description = "Nvidia JIT LTO Library"
optional = true
python-versions = ">=3"
files = [
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_aarch64.whl", hash = "sha256:4abe7fef64914ccfa909bc2ba39739670ecc9e820c83ccc7a6ed414122599b83"},
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl", hash = "sha256:06b3b9b25bf3f8af351d664978ca26a16d2c5127dbd53c0497e28d1fb9611d57"},
{file = "nvidia_nvjitlink_cu12-12.4.127-py3-none-win_amd64.whl", hash = "sha256:fd9020c501d27d135f983c6d3e244b197a7ccad769e34df53a42e276b0e25fa1"},
]
@ -6239,6 +6243,7 @@ files = [
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
{file = "psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
{file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
@ -7671,6 +7676,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f66efbc1caa63c088dead1c4170d148eabc9b80d95fb75b6c92ac0aad2437d76"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:22353049ba4181685023b25b5b51a574bce33e7f51c759371a7422dcae5402a6"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:932205970b9f9991b34f55136be327501903f7c66830e9760a8ffb15b07f05cd"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a52d48f4e7bf9005e8f0a89209bf9a73f7190ddf0489eee5eb51377385f59f2a"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win32.whl", hash = "sha256:3eac5a91891ceb88138c113f9db04f3cebdae277f5d44eaa3651a4f573e6a5da"},
{file = "ruamel.yaml.clib-0.2.12-cp310-cp310-win_amd64.whl", hash = "sha256:ab007f2f5a87bd08ab1499bdf96f3d5c6ad4dcfa364884cb4549aa0154b13a28"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-macosx_13_0_arm64.whl", hash = "sha256:4a6679521a58256a90b0d89e03992c15144c5f3858f40d7c18886023d7943db6"},
@ -7679,6 +7685,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:811ea1594b8a0fb466172c384267a4e5e367298af6b228931f273b111f17ef52"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:cf12567a7b565cbf65d438dec6cfbe2917d3c1bdddfce84a9930b7d35ea59642"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:7dd5adc8b930b12c8fc5b99e2d535a09889941aa0d0bd06f4749e9a9397c71d2"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:1492a6051dab8d912fc2adeef0e8c72216b24d57bd896ea607cb90bb0c4981d3"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win32.whl", hash = "sha256:bd0a08f0bab19093c54e18a14a10b4322e1eacc5217056f3c063bd2f59853ce4"},
{file = "ruamel.yaml.clib-0.2.12-cp311-cp311-win_amd64.whl", hash = "sha256:a274fb2cb086c7a3dea4322ec27f4cb5cc4b6298adb583ab0e211a4682f241eb"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:20b0f8dc160ba83b6dcc0e256846e1a02d044e13f7ea74a3d1d56ede4e48c632"},
@ -7687,6 +7694,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:749c16fcc4a2b09f28843cda5a193e0283e47454b63ec4b81eaa2242f50e4ccd"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:bf165fef1f223beae7333275156ab2022cffe255dcc51c27f066b4370da81e31"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:32621c177bbf782ca5a18ba4d7af0f1082a3f6e517ac2a18b3974d4edf349680"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:b82a7c94a498853aa0b272fd5bc67f29008da798d4f93a2f9f289feb8426a58d"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win32.whl", hash = "sha256:e8c4ebfcfd57177b572e2040777b8abc537cdef58a2120e830124946aa9b42c5"},
{file = "ruamel.yaml.clib-0.2.12-cp312-cp312-win_amd64.whl", hash = "sha256:0467c5965282c62203273b838ae77c0d29d7638c8a4e3a1c8bdd3602c10904e4"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:4c8c5d82f50bb53986a5e02d1b3092b03622c02c2eb78e29bec33fd9593bae1a"},
@ -7695,6 +7703,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:96777d473c05ee3e5e3c3e999f5d23c6f4ec5b0c38c098b3a5229085f74236c6"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_i686.whl", hash = "sha256:3bc2a80e6420ca8b7d3590791e2dfc709c88ab9152c00eeb511c9875ce5778bf"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:e188d2699864c11c36cdfdada94d781fd5d6b0071cd9c427bceb08ad3d7c70e1"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4f6f3eac23941b32afccc23081e1f50612bdbe4e982012ef4f5797986828cd01"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win32.whl", hash = "sha256:6442cb36270b3afb1b4951f060eccca1ce49f3d087ca1ca4563a6eb479cb3de6"},
{file = "ruamel.yaml.clib-0.2.12-cp313-cp313-win_amd64.whl", hash = "sha256:e5b8daf27af0b90da7bb903a876477a9e6d7270be6146906b276605997c7e9a3"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:fc4b630cd3fa2cf7fce38afa91d7cfe844a9f75d7f0f36393fa98815e911d987"},
@ -7703,6 +7712,7 @@ files = [
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e2f1c3765db32be59d18ab3953f43ab62a761327aafc1594a2a1fbe038b8b8a7"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:d85252669dc32f98ebcd5d36768f5d4faeaeaa2d655ac0473be490ecdae3c285"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:e143ada795c341b56de9418c58d028989093ee611aa27ffb9b7f609c00d813ed"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:2c59aa6170b990d8d2719323e628aaf36f3bfbc1c26279c0eeeb24d05d2d11c7"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win32.whl", hash = "sha256:beffaed67936fbbeffd10966a4eb53c402fafd3d6833770516bf7314bc6ffa12"},
{file = "ruamel.yaml.clib-0.2.12-cp39-cp39-win_amd64.whl", hash = "sha256:040ae85536960525ea62868b642bdb0c2cc6021c9f9d507810c0c604e66f5a7b"},
{file = "ruamel.yaml.clib-0.2.12.tar.gz", hash = "sha256:6c8fbb13ec503f99a91901ab46e0b07ae7941cd527393187039aec586fdfd36f"},
@ -9338,14 +9348,14 @@ tensorizer = ["tensorizer (>=2.9.0)"]
[[package]]
name = "vox-box"
version = "0.0.14"
version = "0.0.15"
description = "Vox box"
optional = true
python-versions = "<4.0,>=3.10"
files = [
{file = "vox_box-0.0.14-py3-none-manylinux2014_aarch64.whl", hash = "sha256:3a3b4b392a26b83a683f9dae8328c9ce69b7513fceb751d24bc7018d69e2c54a"},
{file = "vox_box-0.0.14-py3-none-manylinux2014_x86_64.whl", hash = "sha256:8f40a3baaeaf4b89a4ff3a3a49543891f8b3b16e341ffcba270ee0906a20fa87"},
{file = "vox_box-0.0.14.tar.gz", hash = "sha256:f15c9ea6281f68adac83d7f0bf5121ef565b299a8cf87e488e5a655ef2e18642"},
{file = "vox_box-0.0.15-py3-none-manylinux2014_aarch64.whl", hash = "sha256:ec8272fa350bc5106635f79d32a98e2c4bb93f0bfa9f7948ab78335b349c4dcc"},
{file = "vox_box-0.0.15-py3-none-manylinux2014_x86_64.whl", hash = "sha256:1e046ddc2d139ec54a8555119c551b5172f6155f2b4b8c2581319296b109e22f"},
{file = "vox_box-0.0.15.tar.gz", hash = "sha256:db64d72b6324ef4bfd8b073ce0eaa90c0bc6f401f0994696754bd6c813f4b61d"},
]
[package.dependencies]
@ -9988,4 +9998,4 @@ vllm = ["bitsandbytes", "mistral_common", "timm", "vllm"]
[metadata]
lock-version = "2.0"
python-versions = ">=3.10,<3.13"
content-hash = "acf2fccb87b9c374ab046669e24b25f1c0584ee81e51061412aa2b63cb8f2734"
content-hash = "807f91a97e80f065aef18907753da78334606453a85a641cb23e093d423fba27"

@ -48,7 +48,7 @@ wmi = { version="^1.5.1", markers = "platform_system == 'Windows'" }
pywin32 = { version="^308", markers = "platform_system == 'Windows'" }
packaging = "^24.1"
psycopg2-binary = "^2.9.10"
vox-box = {version = "0.0.14", optional = true}
vox-box = {version = "0.0.15", optional = true}
tenacity = "^9.0.0"
aiocache = "^0.12.3"

Loading…
Cancel
Save