|
|
|
@ -31,47 +31,58 @@ class EnvChecker(metaclass=ABCMeta):
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def check_env(self, e):
|
|
|
|
|
pass
|
|
|
|
|
"""检查环境是否符合要求"""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def set_env(self):
|
|
|
|
|
pass
|
|
|
|
|
"""设置环境"""
|
|
|
|
|
|
|
|
|
|
@abstractmethod
|
|
|
|
|
def check_version(self):
|
|
|
|
|
pass
|
|
|
|
|
"""检查版本是否符合要求"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class GPUEnvChecker(EnvChecker):
|
|
|
|
|
"""GPU environment check."""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
# 初始化版本列表
|
|
|
|
|
self.version = ["10.1", "11.1"]
|
|
|
|
|
# 初始化库键到库名的映射字典
|
|
|
|
|
self.lib_key_to_lib_name = {'libcu': 'libcuda.so'}
|
|
|
|
|
# env
|
|
|
|
|
# 获取系统环境变量 PATH 的值
|
|
|
|
|
self.path = os.getenv("PATH")
|
|
|
|
|
# 获取系统环境变量 LD_LIBRARY_PATH 的值
|
|
|
|
|
self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
|
|
|
|
|
|
|
|
|
|
# check
|
|
|
|
|
# 初始化版本号为 "0"
|
|
|
|
|
self.v = "0"
|
|
|
|
|
# 获取 CUDA 库的路径
|
|
|
|
|
self.cuda_lib_path = self._get_lib_path("libcu")
|
|
|
|
|
# 获取 CUDA 可执行文件的路径
|
|
|
|
|
self.cuda_bin_path = self._get_bin_path("cuda")
|
|
|
|
|
# 获取 cuDNN 库的路径
|
|
|
|
|
self.cudnn_lib_path = self._get_lib_path("libcudnn")
|
|
|
|
|
|
|
|
|
|
def check_env(self, e):
|
|
|
|
|
# 抛出传入的异常 e
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def set_env(self):
|
|
|
|
|
# 设置环境变量,当前实现为空
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _get_bin_path(self, bin_name):
|
|
|
|
|
"""Get bin path by bin name."""
|
|
|
|
|
# 如果二进制名称为 "cuda",则调用获取 CUDA 二进制路径的方法
|
|
|
|
|
if bin_name == "cuda":
|
|
|
|
|
return self._get_cuda_bin_path()
|
|
|
|
|
# 否则返回空列表
|
|
|
|
|
return []
|
|
|
|
|
|
|
|
|
|
def _get_cuda_bin_path(self):
|
|
|
|
|
"""Get cuda bin path by lib path."""
|
|
|
|
|
# Get cuda bin path by lib path.
|
|
|
|
|
path_list = []
|
|
|
|
|
for path in self.cuda_lib_path:
|
|
|
|
|
path = os.path.abspath(path.strip()+"/bin/")
|
|
|
|
@ -81,56 +92,87 @@ class GPUEnvChecker(EnvChecker):
|
|
|
|
|
|
|
|
|
|
def _get_nvcc_version(self, is_set_env):
|
|
|
|
|
"""Get cuda version by nvcc command."""
|
|
|
|
|
# 运行 nvcc 命令获取 CUDA 版本信息
|
|
|
|
|
nvcc_result = subprocess.run(["nvcc", "--version | grep release"],
|
|
|
|
|
timeout=3, text=True, capture_output=True, check=False)
|
|
|
|
|
# 如果命令返回非零值,表示命令执行失败
|
|
|
|
|
if nvcc_result.returncode:
|
|
|
|
|
# 如果尚未设置环境变量
|
|
|
|
|
if not is_set_env:
|
|
|
|
|
# 遍历预设的 CUDA 二进制路径
|
|
|
|
|
for path in self.cuda_bin_path:
|
|
|
|
|
# 检查路径中是否存在 nvcc 文件
|
|
|
|
|
if Path(path + "/nvcc").is_file():
|
|
|
|
|
# 将路径添加到环境变量 PATH 中
|
|
|
|
|
os.environ['PATH'] = path + ":" + os.environ['PATH']
|
|
|
|
|
# 递归调用以重新尝试获取版本信息
|
|
|
|
|
return self._get_nvcc_version(True)
|
|
|
|
|
# 如果命令执行失败且未找到 nvcc 文件,返回空字符串
|
|
|
|
|
return ""
|
|
|
|
|
# 获取命令输出结果
|
|
|
|
|
result = nvcc_result.stdout
|
|
|
|
|
# 遍历输出结果的每一行
|
|
|
|
|
for line in result.split('\n'):
|
|
|
|
|
if line:
|
|
|
|
|
# 提取并返回 CUDA 版本号
|
|
|
|
|
return line.strip().split("release")[1].split(",")[0].strip()
|
|
|
|
|
# 如果未找到版本信息,返回空字符串
|
|
|
|
|
return ""
|
|
|
|
|
|
|
|
|
|
def _get_cudnn_version(self):
|
|
|
|
|
"""Get cudnn version by libcudnn.so."""
|
|
|
|
|
# 初始化cudnn版本列表为空
|
|
|
|
|
cudnn_version = []
|
|
|
|
|
# 遍历cudnn库路径
|
|
|
|
|
for path in self.cudnn_lib_path:
|
|
|
|
|
# 查找路径下所有的libcudnn.so文件
|
|
|
|
|
real_path = glob.glob(path + "/lib*/libcudnn.so.*.*")
|
|
|
|
|
# 如果没有找到对应的文件,继续下一个路径
|
|
|
|
|
if real_path == []:
|
|
|
|
|
continue
|
|
|
|
|
# 使用ls命令获取文件信息
|
|
|
|
|
ls_cudnn = subprocess.run(["ls", real_path[0]], timeout=10, text=True,
|
|
|
|
|
capture_output=True, check=False)
|
|
|
|
|
# 如果ls命令执行成功,解析输出以获取版本号
|
|
|
|
|
if ls_cudnn.returncode == 0:
|
|
|
|
|
cudnn_version = ls_cudnn.stdout.split('/')[-1].strip('libcudnn.so.').strip().split('.')
|
|
|
|
|
# 如果版本号只有两个部分,添加一个'.0'作为第三部分
|
|
|
|
|
if len(cudnn_version) == 2:
|
|
|
|
|
cudnn_version.append('0')
|
|
|
|
|
# 找到版本号后跳出循环
|
|
|
|
|
break
|
|
|
|
|
# 将版本号列表转换为字符串
|
|
|
|
|
version_str = ''.join([n for n in cudnn_version])
|
|
|
|
|
# 返回版本号的前三位
|
|
|
|
|
return version_str[0:3]
|
|
|
|
|
|
|
|
|
|
def _get_cudart_version(self):
|
|
|
|
|
"""Get cuda runtime version by libcudart.so."""
|
|
|
|
|
# 遍历可能的 CUDA 库路径
|
|
|
|
|
for path in self.cuda_lib_path:
|
|
|
|
|
# 查找路径下所有可能的 libcudart.so 文件
|
|
|
|
|
real_path = glob.glob(path + "/lib*/libcudart.so.*.*.*")
|
|
|
|
|
# 如果没有找到任何文件,则跳过当前路径
|
|
|
|
|
if real_path == []:
|
|
|
|
|
continue
|
|
|
|
|
# 获取文件名信息以确定 CUDA 版本
|
|
|
|
|
ls_cudart = subprocess.run(["ls", real_path[0]], timeout=10, text=True,
|
|
|
|
|
capture_output=True, check=False)
|
|
|
|
|
# 如果命令成功执行,则解析输出以提取版本号
|
|
|
|
|
if ls_cudart.returncode == 0:
|
|
|
|
|
self.v = ls_cudart.stdout.split('/')[-1].strip('libcudart.so.').strip()
|
|
|
|
|
# 找到版本号后跳出循环
|
|
|
|
|
break
|
|
|
|
|
# 返回找到的 CUDA 版本号
|
|
|
|
|
return self.v
|
|
|
|
|
|
|
|
|
|
def check_version(self):
|
|
|
|
|
"""Check cuda version."""
|
|
|
|
|
version_match = False
|
|
|
|
|
# 调用私有方法检查版本是否匹配,并根据结果设置version_match标志
|
|
|
|
|
if self._check_version():
|
|
|
|
|
version_match = True
|
|
|
|
|
# 如果版本不匹配,根据CUDA版本号输出不同的警告信息
|
|
|
|
|
if not version_match:
|
|
|
|
|
if self.v == "0":
|
|
|
|
|
logger.warning("Can not found cuda libs, please confirm that the correct "
|
|
|
|
@ -140,17 +182,20 @@ class GPUEnvChecker(EnvChecker):
|
|
|
|
|
logger.warning(f"MindSpore version {__version__} and cuda version {self.v} does not match, "
|
|
|
|
|
"please refer to the installation guide for version matching "
|
|
|
|
|
"information: https://www.mindspore.cn/install")
|
|
|
|
|
# 获取nvcc版本号,并检查是否与MindSpore支持的版本匹配
|
|
|
|
|
nvcc_version = self._get_nvcc_version(False)
|
|
|
|
|
if nvcc_version and (nvcc_version not in self.version):
|
|
|
|
|
logger.warning(f"MindSpore version {__version__} and nvcc(cuda bin) version {nvcc_version} "
|
|
|
|
|
"does not match, please refer to the installation guide for version matching "
|
|
|
|
|
"information: https://www.mindspore.cn/install")
|
|
|
|
|
# 获取cudnn版本号,并检查是否符合最低要求
|
|
|
|
|
cudnn_version = self._get_cudnn_version()
|
|
|
|
|
if cudnn_version and int(cudnn_version) < 760:
|
|
|
|
|
logger.warning(f"MindSpore version {__version__} and cudDNN version {cudnn_version} "
|
|
|
|
|
"does not match, please refer to the installation guide for version matching "
|
|
|
|
|
"information: https://www.mindspore.cn/install. The recommended version is "
|
|
|
|
|
"CUDA10.1 with cuDNN7.6.x and CUDA11.1 with cuDNN8.0.x")
|
|
|
|
|
# 检查cudnn版本号与CUDA版本号的兼容性,对于CUDA 11.0以上版本,cudnn版本需要至少为8.0
|
|
|
|
|
if cudnn_version and int(cudnn_version) < 800 and int(str(self.v).split('.')[0]) > 10:
|
|
|
|
|
logger.warning(f"CUDA version {self.v} and cuDNN version {cudnn_version} "
|
|
|
|
|
"does not match, please refer to the installation guide for version matching "
|
|
|
|
@ -159,45 +204,58 @@ class GPUEnvChecker(EnvChecker):
|
|
|
|
|
|
|
|
|
|
def _check_version(self):
|
|
|
|
|
"""Check cuda version"""
|
|
|
|
|
# 获取 CUDA 运行时版本
|
|
|
|
|
v = self._get_cudart_version()
|
|
|
|
|
# 解析版本字符串为版本对象
|
|
|
|
|
v = version.parse(v)
|
|
|
|
|
# 构造版本号字符串,格式为 "主版本.次版本"
|
|
|
|
|
v_str = str(v.major) + "." + str(v.minor)
|
|
|
|
|
# 检查构造的版本号字符串是否在预定义的版本列表中
|
|
|
|
|
if v_str not in self.version:
|
|
|
|
|
return False
|
|
|
|
|
# 版本号匹配,返回 True
|
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
|
def _get_lib_path(self, lib_name):
|
|
|
|
|
"""Get gpu lib path by ldd command."""
|
|
|
|
|
path_list = []
|
|
|
|
|
current_path = os.path.split(os.path.realpath(__file__))[0]
|
|
|
|
|
mindspore_path = os.path.join(current_path, "../")
|
|
|
|
|
"""通过ldd命令获取gpu库路径。"""
|
|
|
|
|
path_list = [] # 初始化一个空列表用于存储路径
|
|
|
|
|
current_path = os.path.split(os.path.realpath(__file__))[0] # 获取当前文件的绝对路径并分割以获取目录部分
|
|
|
|
|
mindspore_path = os.path.join(current_path, "../") # 构建mindspore路径,通常是当前文件的上一级目录
|
|
|
|
|
try:
|
|
|
|
|
# 使用glob模块查找mindspore_path目录下所有以_c_expression.so开头的文件路径
|
|
|
|
|
real_path = glob.glob(mindspore_path + "/_c_expression*.so*")
|
|
|
|
|
if real_path == []:
|
|
|
|
|
logger.error(f"{self.lib_key_to_lib_name[lib_name]} (need by mindspore-gpu) is not found, please "
|
|
|
|
|
f"confirm that _c_expression.so is in directory:{mindspore_path} and the correct cuda "
|
|
|
|
|
"version has been installed, you can refer to the installation "
|
|
|
|
|
"guidelines: https://www.mindspore.cn/install")
|
|
|
|
|
return path_list
|
|
|
|
|
if real_path == []: # 如果没有找到任何文件
|
|
|
|
|
# 记录错误日志,提示用户确认_c_expression.so文件是否存在以及是否安装了正确的cuda版本
|
|
|
|
|
logger.error(f"{self.lib_key_to_lib_name[lib_name]} (mindspore-gpu所需的库) 未找到,请确认 "
|
|
|
|
|
f"_c_expression.so是否位于目录:{mindspore_path}中,并且已安装正确的cuda版本,"
|
|
|
|
|
"您可以参考安装指南:https://www.mindspore.cn/install")
|
|
|
|
|
return path_list # 返回空路径列表
|
|
|
|
|
# 使用subprocess.Popen执行ldd命令以获取依赖库的信息
|
|
|
|
|
ldd_r = subprocess.Popen(['ldd', real_path[0]], stdout=subprocess.PIPE)
|
|
|
|
|
# 使用subprocess.Popen的stdin参数从ldd_r.stdout接收输出,并执行grep命令以过滤出包含指定库名的信息
|
|
|
|
|
ldd_result = subprocess.Popen(['grep', lib_name], stdin=ldd_r.stdout, stdout=subprocess.PIPE)
|
|
|
|
|
# 获取grep命令的输出结果,并解码为字符串
|
|
|
|
|
result = ldd_result.communicate()[0].decode()
|
|
|
|
|
for i in result.split('\n'):
|
|
|
|
|
for i in result.split('\n'): # 按行分割结果字符串
|
|
|
|
|
# 使用partition方法从每一行中提取出库文件的路径
|
|
|
|
|
path = i.partition("=>")[2]
|
|
|
|
|
if path.lower().find("not found") > 0:
|
|
|
|
|
logger.warning(f"Cuda {self.version} version(need by mindspore-gpu) is not found, please confirm "
|
|
|
|
|
"that the path of cuda is set to the env LD_LIBRARY_PATH, please refer to the "
|
|
|
|
|
"installation guidelines: https://www.mindspore.cn/install")
|
|
|
|
|
continue
|
|
|
|
|
if path.lower().find("not found") > 0: # 如果路径中包含"not found"
|
|
|
|
|
# 记录警告日志,提示用户确认cuda路径是否已添加到环境变量LD_LIBRARY_PATH中
|
|
|
|
|
logger.warning(f"Cuda {self.version}版本(由mindspore-gpu要求的) 未找到,请确认cuda路径已设置到环境变量LD_LIBRARY_PATH中,"
|
|
|
|
|
"您可以参考安装指南:https://www.mindspore.cn/install")
|
|
|
|
|
continue # 继续下一次循环
|
|
|
|
|
# 从路径中去除库名部分
|
|
|
|
|
path = path.partition(lib_name)[0]
|
|
|
|
|
if path:
|
|
|
|
|
if path: # 如果路径非空
|
|
|
|
|
# 将路径的绝对路径并去除末尾斜杠后添加到path_list中
|
|
|
|
|
path_list.append(os.path.abspath(path.strip() + "../"))
|
|
|
|
|
# 返回path_list中唯一的路径
|
|
|
|
|
return np.unique(path_list)
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
logger.warning("Failed to check cuda version due to the ldd command timeout, please confirm that "
|
|
|
|
|
"the correct cuda version has been installed, you can refer to the "
|
|
|
|
|
"installation guidelines: https://www.mindspore.cn/install")
|
|
|
|
|
return path_list
|
|
|
|
|
except subprocess.TimeoutExpired: # 捕获subprocess.TimeoutExpired异常
|
|
|
|
|
# 记录警告日志,提示用户确认cuda版本是否正确安装,因为ldd命令超时
|
|
|
|
|
logger.warning("由于ldd命令超时,无法检查cuda版本,请确认已安装正确的cuda版本,"
|
|
|
|
|
"您可以参考安装指南:https://www.mindspore.cn/install")
|
|
|
|
|
return path_list # 返回空路径列表
|
|
|
|
|
|
|
|
|
|
def _read_version(self, file_path):
|
|
|
|
|
"""Get gpu version info in version.txt."""
|
|
|
|
@ -211,70 +269,80 @@ class GPUEnvChecker(EnvChecker):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class AscendEnvChecker(EnvChecker):
|
|
|
|
|
"""ascend environment check"""
|
|
|
|
|
"""Ascend 环境检查类"""
|
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
|
|
# 初始化 Ascend 环境检查器的版本列表
|
|
|
|
|
self.version = ["1.81"]
|
|
|
|
|
|
|
|
|
|
# 定义不同路径下的 version.info 文件位置
|
|
|
|
|
atlas_nnae_version = "/usr/local/Ascend/nnae/latest/fwkacllib/version.info"
|
|
|
|
|
atlas_toolkit_version = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/version.info"
|
|
|
|
|
hisi_fwk_version = "/usr/local/Ascend/latest/fwkacllib/version.info"
|
|
|
|
|
|
|
|
|
|
# 检查 Atlas NNAE 环境是否存在
|
|
|
|
|
if os.path.exists(atlas_nnae_version):
|
|
|
|
|
# atlas default path
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/nnae/latest/fwkacllib"
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe"
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64"
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin"
|
|
|
|
|
self.fwk_version = atlas_nnae_version
|
|
|
|
|
self.op_path = "/usr/local/Ascend/nnae/latest/opp"
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/nnae/latest"
|
|
|
|
|
# 如果存在,设置默认路径
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/nnae/latest/fwkacllib" # Framework 路径
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/nnae/latest/opp/op_impl/built-in/ai_core/tbe" # Operator 实现路径
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64" # TBE 库路径
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin" # CCE 编译器路径
|
|
|
|
|
self.fwk_version = atlas_nnae_version # Framework 版本文件路径
|
|
|
|
|
self.op_path = "/usr/local/Ascend/nnae/latest/opp" # Operator 路径
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/nnae/latest" # AI CPU 路径
|
|
|
|
|
|
|
|
|
|
# 检查 Atlas Toolkit 环境是否存在
|
|
|
|
|
elif os.path.exists(atlas_toolkit_version):
|
|
|
|
|
# atlas default path
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib"
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe"
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64"
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin"
|
|
|
|
|
self.fwk_version = atlas_toolkit_version
|
|
|
|
|
self.op_path = "/usr/local/Ascend/ascend-toolkit/latest/opp"
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/ascend-toolkit/latest"
|
|
|
|
|
# 如果存在,设置默认路径
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib" # Framework 路径
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe" # Operator 实现路径
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64" # TBE 库路径
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin" # CCE 编译器路径
|
|
|
|
|
self.fwk_version = atlas_toolkit_version # Framework 版本文件路径
|
|
|
|
|
self.op_path = "/usr/local/Ascend/ascend-toolkit/latest/opp" # Operator 路径
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/ascend-toolkit/latest" # AI CPU 路径
|
|
|
|
|
|
|
|
|
|
# 检查 Hisi 环境是否存在
|
|
|
|
|
elif os.path.exists(hisi_fwk_version):
|
|
|
|
|
# hisi default path
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/latest/fwkacllib"
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe"
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64"
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin"
|
|
|
|
|
self.fwk_version = hisi_fwk_version
|
|
|
|
|
self.op_path = "/usr/local/Ascend/latest/opp"
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/latest"
|
|
|
|
|
# 如果存在,设置默认路径
|
|
|
|
|
self.fwk_path = "/usr/local/Ascend/latest/fwkacllib" # Framework 路径
|
|
|
|
|
self.op_impl_path = "/usr/local/Ascend/latest/opp/op_impl/built-in/ai_core/tbe" # Operator 实现路径
|
|
|
|
|
self.tbe_path = self.fwk_path + "/lib64" # TBE 库路径
|
|
|
|
|
self.cce_path = self.fwk_path + "/ccec_compiler/bin" # CCE 编译器路径
|
|
|
|
|
self.fwk_version = hisi_fwk_version # Framework 版本文件路径
|
|
|
|
|
self.op_path = "/usr/local/Ascend/latest/opp" # Operator 路径
|
|
|
|
|
self.aicpu_path = "/usr/local/Ascend/latest" # AI CPU 路径
|
|
|
|
|
|
|
|
|
|
else:
|
|
|
|
|
# custom or unknown environment
|
|
|
|
|
self.fwk_path = ""
|
|
|
|
|
self.op_impl_path = ""
|
|
|
|
|
self.tbe_path = ""
|
|
|
|
|
self.cce_path = ""
|
|
|
|
|
self.fwk_version = ""
|
|
|
|
|
self.op_path = ""
|
|
|
|
|
self.aicpu_path = ""
|
|
|
|
|
|
|
|
|
|
# env
|
|
|
|
|
# 如果以上环境都不存在,设置为空路径
|
|
|
|
|
self.fwk_path = "" # Framework 路径
|
|
|
|
|
self.op_impl_path = "" # Operator 实现路径
|
|
|
|
|
self.tbe_path = "" # TBE 库路径
|
|
|
|
|
self.cce_path = "" # CCE 编译器路径
|
|
|
|
|
self.fwk_version = "" # Framework 版本文件路径
|
|
|
|
|
self.op_path = "" # Operator 路径
|
|
|
|
|
self.aicpu_path = "" # AI CPU 路径
|
|
|
|
|
|
|
|
|
|
# 初始化环境变量
|
|
|
|
|
self.path = os.getenv("PATH")
|
|
|
|
|
self.python_path = os.getenv("PYTHONPATH")
|
|
|
|
|
self.ld_lib_path = os.getenv("LD_LIBRARY_PATH")
|
|
|
|
|
self.ascend_opp_path = os.getenv("ASCEND_OPP_PATH")
|
|
|
|
|
self.ascend_aicpu_path = os.getenv("ASCEND_AICPU_PATH")
|
|
|
|
|
|
|
|
|
|
# check content
|
|
|
|
|
# 设置需要检查的路径内容
|
|
|
|
|
self.path_check = "/fwkacllib/ccec_compiler/bin"
|
|
|
|
|
self.python_path_check = "opp/op_impl/built-in/ai_core/tbe"
|
|
|
|
|
self.ld_lib_path_check_fwk = "/fwkacllib/lib64"
|
|
|
|
|
self.ld_lib_path_check_addons = "/add-ons"
|
|
|
|
|
self.ascend_opp_path_check = "/op"
|
|
|
|
|
self.v = ""
|
|
|
|
|
|
|
|
|
|
def check_env(self, e):
|
|
|
|
|
self._check_env()
|
|
|
|
|
raise e
|
|
|
|
|
|
|
|
|
|
def check_version(self):
|
|
|
|
|
# 检查指定路径的版本文件是否存在,如果不存在则跳过版本检查
|
|
|
|
|
if not Path(self.fwk_version).is_file():
|
|
|
|
|
logger.warning("Using custom Ascend AI software package (Ascend Data Center Solution) path, package "
|
|
|
|
|
"version checking is skipped, please make sure Ascend AI software package (Ascend Data "
|
|
|
|
@ -282,40 +350,47 @@ class AscendEnvChecker(EnvChecker):
|
|
|
|
|
"https://www.mindspore.cn/install")
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
# 读取版本文件中的版本信息
|
|
|
|
|
v = self._read_version(self.fwk_version)
|
|
|
|
|
# 如果读取的版本不在支持的版本列表中,则记录警告信息
|
|
|
|
|
if v not in self.version:
|
|
|
|
|
v_list = str([x for x in self.version])
|
|
|
|
|
logger.warning(f"MindSpore version {__version__} and Ascend AI software package (Ascend Data Center "
|
|
|
|
|
f"Solution)version {v} does not match, the version of software package expect one of "
|
|
|
|
|
f"{v_list}, please reference to the match info on: https://www.mindspore.cn/install")
|
|
|
|
|
|
|
|
|
|
def check_deps_version(self):
|
|
|
|
|
"""
|
|
|
|
|
te, topi, hccl wheel package version check
|
|
|
|
|
in order to update the change of 'LD_LIBRARY_PATH' env, run a sub process
|
|
|
|
|
"""
|
|
|
|
|
# 构建输入参数列表,包含mindspore版本和受支持的版本列表
|
|
|
|
|
input_args = ["--mindspore_version=" + __version__]
|
|
|
|
|
for v in self.version:
|
|
|
|
|
input_args.append("--supported_version=" + v)
|
|
|
|
|
# 获取依赖版本检查脚本的路径
|
|
|
|
|
deps_version_checker = os.path.join(os.path.split(os.path.realpath(__file__))[0],
|
|
|
|
|
"_check_deps_version.py")
|
|
|
|
|
# 构建调用命令,包括python解释器路径、脚本路径和输入参数
|
|
|
|
|
call_cmd = [sys.executable, deps_version_checker] + input_args
|
|
|
|
|
try:
|
|
|
|
|
# 运行子进程进行版本检查,设置超时时间为3秒,并捕获输出
|
|
|
|
|
process = subprocess.run(call_cmd, timeout=3, text=True, capture_output=True, check=False)
|
|
|
|
|
# 如果子进程的输出不为空,则记录警告信息并进行倒计时提醒
|
|
|
|
|
if process.stdout.strip() != "":
|
|
|
|
|
logger.warning(process.stdout.strip())
|
|
|
|
|
warning_countdown = 3
|
|
|
|
|
for i in range(warning_countdown, 0, -1):
|
|
|
|
|
logger.warning(f"Please pay attention to the above warning, countdown: {i}")
|
|
|
|
|
time.sleep(1)
|
|
|
|
|
# 如果版本检查超时,则记录信息并跳过
|
|
|
|
|
except subprocess.TimeoutExpired:
|
|
|
|
|
logger.info("Package te, topi, hccl version check timed out, skip.")
|
|
|
|
|
|
|
|
|
|
def set_env(self):
|
|
|
|
|
# 设置Ascend环境变量
|
|
|
|
|
if not self.tbe_path:
|
|
|
|
|
self._check_env()
|
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
import te # pylint: disable=unused-import
|
|
|
|
|
# pylint: disable=broad-except
|
|
|
|
@ -329,32 +404,35 @@ class AscendEnvChecker(EnvChecker):
|
|
|
|
|
raise EnvironmentError(
|
|
|
|
|
f"No such directory: {self.tbe_path}, Please check if Ascend AI software package (Ascend Data "
|
|
|
|
|
"Center Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
# check te version after set te env
|
|
|
|
|
|
|
|
|
|
# 检查te版本
|
|
|
|
|
self.check_deps_version()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置op实现路径环境变量
|
|
|
|
|
if Path(self.op_impl_path).is_dir():
|
|
|
|
|
# python path for sub process
|
|
|
|
|
# python路径用于子进程
|
|
|
|
|
if os.getenv('PYTHONPATH'):
|
|
|
|
|
os.environ['PYTHONPATH'] = self.op_impl_path + ":" + os.environ['PYTHONPATH']
|
|
|
|
|
else:
|
|
|
|
|
os.environ['PYTHONPATH'] = self.op_impl_path
|
|
|
|
|
# sys path for this process
|
|
|
|
|
# sys路径用于当前进程
|
|
|
|
|
sys.path.append(self.op_impl_path)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.environ['TBE_IMPL_PATH'] = self.op_impl_path
|
|
|
|
|
else:
|
|
|
|
|
raise EnvironmentError(
|
|
|
|
|
f"No such directory: {self.op_impl_path}, Please check if Ascend AI software package (Ascend Data "
|
|
|
|
|
"Center Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
f"No such directory: {self.op_impl_path}, Please check if Ascend AI software package (Ascend Data Center "
|
|
|
|
|
"Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
# 设置CCE路径环境变量
|
|
|
|
|
if Path(self.cce_path).is_dir():
|
|
|
|
|
os.environ['PATH'] = self.cce_path + ":" + os.environ['PATH']
|
|
|
|
|
else:
|
|
|
|
|
raise EnvironmentError(
|
|
|
|
|
f"No such directory: {self.cce_path}, Please check if Ascend AI software package (Ascend Data Center "
|
|
|
|
|
"Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置OP路径环境变量
|
|
|
|
|
if self.op_path is None:
|
|
|
|
|
pass
|
|
|
|
|
elif Path(self.op_path).is_dir():
|
|
|
|
@ -363,7 +441,8 @@ class AscendEnvChecker(EnvChecker):
|
|
|
|
|
raise EnvironmentError(
|
|
|
|
|
f"No such directory: {self.op_path}, Please check if Ascend AI software package (Ascend Data Center "
|
|
|
|
|
"Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 设置AICPU路径环境变量
|
|
|
|
|
if self.aicpu_path is None:
|
|
|
|
|
pass
|
|
|
|
|
elif Path(self.aicpu_path).is_dir():
|
|
|
|
@ -372,44 +451,54 @@ class AscendEnvChecker(EnvChecker):
|
|
|
|
|
raise EnvironmentError(
|
|
|
|
|
f"No such directory: {self.aicpu_path}, Please check if Ascend AI software package (Ascend Data Center"
|
|
|
|
|
" Solution) is installed correctly.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _check_env(self):
|
|
|
|
|
"""ascend dependence path check"""
|
|
|
|
|
# 检查是否设置正确的PATH环境变量
|
|
|
|
|
if self.path is None or self.path_check not in self.path:
|
|
|
|
|
logger.warning("Can not find ccec_compiler(need by mindspore-ascend), please check if you have set env "
|
|
|
|
|
"PATH, you can reference to the installation guidelines https://www.mindspore.cn/install")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否设置正确的PYTHONPATH环境变量
|
|
|
|
|
if self.python_path is None or self.python_path_check not in self.python_path:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"Can not find tbe op implement(need by mindspore-ascend), please check if you have set env "
|
|
|
|
|
"PYTHONPATH, you can reference to the installation guidelines "
|
|
|
|
|
"https://www.mindspore.cn/install")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否设置正确的LD_LIBRARY_PATH环境变量
|
|
|
|
|
if self.ld_lib_path is None or not (self.ld_lib_path_check_fwk in self.ld_lib_path and
|
|
|
|
|
self.ld_lib_path_check_addons in self.ld_lib_path):
|
|
|
|
|
logger.warning("Can not find driver so(need by mindspore-ascend), please check if you have set env "
|
|
|
|
|
"LD_LIBRARY_PATH, you can reference to the installation guidelines "
|
|
|
|
|
"https://www.mindspore.cn/install")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查是否设置正确的ASCEND_OPP_PATH环境变量
|
|
|
|
|
if self.ascend_opp_path is None or self.ascend_opp_path_check not in self.ascend_opp_path:
|
|
|
|
|
logger.warning(
|
|
|
|
|
"Can not find opp path (need by mindspore-ascend), please check if you have set env ASCEND_OPP_PATH, "
|
|
|
|
|
"you can reference to the installation guidelines https://www.mindspore.cn/install")
|
|
|
|
|
|
|
|
|
|
def _read_version(self, file_path):
|
|
|
|
|
"""get ascend version info"""
|
|
|
|
|
with open(file_path, 'r') as f:
|
|
|
|
|
all_info = f.readlines()
|
|
|
|
|
# 遍历文件中的每一行
|
|
|
|
|
for line in all_info:
|
|
|
|
|
# 检查行是否以 "Version=" 开头
|
|
|
|
|
if line.startswith("Version="):
|
|
|
|
|
# 去除行末的换行符并按 "=" 分割, 获取版本号
|
|
|
|
|
full_version = line.strip().split("=")[1]
|
|
|
|
|
# 提取主版本号和次版本号, 并用 "." 连接
|
|
|
|
|
self.v = '.'.join(full_version.split('.')[0:2])
|
|
|
|
|
# 返回版本号
|
|
|
|
|
return self.v
|
|
|
|
|
# 如果未找到版本信息, 返回 None 或默认值
|
|
|
|
|
return self.v
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def check_version_and_env_config():
|
|
|
|
|
"""check version and env config"""
|
|
|
|
|
"""检查版本和环境配置"""
|
|
|
|
|
# 检查包名以确定使用哪种环境检查器
|
|
|
|
|
if __package_name__.lower() == "mindspore-ascend":
|
|
|
|
|
env_checker = AscendEnvChecker()
|
|
|
|
|
# Note: pre-load libgomp.so to solve error like "cannot allocate memory in statis TLS block"
|
|
|
|
@ -425,19 +514,21 @@ def check_version_and_env_config():
|
|
|
|
|
else:
|
|
|
|
|
logger.info(f"Package version {__package_name__} does not need to check any environment variable, skipping.")
|
|
|
|
|
return
|
|
|
|
|
# 检查是否关闭版本检查,如果已关闭则直接返回
|
|
|
|
|
if os.getenv("MS_DEV_CLOSE_VERSION_CHECK") == "ON":
|
|
|
|
|
return
|
|
|
|
|
# 设置环境变量以关闭版本检查
|
|
|
|
|
os.environ["MS_DEV_CLOSE_VERSION_CHECK"] = "ON"
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
|
# check version of ascend site or cuda
|
|
|
|
|
# 检查 ascend site 或 cuda 的版本
|
|
|
|
|
env_checker.check_version()
|
|
|
|
|
from .. import _c_expression # pylint: disable=unused-import
|
|
|
|
|
# 设置环境
|
|
|
|
|
env_checker.set_env()
|
|
|
|
|
except ImportError as e:
|
|
|
|
|
# 处理导入错误,检查环境
|
|
|
|
|
env_checker.check_env(e)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _set_pb_env():
|
|
|
|
|
"""Set env variable `PROTOCOL_BUFFERS` to prevent memory overflow."""
|
|
|
|
|
if os.getenv("PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION") == "cpp":
|
|
|
|
@ -449,7 +540,9 @@ def _set_pb_env():
|
|
|
|
|
logger.info("Setting the env `PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python` to prevent memory overflow "
|
|
|
|
|
"during save or load checkpoint file.")
|
|
|
|
|
os.environ["PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION"] = "python"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 检查版本和环境配置
|
|
|
|
|
check_version_and_env_config()
|
|
|
|
|
|
|
|
|
|
# 设置协议缓冲区的环境变量, 防止内存溢出
|
|
|
|
|
_set_pb_env()
|
|
|
|
|