feat: support ascend and nvidia use mode(hami-core mig mps)

Signed-off-by: Nimbus318 <136771156+Nimbus318@users.noreply.github.com>
main
Nimbus318 7 months ago
parent 73d6a538e0
commit a9c25de696

@ -173,16 +173,32 @@ ul {
.nodeCard { .nodeCard {
height: 100%; height: 100%;
.pie { .pie {
width: 200px; width: 200px;
height: 200px; height: 200px;
margin: 0 auto; margin: 0 auto;
} }
.nodeCard-legend { .nodeCard-legend {
width: 100%; width: 100%;
display: flex; display: flex;
flex-direction: column; flex-direction: column;
gap: 15px; gap: 15px;
max-height: calc(3 * (12px + 15px));
overflow-y: auto;
padding-right: 10px;
/* 自定义滚动条样式(可选) */
&::-webkit-scrollbar {
width: 6px;
}
&::-webkit-scrollbar-thumb {
background-color: rgba(0, 0, 0, 0.2);
border-radius: 3px;
}
li { li {
display: flex; display: flex;
justify-content: space-between; justify-content: space-between;
@ -194,8 +210,8 @@ ul {
gap: 5px; gap: 5px;
} }
.color-box { .color-box {
width: 4px; width: 10px;
height: 4px; height: 10px;
display: inline-block; display: inline-block;
} }
} }

@ -196,6 +196,15 @@ const columns = [
label: '驱动版本', label: '驱动版本',
value: 'driver_version', value: 'driver_version',
}, },
{
label: '使用模式',
value: 'mode',
render: ({ mode, type }) => (
<el-tag disable-transitions>
{type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
</el-tag>
)
}
]; ];
const cp = useInstantVector( const cp = useInstantVector(

@ -63,6 +63,15 @@ const columns = [
</el-tag> </el-tag>
) )
}, },
{
title: '使用模式',
dataIndex: 'mode',
render: ({ mode, type }) => (
<el-tag disable-transitions>
{type?.split('-')[0] === "NVIDIA" ? mode : 'default'}
</el-tag>
)
},
{ {
title: '所属节点', title: '所属节点',
dataIndex: 'nodeName', dataIndex: 'nodeName',

@ -42,11 +42,16 @@
</block-box> </block-box>
<block-box v-for="{ title, data } in lineConfig" :key="title" :title="title"> <block-box v-for="{ title, data } in lineConfig" :key="title" :title="title">
<template #extra> <template #extra v-if="detail.type && detail.type.startsWith('NVIDIA')">
<time-picker v-model="times" type="datetimerange" size="small" /> <time-picker v-model="times" type="datetimerange" size="small" />
</template> </template>
<div style="height: 200px"> <div style="height: 200px">
<echarts-plus :options="getLineOptions({ data })" /> <template v-if="detail.type && !detail.type.startsWith('NVIDIA')">
<el-empty description="该设备厂商暂不支持任务维度监控" :image-size="60" />
</template>
<template v-else>
<echarts-plus :options="getLineOptions({ data })" />
</template>
</div> </div>
</block-box> </block-box>
</template> </template>

@ -64,6 +64,7 @@ message GPUReply {
int32 memory_total = 9; int32 memory_total = 9;
string node_uid = 10; string node_uid = 10;
bool health = 11; bool health = 11;
string mode = 12;
} }
message GPUsReply { message GPUsReply {

@ -32,6 +32,7 @@ type DeviceInfo struct {
Devcore int32 Devcore int32
Type string Type string
Numa int Numa int
Mode string
Health bool Health bool
NodeName string NodeName string
NodeUid string NodeUid string

@ -86,6 +86,7 @@ func (r *nodeRepo) updateLocalNodes() {
Devcore: device.Devcore, Devcore: device.Devcore,
Type: device.Type, Type: device.Type,
Numa: device.Numa, Numa: device.Numa,
Mode: device.Mode,
Health: device.Health, Health: device.Health,
NodeName: node.Name, NodeName: node.Name,
NodeUid: string(node.UID), NodeUid: string(node.UID),

@ -438,6 +438,8 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
switch provider { switch provider {
case biz.NvidiaGPUDevice: case biz.NvidiaGPUDevice:
query = fmt.Sprintf("DCGM_FI_DEV_POWER_USAGE{UUID=\"%s\"}", deviceUUID) query = fmt.Sprintf("DCGM_FI_DEV_POWER_USAGE{UUID=\"%s\"}", deviceUUID)
case biz.AscendGPUDevice:
query = fmt.Sprintf("npu_chip_info_power{vdie_id=\"%s\"}", deviceUUID)
case biz.CambriconGPUDevice: case biz.CambriconGPUDevice:
query = fmt.Sprintf("mlu_power_usage{uuid=\"%s\"}", deviceUUID) query = fmt.Sprintf("mlu_power_usage{uuid=\"%s\"}", deviceUUID)
case biz.HygonGPUDevice: case biz.HygonGPUDevice:
@ -462,6 +464,9 @@ func (s *MetricsGenerator) queryDeviceAdditional(ctx context.Context, provider,
case biz.CambriconGPUDevice: case biz.CambriconGPUDevice:
info.DriverVersion = metric["driver"] info.DriverVersion = metric["driver"]
info.DeviceNo = metric["sn"] info.DeviceNo = metric["sn"]
case biz.AscendGPUDevice:
info.DriverVersion = "暂无"
info.DeviceNo = "ascend-" + metric["id"]
case biz.HygonGPUDevice: case biz.HygonGPUDevice:
info.DriverVersion = "暂无" info.DriverVersion = "暂无"
info.DeviceNo = "dcu-" + metric["minor_number"] info.DeviceNo = "dcu-" + metric["minor_number"]

@ -8,17 +8,22 @@ const (
// IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID. // IluvatarUseUUID is user can use specify Iluvatar device for set Iluvatar UUID.
AscendDeviceUseUUID = "huawei.com/use-ascenduuid" AscendDeviceUseUUID = "huawei.com/use-ascenduuid"
// IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID. // IluvatarNoUseUUID is user can not use specify Iluvatar device for set Iluvatar UUID.
AscendNoUseUUID = "huawei.com/nouse-ascenduuid" AscendNoUseUUID = "huawei.com/nouse-ascenduuid"
AscendResourceCoreCount = "huawei.com/Ascend910" Ascend910BNodeRegisterAnno = "hami.io/node-register-Ascend910B"
Ascend310PNodeRegisterAnno = "hami.io/node-register-Ascend310P"
) )
var ( var (
AscendResourceCount string AscendResourceCount string
AscendResourceMemory string AscendResourceMemory string
AscendResourceCores string AscendResourceCores string
AscendNodeRegisterAnnos []string
) )
func init() { func init() {
util.InRequestDevices[AscendDevice] = "hami.io/ascend-devices-to-allocate" AscendNodeRegisterAnnos = []string{Ascend910BNodeRegisterAnno, Ascend310PNodeRegisterAnno}
util.SupportDevices[AscendDevice] = "hami.io/ascend-devices-allocated" util.InRequestDevices[AscendDevice] = "hami.io/Ascend910B-devices-to-allocate"
util.SupportDevices[AscendDevice] = "hami.io/Ascend910B-devices-allocated"
util.InRequestDevices["Ascend310P"] = "hami.io/Ascend310P-devices-to-allocate"
util.SupportDevices["Ascend310P"] = "hami.io/Ascend310P-devices-allocated"
} }

@ -6,9 +6,8 @@ import (
"github.com/go-kratos/kratos/v2/log" "github.com/go-kratos/kratos/v2/log"
"github.com/prometheus/common/model" "github.com/prometheus/common/model"
corev1 "k8s.io/api/core/v1" corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
"k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/labels"
"strings" "strconv"
"vgpu/internal/data/prom" "vgpu/internal/data/prom"
"vgpu/internal/provider/util" "vgpu/internal/provider/util"
) )
@ -72,31 +71,26 @@ func (a *Ascend) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.De
} }
func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) { func (a *Ascend) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
for _, anno := range AscendNodeRegisterAnnos {
nodedevices := []*util.DeviceInfo{} tmpDevice := a.GetDevicesFromPrometheus(node)
i := 0 anno, ok := node.Annotations[anno]
cards, _ := node.Status.Capacity.Name(corev1.ResourceName(AscendResourceCoreCount), resource.DecimalSI).AsInt64() if !ok {
tmpDevice := a.GetDevicesFromPrometheus(node) log.Infof("anno %s not found", anno)
for int64(i)*10 < cards {
index := fmt.Sprintf("%d", i)
if _, ok := tmpDevice[index]; !ok {
i++
continue continue
} }
mode := strings.Split(tmpDevice[index].Type, "-") nodeDevices, err := util.UnMarshalNodeDevices(anno)
nodedevices = append(nodedevices, &util.DeviceInfo{ if err != nil {
Index: uint(i), return []*util.DeviceInfo{}, err
ID: tmpDevice[index].ID, }
AliasId: node.Name + "-Ascend910-" + fmt.Sprint(i), for i, nodedevice := range nodeDevices {
Count: 10, nodeDevices[i].AliasId = nodedevice.ID
Devmem: int32(65536), if device, exists := tmpDevice[strconv.Itoa(i)]; exists {
Devcore: 100, nodeDevices[i].ID = device.ID
Type: fmt.Sprintf("%s-%s", mode[1], mode[0]), } else {
Numa: 0, log.Infof("Key %d not found in tmpDevice", i)
Health: true, }
Driver: "xxx", }
}) return nodeDevices, nil
i++
} }
return nodedevices, nil return []*util.DeviceInfo{}, fmt.Errorf("")
} }

@ -97,6 +97,7 @@ type DeviceInfo struct {
Devcore int32 Devcore int32
Type string Type string
Numa int Numa int
Mode string
Health bool Health bool
Driver string Driver string
} }

@ -1,6 +1,7 @@
package util package util
import ( import (
"encoding/json"
"errors" "errors"
"fmt" "fmt"
"github.com/go-kratos/kratos/v2/log" "github.com/go-kratos/kratos/v2/log"
@ -60,12 +61,18 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
for _, val := range tmp { for _, val := range tmp {
if strings.Contains(val, ",") { if strings.Contains(val, ",") {
items := strings.Split(val, ",") items := strings.Split(val, ",")
if len(items) == 7 { if len(items) >= 7 || len(items) == 9 {
count, _ := strconv.Atoi(items[1]) count, _ := strconv.ParseInt(items[1], 10, 32)
devmem, _ := strconv.Atoi(items[2]) devmem, _ := strconv.ParseInt(items[2], 10, 32)
devcore, _ := strconv.Atoi(items[3]) devcore, _ := strconv.ParseInt(items[3], 10, 32)
health, _ := strconv.ParseBool(items[6]) health, _ := strconv.ParseBool(items[6])
numa, _ := strconv.Atoi(items[5]) numa, _ := strconv.Atoi(items[5])
mode := "hami-core"
index := 0
if len(items) == 9 {
index, _ = strconv.Atoi(items[7])
mode = items[8]
}
i := DeviceInfo{ i := DeviceInfo{
ID: items[0], ID: items[0],
AliasId: items[0], AliasId: items[0],
@ -75,6 +82,8 @@ func DecodeNodeDevices(str string, log *log.Helper) ([]*DeviceInfo, error) {
Type: items[4], Type: items[4],
Numa: numa, Numa: numa,
Health: health, Health: health,
Mode: mode,
Index: uint(index),
} }
retval = append(retval, &i) retval = append(retval, &i)
} else { } else {
@ -307,3 +316,9 @@ func DecodePodDevices(pod *corev1.Pod, log *log.Helper) (PodDevices, error) {
log.Infof("Decoded pod annos: poddevices %v", pd) log.Infof("Decoded pod annos: poddevices %v", pd)
return pd, nil return pd, nil
} }
func UnMarshalNodeDevices(str string) ([]*DeviceInfo, error) {
var dlist []*DeviceInfo
err := json.Unmarshal([]byte(str), &dlist)
return dlist, err
}

@ -50,6 +50,7 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p
gpu.MemoryTotal = device.Devmem gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid gpu.NodeUid = device.NodeUid
gpu.Health = device.Health gpu.Health = device.Health
gpu.Mode = device.Mode
vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil { if err == nil {
@ -120,6 +121,7 @@ func (s *CardService) GetGPU(ctx context.Context, req *pb.GetGpuReq) (*pb.GPURep
gpu.MemoryTotal = device.Devmem gpu.MemoryTotal = device.Devmem
gpu.NodeUid = device.NodeUid gpu.NodeUid = device.NodeUid
gpu.Health = device.Health gpu.Health = device.Health
gpu.Mode = device.Mode
vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil { if err == nil {

@ -69,7 +69,7 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC
deviceID = device.Id deviceID = device.Id
} }
if filters.DeviceId != "" && filters.DeviceId != deviceID { if filters.DeviceId != "" && !strings.HasPrefix(deviceID, filters.DeviceId) {
continue continue
} }

Loading…
Cancel
Save