You can not select more than 25 topics
			Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
		
		
		
		
		
			
		
			
				
					
					
						
							101 lines
						
					
					
						
							2.5 KiB
						
					
					
				
			
		
		
	
	
							101 lines
						
					
					
						
							2.5 KiB
						
					
					
				| package mlu
 | |
| 
 | |
| import (
 | |
| 	"context"
 | |
| 	"fmt"
 | |
| 	"github.com/go-kratos/kratos/v2/log"
 | |
| 	"github.com/prometheus/common/model"
 | |
| 	corev1 "k8s.io/api/core/v1"
 | |
| 	"k8s.io/apimachinery/pkg/api/resource"
 | |
| 	"k8s.io/apimachinery/pkg/labels"
 | |
| 	"vgpu/internal/data/prom"
 | |
| 	"vgpu/internal/provider/util"
 | |
| )
 | |
| 
 | |
| type Cambricon struct {
 | |
| 	prom *prom.Client
 | |
| 	log  *log.Helper
 | |
| 
 | |
| 	labelsSelector string
 | |
| }
 | |
| 
 | |
| func NewCambricon(prom *prom.Client, log *log.Helper, labelSelector string) *Cambricon {
 | |
| 	return &Cambricon{
 | |
| 		prom:           prom,
 | |
| 		log:            log,
 | |
| 		labelsSelector: labelSelector,
 | |
| 	}
 | |
| }
 | |
| 
 | |
| func (c *Cambricon) GetNodeDevicePluginLabels() (labels.Selector, error) {
 | |
| 	return labels.Parse(c.labelsSelector)
 | |
| }
 | |
| 
 | |
| func (c *Cambricon) GetProvider() string {
 | |
| 	return CambriconMLUDevice
 | |
| }
 | |
| 
 | |
| type DeviceMeta struct {
 | |
| 	UUID   string
 | |
| 	Type   string
 | |
| 	Driver string
 | |
| }
 | |
| 
 | |
| func (c *Cambricon) GetDevicesFromPrometheus(node *corev1.Node) map[string]*util.DeviceInfo {
 | |
| 	device := make(map[string]*util.DeviceInfo)
 | |
| 	queryString := fmt.Sprintf("mlu_health{node=\"%s\"}", node.Name)
 | |
| 	vs, err := c.prom.Query(context.Background(), queryString)
 | |
| 	if err != nil {
 | |
| 		c.log.Warnf("query %s failed", queryString)
 | |
| 	} else {
 | |
| 		ds, ok := vs.(model.Vector)
 | |
| 		if !ok {
 | |
| 			c.log.Warnf("vectorValue: %v, failed", vs)
 | |
| 		} else {
 | |
| 			for _, d := range ds {
 | |
| 				id := d.Metric["mlu"]
 | |
| 				health := false
 | |
| 				if d.Value.Equal(1) {
 | |
| 					health = true
 | |
| 				}
 | |
| 				device[string(id)] = &util.DeviceInfo{
 | |
| 					ID:     string(d.Metric["uuid"]),
 | |
| 					Type:   string(d.Metric["model"]),
 | |
| 					Driver: string(d.Metric["driver"]),
 | |
| 					Health: health,
 | |
| 				}
 | |
| 			}
 | |
| 		}
 | |
| 	}
 | |
| 	return device
 | |
| }
 | |
| 
 | |
| func (c *Cambricon) FetchDevices(node *corev1.Node) ([]*util.DeviceInfo, error) {
 | |
| 	nodedevices := []*util.DeviceInfo{}
 | |
| 	i := 0
 | |
| 	cards, _ := node.Status.Capacity.Name(corev1.ResourceName(CambriconDeviceCoreAnnos), resource.DecimalSI).AsInt64()
 | |
| 	memoryTotal, _ := node.Status.Capacity.Name(corev1.ResourceName(CambriconDeviceMemAnnos), resource.DecimalSI).AsInt64()
 | |
| 	tmpDevice := c.GetDevicesFromPrometheus(node)
 | |
| 	for int64(i)*100 < cards {
 | |
| 		index := fmt.Sprintf("%d", i)
 | |
| 		if _, ok := tmpDevice[index]; !ok {
 | |
| 			i++
 | |
| 			continue
 | |
| 		}
 | |
| 		nodedevices = append(nodedevices, &util.DeviceInfo{
 | |
| 			Index:   uint(i),
 | |
| 			ID:      tmpDevice[index].ID,
 | |
| 			AliasId: node.Name + "-cambricon-mlu-" + fmt.Sprint(i),
 | |
| 			Count:   10,
 | |
| 			Devmem:  int32(memoryTotal * CambriconMemUnit * 100 / cards),
 | |
| 			Devcore: 100,
 | |
| 			Type:    tmpDevice[index].Type,
 | |
| 			Numa:    0,
 | |
| 			Health:  tmpDevice[index].Health,
 | |
| 			Driver:  tmpDevice[index].Driver,
 | |
| 		})
 | |
| 		i++
 | |
| 	}
 | |
| 	return nodedevices, nil
 | |
| }
 |