package exporter import ( "github.com/prometheus/client_golang/prometheus" ) func init() { // 卡维度指标 prometheus.MustRegister(HamiVCoreScaling) // 算力超分倍数 prometheus.MustRegister(HamiVMemoryScaling) // 显存超分倍数 prometheus.MustRegister(HamiVgpuCount) // 虚拟vgpu设备数 prometheus.MustRegister(HamiVmemorySize) // 虚拟显存大小(MB) prometheus.MustRegister(HamiVcoreSize) // 虚拟算力大小 prometheus.MustRegister(HamiMemorySize) // 真实显存总量(MB) prometheus.MustRegister(HamiMemoryUsed) // 真实显存已使用(MB) prometheus.MustRegister(HamiMemoryUtil) // 真实显存利用率 prometheus.MustRegister(HamiCoreSize) // 真实算力总量 prometheus.MustRegister(HamiCoreUsed) // 真实算力已使用 prometheus.MustRegister(HamiCoreUtil) // 真实算力利用率 prometheus.MustRegister(HamiCoreUsedAvg) // 真实算力已使用周期平均 prometheus.MustRegister(HamiCoreUtilAvg) // 真实算力利用率周期平均 prometheus.MustRegister(HamiDeviceTemperature) // 显卡温度 prometheus.MustRegister(HamiDeviceMemoryTemperature) // 显存温度 prometheus.MustRegister(HamiDevicePower) // 显卡功耗 prometheus.MustRegister(HamiDeviceFanSpeedP) // 风扇转速(百分比) prometheus.MustRegister(HamiDeviceFanSpeedR) // 风扇转速(每分钟转速) prometheus.MustRegister(HamiDeviceHardwareHealth) // 显卡健康状态 // 任务维度指标 prometheus.MustRegister(HamiContainerVgpuAllocated) // 任务申请的vgpu设备数 prometheus.MustRegister(HamiContainerVmemoryAllocated) // 任务申请的vmemory prometheus.MustRegister(HamiContainerVcoreAllocated) // 任务申请的vcore prometheus.MustRegister(HamiContainerMemoryUsed) // 任务实际使用的显存大小(MB) prometheus.MustRegister(HamiContainerMemoryUtil) // 任务实际使用的显存占任务申请的比例 prometheus.MustRegister(HamiContainerCoreUsed) // 任务实际使用的算力占卡的百分比 prometheus.MustRegister(HamiContainerCoreUtil) // 任务实际使用的算力占任务申请的比例 // 资源池维度指标 prometheus.MustRegister(HamiPoolVcoreSize) // 资源池总算力大小 prometheus.MustRegister(HamiPoolVgpuCount) // 资源池总vgpu设备数 prometheus.MustRegister(HamiPoolVmemorySize) // 资源池总显存大小 prometheus.MustRegister(HamiSystemComponentHealth) // 系统组件健康状态 } func reset() { HamiVCoreScaling.Reset() HamiVMemoryScaling.Reset() HamiVgpuCount.Reset() HamiVmemorySize.Reset() HamiVcoreSize.Reset() HamiMemoryUsed.Reset() HamiMemorySize.Reset() HamiMemoryUtil.Reset() HamiCoreSize.Reset() HamiCoreUsed.Reset() HamiCoreUtil.Reset() HamiCoreUsedAvg.Reset() HamiCoreUtilAvg.Reset() HamiDeviceTemperature.Reset() HamiDeviceMemoryTemperature.Reset() HamiDevicePower.Reset() HamiDeviceFanSpeedP.Reset() HamiDeviceFanSpeedR.Reset() HamiContainerVgpuAllocated.Reset() HamiContainerVmemoryAllocated.Reset() HamiContainerVcoreAllocated.Reset() HamiContainerMemoryUsed.Reset() HamiContainerMemoryUtil.Reset() HamiContainerCoreUsed.Reset() HamiContainerCoreUtil.Reset() HamiPoolVgpuCount.Reset() HamiPoolVmemorySize.Reset() HamiPoolVcoreSize.Reset() } var ( HamiVCoreScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vcore_scaling", Help: "GPU virtual core Scaling", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiVMemoryScaling = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vmemory_scaling", Help: "GPU virtual memory Scaling", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vgpu_count", Help: "Total vGPU count", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiVmemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vmemory_size", Help: "Total vMemory size", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiVcoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_vcore_size", Help: "Total vCore size", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_used", Help: "Actual memory usage, unit is 'MB' ", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiMemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_size", Help: "Actual memory size, unit is 'MB' ", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_memory_util", Help: "Actual Memory Util percent 0-100", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiCoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_size", Help: "Actual core size", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_used", Help: "Actual Core Used", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_util", Help: "Actual Core Util percent 0-100", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiCoreUsedAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_used_avg", Help: "Actual Core Used period avg", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiCoreUtilAvg = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_core_util_avg", Help: "Actual Core Util percent 0-100 period avg", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDeviceTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_temperature", Help: "gpu temperature", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDeviceMemoryTemperature = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_memory_temperature", Help: "gpu memory temperature", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDevicePower = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_power", Help: "gpu power", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDeviceHardwareHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_hardware_health", Help: "gpu hardware health", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDeviceFanSpeedP = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_fan_speed_p", Help: "gpu fan speed percent 0-100", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiDeviceFanSpeedR = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_device_fan_speed_r", Help: "gpu fan speed rpm", }, []string{"node", "provider", "devicetype", "deviceuuid", "driver_version", "device_no"}) HamiContainerVgpuAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vgpu_allocated", Help: "task allocated vGPU count", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerVmemoryAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vmemory_allocated", Help: "task allocated vMemory size", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerVcoreAllocated = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_vcore_allocated", Help: "task allocated vCore size", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name", "container_pod_uuid"}) HamiContainerMemoryUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_memory_used", Help: "task used memory unit MB", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) HamiContainerMemoryUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_memory_util", Help: "task memory util percent 0-100", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) HamiContainerCoreUsed = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_core_used", Help: "task used core ", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) HamiContainerCoreUtil = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_container_core_util", Help: "task core util percent 0-100", }, []string{"node", "provider", "devicetype", "deviceuuid", "pod_name", "container_name", "namespace_name"}) HamiPoolVgpuCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_pool_vgpu_count", Help: "Pool total vGPU count", }, []string{"pool"}) HamiPoolVmemorySize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_pool_vmemory_size", Help: "Pool total vMemory size", }, []string{"pool"}) HamiPoolVcoreSize = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_pool_vcore_size", Help: "Pool total vCore size", }, []string{"pool"}) HamiSystemComponentHealth = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Name: "hami_system_component_health", Help: "system component health", }, []string{"component"}) Reset = true )