fix(node): 资源池移除节点接口,显卡管理,任务管理

main
youys 6 days ago
parent dbef1be45f
commit 02028e09e1

@ -65,6 +65,7 @@ message GPUReply {
string node_uid = 10; string node_uid = 10;
bool health = 11; bool health = 11;
string mode = 12; string mode = 12;
repeated string resource_pools = 13;
} }
message GPUsReply { message GPUsReply {

@ -61,11 +61,16 @@ message ContainerReply {
string end_time = 11; string end_time = 11;
string pod_uid = 12; string pod_uid = 12;
string node_uid = 13; string node_uid = 13;
string resource_pool = 14; repeated string resource_pool = 14;
string flavor = 15; string flavor = 15;
string priority = 16; string priority = 16;
string namespace = 17; string namespace = 17;
repeated string device_ids = 18; repeated string device_ids = 18;
string pod_name = 19;
string task_type = 20;
string shixun_name = 21;
string role = 22;
string username = 23;
} }
message ContainersReply { message ContainersReply {

@ -41,6 +41,16 @@ service ResourcePool {
}; };
} }
rpc RemoveNode (RemoveNodeRequest) returns (BaseResponse) {
option (google.api.http) = {
post: "/v1/resource/pool/removeNode",
body: "*"
};
option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
summary: "移除节点";
};
}
rpc List (ResourcePoolListRequest) returns (ResourcePoolListResponse) { rpc List (ResourcePoolListRequest) returns (ResourcePoolListResponse) {
option (google.api.http) = { option (google.api.http) = {
get: "/v1/resource/pool/list" get: "/v1/resource/pool/list"
@ -52,7 +62,7 @@ service ResourcePool {
rpc GetDetail (ResourcePoolDetailRequest) returns (ResourcePoolDetailResponse) { rpc GetDetail (ResourcePoolDetailRequest) returns (ResourcePoolDetailResponse) {
option (google.api.http) = { option (google.api.http) = {
get: "/v1/resource/pool/detail" post: "/v1/resource/pool/detail"
}; };
option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = { option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
summary: "资源池详情"; summary: "资源池详情";
@ -67,6 +77,7 @@ service ResourcePool {
summary: "可用节点列表"; summary: "可用节点列表";
}; };
} }
} }
message BaseResponse { message BaseResponse {
@ -118,6 +129,7 @@ message PoolNodeReply {
string architecture = 20; string architecture = 20;
string creation_timestamp = 21; string creation_timestamp = 21;
int64 disk_size = 22; int64 disk_size = 22;
int64 node_id = 23;
} }
message ResourcePoolDetailRequest { message ResourcePoolDetailRequest {
@ -137,7 +149,7 @@ message ResourcePoolListData{
int64 available_memory = 6;//kb int64 available_memory = 6;//kb
int64 total_memory = 7; // kb int64 total_memory = 7; // kb
int64 disk_size = 8; int64 disk_size = 8;
repeated string node_list = 9; repeated Nodes node_list = 9;
} }
message ResourcePoolListRequest { message ResourcePoolListRequest {
@ -167,4 +179,8 @@ message AvailableNodesInfo{
} }
message RemoveNodeRequest{
int64 node_id = 1;
}

@ -34,6 +34,7 @@ func main() {
flag.Parse() flag.Parse()
var ctx = context.Background() var ctx = context.Background()
database.InitConfigPath(flagconf)
if err := initDatabase(); err != nil { if err := initDatabase(); err != nil {
log.Errorf("数据库初始化失败: %v", err) log.Errorf("数据库初始化失败: %v", err)
os.Exit(1) os.Exit(1)
@ -71,13 +72,22 @@ func getNodeSelectors(c *conf.Bootstrap) map[string]string {
} }
func initDatabase() error { func initDatabase() error {
config, err := database.LoadConfig(flagconf) driver, err := database.Get("database.driver")
log.Infof("config: %+v", config)
if err != nil { if err != nil {
log.Errorf("Failed to load config: %v", err) log.Errorf("Failed to load config: %v", err)
return err return err
} }
database.InitDB(&config.Database)
log.Infof("初始化%s成功", config.Database.Driver) dataSourceName, err := database.Get("database.dataSourceName")
if err != nil {
log.Errorf("Failed to load config: %v", err)
return err
}
var config = &database.DatabaseConfig{}
config.Driver = driver.(string)
config.DataSourceName = dataSourceName.(string)
database.InitDB(config)
log.Infof("初始化%s成功", driver)
return nil return nil
} }

@ -16,3 +16,5 @@ node_selectors:
database: database:
driver: mysql driver: mysql
dataSourceName: testeducoder:TEST@123@tcp(testeducoder-public.mysql.polardb.rds.aliyuncs.com:3306)/hami?parseTime=true&loc=Local dataSourceName: testeducoder:TEST@123@tcp(testeducoder-public.mysql.polardb.rds.aliyuncs.com:3306)/hami?parseTime=true&loc=Local
web_domain: http://172.16.100.14
big_model_resource_pool_name: "大模型资源池"

@ -21,6 +21,7 @@ type Container struct {
Priority string Priority string
NodeUID string NodeUID string
Namespace string Namespace string
TpiID string
} }
type PodInfo struct { type PodInfo struct {
@ -31,6 +32,7 @@ type PodInfo struct {
Devices PodDevices Devices PodDevices
CtrIDs []string CtrIDs []string
Ctrs []*Container Ctrs []*Container
Labels map[string]string
} }
type PodRepo interface { type PodRepo interface {

@ -22,6 +22,7 @@ type podRepo struct {
data *Data data *Data
podLister listerscorev1.PodLister podLister listerscorev1.PodLister
pods map[k8stypes.UID]*biz.PodInfo pods map[k8stypes.UID]*biz.PodInfo
allPods []*biz.PodInfo
mutex sync.RWMutex mutex sync.RWMutex
log *log.Helper log *log.Helper
} }
@ -30,6 +31,7 @@ func NewPodRepo(data *Data, logger log.Logger) biz.PodRepo {
repo := &podRepo{ repo := &podRepo{
data: data, data: data,
pods: make(map[k8stypes.UID]*biz.PodInfo), pods: make(map[k8stypes.UID]*biz.PodInfo),
allPods: []*biz.PodInfo{},
log: log.NewHelper(logger), log: log.NewHelper(logger),
} }
repo.init() repo.init()
@ -91,8 +93,9 @@ func (r *podRepo) addPod(pod *corev1.Pod, nodeID string, devices biz.PodDevices)
r.mutex.Lock() r.mutex.Lock()
defer r.mutex.Unlock() defer r.mutex.Unlock()
ctrs := r.fetchContainerInfo(pod) ctrs := r.fetchContainerInfo(pod)
pi := &biz.PodInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices, Ctrs: ctrs} pi := &biz.PodInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices, Ctrs: ctrs, Labels: pod.Labels}
r.pods[pod.UID] = pi r.pods[pod.UID] = pi
r.allPods = append(r.allPods, pi)
r.log.Infof("Pod added: Name: %s, UID: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID) r.log.Infof("Pod added: Name: %s, UID: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID)
} }
@ -185,7 +188,11 @@ func (r *podRepo) GetStartTime(pod *corev1.Pod) time.Time {
func (r *podRepo) ListAll(context.Context) ([]*biz.Container, error) { func (r *podRepo) ListAll(context.Context) ([]*biz.Container, error) {
var containerList []*biz.Container var containerList []*biz.Container
for _, pod := range r.pods { for _, pod := range r.pods {
containerList = append(containerList, pod.Ctrs...) TpiID := pod.Labels["tpi-id"]
for _, container := range pod.Ctrs {
container.TpiID = TpiID
containerList = append(containerList, container)
}
} }
return containerList, nil return containerList, nil
} }

@ -1,31 +1,97 @@
package database package database
import ( import (
"encoding/json"
"fmt" "fmt"
"github.com/go-kratos/kratos/v2/log"
"gopkg.in/yaml.v3" "gopkg.in/yaml.v3"
"os" "os"
"strings"
"sync"
) )
type DatabaseConfig struct { var (
Driver string `yaml:"driver"` configData map[string]interface{}
DataSourceName string `yaml:"dataSourceName"` loadOnce sync.Once
} loadErr error
configPath string
)
type Config struct { // InitConfigPath 设置配置路径(可选)
Database DatabaseConfig `yaml:"database"` func InitConfigPath(path string) {
configPath = path
} }
func LoadConfig(filePath string) (*Config, error) { // loadYAML 加载 YAML 到 map
func loadYAML(filePath string) (map[string]interface{}, error) {
yamlFile, err := os.ReadFile(filePath) yamlFile, err := os.ReadFile(filePath)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to read config file: %v", err) return nil, fmt.Errorf("failed to read config file: %v", err)
} }
var config Config var raw map[string]interface{}
err = yaml.Unmarshal(yamlFile, &config) err = yaml.Unmarshal(yamlFile, &raw)
if err != nil { if err != nil {
return nil, fmt.Errorf("failed to unmarshal config file: %v", err) return nil, fmt.Errorf("failed to unmarshal yaml: %v", err)
}
b, _ := json.MarshalIndent(raw, "", " ")
log.Info("loadYAML: ", string(b))
converted, ok := toStringKeyMap(raw).(map[string]interface{})
if !ok {
return nil, fmt.Errorf("failed to convert config to map[string]interface{}")
} }
return &config, nil return converted, nil
}
// GetConfig 获取全局配置 map只加载一次
func GetConfig() (map[string]interface{}, error) {
loadOnce.Do(func() {
if configPath == "" {
configPath = "config.yaml"
}
configData, loadErr = loadYAML(configPath)
})
return configData, loadErr
}
// Get 获取嵌套配置,例如 Get("database.driver")
func Get(key string) (interface{}, error) {
cfg, err := GetConfig()
if err != nil {
return nil, err
}
keys := strings.Split(key, ".")
var val interface{} = cfg
for _, k := range keys {
m, ok := val.(map[string]interface{})
if !ok {
return nil, fmt.Errorf("invalid path: %s", key)
}
val, ok = m[k]
if !ok {
return nil, fmt.Errorf("key not found: %s", key)
}
}
return val, nil
}
func toStringKeyMap(i interface{}) interface{} {
switch x := i.(type) {
case map[interface{}]interface{}:
m2 := map[string]interface{}{}
for k, v := range x {
keyStr := fmt.Sprintf("%v", k)
m2[keyStr] = toStringKeyMap(v)
}
return m2
case []interface{}:
for i, v := range x {
x[i] = toStringKeyMap(v)
}
}
return i
} }

@ -8,6 +8,15 @@ import (
var db *sql.DB var db *sql.DB
type DatabaseConfig struct {
Driver string `yaml:"driver"`
DataSourceName string `yaml:"dataSourceName"`
}
type Config struct {
Database DatabaseConfig `yaml:"database"`
}
func InitDB(config *DatabaseConfig) { func InitDB(config *DatabaseConfig) {
var err error var err error
switch config.Driver { switch config.Driver {

@ -180,6 +180,37 @@ func QueryResourceNamesByIp(nodeIp string) ([]string, error) {
return resourcePoolNames, nil return resourcePoolNames, nil
} }
func QueryResourceNamesByNodeName(nodeName string) ([]string, error) {
// 执行查询
rows, err := db.Query("select pool_name from resource_pool where id in (select distinct pool_id from nodes where node_name=?)", nodeName)
if err != nil {
log.Infof("Query failed: %v", err)
return nil, err
}
defer rows.Close()
// 存放结果的切片
resourcePoolNames := make([]string, 0)
// 遍历每一行
for rows.Next() {
var name string
err := rows.Scan(&name)
if err != nil {
log.Infof("Scan failed: %v", err)
return nil, err
}
resourcePoolNames = append(resourcePoolNames, name)
}
// 检查 rows 是否遍历中出错
if err := rows.Err(); err != nil {
return nil, err
}
return resourcePoolNames, nil
}
func InsertResourcePool(poolName string) (int64, error) { func InsertResourcePool(poolName string) (int64, error) {
querySql := "INSERT INTO resource_pool(pool_name) VALUES (?)" querySql := "INSERT INTO resource_pool(pool_name) VALUES (?)"
@ -272,3 +303,18 @@ func DeleteNodesByPoolId(poolId int64) (int64, error) {
return rowsAffected, nil return rowsAffected, nil
} }
func DeleteNodeById(nodeId int64) (int64, error) {
result, err := db.Exec("DELETE FROM nodes WHERE id = ?", nodeId)
if err != nil {
return 0, fmt.Errorf("delete failed: %w", err)
}
// 返回影响的行数0 表示未删除任何数据)
rowsAffected, err := result.RowsAffected()
if err != nil {
return 0, fmt.Errorf("get rows affected failed: %w", err)
}
return rowsAffected, nil
}

@ -7,6 +7,7 @@ import (
"strings" "strings"
pb "vgpu/api/v1" pb "vgpu/api/v1"
"vgpu/internal/biz" "vgpu/internal/biz"
"vgpu/internal/database"
) )
type CardService struct { type CardService struct {
@ -51,6 +52,11 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p
gpu.NodeUid = device.NodeUid gpu.NodeUid = device.NodeUid
gpu.Health = device.Health gpu.Health = device.Health
gpu.Mode = device.Mode gpu.Mode = device.Mode
resourcePoolNames, err := database.QueryResourceNamesByNodeName(device.NodeName)
if err != nil {
return nil, err
}
gpu.ResourcePools = resourcePoolNames
vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId)
if err == nil { if err == nil {

@ -2,11 +2,16 @@ package service
import ( import (
"context" "context"
"encoding/json"
"github.com/go-kratos/kratos/v2/log"
"slices"
"sort" "sort"
"strings" "strings"
"time" "time"
pb "vgpu/api/v1" pb "vgpu/api/v1"
"vgpu/internal/biz" "vgpu/internal/biz"
"vgpu/internal/database"
"vgpu/internal/utils"
) )
var statusOrder = map[string]int{ var statusOrder = map[string]int{
@ -86,6 +91,32 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC
if containerReply.DeviceIds == nil { if containerReply.DeviceIds == nil {
continue continue
} }
resourcePoolNames, err := database.QueryResourceNamesByNodeName(container.NodeName)
if err != nil {
return nil, err
}
containerReply.ResourcePool = resourcePoolNames
resourcePoolName, err := database.Get("big_model_resource_pool_name")
if err != nil {
return nil, err
}
if slices.Contains(resourcePoolNames, resourcePoolName.(string)) {
containerReply.TaskType = "big_model"
} else {
containerReply.TaskType = "shixun"
}
if len(container.TpiID) > 0 {
err := s.setShixunData(ctx, containerReply, container.TpiID)
if err != nil {
return nil, err
}
}
containerReply.PodName = container.PodName
containerReply.CreateTime = container.CreateTime.Format(time.RFC3339) containerReply.CreateTime = container.CreateTime.Format(time.RFC3339)
res.Items = append(res.Items, containerReply) res.Items = append(res.Items, containerReply)
} }
@ -127,3 +158,34 @@ func (s *ContainerService) GetContainer(ctx context.Context, req *pb.GetContaine
ctrReply.CreateTime = container.CreateTime.Format(time.RFC3339) ctrReply.CreateTime = container.CreateTime.Format(time.RFC3339)
return ctrReply, nil return ctrReply, nil
} }
func (s *ContainerService) setShixunData(ctx context.Context, containerReply *pb.ContainerReply, tpiId string) error {
webDomain, err := database.Get("web_domain")
if err != nil {
return err
}
client := utils.GetDefaultClient()
url := webDomain.(string) + "/api/myshixuns/get_shixun_info.json"
log.Info("Get shixun info url: ", url, " tpiId: ", tpiId)
jsonData := map[string]interface{}{
"tpiID": tpiId,
}
body, status, err := client.PostJSON(ctx, url, jsonData, nil)
if err != nil {
return err
}
log.Infof("Get shixun info: %s, status: %d", string(body), status)
var respMap map[string]interface{}
err = json.Unmarshal(body, &respMap)
log.Info("Get shixun info: ", respMap, "----", respMap["status"])
if respMap["status"].(float64) == 0 {
data := respMap["data"].(map[string]interface{})
containerReply.ShixunName = data["shixun_name"].(string)
containerReply.Role = data["user_identity"].(string)
containerReply.Username = data["user_name"].(string)
}
return nil
}

@ -109,6 +109,17 @@ func (s *ResourcePoolService) Delete(ctx context.Context, req *pb.ResourcePoolDe
return &pb.BaseResponse{Code: 200, Message: "成功"}, nil return &pb.BaseResponse{Code: 200, Message: "成功"}, nil
} }
func (s *ResourcePoolService) RemoveNode(ctx context.Context, req *pb.RemoveNodeRequest) (*pb.BaseResponse, error) {
log.Info("RemoveNode called", req)
nodeId := req.NodeId
num, err := database.DeleteNodeById(nodeId)
if err != nil {
return &pb.BaseResponse{Code: 500, Message: "移除节点失败"}, nil
}
log.Infof("RemoveNode success poolId: %d, 影响行数: %d", nodeId, num)
return &pb.BaseResponse{Code: 200, Message: "成功"}, nil
}
func (s *ResourcePoolService) List(ctx context.Context, req *pb.ResourcePoolListRequest) (*pb.ResourcePoolListResponse, error) { func (s *ResourcePoolService) List(ctx context.Context, req *pb.ResourcePoolListRequest) (*pb.ResourcePoolListResponse, error) {
log.Info("GetResourcePoolList", req) log.Info("GetResourcePoolList", req)
@ -137,7 +148,10 @@ func (s *ResourcePoolService) List(ctx context.Context, req *pb.ResourcePoolList
poolData.TotalMemory = poolData.TotalMemory + node.TotalMemory poolData.TotalMemory = poolData.TotalMemory + node.TotalMemory
poolData.AvailableMemory = poolData.AvailableMemory + node.AvailableMemory poolData.AvailableMemory = poolData.AvailableMemory + node.AvailableMemory
poolData.DiskSize = poolData.DiskSize + node.DiskTotal poolData.DiskSize = poolData.DiskSize + node.DiskTotal
poolData.NodeList = append(poolData.NodeList, n.NodeIp) poolData.NodeList = append(poolData.NodeList, &pb.Nodes{
NodeIp: n.NodeIp,
NodeName: n.NodeName,
})
} }
data = append(data, &poolData) data = append(data, &poolData)
} }
@ -165,6 +179,7 @@ func (s *ResourcePoolService) GetDetail(ctx context.Context, req *pb.ResourcePoo
continue continue
} }
nodeReply, err := s.buildNodeReply(ctx, node) nodeReply, err := s.buildNodeReply(ctx, node)
nodeReply.NodeId = poolNode.Id
if err != nil { if err != nil {
return nil, err return nil, err
} }

@ -0,0 +1,110 @@
package utils
import (
"bytes"
"context"
"encoding/json"
"io"
"net/http"
"net/url"
"strings"
"sync"
"time"
)
type HttpClient struct {
client *http.Client
}
// NewHttpClient 创建带超时的 HTTP 客户端
func NewHttpClient(timeout time.Duration) *HttpClient {
return &HttpClient{
client: &http.Client{
Timeout: timeout,
},
}
}
// Get 发送 GET 请求
func (hc *HttpClient) Get(ctx context.Context, rawUrl string, headers map[string]string) ([]byte, int, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawUrl, nil)
if err != nil {
return nil, 0, err
}
for k, v := range headers {
req.Header.Set(k, v)
}
resp, err := hc.client.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return body, resp.StatusCode, err
}
// PostJSON 发送 POST 请求Body 是 JSON
func (hc *HttpClient) PostJSON(ctx context.Context, rawUrl string, data interface{}, headers map[string]string) ([]byte, int, error) {
bodyBytes, err := json.Marshal(data)
if err != nil {
return nil, 0, err
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, rawUrl, bytes.NewReader(bodyBytes))
if err != nil {
return nil, 0, err
}
req.Header.Set("Content-Type", "application/json")
for k, v := range headers {
req.Header.Set(k, v)
}
resp, err := hc.client.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return body, resp.StatusCode, err
}
// PostForm 发送 POST 表单请求
func (hc *HttpClient) PostForm(ctx context.Context, rawUrl string, formData map[string]string, headers map[string]string) ([]byte, int, error) {
data := url.Values{}
for k, v := range formData {
data.Set(k, v)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, rawUrl, strings.NewReader(data.Encode()))
if err != nil {
return nil, 0, err
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
for k, v := range headers {
req.Header.Set(k, v)
}
resp, err := hc.client.Do(req)
if err != nil {
return nil, 0, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
return body, resp.StatusCode, err
}
var (
defaultClient *HttpClient
once sync.Once
)
func GetDefaultClient() *HttpClient {
once.Do(func() {
defaultClient = NewHttpClient(10 * time.Second)
})
return defaultClient
}

@ -3,42 +3,37 @@
openapi: 3.0.3 openapi: 3.0.3
info: info:
title: Node API title: ResourcePool API
version: 0.0.1 version: 0.0.1
paths: paths:
/v1/node: /v1/available/nodes:
get: get:
tags: tags:
- Node - ResourcePool
operationId: Node_GetNode operationId: ResourcePool_GetAvailableNodes
parameters:
- name: uid
in: query
schema:
type: string
responses: responses:
"200": "200":
description: OK description: OK
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/NodeReply' $ref: '#/components/schemas/AvailableNodesResponse'
default: default:
description: Default error response description: Default error response
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
/v1/node/discovered: /v1/resource/pool/create:
post: post:
tags: tags:
- Node - ResourcePool
operationId: Node_DiscoveredNode operationId: ResourcePool_Create
requestBody: requestBody:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/DiscoveredNodeRequest' $ref: '#/components/schemas/ResourcePoolCreateRequest'
required: true required: true
responses: responses:
"200": "200":
@ -46,23 +41,23 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/DiscoveredNodeResponse' $ref: '#/components/schemas/BaseResponse'
default: default:
description: Default error response description: Default error response
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
/v1/node/join: /v1/resource/pool/delete:
post: post:
tags: tags:
- Node - ResourcePool
operationId: Node_JoinNode operationId: ResourcePool_Delete
requestBody: requestBody:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/JoinNodeRequest' $ref: '#/components/schemas/ResourcePoolDeleteRequest'
required: true required: true
responses: responses:
"200": "200":
@ -70,47 +65,64 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/JoinNodeResponse' $ref: '#/components/schemas/BaseResponse'
default: default:
description: Default error response description: Default error response
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
/v1/node/status/update: /v1/resource/pool/detail:
post: post:
tags: tags:
- Node - ResourcePool
operationId: Node_UpdateNodeStatus operationId: ResourcePool_GetDetail
requestBody: parameters:
- name: poolId
in: query
schema:
type: string
responses:
"200":
description: OK
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/UpdateNodeStatusRequest' $ref: '#/components/schemas/ResourcePoolDetailResponse'
required: true default:
description: Default error response
content:
application/json:
schema:
$ref: '#/components/schemas/Status'
/v1/resource/pool/list:
get:
tags:
- ResourcePool
operationId: ResourcePool_List
responses: responses:
"200": "200":
description: OK description: OK
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/UpdateNodeStatusResponse' $ref: '#/components/schemas/ResourcePoolListResponse'
default: default:
description: Default error response description: Default error response
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
/v1/nodes: /v1/resource/pool/removeNode:
post: post:
tags: tags:
- Node - ResourcePool
operationId: Node_GetAllNodes operationId: ResourcePool_RemoveNode
requestBody: requestBody:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/GetAllNodesReq' $ref: '#/components/schemas/RemoveNodeRequest'
required: true required: true
responses: responses:
"200": "200":
@ -118,23 +130,23 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/NodesReply' $ref: '#/components/schemas/BaseResponse'
default: default:
description: Default error response description: Default error response
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
/v1/summary: /v1/resource/pool/update:
post: post:
tags: tags:
- Node - ResourcePool
operationId: Node_GetSummary operationId: ResourcePool_Update
requestBody: requestBody:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/GetSummaryReq' $ref: '#/components/schemas/ResourcePoolUpdateRequest'
required: true required: true
responses: responses:
"200": "200":
@ -142,7 +154,7 @@ paths:
content: content:
application/json: application/json:
schema: schema:
$ref: '#/components/schemas/DeviceSummaryReply' $ref: '#/components/schemas/BaseResponse'
default: default:
description: Default error response description: Default error response
content: content:
@ -151,78 +163,40 @@ paths:
$ref: '#/components/schemas/Status' $ref: '#/components/schemas/Status'
components: components:
schemas: schemas:
DeviceSummaryReply: AvailableNodesInfo:
type: object
properties:
vgpuUsed:
type: integer
format: int32
vgpuTotal:
type: integer
format: int32
coreUsed:
type: integer
format: int32
coreTotal:
type: integer
format: int32
memoryUsed:
type: integer
format: int32
memoryTotal:
type: integer
format: int32
gpuCount:
type: integer
format: int32
nodeCount:
type: integer
format: int32
DiscoveredNodeInfo:
type: object type: object
properties: properties:
nodeIp:
type: string
nodeName: nodeName:
type: string type: string
DiscoveredNodeRequest: cpuCores:
type: object
properties: {}
DiscoveredNodeResponse:
type: object
properties:
list:
type: array
items:
$ref: '#/components/schemas/DiscoveredNodeInfo'
GetAllNodesReq:
type: object
properties:
filters:
$ref: '#/components/schemas/GetAllNodesReq_Filters'
GetAllNodesReq_Filters:
type: object
properties:
ip:
type: string type: string
type: gpuNum:
type: string type: string
isSchedulable: gpuMemory:
type: string
totalMemory:
type: string
diskSize:
type: string
nodeIp:
type: string type: string
GetSummaryReq: AvailableNodesResponse:
type: object type: object
properties: properties:
filters: data:
$ref: '#/components/schemas/GetSummaryReq_Filters' type: array
GetSummaryReq_Filters: items:
$ref: '#/components/schemas/AvailableNodesInfo'
BaseResponse:
type: object type: object
properties: properties:
type: code:
type: string type: integer
nodeUid: format: int32
type: string message:
deviceId:
type: string type: string
data:
type: object
GoogleProtobufAny: GoogleProtobufAny:
type: object type: object
properties: properties:
@ -231,22 +205,14 @@ components:
description: The type of the serialized message. description: The type of the serialized message.
additionalProperties: true additionalProperties: true
description: Contains an arbitrary serialized message along with a @type that describes the type of the serialized message. description: Contains an arbitrary serialized message along with a @type that describes the type of the serialized message.
JoinNodeRequest: Nodes:
type: object type: object
properties: properties:
nodeNames: nodeIp:
type: array
items:
type: string type: string
JoinNodeResponse: nodeName:
type: object
properties:
code:
type: integer
format: int32
message:
type: string type: string
NodeReply: PoolNodeReply:
type: object type: object
properties: properties:
ip: ip:
@ -271,8 +237,7 @@ components:
coreTotal: coreTotal:
type: string type: string
memoryUsed: memoryUsed:
type: integer type: string
format: int32
memoryTotal: memoryTotal:
type: string type: string
uid: uid:
@ -300,47 +265,90 @@ components:
type: string type: string
diskSize: diskSize:
type: string type: string
resourcePools: nodeId:
type: string
RemoveNodeRequest:
type: object
properties:
nodeId:
type: string
ResourcePoolCreateRequest:
type: object
properties:
poolName:
type: string
nodes:
type: array type: array
items: items:
$ref: '#/components/schemas/Nodes'
ResourcePoolDeleteRequest:
type: object
properties:
poolId:
type: string type: string
NodesReply: ResourcePoolDetailResponse:
type: object type: object
properties: properties:
list: list:
type: array type: array
items: items:
$ref: '#/components/schemas/NodeReply' $ref: '#/components/schemas/PoolNodeReply'
Status: ResourcePoolListData:
type: object type: object
properties: properties:
code: poolId:
type: integer
description: The status code, which should be an enum value of [google.rpc.Code][google.rpc.Code].
format: int32
message:
type: string type: string
description: A developer-facing error message, which should be in English. Any user-facing error message should be localized and sent in the [google.rpc.Status.details][google.rpc.Status.details] field, or localized by the client. poolName:
details: type: string
cpuCores:
type: string
nodeNum:
type: string
gpuNum:
type: string
availableMemory:
type: string
totalMemory:
type: string
diskSize:
type: string
nodeList:
type: array type: array
items: items:
$ref: '#/components/schemas/GoogleProtobufAny' $ref: '#/components/schemas/Nodes'
description: A list of messages that carry the error details. There is a common set of message types for APIs to use. ResourcePoolListResponse:
description: 'The `Status` type defines a logical error model that is suitable for different programming environments, including REST APIs and RPC APIs. It is used by [gRPC](https://github.com/grpc). Each `Status` message contains three pieces of data: error code, error message, and error details. You can find out more about this error model and how to work with it in the [API Design Guide](https://cloud.google.com/apis/design/errors).'
UpdateNodeStatusRequest:
type: object type: object
properties: properties:
nodeName: data:
type: array
items:
$ref: '#/components/schemas/ResourcePoolListData'
ResourcePoolUpdateRequest:
type: object
properties:
poolId:
type: string type: string
status: poolName:
type: string type: string
UpdateNodeStatusResponse: nodes:
type: array
items:
$ref: '#/components/schemas/Nodes'
Status:
type: object type: object
properties: properties:
code: code:
type: integer type: integer
description: The status code, which should be an enum value of [google.rpc.Code][google.rpc.Code].
format: int32 format: int32
message: message:
type: string type: string
description: A developer-facing error message, which should be in English. Any user-facing error message should be localized and sent in the [google.rpc.Status.details][google.rpc.Status.details] field, or localized by the client.
details:
type: array
items:
$ref: '#/components/schemas/GoogleProtobufAny'
description: A list of messages that carry the error details. There is a common set of message types for APIs to use.
description: 'The `Status` type defines a logical error model that is suitable for different programming environments, including REST APIs and RPC APIs. It is used by [gRPC](https://github.com/grpc). Each `Status` message contains three pieces of data: error code, error message, and error details. You can find out more about this error model and how to work with it in the [API Design Guide](https://cloud.google.com/apis/design/errors).'
tags: tags:
- name: Node - name: ResourcePool

Loading…
Cancel
Save