From 02028e09e11637333c14a4cca5083fa3d611e254 Mon Sep 17 00:00:00 2001 From: youys <1272586223@qq.com> Date: Wed, 6 Aug 2025 14:54:41 +0800 Subject: [PATCH] =?UTF-8?q?fix(node):=20=E8=B5=84=E6=BA=90=E6=B1=A0?= =?UTF-8?q?=E7=A7=BB=E9=99=A4=E8=8A=82=E7=82=B9=E6=8E=A5=E5=8F=A3=EF=BC=8C?= =?UTF-8?q?=E6=98=BE=E5=8D=A1=E7=AE=A1=E7=90=86=EF=BC=8C=E4=BB=BB=E5=8A=A1?= =?UTF-8?q?=E7=AE=A1=E7=90=86?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- server/api/v1/card.proto | 1 + server/api/v1/container.proto | 7 +- server/api/v1/resource_pool.proto | 20 +- server/cmd/server/main.go | 18 +- server/config/config.yaml | 4 +- server/internal/biz/pod.go | 2 + server/internal/data/pod.go | 17 +- server/internal/database/config.go | 88 +++++- server/internal/database/init_db.go | 9 + server/internal/database/resource_pool_db.go | 46 +++ server/internal/service/card.go | 6 + server/internal/service/container.go | 62 +++++ server/internal/service/resource_pool.go | 17 +- server/internal/utils/httpclient.go | 110 ++++++++ server/openapi.yaml | 278 ++++++++++--------- 15 files changed, 525 insertions(+), 160 deletions(-) create mode 100644 server/internal/utils/httpclient.go diff --git a/server/api/v1/card.proto b/server/api/v1/card.proto index 5087dc1..cc4ca1e 100644 --- a/server/api/v1/card.proto +++ b/server/api/v1/card.proto @@ -65,6 +65,7 @@ message GPUReply { string node_uid = 10; bool health = 11; string mode = 12; + repeated string resource_pools = 13; } message GPUsReply { diff --git a/server/api/v1/container.proto b/server/api/v1/container.proto index 8445f18..9a52511 100644 --- a/server/api/v1/container.proto +++ b/server/api/v1/container.proto @@ -61,11 +61,16 @@ message ContainerReply { string end_time = 11; string pod_uid = 12; string node_uid = 13; - string resource_pool = 14; + repeated string resource_pool = 14; string flavor = 15; string priority = 16; string namespace = 17; repeated string device_ids = 18; + string pod_name = 19; + string task_type = 20; + string shixun_name = 21; + string role = 22; + string username = 23; } message ContainersReply { diff --git a/server/api/v1/resource_pool.proto b/server/api/v1/resource_pool.proto index b3dedae..858d4b9 100644 --- a/server/api/v1/resource_pool.proto +++ b/server/api/v1/resource_pool.proto @@ -41,6 +41,16 @@ service ResourcePool { }; } + rpc RemoveNode (RemoveNodeRequest) returns (BaseResponse) { + option (google.api.http) = { + post: "/v1/resource/pool/removeNode", + body: "*" + }; + option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = { + summary: "移除节点"; + }; + } + rpc List (ResourcePoolListRequest) returns (ResourcePoolListResponse) { option (google.api.http) = { get: "/v1/resource/pool/list" @@ -52,7 +62,7 @@ service ResourcePool { rpc GetDetail (ResourcePoolDetailRequest) returns (ResourcePoolDetailResponse) { option (google.api.http) = { - get: "/v1/resource/pool/detail" + post: "/v1/resource/pool/detail" }; option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = { summary: "资源池详情"; @@ -67,6 +77,7 @@ service ResourcePool { summary: "可用节点列表"; }; } + } message BaseResponse { @@ -118,6 +129,7 @@ message PoolNodeReply { string architecture = 20; string creation_timestamp = 21; int64 disk_size = 22; + int64 node_id = 23; } message ResourcePoolDetailRequest { @@ -137,7 +149,7 @@ message ResourcePoolListData{ int64 available_memory = 6;//kb int64 total_memory = 7; // kb int64 disk_size = 8; - repeated string node_list = 9; + repeated Nodes node_list = 9; } message ResourcePoolListRequest { @@ -167,4 +179,8 @@ message AvailableNodesInfo{ } +message RemoveNodeRequest{ + int64 node_id = 1; +} + diff --git a/server/cmd/server/main.go b/server/cmd/server/main.go index dc61dee..dbbeeee 100644 --- a/server/cmd/server/main.go +++ b/server/cmd/server/main.go @@ -34,6 +34,7 @@ func main() { flag.Parse() var ctx = context.Background() + database.InitConfigPath(flagconf) if err := initDatabase(); err != nil { log.Errorf("数据库初始化失败: %v", err) os.Exit(1) @@ -71,13 +72,22 @@ func getNodeSelectors(c *conf.Bootstrap) map[string]string { } func initDatabase() error { - config, err := database.LoadConfig(flagconf) - log.Infof("config: %+v", config) + driver, err := database.Get("database.driver") if err != nil { log.Errorf("Failed to load config: %v", err) return err } - database.InitDB(&config.Database) - log.Infof("初始化%s成功", config.Database.Driver) + + dataSourceName, err := database.Get("database.dataSourceName") + if err != nil { + log.Errorf("Failed to load config: %v", err) + return err + } + + var config = &database.DatabaseConfig{} + config.Driver = driver.(string) + config.DataSourceName = dataSourceName.(string) + database.InitDB(config) + log.Infof("初始化%s成功", driver) return nil } diff --git a/server/config/config.yaml b/server/config/config.yaml index e2065f7..3d57573 100644 --- a/server/config/config.yaml +++ b/server/config/config.yaml @@ -15,4 +15,6 @@ node_selectors: MLU: mlu=on database: driver: mysql - dataSourceName: testeducoder:TEST@123@tcp(testeducoder-public.mysql.polardb.rds.aliyuncs.com:3306)/hami?parseTime=true&loc=Local \ No newline at end of file + dataSourceName: testeducoder:TEST@123@tcp(testeducoder-public.mysql.polardb.rds.aliyuncs.com:3306)/hami?parseTime=true&loc=Local +web_domain: http://172.16.100.14 +big_model_resource_pool_name: "大模型资源池" \ No newline at end of file diff --git a/server/internal/biz/pod.go b/server/internal/biz/pod.go index 15aa6c5..e5bbb67 100644 --- a/server/internal/biz/pod.go +++ b/server/internal/biz/pod.go @@ -21,6 +21,7 @@ type Container struct { Priority string NodeUID string Namespace string + TpiID string } type PodInfo struct { @@ -31,6 +32,7 @@ type PodInfo struct { Devices PodDevices CtrIDs []string Ctrs []*Container + Labels map[string]string } type PodRepo interface { diff --git a/server/internal/data/pod.go b/server/internal/data/pod.go index bfc71e8..8c05c96 100644 --- a/server/internal/data/pod.go +++ b/server/internal/data/pod.go @@ -22,15 +22,17 @@ type podRepo struct { data *Data podLister listerscorev1.PodLister pods map[k8stypes.UID]*biz.PodInfo + allPods []*biz.PodInfo mutex sync.RWMutex log *log.Helper } func NewPodRepo(data *Data, logger log.Logger) biz.PodRepo { repo := &podRepo{ - data: data, - pods: make(map[k8stypes.UID]*biz.PodInfo), - log: log.NewHelper(logger), + data: data, + pods: make(map[k8stypes.UID]*biz.PodInfo), + allPods: []*biz.PodInfo{}, + log: log.NewHelper(logger), } repo.init() return repo @@ -91,8 +93,9 @@ func (r *podRepo) addPod(pod *corev1.Pod, nodeID string, devices biz.PodDevices) r.mutex.Lock() defer r.mutex.Unlock() ctrs := r.fetchContainerInfo(pod) - pi := &biz.PodInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices, Ctrs: ctrs} + pi := &biz.PodInfo{Name: pod.Name, UID: pod.UID, Namespace: pod.Namespace, NodeID: nodeID, Devices: devices, Ctrs: ctrs, Labels: pod.Labels} r.pods[pod.UID] = pi + r.allPods = append(r.allPods, pi) r.log.Infof("Pod added: Name: %s, UID: %s, Namespace: %s, NodeID: %s", pod.Name, pod.UID, pod.Namespace, nodeID) } @@ -185,7 +188,11 @@ func (r *podRepo) GetStartTime(pod *corev1.Pod) time.Time { func (r *podRepo) ListAll(context.Context) ([]*biz.Container, error) { var containerList []*biz.Container for _, pod := range r.pods { - containerList = append(containerList, pod.Ctrs...) + TpiID := pod.Labels["tpi-id"] + for _, container := range pod.Ctrs { + container.TpiID = TpiID + containerList = append(containerList, container) + } } return containerList, nil } diff --git a/server/internal/database/config.go b/server/internal/database/config.go index d224f25..ff9b2a3 100644 --- a/server/internal/database/config.go +++ b/server/internal/database/config.go @@ -1,31 +1,97 @@ package database import ( + "encoding/json" "fmt" + "github.com/go-kratos/kratos/v2/log" "gopkg.in/yaml.v3" "os" + "strings" + "sync" ) -type DatabaseConfig struct { - Driver string `yaml:"driver"` - DataSourceName string `yaml:"dataSourceName"` -} +var ( + configData map[string]interface{} + loadOnce sync.Once + loadErr error + configPath string +) -type Config struct { - Database DatabaseConfig `yaml:"database"` +// InitConfigPath 设置配置路径(可选) +func InitConfigPath(path string) { + configPath = path } -func LoadConfig(filePath string) (*Config, error) { +// loadYAML 加载 YAML 到 map +func loadYAML(filePath string) (map[string]interface{}, error) { yamlFile, err := os.ReadFile(filePath) if err != nil { return nil, fmt.Errorf("failed to read config file: %v", err) } - var config Config - err = yaml.Unmarshal(yamlFile, &config) + var raw map[string]interface{} + err = yaml.Unmarshal(yamlFile, &raw) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal yaml: %v", err) + } + + b, _ := json.MarshalIndent(raw, "", " ") + log.Info("loadYAML: ", string(b)) + converted, ok := toStringKeyMap(raw).(map[string]interface{}) + if !ok { + return nil, fmt.Errorf("failed to convert config to map[string]interface{}") + } + + return converted, nil +} + +// GetConfig 获取全局配置 map,只加载一次 +func GetConfig() (map[string]interface{}, error) { + loadOnce.Do(func() { + if configPath == "" { + configPath = "config.yaml" + } + configData, loadErr = loadYAML(configPath) + }) + return configData, loadErr +} + +// Get 获取嵌套配置,例如 Get("database.driver") +func Get(key string) (interface{}, error) { + cfg, err := GetConfig() if err != nil { - return nil, fmt.Errorf("failed to unmarshal config file: %v", err) + return nil, err + } + + keys := strings.Split(key, ".") + var val interface{} = cfg + + for _, k := range keys { + m, ok := val.(map[string]interface{}) + if !ok { + return nil, fmt.Errorf("invalid path: %s", key) + } + val, ok = m[k] + if !ok { + return nil, fmt.Errorf("key not found: %s", key) + } } + return val, nil +} - return &config, nil +func toStringKeyMap(i interface{}) interface{} { + switch x := i.(type) { + case map[interface{}]interface{}: + m2 := map[string]interface{}{} + for k, v := range x { + keyStr := fmt.Sprintf("%v", k) + m2[keyStr] = toStringKeyMap(v) + } + return m2 + case []interface{}: + for i, v := range x { + x[i] = toStringKeyMap(v) + } + } + return i } diff --git a/server/internal/database/init_db.go b/server/internal/database/init_db.go index a2b6149..7768ed7 100644 --- a/server/internal/database/init_db.go +++ b/server/internal/database/init_db.go @@ -8,6 +8,15 @@ import ( var db *sql.DB +type DatabaseConfig struct { + Driver string `yaml:"driver"` + DataSourceName string `yaml:"dataSourceName"` +} + +type Config struct { + Database DatabaseConfig `yaml:"database"` +} + func InitDB(config *DatabaseConfig) { var err error switch config.Driver { diff --git a/server/internal/database/resource_pool_db.go b/server/internal/database/resource_pool_db.go index ad954de..6adebda 100644 --- a/server/internal/database/resource_pool_db.go +++ b/server/internal/database/resource_pool_db.go @@ -180,6 +180,37 @@ func QueryResourceNamesByIp(nodeIp string) ([]string, error) { return resourcePoolNames, nil } +func QueryResourceNamesByNodeName(nodeName string) ([]string, error) { + // 执行查询 + rows, err := db.Query("select pool_name from resource_pool where id in (select distinct pool_id from nodes where node_name=?)", nodeName) + if err != nil { + log.Infof("Query failed: %v", err) + return nil, err + } + defer rows.Close() + + // 存放结果的切片 + resourcePoolNames := make([]string, 0) + + // 遍历每一行 + for rows.Next() { + var name string + err := rows.Scan(&name) + if err != nil { + log.Infof("Scan failed: %v", err) + return nil, err + } + resourcePoolNames = append(resourcePoolNames, name) + } + + // 检查 rows 是否遍历中出错 + if err := rows.Err(); err != nil { + return nil, err + } + + return resourcePoolNames, nil +} + func InsertResourcePool(poolName string) (int64, error) { querySql := "INSERT INTO resource_pool(pool_name) VALUES (?)" @@ -272,3 +303,18 @@ func DeleteNodesByPoolId(poolId int64) (int64, error) { return rowsAffected, nil } + +func DeleteNodeById(nodeId int64) (int64, error) { + result, err := db.Exec("DELETE FROM nodes WHERE id = ?", nodeId) + if err != nil { + return 0, fmt.Errorf("delete failed: %w", err) + } + + // 返回影响的行数(0 表示未删除任何数据) + rowsAffected, err := result.RowsAffected() + if err != nil { + return 0, fmt.Errorf("get rows affected failed: %w", err) + } + + return rowsAffected, nil +} diff --git a/server/internal/service/card.go b/server/internal/service/card.go index 6569c1f..da96b05 100644 --- a/server/internal/service/card.go +++ b/server/internal/service/card.go @@ -7,6 +7,7 @@ import ( "strings" pb "vgpu/api/v1" "vgpu/internal/biz" + "vgpu/internal/database" ) type CardService struct { @@ -51,6 +52,11 @@ func (s *CardService) GetAllGPUs(ctx context.Context, req *pb.GetAllGpusReq) (*p gpu.NodeUid = device.NodeUid gpu.Health = device.Health gpu.Mode = device.Mode + resourcePoolNames, err := database.QueryResourceNamesByNodeName(device.NodeName) + if err != nil { + return nil, err + } + gpu.ResourcePools = resourcePoolNames vGPU, core, memory, err := s.pod.StatisticsByDeviceId(ctx, device.AliasId) if err == nil { diff --git a/server/internal/service/container.go b/server/internal/service/container.go index 4bde7f2..9a17f03 100644 --- a/server/internal/service/container.go +++ b/server/internal/service/container.go @@ -2,11 +2,16 @@ package service import ( "context" + "encoding/json" + "github.com/go-kratos/kratos/v2/log" + "slices" "sort" "strings" "time" pb "vgpu/api/v1" "vgpu/internal/biz" + "vgpu/internal/database" + "vgpu/internal/utils" ) var statusOrder = map[string]int{ @@ -86,6 +91,32 @@ func (s *ContainerService) GetAllContainers(ctx context.Context, req *pb.GetAllC if containerReply.DeviceIds == nil { continue } + + resourcePoolNames, err := database.QueryResourceNamesByNodeName(container.NodeName) + if err != nil { + return nil, err + } + + containerReply.ResourcePool = resourcePoolNames + resourcePoolName, err := database.Get("big_model_resource_pool_name") + if err != nil { + return nil, err + } + + if slices.Contains(resourcePoolNames, resourcePoolName.(string)) { + containerReply.TaskType = "big_model" + } else { + containerReply.TaskType = "shixun" + } + + if len(container.TpiID) > 0 { + err := s.setShixunData(ctx, containerReply, container.TpiID) + if err != nil { + return nil, err + } + } + + containerReply.PodName = container.PodName containerReply.CreateTime = container.CreateTime.Format(time.RFC3339) res.Items = append(res.Items, containerReply) } @@ -127,3 +158,34 @@ func (s *ContainerService) GetContainer(ctx context.Context, req *pb.GetContaine ctrReply.CreateTime = container.CreateTime.Format(time.RFC3339) return ctrReply, nil } + +func (s *ContainerService) setShixunData(ctx context.Context, containerReply *pb.ContainerReply, tpiId string) error { + webDomain, err := database.Get("web_domain") + if err != nil { + return err + } + + client := utils.GetDefaultClient() + url := webDomain.(string) + "/api/myshixuns/get_shixun_info.json" + log.Info("Get shixun info url: ", url, " tpiId: ", tpiId) + jsonData := map[string]interface{}{ + "tpiID": tpiId, + } + body, status, err := client.PostJSON(ctx, url, jsonData, nil) + if err != nil { + return err + } + log.Infof("Get shixun info: %s, status: %d", string(body), status) + + var respMap map[string]interface{} + err = json.Unmarshal(body, &respMap) + log.Info("Get shixun info: ", respMap, "----", respMap["status"]) + if respMap["status"].(float64) == 0 { + data := respMap["data"].(map[string]interface{}) + containerReply.ShixunName = data["shixun_name"].(string) + containerReply.Role = data["user_identity"].(string) + containerReply.Username = data["user_name"].(string) + } + + return nil +} diff --git a/server/internal/service/resource_pool.go b/server/internal/service/resource_pool.go index cc09ea1..027c9fc 100644 --- a/server/internal/service/resource_pool.go +++ b/server/internal/service/resource_pool.go @@ -109,6 +109,17 @@ func (s *ResourcePoolService) Delete(ctx context.Context, req *pb.ResourcePoolDe return &pb.BaseResponse{Code: 200, Message: "成功"}, nil } +func (s *ResourcePoolService) RemoveNode(ctx context.Context, req *pb.RemoveNodeRequest) (*pb.BaseResponse, error) { + log.Info("RemoveNode called", req) + nodeId := req.NodeId + num, err := database.DeleteNodeById(nodeId) + if err != nil { + return &pb.BaseResponse{Code: 500, Message: "移除节点失败"}, nil + } + log.Infof("RemoveNode success poolId: %d, 影响行数: %d", nodeId, num) + return &pb.BaseResponse{Code: 200, Message: "成功"}, nil +} + func (s *ResourcePoolService) List(ctx context.Context, req *pb.ResourcePoolListRequest) (*pb.ResourcePoolListResponse, error) { log.Info("GetResourcePoolList", req) @@ -137,7 +148,10 @@ func (s *ResourcePoolService) List(ctx context.Context, req *pb.ResourcePoolList poolData.TotalMemory = poolData.TotalMemory + node.TotalMemory poolData.AvailableMemory = poolData.AvailableMemory + node.AvailableMemory poolData.DiskSize = poolData.DiskSize + node.DiskTotal - poolData.NodeList = append(poolData.NodeList, n.NodeIp) + poolData.NodeList = append(poolData.NodeList, &pb.Nodes{ + NodeIp: n.NodeIp, + NodeName: n.NodeName, + }) } data = append(data, &poolData) } @@ -165,6 +179,7 @@ func (s *ResourcePoolService) GetDetail(ctx context.Context, req *pb.ResourcePoo continue } nodeReply, err := s.buildNodeReply(ctx, node) + nodeReply.NodeId = poolNode.Id if err != nil { return nil, err } diff --git a/server/internal/utils/httpclient.go b/server/internal/utils/httpclient.go new file mode 100644 index 0000000..ec541de --- /dev/null +++ b/server/internal/utils/httpclient.go @@ -0,0 +1,110 @@ +package utils + +import ( + "bytes" + "context" + "encoding/json" + "io" + "net/http" + "net/url" + "strings" + "sync" + "time" +) + +type HttpClient struct { + client *http.Client +} + +// NewHttpClient 创建带超时的 HTTP 客户端 +func NewHttpClient(timeout time.Duration) *HttpClient { + return &HttpClient{ + client: &http.Client{ + Timeout: timeout, + }, + } +} + +// Get 发送 GET 请求 +func (hc *HttpClient) Get(ctx context.Context, rawUrl string, headers map[string]string) ([]byte, int, error) { + req, err := http.NewRequestWithContext(ctx, http.MethodGet, rawUrl, nil) + if err != nil { + return nil, 0, err + } + for k, v := range headers { + req.Header.Set(k, v) + } + + resp, err := hc.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + return body, resp.StatusCode, err +} + +// PostJSON 发送 POST 请求,Body 是 JSON +func (hc *HttpClient) PostJSON(ctx context.Context, rawUrl string, data interface{}, headers map[string]string) ([]byte, int, error) { + bodyBytes, err := json.Marshal(data) + if err != nil { + return nil, 0, err + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, rawUrl, bytes.NewReader(bodyBytes)) + if err != nil { + return nil, 0, err + } + req.Header.Set("Content-Type", "application/json") + for k, v := range headers { + req.Header.Set(k, v) + } + + resp, err := hc.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + return body, resp.StatusCode, err +} + +// PostForm 发送 POST 表单请求 +func (hc *HttpClient) PostForm(ctx context.Context, rawUrl string, formData map[string]string, headers map[string]string) ([]byte, int, error) { + data := url.Values{} + for k, v := range formData { + data.Set(k, v) + } + + req, err := http.NewRequestWithContext(ctx, http.MethodPost, rawUrl, strings.NewReader(data.Encode())) + if err != nil { + return nil, 0, err + } + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") + for k, v := range headers { + req.Header.Set(k, v) + } + + resp, err := hc.client.Do(req) + if err != nil { + return nil, 0, err + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + return body, resp.StatusCode, err +} + +var ( + defaultClient *HttpClient + once sync.Once +) + +func GetDefaultClient() *HttpClient { + once.Do(func() { + defaultClient = NewHttpClient(10 * time.Second) + }) + return defaultClient +} diff --git a/server/openapi.yaml b/server/openapi.yaml index 5647a34..bf312da 100644 --- a/server/openapi.yaml +++ b/server/openapi.yaml @@ -3,42 +3,37 @@ openapi: 3.0.3 info: - title: Node API + title: ResourcePool API version: 0.0.1 paths: - /v1/node: + /v1/available/nodes: get: tags: - - Node - operationId: Node_GetNode - parameters: - - name: uid - in: query - schema: - type: string + - ResourcePool + operationId: ResourcePool_GetAvailableNodes responses: "200": description: OK content: application/json: schema: - $ref: '#/components/schemas/NodeReply' + $ref: '#/components/schemas/AvailableNodesResponse' default: description: Default error response content: application/json: schema: $ref: '#/components/schemas/Status' - /v1/node/discovered: + /v1/resource/pool/create: post: tags: - - Node - operationId: Node_DiscoveredNode + - ResourcePool + operationId: ResourcePool_Create requestBody: content: application/json: schema: - $ref: '#/components/schemas/DiscoveredNodeRequest' + $ref: '#/components/schemas/ResourcePoolCreateRequest' required: true responses: "200": @@ -46,23 +41,23 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/DiscoveredNodeResponse' + $ref: '#/components/schemas/BaseResponse' default: description: Default error response content: application/json: schema: $ref: '#/components/schemas/Status' - /v1/node/join: + /v1/resource/pool/delete: post: tags: - - Node - operationId: Node_JoinNode + - ResourcePool + operationId: ResourcePool_Delete requestBody: content: application/json: schema: - $ref: '#/components/schemas/JoinNodeRequest' + $ref: '#/components/schemas/ResourcePoolDeleteRequest' required: true responses: "200": @@ -70,47 +65,64 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/JoinNodeResponse' + $ref: '#/components/schemas/BaseResponse' default: description: Default error response content: application/json: schema: $ref: '#/components/schemas/Status' - /v1/node/status/update: + /v1/resource/pool/detail: post: tags: - - Node - operationId: Node_UpdateNodeStatus - requestBody: - content: - application/json: - schema: - $ref: '#/components/schemas/UpdateNodeStatusRequest' - required: true + - ResourcePool + operationId: ResourcePool_GetDetail + parameters: + - name: poolId + in: query + schema: + type: string + responses: + "200": + description: OK + content: + application/json: + schema: + $ref: '#/components/schemas/ResourcePoolDetailResponse' + default: + description: Default error response + content: + application/json: + schema: + $ref: '#/components/schemas/Status' + /v1/resource/pool/list: + get: + tags: + - ResourcePool + operationId: ResourcePool_List responses: "200": description: OK content: application/json: schema: - $ref: '#/components/schemas/UpdateNodeStatusResponse' + $ref: '#/components/schemas/ResourcePoolListResponse' default: description: Default error response content: application/json: schema: $ref: '#/components/schemas/Status' - /v1/nodes: + /v1/resource/pool/removeNode: post: tags: - - Node - operationId: Node_GetAllNodes + - ResourcePool + operationId: ResourcePool_RemoveNode requestBody: content: application/json: schema: - $ref: '#/components/schemas/GetAllNodesReq' + $ref: '#/components/schemas/RemoveNodeRequest' required: true responses: "200": @@ -118,23 +130,23 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/NodesReply' + $ref: '#/components/schemas/BaseResponse' default: description: Default error response content: application/json: schema: $ref: '#/components/schemas/Status' - /v1/summary: + /v1/resource/pool/update: post: tags: - - Node - operationId: Node_GetSummary + - ResourcePool + operationId: ResourcePool_Update requestBody: content: application/json: schema: - $ref: '#/components/schemas/GetSummaryReq' + $ref: '#/components/schemas/ResourcePoolUpdateRequest' required: true responses: "200": @@ -142,7 +154,7 @@ paths: content: application/json: schema: - $ref: '#/components/schemas/DeviceSummaryReply' + $ref: '#/components/schemas/BaseResponse' default: description: Default error response content: @@ -151,78 +163,40 @@ paths: $ref: '#/components/schemas/Status' components: schemas: - DeviceSummaryReply: + AvailableNodesInfo: type: object properties: - vgpuUsed: - type: integer - format: int32 - vgpuTotal: - type: integer - format: int32 - coreUsed: - type: integer - format: int32 - coreTotal: - type: integer - format: int32 - memoryUsed: - type: integer - format: int32 - memoryTotal: - type: integer - format: int32 - gpuCount: - type: integer - format: int32 - nodeCount: - type: integer - format: int32 - DiscoveredNodeInfo: - type: object - properties: - nodeIp: - type: string nodeName: type: string - DiscoveredNodeRequest: - type: object - properties: {} - DiscoveredNodeResponse: - type: object - properties: - list: - type: array - items: - $ref: '#/components/schemas/DiscoveredNodeInfo' - GetAllNodesReq: - type: object - properties: - filters: - $ref: '#/components/schemas/GetAllNodesReq_Filters' - GetAllNodesReq_Filters: - type: object - properties: - ip: + cpuCores: type: string - type: + gpuNum: type: string - isSchedulable: + gpuMemory: + type: string + totalMemory: + type: string + diskSize: + type: string + nodeIp: type: string - GetSummaryReq: + AvailableNodesResponse: type: object properties: - filters: - $ref: '#/components/schemas/GetSummaryReq_Filters' - GetSummaryReq_Filters: + data: + type: array + items: + $ref: '#/components/schemas/AvailableNodesInfo' + BaseResponse: type: object properties: - type: - type: string - nodeUid: - type: string - deviceId: + code: + type: integer + format: int32 + message: type: string + data: + type: object GoogleProtobufAny: type: object properties: @@ -231,22 +205,14 @@ components: description: The type of the serialized message. additionalProperties: true description: Contains an arbitrary serialized message along with a @type that describes the type of the serialized message. - JoinNodeRequest: + Nodes: type: object properties: - nodeNames: - type: array - items: - type: string - JoinNodeResponse: - type: object - properties: - code: - type: integer - format: int32 - message: + nodeIp: + type: string + nodeName: type: string - NodeReply: + PoolNodeReply: type: object properties: ip: @@ -271,8 +237,7 @@ components: coreTotal: type: string memoryUsed: - type: integer - format: int32 + type: string memoryTotal: type: string uid: @@ -300,47 +265,90 @@ components: type: string diskSize: type: string - resourcePools: + nodeId: + type: string + RemoveNodeRequest: + type: object + properties: + nodeId: + type: string + ResourcePoolCreateRequest: + type: object + properties: + poolName: + type: string + nodes: type: array items: - type: string - NodesReply: + $ref: '#/components/schemas/Nodes' + ResourcePoolDeleteRequest: + type: object + properties: + poolId: + type: string + ResourcePoolDetailResponse: type: object properties: list: type: array items: - $ref: '#/components/schemas/NodeReply' - Status: + $ref: '#/components/schemas/PoolNodeReply' + ResourcePoolListData: type: object properties: - code: - type: integer - description: The status code, which should be an enum value of [google.rpc.Code][google.rpc.Code]. - format: int32 - message: + poolId: type: string - description: A developer-facing error message, which should be in English. Any user-facing error message should be localized and sent in the [google.rpc.Status.details][google.rpc.Status.details] field, or localized by the client. - details: + poolName: + type: string + cpuCores: + type: string + nodeNum: + type: string + gpuNum: + type: string + availableMemory: + type: string + totalMemory: + type: string + diskSize: + type: string + nodeList: type: array items: - $ref: '#/components/schemas/GoogleProtobufAny' - description: A list of messages that carry the error details. There is a common set of message types for APIs to use. - description: 'The `Status` type defines a logical error model that is suitable for different programming environments, including REST APIs and RPC APIs. It is used by [gRPC](https://github.com/grpc). Each `Status` message contains three pieces of data: error code, error message, and error details. You can find out more about this error model and how to work with it in the [API Design Guide](https://cloud.google.com/apis/design/errors).' - UpdateNodeStatusRequest: + $ref: '#/components/schemas/Nodes' + ResourcePoolListResponse: type: object properties: - nodeName: + data: + type: array + items: + $ref: '#/components/schemas/ResourcePoolListData' + ResourcePoolUpdateRequest: + type: object + properties: + poolId: type: string - status: + poolName: type: string - UpdateNodeStatusResponse: + nodes: + type: array + items: + $ref: '#/components/schemas/Nodes' + Status: type: object properties: code: type: integer + description: The status code, which should be an enum value of [google.rpc.Code][google.rpc.Code]. format: int32 message: type: string + description: A developer-facing error message, which should be in English. Any user-facing error message should be localized and sent in the [google.rpc.Status.details][google.rpc.Status.details] field, or localized by the client. + details: + type: array + items: + $ref: '#/components/schemas/GoogleProtobufAny' + description: A list of messages that carry the error details. There is a common set of message types for APIs to use. + description: 'The `Status` type defines a logical error model that is suitable for different programming environments, including REST APIs and RPC APIs. It is used by [gRPC](https://github.com/grpc). Each `Status` message contains three pieces of data: error code, error message, and error details. You can find out more about this error model and how to work with it in the [API Design Guide](https://cloud.google.com/apis/design/errors).' tags: - - name: Node + - name: ResourcePool