Compare commits
1 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
327d4f37a4 | 4 days ago |
|
After Width: | Height: | Size: 46 KiB |
@ -1,29 +1,51 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 日志诊断与自动修复流程
|
title 核心业务流程时序图 (Updated)
|
||||||
|
|
||||||
actor User
|
actor User
|
||||||
|
actor Admin
|
||||||
participant Frontend as FE
|
participant Frontend as FE
|
||||||
participant FastAPI as API
|
participant "FastAPI Auth" as Auth
|
||||||
participant Flume
|
participant "FastAPI Diagnosis" as Diag
|
||||||
database MySQL as DB
|
participant "Agents (DA/PA/RA)" as Agents
|
||||||
queue Redis
|
database PostgreSQL as DB
|
||||||
participant LLM
|
participant LLM
|
||||||
|
participant "Hadoop Cluster" as Cluster
|
||||||
|
|
||||||
Flume -> API : 推送结构化日志
|
== 用户注册与审批 ==
|
||||||
API -> DB : 写入 fault_record
|
User -> FE : 提交注册信息
|
||||||
FE -> API : 查询 /api/logs/query
|
FE -> Auth : POST /api/auth/register
|
||||||
API -> FE : 返回日志列表
|
Auth -> DB : 写入用户 (status=pending)
|
||||||
|
Admin -> FE : 查看审批队列
|
||||||
|
FE -> Auth : GET /api/auth/pending_users
|
||||||
|
Auth -> DB : 查询
|
||||||
|
Admin -> FE : 批准注册
|
||||||
|
FE -> Auth : POST /api/auth/approve/{uid}
|
||||||
|
Auth -> DB : 更新用户 (status=active)
|
||||||
|
User -> FE : 登录 (正确/错误凭据)
|
||||||
|
FE -> Auth : POST /api/auth/login
|
||||||
|
Auth -> DB : 校验
|
||||||
|
Auth -> FE : 返回 JWT / 错误提示
|
||||||
|
|
||||||
API -> LLM : call_llm_diagnose(logs)
|
== 故障诊断与自动修复 ==
|
||||||
LLM --> API : 返回 FixCommand(JSON)
|
Cluster -> Diag : 推送日志 (Flume/SSH)
|
||||||
API -> DB : 写入 exec_log
|
Diag -> DB : 记录 Fault (status=detected)
|
||||||
API -> Redis : 缓存/发布修复任务
|
User -> FE : 点击 "AI 诊断"
|
||||||
API -> FE : WebSocket 推送诊断结果
|
FE -> Diag : POST /api/diagnosis/trigger
|
||||||
|
Diag -> Agents : 调用 DiagnosisAgent
|
||||||
|
Agents -> LLM : 分析日志上下文
|
||||||
|
LLM -> Agents : 返回根因与建议 (FixCommand)
|
||||||
|
Agents -> Diag : 诊断结果 (Risk Level)
|
||||||
|
Diag -> DB : 更新 Fault (status=analyzing)
|
||||||
|
Diag -> FE : WebSocket 推送报告
|
||||||
|
|
||||||
FE -> API : /api/repair/execute
|
User -> FE : 确认执行修复 (High risk needs approval)
|
||||||
API -> "修复脚本" : 执行Shell/Hadoop命令
|
FE -> Diag : POST /api/repair/execute
|
||||||
"修复脚本" -> API : stdout/stderr
|
Diag -> Agents : 调用 PolicyAgent (评估风险)
|
||||||
API -> DB : 更新 exec_log
|
Agents -> Agents : 调用 RepairAgent
|
||||||
API -> FE : 返回执行结果
|
Agents -> Cluster : SSH 执行修复脚本
|
||||||
|
Cluster -> Agents : 返回 stdout/stderr
|
||||||
|
Agents -> Diag : 修复完成
|
||||||
|
Diag -> DB : 记录 ExecLog & 更新 Fault (status=resolved/failed)
|
||||||
|
Diag -> FE : 推送最终结果
|
||||||
|
|
||||||
@enduml
|
@enduml
|
||||||
|
After Width: | Height: | Size: 36 KiB |
@ -1,36 +1,52 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 故障检测系统总体架构
|
title 故障检测系统总体架构 (Aligned with Backend)
|
||||||
|
|
||||||
node "Hadoop Cluster" {
|
node "Hadoop Cluster" {
|
||||||
[NameNode]
|
[NameNode]
|
||||||
[DataNode] as DN1
|
[ResourceManager]
|
||||||
[DataNode] as DN2
|
[DataNode / NodeManager] as Node
|
||||||
}
|
}
|
||||||
|
|
||||||
cloud "Flume Agents" as Flume
|
cloud "Log & Metrics Collection" {
|
||||||
Flume --> DN1 : 采集HDFS/YARN日志
|
[Flume Agent] as Flume
|
||||||
Flume --> DN2 : 采集HDFS/YARN日志
|
[SSH Probe Service] as Probe
|
||||||
|
Flume --> Node : 采集 Hadoop 日志
|
||||||
component "FastAPI Service" as API
|
Probe --> Node : 采集系统指标 (CPU/Mem/Disk)
|
||||||
database "PostgreSQL" as DB
|
}
|
||||||
queue "Redis" as Cache
|
|
||||||
API --> DB : 写入/查询故障记录
|
package "Backend Service (FastAPI)" {
|
||||||
API --> Cache : 状态缓存/队列
|
component "Auth Router" as Auth
|
||||||
API --> "LLM Diagnose" : 调用大模型\n返回FixCommand
|
component "Cluster/Node Router" as ClusterSvc
|
||||||
|
component "Fault/Log Router" as FaultSvc
|
||||||
component "Agent Orchestrator" as Orchestrator
|
component "AI/Chat Router" as ChatSvc
|
||||||
component "Diagnosis Agent" as DA
|
|
||||||
component "Repair Agent" as RA
|
component "Orchestrator" as Orchestrator
|
||||||
component "Policy Agent" as PA
|
component "DiagnosisAgent" as DA
|
||||||
API --> Orchestrator : 触发诊断/修复流程
|
component "PolicyAgent" as PA
|
||||||
Orchestrator --> DA : 传递结构化日志
|
component "RepairAgent" as RA
|
||||||
Orchestrator --> PA : 风险评估与审批策略
|
|
||||||
Orchestrator --> RA : 下发修复命令
|
Auth --> [PostgreSQL] : users
|
||||||
DA --> "LLM Diagnose" : 调用LLM分析
|
ClusterSvc --> [PostgreSQL] : clusters, nodes
|
||||||
RA --> Cluster : SSH/命令执行
|
FaultSvc --> [PostgreSQL] : fault_records, hadoop_exec_logs
|
||||||
|
ChatSvc --> [PostgreSQL] : chat_sessions, chat_messages
|
||||||
component "Frontend Web (Vue/React + ECharts)" as FE
|
|
||||||
FE --> API : /api/cluster/status\n/api/logs/query\n/api/diagnosis/result\n/api/repair/execute
|
Orchestrator --> DA
|
||||||
API --> FE : WebSocket推送状态/诊断结果
|
Orchestrator --> PA
|
||||||
|
Orchestrator --> RA
|
||||||
|
|
||||||
|
DA --> [LLM Service] : LangChain / OpenAI
|
||||||
|
RA --> [Hadoop Cluster] : SSH (Paramiko)
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Frontend (Vue 3)" {
|
||||||
|
component "Dashboard" as Dash
|
||||||
|
component "Diagnosis Chat" as DiagUI
|
||||||
|
component "Admin UI" as Admin
|
||||||
|
}
|
||||||
|
|
||||||
|
Dash --> ClusterSvc
|
||||||
|
DiagUI --> ChatSvc
|
||||||
|
Admin --> Auth
|
||||||
|
|
||||||
|
Orchestrator ..> [Redis] : 任务状态与实时推送
|
||||||
@enduml
|
@enduml
|
||||||
|
After Width: | Height: | Size: 10 KiB |
@ -1,45 +1,36 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 日志诊断与自动修复 - 活动图
|
title 故障生命周期状态机与修复流程 (Updated)
|
||||||
|
|
||||||
skinparam defaultFontName Microsoft YaHei
|
skinparam defaultFontName Microsoft YaHei
|
||||||
|
|
||||||
start
|
(*) --> "Detected (故障发现)" : 系统采集到异常日志/指标
|
||||||
:Flume采集日志;
|
|
||||||
:FastAPI接收并解析日志;
|
partition "AI 诊断阶段" {
|
||||||
:保存 FaultRecord 到 MySQL;
|
"Detected (故障发现)" --> "Analyzing (正在分析)" : 触发 AI 诊断 (DiagnosisAgent)
|
||||||
|
"Analyzing (正在分析)" --> "Diagnosed (已生成建议)" : LLM 分析完成并生成 FixCommand
|
||||||
partition "用户/系统触发" {
|
|
||||||
if (是否需要诊断?) then (是)
|
|
||||||
:聚合相关日志;
|
|
||||||
:构造 Prompt;
|
|
||||||
:调用 LLM 诊断;
|
|
||||||
:生成 FixCommand(JSON);
|
|
||||||
:安全校验(禁止高危命令);
|
|
||||||
else (否)
|
|
||||||
:等待新日志/用户请求;
|
|
||||||
stop
|
|
||||||
endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (风险等级 == high?) then (是)
|
partition "策略评估与修复阶段" {
|
||||||
:前端弹窗请求人工确认;
|
"Diagnosed (已生成建议)" --> "Risk Assessment (PolicyAgent)"
|
||||||
if (用户确认执行?) then (是)
|
|
||||||
:继续执行修复;
|
if "风险等级" then
|
||||||
else (否)
|
-->[High] "Pending Approval (待审批)"
|
||||||
:记录并通知未执行;
|
-->[Approved] "Repairing (修复中)"
|
||||||
stop
|
else
|
||||||
|
-->[Low/Medium] "Repairing (修复中)"
|
||||||
endif
|
endif
|
||||||
endif
|
|
||||||
|
|
||||||
:修复前预检查(配置/路径/权限);
|
"Repairing (修复中)" --> "Executing (RepairAgent)"
|
||||||
if (预检查通过?) then (是)
|
"Executing (RepairAgent)" --> "Post-Check (修复后校验)"
|
||||||
:执行修复脚本;
|
}
|
||||||
:采集stdout/stderr;
|
|
||||||
:保存 ExecLog 到 MySQL;
|
"Post-Check (修复后校验)" --> if "是否修复成功?" then
|
||||||
:更新状态到 Redis 并推送 WebSocket;
|
-->[Yes] "Resolved (已解决)"
|
||||||
else (否)
|
--> (*)
|
||||||
:记录失败原因;
|
else
|
||||||
|
-->[No] "Failed (修复失败)"
|
||||||
|
--> "Manual Intervention (需人工介入)"
|
||||||
|
--> (*)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
:返回结果给前端;
|
|
||||||
stop
|
|
||||||
@enduml
|
@enduml
|
||||||
|
After Width: | Height: | Size: 22 KiB |
@ -1,38 +1,42 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 故障检测系统 - 用例图
|
title 故障检测系统用例图 (Updated)
|
||||||
|
|
||||||
skinparam defaultFontName Microsoft YaHei
|
skinparam defaultFontName Microsoft YaHei
|
||||||
|
|
||||||
actor 运维工程师 as Ops
|
actor "访客" as Guest
|
||||||
actor 前端用户 as User
|
actor "管理员" as Admin
|
||||||
actor 测试工程师 as QA
|
actor "运维人员/操作员" as Operator
|
||||||
|
actor "观察员" as Observer
|
||||||
rectangle "故障检测系统" {
|
|
||||||
usecase "查看集群状态" as UC_Status
|
rectangle "故障检测与诊断系统" {
|
||||||
usecase "查询日志" as UC_QueryLogs
|
(注册与登录) as UC_Auth
|
||||||
usecase "发起故障诊断" as UC_Diagnose
|
(注册审批) as UC_Approve
|
||||||
usecase "执行自动修复" as UC_Repair
|
(集群管理) as UC_Cluster
|
||||||
usecase "查看执行日志" as UC_ExecLogs
|
(实时监控) as UC_Monitor
|
||||||
usecase "配置Flume收集" as UC_ConfigFlume
|
(日志检索) as UC_Logs
|
||||||
usecase "配置告警阈值" as UC_ConfigAlert
|
(AI 故障诊断) as UC_Diag
|
||||||
usecase "导出故障与诊断报告" as UC_Export
|
(自动/手动修复) as UC_Repair
|
||||||
usecase "生成FixCommand" as UC_FixCmd
|
(操作审计) as UC_Audit
|
||||||
usecase "命令安全校验" as UC_SafeCheck
|
(用户权限管理) as UC_UserMgmt
|
||||||
|
|
||||||
User --> UC_Status
|
Guest --> UC_Auth
|
||||||
User --> UC_QueryLogs
|
|
||||||
User --> UC_Diagnose
|
Admin --> UC_Approve
|
||||||
User --> UC_Repair
|
Admin --> UC_UserMgmt
|
||||||
User --> UC_ExecLogs
|
Admin --> UC_Audit
|
||||||
|
Admin --> UC_Cluster
|
||||||
Ops --> UC_ConfigFlume
|
|
||||||
Ops --> UC_ConfigAlert
|
Operator --> UC_Monitor
|
||||||
Ops --> UC_Repair
|
Operator --> UC_Logs
|
||||||
Ops --> UC_Status
|
Operator --> UC_Diag
|
||||||
|
Operator --> UC_Repair
|
||||||
QA --> UC_QueryLogs
|
Operator --> UC_Cluster
|
||||||
QA --> UC_Export
|
|
||||||
|
Observer --> UC_Monitor
|
||||||
UC_Diagnose --> UC_FixCmd : <<include>>
|
Observer --> UC_Logs
|
||||||
UC_Repair --> UC_SafeCheck : <<include>>
|
|
||||||
|
UC_Diag ..> (LLM 根因分析) : <<include>>
|
||||||
|
UC_Repair ..> (风险评估审批) : <<include>>
|
||||||
}
|
}
|
||||||
|
|
||||||
@enduml
|
@enduml
|
||||||
|
After Width: | Height: | Size: 54 KiB |
@ -1,130 +1,112 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 故障检测与自动修复 - 类图
|
title 故障检测与自动修复 - 领域模型类图 (Updated from Code)
|
||||||
skinparam backgroundColor #FFFFFF
|
|
||||||
skinparam defaultFontName Microsoft YaHei
|
|
||||||
skinparam classAttributeIconSize 0
|
|
||||||
|
|
||||||
class FlumeAgent {
|
|
||||||
+config : Map
|
|
||||||
+start()
|
|
||||||
+stop()
|
|
||||||
}
|
|
||||||
|
|
||||||
class LogEvent {
|
|
||||||
+timestamp : datetime
|
|
||||||
+host : string
|
|
||||||
+source : string
|
|
||||||
+level : string
|
|
||||||
+message : string
|
|
||||||
+raw : text
|
|
||||||
}
|
|
||||||
|
|
||||||
class FastAPIService {
|
|
||||||
+ingestLog(e: LogEvent)
|
|
||||||
+getClusterStatus()
|
|
||||||
+queryLogs(filter)
|
|
||||||
+diagnose(logs)
|
|
||||||
+executeRepair(cmd: FixCommand)
|
|
||||||
}
|
|
||||||
|
|
||||||
class DiagnosisService {
|
|
||||||
+callLLM(logs) : FixCommand
|
|
||||||
+validateCommand(cmd: FixCommand) : bool
|
|
||||||
}
|
|
||||||
|
|
||||||
class LLMClient {
|
|
||||||
+apiKey : string
|
|
||||||
+endpoint : string
|
|
||||||
+invoke(prompt) : string
|
|
||||||
}
|
|
||||||
|
|
||||||
class FixCommand {
|
|
||||||
+fault_type : string
|
|
||||||
+reason : string
|
|
||||||
+fix_script : string
|
|
||||||
+risk_level : RiskLevel
|
|
||||||
}
|
|
||||||
|
|
||||||
enum RiskLevel {
|
|
||||||
low
|
|
||||||
medium
|
|
||||||
high
|
|
||||||
}
|
|
||||||
|
|
||||||
class RepairExecutor {
|
|
||||||
+run(script) : ExecResult
|
|
||||||
+precheck() : bool
|
|
||||||
}
|
|
||||||
|
|
||||||
class ExecResult {
|
|
||||||
+stdout : text
|
|
||||||
+stderr : text
|
|
||||||
+exitCode : int
|
|
||||||
}
|
|
||||||
|
|
||||||
class FaultRecord {
|
|
||||||
+id : int
|
|
||||||
+fault_type : string
|
|
||||||
+reason : string
|
|
||||||
+timestamp : datetime
|
|
||||||
+node : string
|
|
||||||
}
|
|
||||||
|
|
||||||
class ExecLog {
|
|
||||||
+id : int
|
|
||||||
+record_id : int
|
|
||||||
+stdout : text
|
|
||||||
+stderr : text
|
|
||||||
+timestamp : datetime
|
|
||||||
}
|
|
||||||
|
|
||||||
class MySQLClient {
|
|
||||||
+saveFault(record: FaultRecord)
|
|
||||||
+saveExecLog(log: ExecLog)
|
|
||||||
+queryLogs(filter)
|
|
||||||
}
|
|
||||||
|
|
||||||
class RedisCache {
|
|
||||||
+set(key, value)
|
|
||||||
+publish(channel, msg)
|
|
||||||
+get(key)
|
|
||||||
}
|
|
||||||
|
|
||||||
class ClusterStatus {
|
|
||||||
+nodesUp : int
|
|
||||||
+nodesDown : int
|
|
||||||
+hdfsUsage : float
|
|
||||||
+yarnActiveApps : int
|
|
||||||
}
|
|
||||||
|
|
||||||
class FrontendWeb {
|
skinparam classAttributeIconSize 0
|
||||||
+viewStatus()
|
|
||||||
+queryLogs()
|
|
||||||
+requestDiagnosis()
|
|
||||||
+executeRepair()
|
|
||||||
}
|
|
||||||
|
|
||||||
FlumeAgent --> FastAPIService : push(LogEvent)
|
package "Models (SQLAlchemy)" {
|
||||||
FastAPIService --> DiagnosisService : diagnose(logs)
|
class User {
|
||||||
DiagnosisService --> LLMClient : call_llm_diagnose
|
+id : int <<PK>>
|
||||||
DiagnosisService --> FixCommand : returns
|
+username : string
|
||||||
FastAPIService --> RepairExecutor : execute(FixCommand)
|
+email : string
|
||||||
RepairExecutor --> ExecResult : returns
|
+password_hash : string
|
||||||
FastAPIService --> MySQLClient : save FaultRecord/ExecLog
|
+full_name : string
|
||||||
FastAPIService --> RedisCache : cache/publish status
|
+is_active : bool
|
||||||
FrontendWeb --> FastAPIService : REST/WebSocket
|
+last_login : TIMESTAMP
|
||||||
FastAPIService --> ClusterStatus : compose
|
+created_at : TIMESTAMP
|
||||||
MySQLClient --> FaultRecord
|
}
|
||||||
MySQLClient --> ExecLog
|
|
||||||
FixCommand --> RiskLevel
|
class Cluster {
|
||||||
|
+id : int <<PK>>
|
||||||
|
+uuid : string <<Unique>>
|
||||||
|
+name : string
|
||||||
|
+type : string
|
||||||
|
+node_count : int
|
||||||
|
+health_status : string
|
||||||
|
+cpu_avg : float
|
||||||
|
+memory_avg : float
|
||||||
|
+namenode_ip : INET
|
||||||
|
+rm_ip : INET
|
||||||
|
+config_info : JSONB
|
||||||
|
+to_dict() : dict
|
||||||
|
}
|
||||||
|
|
||||||
|
class Node {
|
||||||
|
+id : int <<PK>>
|
||||||
|
+uuid : string <<Unique>>
|
||||||
|
+cluster_id : int <<FK>>
|
||||||
|
+hostname : string
|
||||||
|
+ip_address : INET
|
||||||
|
+ssh_user : string
|
||||||
|
+ssh_password : string
|
||||||
|
+status : string
|
||||||
|
+cpu_usage : float
|
||||||
|
+memory_usage : float
|
||||||
|
+disk_usage : float
|
||||||
|
+last_heartbeat : TIMESTAMP
|
||||||
|
}
|
||||||
|
|
||||||
|
class FaultRecord {
|
||||||
|
+id : int <<PK>>
|
||||||
|
+fault_id : string <<Unique>>
|
||||||
|
+cluster_id : int <<FK>>
|
||||||
|
+fault_type : string
|
||||||
|
+fault_level : string
|
||||||
|
+title : string
|
||||||
|
+description : string
|
||||||
|
+affected_nodes : JSONB
|
||||||
|
+affected_clusters : JSONB
|
||||||
|
+root_cause : string
|
||||||
|
+repair_suggestion : string
|
||||||
|
+status : string
|
||||||
|
+reporter : string
|
||||||
|
+to_dict() : dict
|
||||||
|
}
|
||||||
|
|
||||||
|
class HadoopExecLog {
|
||||||
|
+id : int <<PK>>
|
||||||
|
+from_user_id : int <<FK>>
|
||||||
|
+cluster_name : string
|
||||||
|
+description : text
|
||||||
|
+start_time : TIMESTAMP
|
||||||
|
+end_time : TIMESTAMP
|
||||||
|
+to_dict() : dict
|
||||||
|
}
|
||||||
|
|
||||||
|
class ChatSession {
|
||||||
|
+id : string <<PK>> (UUID)
|
||||||
|
+user_id : int <<FK>>
|
||||||
|
+title : string
|
||||||
|
+created_at : DateTime
|
||||||
|
+messages : List<ChatMessage>
|
||||||
|
}
|
||||||
|
|
||||||
|
class ChatMessage {
|
||||||
|
+id : int <<PK>>
|
||||||
|
+session_id : string <<FK>>
|
||||||
|
+role : string (system/user/assistant/tool)
|
||||||
|
+content : text
|
||||||
|
+created_at : DateTime
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
package "Agents (Logic)" {
|
||||||
|
class DiagnosisAgent {
|
||||||
|
+analyze(logs) : FixCommand
|
||||||
|
}
|
||||||
|
|
||||||
|
class PolicyAgent {
|
||||||
|
+evaluate(cmd) : RiskLevel
|
||||||
|
}
|
||||||
|
|
||||||
|
class RepairAgent {
|
||||||
|
+execute(cmd) : ExecResult
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
User "1" -- "0..*" HadoopExecLog : executes
|
||||||
|
User "1" -- "0..*" ChatSession : owns
|
||||||
|
Cluster "1" -- "0..*" Node : contains
|
||||||
|
Cluster "1" -- "0..*" FaultRecord : has
|
||||||
|
ChatSession "1" -- "0..*" ChatMessage : has_many
|
||||||
|
|
||||||
note right of FixCommand
|
|
||||||
JSON 示例:
|
|
||||||
{
|
|
||||||
fault_type: "DataNode故障",
|
|
||||||
reason: "磁盘占满",
|
|
||||||
fix_script: "ssh dn 'clean_temp.sh'",
|
|
||||||
risk_level: "medium"
|
|
||||||
}
|
|
||||||
end note
|
|
||||||
@enduml
|
@enduml
|
||||||
|
After Width: | Height: | Size: 24 KiB |
@ -1,25 +1,35 @@
|
|||||||
@startuml
|
@startuml
|
||||||
title 部署拓扑
|
title 故障检测系统部署拓扑 (Updated)
|
||||||
|
|
||||||
node "On-Prem / Cloud" {
|
node "Hadoop Cluster Node" {
|
||||||
node "Hadoop Cluster" {
|
component "Hadoop Components" as HC
|
||||||
[NameNode]
|
component "Flume Agent" as Flume
|
||||||
[DataNodes...]
|
HC - [SSH]
|
||||||
}
|
}
|
||||||
|
|
||||||
node "Logging Layer" {
|
node "Management Server" {
|
||||||
[Flume Agents]
|
package "Backend (Docker Container)" {
|
||||||
|
[FastAPI Service] as API
|
||||||
|
[Celery Workers] as Workers
|
||||||
}
|
}
|
||||||
|
|
||||||
node "Application Layer" {
|
package "Frontend (Docker Container)" {
|
||||||
[FastAPI]
|
[Nginx / Vue App] as Web
|
||||||
[LLM Connector]
|
|
||||||
[Nginx for Frontend]
|
|
||||||
}
|
}
|
||||||
|
|
||||||
node "Storage/Caching" {
|
database "PostgreSQL" as DB
|
||||||
[MySQL]
|
queue "Redis" as Redis
|
||||||
[Redis]
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
cloud "AI Platform" {
|
||||||
|
[OpenAI API / LangChain] as LLM
|
||||||
|
}
|
||||||
|
|
||||||
|
Web --> API : HTTP/WebSocket
|
||||||
|
API --> DB : Persistence
|
||||||
|
API --> Redis : Task Queue
|
||||||
|
API --> LLM : AI Analysis
|
||||||
|
API --> HC : SSH Execution
|
||||||
|
Flume --> API : Log Streaming
|
||||||
|
|
||||||
@enduml
|
@enduml
|
||||||
Binary file not shown.
@ -0,0 +1,65 @@
|
|||||||
|
# 基于 Hadoop 的故障检测与智能诊断项目 - 测试报告
|
||||||
|
|
||||||
|
## 修订记录
|
||||||
|
|
||||||
|
| 版本号 | 修订日期 | 修订内容 | 修订人 |
|
||||||
|
| :--- | :--- | :--- | :--- |
|
||||||
|
| v1.0.0 | 2026-01-10 | 初始测试报告框架创建 | AI Assistant |
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 1. 测试概述
|
||||||
|
|
||||||
|
### 1.1 测试目的
|
||||||
|
验证系统在 Hadoop 集群管理、指标采集、日志检索及 AI 诊断功能上的正确性、稳定性和响应速度,确保满足《需求规格说明书》中的定义。
|
||||||
|
|
||||||
|
### 1.2 测试范围
|
||||||
|
- **功能测试**: 集群注册、SSH 校验、日志同步、AI SSE 流式对话。
|
||||||
|
- **性能测试**: 大规模日志检索响应时间、多并发指标采集压力。
|
||||||
|
|
||||||
|
## 2. 测试环境
|
||||||
|
|
||||||
|
| 类别 | 配置要求 |
|
||||||
|
| :--- | :--- |
|
||||||
|
| **硬件** | 8 vCPU, 16GB RAM (测试服务器) |
|
||||||
|
| **软件** | Docker, PostgreSQL 14, Python 3.10 |
|
||||||
|
| **集群环境** | Hadoop 3.1.3 (1 NameNode, 5 DataNodes) |
|
||||||
|
|
||||||
|
## 3. 测试用例
|
||||||
|
|
||||||
|
| 用例编号 | 功能模块 | 测试点 | 预期结果 | 状态 |
|
||||||
|
| :--- | :--- | :--- | :--- | :--- |
|
||||||
|
| TC-01 | 集群管理 | 输入合法的 SSH 信息注册集群 | 注册成功并自动发现所有节点 | [待测试] |
|
||||||
|
| TC-02 | 日志采集 | 模拟节点产生 ERROR 日志 | 数据库 5 秒内出现对应增量日志 | [待测试] |
|
||||||
|
| TC-03 | AI 诊断 | 询问集群负载情况 | AI 正确调用指标工具并给出分析建议 | [待测试] |
|
||||||
|
|
||||||
|
## 4. 测试结果
|
||||||
|
|
||||||
|
### 4.1 通过/失败统计
|
||||||
|
- **总用例数**: 0
|
||||||
|
- **通过数**: 0
|
||||||
|
- **失败数**: 0
|
||||||
|
- **跳过数**: 0
|
||||||
|
- **通过率**: 0%
|
||||||
|
|
||||||
|
## 5. 缺陷分析
|
||||||
|
|
||||||
|
### 5.1 严重等级分布
|
||||||
|
- **致命 (Blocker)**: 0
|
||||||
|
- **严重 (Critical)**: 0
|
||||||
|
- **一般 (Major)**: 0
|
||||||
|
- **次要 (Minor)**: 0
|
||||||
|
|
||||||
|
## 6. 测试结论
|
||||||
|
|
||||||
|
### 6.1 质量评估
|
||||||
|
[在此填写本次测试阶段的总体质量评价,例如:系统核心流程已打通,但 AI 诊断在极端日志量下存在响应延迟。]
|
||||||
|
|
||||||
|
## 7. 附录
|
||||||
|
|
||||||
|
### 7.1 测试日志
|
||||||
|
- 后端服务日志: `backend/logs/test_run.log`
|
||||||
|
- 采集器性能统计: `docs/metrics_report.csv`
|
||||||
|
|
||||||
|
### 7.2 截图
|
||||||
|
[占位符:插入关键功能运行截图]
|
||||||
Binary file not shown.
Loading…
Reference in new issue