@ -1,49 +0,0 @@
|
||||
name: frontend-vue-ci
|
||||
|
||||
on:
|
||||
push:
|
||||
paths:
|
||||
- "frontend-vue/**"
|
||||
- ".github/workflows/frontend-vue-ci.yml"
|
||||
pull_request:
|
||||
paths:
|
||||
- "frontend-vue/**"
|
||||
- ".github/workflows/frontend-vue-ci.yml"
|
||||
|
||||
jobs:
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
defaults:
|
||||
run:
|
||||
working-directory: frontend-vue
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 20
|
||||
cache: "pnpm"
|
||||
cache-dependency-path: frontend-vue/pnpm-lock.yaml
|
||||
- name: Enable corepack
|
||||
run: corepack enable
|
||||
- name: Install
|
||||
run: pnpm install --frozen-lockfile
|
||||
- name: Lint
|
||||
run: pnpm run lint
|
||||
- name: Typecheck
|
||||
run: pnpm run typecheck
|
||||
- name: Test
|
||||
run: pnpm run test
|
||||
- name: Install Playwright Browsers
|
||||
run: pnpm exec playwright install --with-deps chromium
|
||||
- name: E2E
|
||||
run: pnpm run e2e:ui
|
||||
- name: Upload Playwright Report
|
||||
if: failure()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: frontend-vue-playwright-report
|
||||
path: |
|
||||
frontend-vue/playwright-report
|
||||
frontend-vue/test-results
|
||||
- name: Build
|
||||
run: pnpm run build
|
||||
@ -1,10 +0,0 @@
|
||||
# Ignore temporary Vue3 playground directory
|
||||
src/fronted/vue3/
|
||||
frontend-vue/node_modules/
|
||||
frontend-vue/dist/
|
||||
frontend-vue/.vite/
|
||||
frontend-vue/test-results/
|
||||
frontend-vue/playwright-report/
|
||||
.venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
@ -1,11 +0,0 @@
|
||||
DATABASE_URL=postgresql+asyncpg://postgres:n7mdf4c5@dbconn.sealoshzh.site:38596/hadoop_fault_db
|
||||
DB_HOST=dbconn.sealoshzh.site
|
||||
DB_PORT=38596
|
||||
DB_NAME=hadoop_fault_db
|
||||
DB_USER=postgres
|
||||
DB_PASSWORD=n7mdf4c5
|
||||
LLM_PROVIDER=siliconflow
|
||||
LLM_API_KEY=sk-nmycwvibqotsoykzyxudcexkxwkechzdglksiynrkwfgwyqx
|
||||
LLM_ENDPOINT=https://api.siliconflow.cn/v1
|
||||
LLM_MODEL=deepseek-ai/DeepSeek-V3
|
||||
LLM_TIMEOUT=300
|
||||
@ -1 +0,0 @@
|
||||
|
||||
@ -1,45 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
from dotenv import load_dotenv
|
||||
from typing import Dict, Tuple
|
||||
from datetime import datetime
|
||||
from zoneinfo import ZoneInfo
|
||||
|
||||
load_dotenv()
|
||||
|
||||
# Timezone Configuration
|
||||
APP_TIMEZONE = os.getenv("APP_TIMEZONE", "Asia/Shanghai")
|
||||
BJ_TZ = ZoneInfo(APP_TIMEZONE)
|
||||
|
||||
def now_bj() -> datetime:
|
||||
return datetime.now(BJ_TZ)
|
||||
|
||||
# Database Configuration
|
||||
_db_url = os.getenv("DATABASE_URL")
|
||||
if not _db_url:
|
||||
_host = os.getenv("DB_HOST")
|
||||
_port = os.getenv("DB_PORT")
|
||||
_name = os.getenv("DB_NAME")
|
||||
_user = os.getenv("DB_USER")
|
||||
_password = os.getenv("DB_PASSWORD")
|
||||
if all([_host, _port, _name, _user, _password]):
|
||||
_db_url = f"postgresql+asyncpg://{_user}:{_password}@{_host}:{_port}/{_name}"
|
||||
else:
|
||||
_db_url = "postgresql+asyncpg://postgres:password@localhost:5432/hadoop_fault_db"
|
||||
|
||||
DATABASE_URL = _db_url
|
||||
SYNC_DATABASE_URL = _db_url.replace("postgresql+asyncpg://", "postgresql://")
|
||||
|
||||
# JWT Configuration
|
||||
JWT_SECRET = os.getenv("JWT_SECRET", "dev-secret")
|
||||
JWT_EXPIRE_MINUTES = int(os.getenv("JWT_EXPIRE_MINUTES", "60"))
|
||||
|
||||
|
||||
# SSH Configuration
|
||||
SSH_PORT = int(os.getenv("SSH_PORT", "22"))
|
||||
SSH_TIMEOUT = int(os.getenv("SSH_TIMEOUT", "10"))
|
||||
|
||||
ssh_port = SSH_PORT
|
||||
ssh_timeout = SSH_TIMEOUT
|
||||
|
||||
LOG_DIR = os.getenv("HADOOP_LOG_DIR", "/usr/local/hadoop/logs")
|
||||
@ -1,15 +0,0 @@
|
||||
from sqlalchemy.ext.asyncio import create_async_engine, async_sessionmaker, AsyncSession
|
||||
from .config import DATABASE_URL, APP_TIMEZONE
|
||||
|
||||
engine = create_async_engine(
|
||||
DATABASE_URL,
|
||||
echo=False,
|
||||
pool_pre_ping=True,
|
||||
connect_args={"server_settings": {"timezone": APP_TIMEZONE}},
|
||||
)
|
||||
SessionLocal = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
|
||||
|
||||
async def get_db() -> AsyncSession:
|
||||
"""获取一个异步数据库会话,用于依赖注入。"""
|
||||
async with SessionLocal() as session:
|
||||
yield session
|
||||
Binary file not shown.
@ -1,336 +0,0 @@
|
||||
import threading
|
||||
import time
|
||||
import uuid
|
||||
import datetime
|
||||
from typing import Dict, List, Optional
|
||||
from sqlalchemy.ext.asyncio import AsyncSession, create_async_engine, async_sessionmaker, AsyncEngine
|
||||
from .log_reader import log_reader
|
||||
from .ssh_utils import ssh_manager
|
||||
from .db import SessionLocal
|
||||
from .models.hadoop_logs import HadoopLog
|
||||
from sqlalchemy import text
|
||||
import asyncio
|
||||
from .config import BJ_TZ, DATABASE_URL, APP_TIMEZONE
|
||||
|
||||
class LogCollector:
|
||||
"""Real-time log collector for Hadoop cluster"""
|
||||
|
||||
def __init__(self):
|
||||
self.collectors: Dict[str, threading.Thread] = {}
|
||||
self.is_running: bool = False
|
||||
self.collection_interval: int = 5 # 默认采集间隔,单位:秒
|
||||
self._loops: Dict[str, asyncio.AbstractEventLoop] = {}
|
||||
self._engines: Dict[str, AsyncEngine] = {}
|
||||
self._session_locals: Dict[str, async_sessionmaker[AsyncSession]] = {}
|
||||
self._intervals: Dict[str, int] = {}
|
||||
self._cluster_name_cache: Dict[str, str] = {}
|
||||
self._targets: Dict[str, str] = {}
|
||||
self._line_counts: Dict[str, int] = {}
|
||||
self.max_bytes_per_pull: int = 256 * 1024
|
||||
|
||||
def start_collection(self, node_name: str, log_type: str, ip: Optional[str] = None, interval: Optional[int] = None) -> bool:
|
||||
"""Start real-time log collection for a specific node and log type"""
|
||||
collector_id = f"{node_name}_{log_type}"
|
||||
if interval is not None:
|
||||
self._intervals[collector_id] = max(1, int(interval))
|
||||
|
||||
if collector_id in self.collectors and self.collectors[collector_id].is_alive():
|
||||
print(f"Collector {collector_id} is already running")
|
||||
return False
|
||||
|
||||
# Start even if log file not yet exists; collector will self-check in loop
|
||||
|
||||
# Create a new collector thread
|
||||
collector_thread = threading.Thread(
|
||||
target=self._collect_logs,
|
||||
args=(node_name, log_type, ip),
|
||||
name=collector_id,
|
||||
daemon=True
|
||||
)
|
||||
|
||||
self.collectors[collector_id] = collector_thread
|
||||
collector_thread.start()
|
||||
print(f"Started collector {collector_id}")
|
||||
return True
|
||||
|
||||
def stop_collection(self, node_name: str, log_type: str):
|
||||
"""Stop log collection for a specific node and log type"""
|
||||
collector_id = f"{node_name}_{log_type}"
|
||||
|
||||
if collector_id in self.collectors:
|
||||
# Threads are daemon, so they will exit when main process exits
|
||||
# We just remove it from our tracking
|
||||
del self.collectors[collector_id]
|
||||
self._intervals.pop(collector_id, None)
|
||||
print(f"Stopped collector {collector_id}")
|
||||
else:
|
||||
print(f"Collector {collector_id} is not running")
|
||||
|
||||
def stop_all_collections(self):
|
||||
"""Stop all log collections"""
|
||||
for collector_id in list(self.collectors.keys()):
|
||||
self.stop_collection(*collector_id.split("_"))
|
||||
|
||||
def _parse_log_line(self, line: str, node_name: str, log_type: str):
|
||||
"""Parse a single log line and return a dictionary of log fields"""
|
||||
# Extract timestamp from the log line (format: [2023-12-17 10:00:00,123])
|
||||
timestamp = None
|
||||
log_level = "INFO" # Default log level
|
||||
message = line
|
||||
exception = None
|
||||
|
||||
# Simple log parsing logic
|
||||
if line.startswith('['):
|
||||
# Extract timestamp
|
||||
timestamp_end = line.find(']', 1)
|
||||
if timestamp_end > 0:
|
||||
timestamp_str = line[1:timestamp_end]
|
||||
try:
|
||||
timestamp = datetime.datetime.strptime(timestamp_str, "%Y-%m-%d %H:%M:%S,%f").replace(tzinfo=BJ_TZ)
|
||||
except ValueError:
|
||||
# If parsing fails, use current time
|
||||
timestamp = datetime.datetime.now(BJ_TZ)
|
||||
|
||||
# Extract log level
|
||||
log_levels = ["ERROR", "WARN", "INFO", "DEBUG", "TRACE"]
|
||||
for level in log_levels:
|
||||
if f" {level} " in line:
|
||||
log_level = level
|
||||
break
|
||||
|
||||
return {
|
||||
"timestamp": timestamp or datetime.datetime.now(BJ_TZ),
|
||||
"log_level": log_level,
|
||||
"message": message,
|
||||
"host": node_name,
|
||||
"service": log_type,
|
||||
"raw_log": line
|
||||
}
|
||||
|
||||
async def _save_log_to_db(self, log_data: Dict, collector_id: str | None = None):
|
||||
"""Save log data to database"""
|
||||
try:
|
||||
session_local = self._session_locals.get(collector_id) if collector_id else None
|
||||
async with (session_local() if session_local else SessionLocal()) as session:
|
||||
# 获取集群名称
|
||||
host = log_data["host"]
|
||||
cluster_name = self._cluster_name_cache.get(host)
|
||||
if not cluster_name:
|
||||
cluster_res = await session.execute(text("""
|
||||
SELECT c.name
|
||||
FROM clusters c
|
||||
JOIN nodes n ON c.id = n.cluster_id
|
||||
WHERE n.hostname = :hn LIMIT 1
|
||||
"""), {"hn": host})
|
||||
cluster_row = cluster_res.first()
|
||||
cluster_name = cluster_row[0] if cluster_row else "default_cluster"
|
||||
self._cluster_name_cache[host] = cluster_name
|
||||
|
||||
# Create HadoopLog instance
|
||||
hadoop_log = HadoopLog(
|
||||
log_time=log_data["timestamp"],
|
||||
node_host=log_data["host"],
|
||||
title=log_data["service"],
|
||||
info=log_data["message"],
|
||||
cluster_name=cluster_name
|
||||
)
|
||||
|
||||
# Add to session and commit
|
||||
session.add(hadoop_log)
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
print(f"Error saving log to database: {e}")
|
||||
|
||||
async def _save_logs_to_db_batch(self, logs: List[Dict], collector_id: str | None = None):
|
||||
"""Save a batch of logs to database in one transaction"""
|
||||
try:
|
||||
session_local = self._session_locals.get(collector_id) if collector_id else None
|
||||
async with (session_local() if session_local else SessionLocal()) as session:
|
||||
host = logs[0]["host"] if logs else None
|
||||
cluster_name = self._cluster_name_cache.get(host) if host else None
|
||||
if host and not cluster_name:
|
||||
cluster_res = await session.execute(text("""
|
||||
SELECT c.name
|
||||
FROM clusters c
|
||||
JOIN nodes n ON c.id = n.cluster_id
|
||||
WHERE n.hostname = :hn LIMIT 1
|
||||
"""), {"hn": host})
|
||||
cluster_row = cluster_res.first()
|
||||
cluster_name = cluster_row[0] if cluster_row else "default_cluster"
|
||||
self._cluster_name_cache[host] = cluster_name
|
||||
|
||||
objs: list[HadoopLog] = []
|
||||
for log_data in logs:
|
||||
objs.append(HadoopLog(
|
||||
log_time=log_data["timestamp"],
|
||||
node_host=log_data["host"],
|
||||
title=log_data["service"],
|
||||
info=log_data["message"],
|
||||
cluster_name=cluster_name or "default_cluster",
|
||||
))
|
||||
session.add_all(objs)
|
||||
await session.commit()
|
||||
except Exception as e:
|
||||
print(f"Error batch saving logs: {e}")
|
||||
|
||||
def _collect_logs(self, node_name: str, log_type: str, ip: str):
|
||||
"""Internal method to collect logs continuously"""
|
||||
print(f"Starting log collection for {node_name}_{log_type}")
|
||||
|
||||
collector_id = f"{node_name}_{log_type}"
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
self._loops[collector_id] = loop
|
||||
engine = create_async_engine(
|
||||
DATABASE_URL,
|
||||
echo=False,
|
||||
pool_pre_ping=True,
|
||||
connect_args={"server_settings": {"timezone": APP_TIMEZONE}},
|
||||
pool_size=1,
|
||||
max_overflow=0,
|
||||
)
|
||||
self._engines[collector_id] = engine
|
||||
self._session_locals[collector_id] = async_sessionmaker(engine, expire_on_commit=False, class_=AsyncSession)
|
||||
|
||||
last_remote_size = 0
|
||||
retry_count = 0
|
||||
max_retries = 3
|
||||
|
||||
while collector_id in self.collectors:
|
||||
try:
|
||||
# Wait for next collection interval
|
||||
interval = self._intervals.get(collector_id, self.collection_interval)
|
||||
time.sleep(interval)
|
||||
|
||||
# Resolve target file once and reuse
|
||||
target = self._targets.get(collector_id)
|
||||
if not target:
|
||||
try:
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
dirs = [
|
||||
"/opt/module/hadoop-3.1.3/logs",
|
||||
"/usr/local/hadoop/logs",
|
||||
"/usr/local/hadoop-3.3.6/logs",
|
||||
"/usr/local/hadoop-3.3.5/logs",
|
||||
"/usr/local/hadoop-3.1.3/logs",
|
||||
"/opt/hadoop/logs",
|
||||
"/var/log/hadoop",
|
||||
]
|
||||
for d in dirs:
|
||||
out, err = ssh_client.execute_command(f"ls -1 {d} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
for fn in out.splitlines():
|
||||
f = fn.lower()
|
||||
if log_type in f and node_name in f:
|
||||
target = f"{d}/{fn}"
|
||||
break
|
||||
if target:
|
||||
break
|
||||
if target:
|
||||
self._targets[collector_id] = target
|
||||
except Exception:
|
||||
target = None
|
||||
if not target:
|
||||
print(f"Log file {node_name}_{log_type} not found, will retry")
|
||||
retry_count += 1
|
||||
continue
|
||||
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
|
||||
size_out, size_err = ssh_client.execute_command(f"stat -c %s {target} 2>/dev/null")
|
||||
if size_err:
|
||||
retry_count += 1
|
||||
continue
|
||||
try:
|
||||
remote_size = int((size_out or "").strip())
|
||||
except Exception:
|
||||
retry_count += 1
|
||||
continue
|
||||
|
||||
if remote_size < last_remote_size:
|
||||
last_remote_size = 0
|
||||
|
||||
if remote_size > last_remote_size:
|
||||
delta = remote_size - last_remote_size
|
||||
if delta > self.max_bytes_per_pull:
|
||||
start_pos = remote_size - self.max_bytes_per_pull + 1
|
||||
last_remote_size = remote_size - self.max_bytes_per_pull
|
||||
else:
|
||||
start_pos = last_remote_size + 1
|
||||
|
||||
out2, err2 = ssh_client.execute_command(f"tail -c +{start_pos} {target} 2>/dev/null")
|
||||
if err2:
|
||||
out2, err2 = ssh_client.execute_command(f"dd if={target} bs=1 skip={max(0, start_pos - 1)} 2>/dev/null")
|
||||
if not err2 and out2 and out2.strip():
|
||||
self._save_log_chunk(node_name, log_type, out2)
|
||||
print(f"Collected new logs from {node_name}_{log_type} bytes={len(out2)}")
|
||||
|
||||
last_remote_size = remote_size
|
||||
|
||||
# Reset retry count on successful collection
|
||||
retry_count = 0
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error collecting logs from {node_name}_{log_type}: {e}")
|
||||
retry_count += 1
|
||||
|
||||
if retry_count > max_retries:
|
||||
print(f"Max retries reached for {node_name}_{log_type}, stopping collection")
|
||||
self.stop_collection(node_name, log_type)
|
||||
break
|
||||
|
||||
print(f"Retrying in {self.collection_interval * 2} seconds... ({retry_count}/{max_retries})")
|
||||
|
||||
try:
|
||||
loop = self._loops.pop(collector_id, None)
|
||||
engine = self._engines.pop(collector_id, None)
|
||||
self._session_locals.pop(collector_id, None)
|
||||
if engine and loop:
|
||||
loop.run_until_complete(engine.dispose())
|
||||
if loop and loop.is_running():
|
||||
loop.stop()
|
||||
if loop:
|
||||
loop.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _save_log_chunk(self, node_name: str, log_type: str, content: str):
|
||||
"""Save a chunk of log content to database"""
|
||||
# Split content into lines
|
||||
lines = content.splitlines()
|
||||
|
||||
# Parse each line and save to database
|
||||
log_batch: List[Dict] = []
|
||||
for line in lines:
|
||||
if line.strip():
|
||||
log_data = self._parse_log_line(line, node_name, log_type)
|
||||
log_batch.append(log_data)
|
||||
if not log_batch:
|
||||
return
|
||||
collector_id = f"{node_name}_{log_type}"
|
||||
loop = self._loops.get(collector_id)
|
||||
if loop:
|
||||
loop.run_until_complete(self._save_logs_to_db_batch(log_batch, collector_id=collector_id))
|
||||
else:
|
||||
asyncio.run(self._save_logs_to_db_batch(log_batch))
|
||||
|
||||
def get_collectors_status(self) -> Dict[str, bool]:
|
||||
"""Get the status of all collectors"""
|
||||
status = {}
|
||||
for collector_id, thread in self.collectors.items():
|
||||
status[collector_id] = thread.is_alive()
|
||||
return status
|
||||
|
||||
def set_collection_interval(self, interval: int):
|
||||
"""Set the collection interval"""
|
||||
self.collection_interval = max(1, interval) # Ensure interval is at least 1 second
|
||||
for k in list(self._intervals.keys()):
|
||||
self._intervals[k] = self.collection_interval
|
||||
print(f"Set collection interval to {self.collection_interval} seconds")
|
||||
|
||||
def set_log_dir(self, log_dir: str):
|
||||
"""Set the log directory (deprecated, logs are now stored in database)"""
|
||||
print(f"Warning: set_log_dir is deprecated. Logs are now stored in the database, not in local directory: {log_dir}")
|
||||
|
||||
# Create a global log collector instance
|
||||
log_collector = LogCollector()
|
||||
@ -1,202 +0,0 @@
|
||||
from typing import List, Dict, Optional
|
||||
from .config import LOG_DIR
|
||||
from .ssh_utils import ssh_manager
|
||||
|
||||
class LogReader:
|
||||
"""Log Reader for Hadoop cluster nodes"""
|
||||
|
||||
def __init__(self):
|
||||
self.log_dir = LOG_DIR
|
||||
self._node_log_dir: Dict[str, str] = {}
|
||||
self._candidates = [
|
||||
"/usr/local/hadoop/logs",
|
||||
"/opt/hadoop/logs",
|
||||
"/usr/local/hadoop-3.3.6/logs",
|
||||
"/usr/local/hadoop-3.3.5/logs",
|
||||
"/usr/local/hadoop-3.1.3/logs",
|
||||
"/opt/module/hadoop-3.1.3/logs",
|
||||
"/var/log/hadoop",
|
||||
]
|
||||
|
||||
def get_log_file_path(self, node_name: str, log_type: str) -> str:
|
||||
"""Generate log file path based on node name and log type"""
|
||||
# Map log type to actual log file name
|
||||
log_file_map = {
|
||||
"namenode": "hadoop-hadoop-namenode",
|
||||
"datanode": "hadoop-hadoop-datanode",
|
||||
"resourcemanager": "hadoop-hadoop-resourcemanager",
|
||||
"nodemanager": "hadoop-hadoop-nodemanager",
|
||||
"historyserver": "hadoop-hadoop-historyserver"
|
||||
}
|
||||
|
||||
# Get the base log file name
|
||||
base_name = log_file_map.get(log_type.lower(), log_type.lower())
|
||||
# Generate full log file path
|
||||
return f"{self.log_dir}/{base_name}-{node_name.replace('_', '')}.log"
|
||||
|
||||
def read_log(self, node_name: str, log_type: str, ip: str) -> str:
|
||||
"""Read log from a specific node"""
|
||||
# Ensure working log dir
|
||||
self.find_working_log_dir(node_name, ip)
|
||||
paths = self.get_log_file_paths(node_name, log_type)
|
||||
|
||||
# Get SSH connection
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
|
||||
# Read log file content
|
||||
# try direct candidates
|
||||
for p in paths:
|
||||
out, err = ssh_client.execute_command(f"ls -la {p} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
out, err = ssh_client.execute_command(f"cat {p} 2>/dev/null")
|
||||
if not err:
|
||||
return out
|
||||
# resolve by directory listing
|
||||
base_dir = self._node_log_dir.get(node_name, self.log_dir)
|
||||
out, err = ssh_client.execute_command(f"ls -la {base_dir} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
for line in out.splitlines():
|
||||
parts = line.split()
|
||||
if parts:
|
||||
fn = parts[-1]
|
||||
lf = fn.lower()
|
||||
if log_type in lf and node_name in lf and (lf.endswith(".log") or lf.endswith(".out") or lf.endswith(".out.1")):
|
||||
out2, err2 = ssh_client.execute_command(f"cat {base_dir}/{fn} 2>/dev/null")
|
||||
if not err2:
|
||||
return out2
|
||||
raise FileNotFoundError("No such file")
|
||||
|
||||
def read_all_nodes_log(self, nodes: List[Dict[str, str]], log_type: str) -> Dict[str, str]:
|
||||
"""Read log from all nodes"""
|
||||
logs = {}
|
||||
|
||||
for node in nodes:
|
||||
node_name = node['name']
|
||||
ip = node.get('ip')
|
||||
if not ip:
|
||||
logs[node_name] = "Error: IP address not found"
|
||||
continue
|
||||
|
||||
try:
|
||||
logs[node_name] = self.read_log(node_name, log_type, ip)
|
||||
except Exception as e:
|
||||
logs[node_name] = f"Error reading log: {str(e)}"
|
||||
|
||||
return logs
|
||||
|
||||
def filter_log_by_date(self, log_content: str, start_date: str, end_date: str) -> str:
|
||||
"""Filter log content by date range"""
|
||||
filtered_lines = []
|
||||
for line in log_content.splitlines():
|
||||
# Check if line contains date in the format [YYYY-MM-DD HH:MM:SS,mmm]
|
||||
if line.startswith('['):
|
||||
# Extract date part
|
||||
date_str = line[1:11] # Get YYYY-MM-DD part
|
||||
if start_date <= date_str <= end_date:
|
||||
filtered_lines.append(line)
|
||||
return '\n'.join(filtered_lines)
|
||||
|
||||
def get_log_files_list(self, node_name: str, ip: Optional[str] = None) -> List[str]:
|
||||
"""Get list of log files on a specific node"""
|
||||
# Ensure working log dir
|
||||
if ip:
|
||||
self.find_working_log_dir(node_name, ip)
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
|
||||
# Execute command to list log files from available directories
|
||||
dirs = [self._node_log_dir.get(node_name, self.log_dir)] + self._candidates
|
||||
stdout = ""
|
||||
for d in dirs:
|
||||
out, err = ssh_client.execute_command(f"ls -1 {d} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
stdout = out
|
||||
self._node_log_dir[node_name] = d
|
||||
break
|
||||
stderr = ""
|
||||
|
||||
# Parse log files from output
|
||||
log_files = []
|
||||
if not stderr and stdout.strip():
|
||||
for line in stdout.splitlines():
|
||||
name = line.strip()
|
||||
if name.endswith(".log") or name.endswith(".out") or name.endswith(".out.1"):
|
||||
log_files.append(name)
|
||||
|
||||
return log_files
|
||||
|
||||
def check_log_file_exists(self, node_name: str, log_type: str, ip: Optional[str] = None) -> bool:
|
||||
"""Check if log file exists on a specific node"""
|
||||
# Ensure working log dir
|
||||
if ip:
|
||||
self.find_working_log_dir(node_name, ip)
|
||||
paths = self.get_log_file_paths(node_name, log_type)
|
||||
|
||||
# Get SSH connection
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
|
||||
try:
|
||||
# Execute command to check if file exists
|
||||
for p in paths:
|
||||
stdout, stderr = ssh_client.execute_command(f"ls -la {p} 2>/dev/null")
|
||||
if not stderr and stdout.strip():
|
||||
return True
|
||||
base_dir = self._node_log_dir.get(node_name, self.log_dir)
|
||||
stdout, stderr = ssh_client.execute_command(f"ls -la {base_dir} 2>/dev/null")
|
||||
if not stderr and stdout.strip():
|
||||
for line in stdout.splitlines():
|
||||
parts = line.split()
|
||||
if parts:
|
||||
fn = parts[-1].lower()
|
||||
if log_type in fn and node_name in fn and (fn.endswith(".log") or fn.endswith(".out") or fn.endswith(".out.1")):
|
||||
return True
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"Error checking log file existence: {e}")
|
||||
return False
|
||||
|
||||
def get_node_services(self, node_name: str) -> List[str]:
|
||||
"""Get list of running services on a node based on log files"""
|
||||
# Get all log files
|
||||
log_files = self.get_log_files_list(node_name)
|
||||
|
||||
# Extract service types from log file names
|
||||
services = []
|
||||
for log_file in log_files:
|
||||
if "namenode" in log_file:
|
||||
services.append("namenode")
|
||||
elif "datanode" in log_file:
|
||||
services.append("datanode")
|
||||
elif "resourcemanager" in log_file:
|
||||
services.append("resourcemanager")
|
||||
elif "nodemanager" in log_file:
|
||||
services.append("nodemanager")
|
||||
elif "secondarynamenode" in log_file:
|
||||
services.append("secondarynamenode")
|
||||
|
||||
# Remove duplicates
|
||||
return list(set(services))
|
||||
|
||||
def find_working_log_dir(self, node_name: str, ip: str) -> str:
|
||||
"""Detect a working log directory on remote node and set it"""
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
# try current
|
||||
current = self._node_log_dir.get(node_name, self.log_dir)
|
||||
stdout, stderr = ssh_client.execute_command(f"ls -la {current}")
|
||||
if not stderr and stdout.strip():
|
||||
self._node_log_dir[node_name] = current
|
||||
return current
|
||||
for d in [current] + self._candidates:
|
||||
stdout, stderr = ssh_client.execute_command(f"ls -la {d} 2>/dev/null")
|
||||
if not stderr and stdout.strip():
|
||||
self._node_log_dir[node_name] = d
|
||||
return d
|
||||
self._node_log_dir[node_name] = self.log_dir
|
||||
return self._node_log_dir[node_name]
|
||||
|
||||
def get_log_file_paths(self, node_name: str, log_type: str) -> List[str]:
|
||||
base_dir = self._node_log_dir.get(node_name, self.log_dir)
|
||||
base = f"{base_dir}/hadoop-hadoop-{log_type}-{node_name}"
|
||||
return [f"{base}.log", f"{base}.out", f"{base}.out.1"]
|
||||
|
||||
# Create a global LogReader instance
|
||||
log_reader = LogReader()
|
||||
@ -1,51 +0,0 @@
|
||||
from fastapi import FastAPI, Request, status
|
||||
from fastapi.responses import JSONResponse
|
||||
from fastapi.exceptions import RequestValidationError
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from .routers import auth, health, secure, users, clusters, nodes, metrics, faults, ops, ai, hadoop_logs, sys_exec_logs, hadoop_exec_logs
|
||||
import os
|
||||
|
||||
app = FastAPI(title="Hadoop Fault Detecting API", version="v1")
|
||||
|
||||
@app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(request: Request, exc: RequestValidationError):
|
||||
"""
|
||||
将 Pydantic 校验错误转换为前端更易解析的格式
|
||||
"""
|
||||
errors = []
|
||||
for error in exc.errors():
|
||||
field = error.get("loc")[-1] if error.get("loc") else "unknown"
|
||||
msg = error.get("msg")
|
||||
errors.append({
|
||||
"field": field,
|
||||
"message": f"{field}: {msg}",
|
||||
"code": error.get("type")
|
||||
})
|
||||
|
||||
return JSONResponse(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
content={"detail": {"errors": errors, "message": "请求参数校验失败"}}
|
||||
)
|
||||
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_credentials=False,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
app.include_router(health.router, prefix="/api/v1")
|
||||
app.include_router(auth.router, prefix="/api/v1")
|
||||
app.include_router(secure.router, prefix="/api/v1")
|
||||
app.include_router(clusters.router, prefix="/api/v1")
|
||||
app.include_router(nodes.router, prefix="/api/v1")
|
||||
app.include_router(metrics.router, prefix="/api/v1")
|
||||
app.include_router(users.router, prefix="/api/v1")
|
||||
app.include_router(hadoop_logs.router, prefix="/api/v1")
|
||||
app.include_router(faults.router, prefix="/api/v1")
|
||||
app.include_router(hadoop_exec_logs.router, prefix="/api/v1")
|
||||
app.include_router(ops.router, prefix="/api/v1")
|
||||
app.include_router(ai.router, prefix="/api/v1")
|
||||
app.include_router(sys_exec_logs.router, prefix="/api/v1")
|
||||
|
||||
@ -1,121 +0,0 @@
|
||||
import threading
|
||||
import time
|
||||
import datetime
|
||||
import time as _time
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from .ssh_utils import ssh_manager
|
||||
from .db import SessionLocal
|
||||
from .models.nodes import Node
|
||||
import asyncio
|
||||
from .config import BJ_TZ
|
||||
|
||||
class MetricsCollector:
|
||||
def __init__(self):
|
||||
self.collectors: Dict[str, threading.Thread] = {}
|
||||
self.collection_interval: int = 5
|
||||
self.last_errors: Dict[str, str] = {}
|
||||
self._columns_cache: Dict[str, set] = {}
|
||||
self._cluster_avg_inited: bool = False
|
||||
|
||||
def set_collection_interval(self, interval: int):
|
||||
self.collection_interval = max(1, interval)
|
||||
|
||||
def get_collectors_status(self) -> Dict[str, bool]:
|
||||
status = {}
|
||||
for cid, t in self.collectors.items():
|
||||
status[cid] = t.is_alive()
|
||||
return status
|
||||
|
||||
def get_errors(self) -> Dict[str, str]:
|
||||
return dict(self.last_errors)
|
||||
|
||||
def stop_all(self):
|
||||
for cid in list(self.collectors.keys()):
|
||||
self.stop(cid)
|
||||
|
||||
def stop(self, collector_id: str):
|
||||
if collector_id in self.collectors:
|
||||
del self.collectors[collector_id]
|
||||
if collector_id in self.last_errors:
|
||||
del self.last_errors[collector_id]
|
||||
|
||||
def start_for_nodes(self, nodes: List[Tuple[int, str, str, int]], interval: Optional[int] = None) -> Tuple[int, List[str]]:
|
||||
if interval:
|
||||
self.set_collection_interval(interval)
|
||||
started: List[str] = []
|
||||
for nid, hn, ip, cid in nodes:
|
||||
cid_str = f"{hn}"
|
||||
if cid_str in self.collectors and self.collectors[cid_str].is_alive():
|
||||
continue
|
||||
t = threading.Thread(target=self._collect_node_metrics, args=(nid, hn, ip, cid), name=f"metrics_{hn}", daemon=True)
|
||||
self.collectors[cid_str] = t
|
||||
t.start()
|
||||
started.append(hn)
|
||||
return len(started), started
|
||||
|
||||
def _read_cpu_mem(self, node_name: str, ip: str) -> Tuple[float, float]:
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=ip)
|
||||
out1, err1 = ssh_client.execute_command("cat /proc/stat | head -n 1")
|
||||
_time.sleep(0.5)
|
||||
out2, err2 = ssh_client.execute_command("cat /proc/stat | head -n 1")
|
||||
cpu_pct = 0.0
|
||||
if not err1 and not err2 and out1.strip() and out2.strip():
|
||||
p1 = out1.strip().split()
|
||||
p2 = out2.strip().split()
|
||||
v1 = [int(x) for x in p1[1:]]
|
||||
v2 = [int(x) for x in p2[1:]]
|
||||
get1 = lambda i: (v1[i] if i < len(v1) else 0)
|
||||
get2 = lambda i: (v2[i] if i < len(v2) else 0)
|
||||
idle = (get2(3) + get2(4)) - (get1(3) + get1(4))
|
||||
total = (get2(0) - get1(0)) + (get2(1) - get1(1)) + (get2(2) - get1(2)) + idle + (get2(5) - get1(5)) + (get2(6) - get1(6)) + (get2(7) - get1(7))
|
||||
if total > 0:
|
||||
cpu_pct = round((1.0 - idle / total) * 100.0, 2)
|
||||
outm, errm = ssh_client.execute_command("cat /proc/meminfo")
|
||||
mem_pct = 0.0
|
||||
if not errm and outm.strip():
|
||||
mt = 0
|
||||
ma = 0
|
||||
for line in outm.splitlines():
|
||||
if line.startswith("MemTotal:"):
|
||||
mt = int(line.split()[1])
|
||||
elif line.startswith("MemAvailable:"):
|
||||
ma = int(line.split()[1])
|
||||
if mt > 0:
|
||||
mem_pct = round((1.0 - (ma / mt)) * 100.0, 2)
|
||||
return cpu_pct, mem_pct
|
||||
|
||||
async def _save_metrics(self, node_id: int, hostname: str, cluster_id: int, cpu: float, mem: float):
|
||||
# 这里的 SessionLocal 绑定的 engine 可能在主线程 loop 中初始化
|
||||
# 在 asyncio.run() 开启的新 loop 中使用它会报 Loop 冲突
|
||||
from .db import engine
|
||||
async with AsyncSession(engine) as session:
|
||||
now = datetime.datetime.now(BJ_TZ)
|
||||
await session.execute(text("UPDATE nodes SET cpu_usage=:cpu, memory_usage=:mem, last_heartbeat=:hb WHERE id=:nid"), {"cpu": cpu, "mem": mem, "hb": now, "nid": node_id})
|
||||
await session.commit()
|
||||
|
||||
def _collect_node_metrics(self, node_id: int, hostname: str, ip: str, cluster_id: int):
|
||||
cid = hostname
|
||||
while cid in self.collectors:
|
||||
try:
|
||||
cpu, mem = self._read_cpu_mem(hostname, ip)
|
||||
asyncio.run(self._save_metrics(node_id, hostname, cluster_id, cpu, mem))
|
||||
except Exception as e:
|
||||
self.last_errors[cid] = str(e)
|
||||
time.sleep(self.collection_interval)
|
||||
|
||||
async def _get_table_columns(self, session: AsyncSession, table_name: str) -> set:
|
||||
if table_name in self._columns_cache:
|
||||
return self._columns_cache[table_name]
|
||||
res = await session.execute(text("""
|
||||
SELECT column_name
|
||||
FROM information_schema.columns
|
||||
WHERE table_name = :t
|
||||
"""), {"t": table_name})
|
||||
cols = set(r[0] for r in res.all())
|
||||
self._columns_cache[table_name] = cols
|
||||
return cols
|
||||
|
||||
|
||||
metrics_collector = MetricsCollector()
|
||||
@ -1,4 +0,0 @@
|
||||
from sqlalchemy.orm import DeclarativeBase
|
||||
|
||||
class Base(DeclarativeBase):
|
||||
pass
|
||||
@ -1,30 +0,0 @@
|
||||
from sqlalchemy import Column, Integer, String, Text, DateTime, ForeignKey, Boolean
|
||||
from sqlalchemy.orm import relationship
|
||||
from datetime import datetime
|
||||
from ..config import BJ_TZ
|
||||
from . import Base
|
||||
|
||||
class ChatSession(Base):
|
||||
__tablename__ = "chat_sessions"
|
||||
|
||||
id = Column(String, primary_key=True, index=True) # UUID
|
||||
user_id = Column(Integer, nullable=True, index=True) # Can be linked to a user
|
||||
title = Column(String, nullable=True)
|
||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(BJ_TZ))
|
||||
updated_at = Column(DateTime(timezone=True), default=lambda: datetime.now(BJ_TZ), onupdate=lambda: datetime.now(BJ_TZ))
|
||||
|
||||
messages = relationship("ChatMessage", back_populates="session", cascade="all, delete-orphan", lazy="selectin")
|
||||
|
||||
class ChatMessage(Base):
|
||||
__tablename__ = "chat_messages"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
session_id = Column(String, ForeignKey("chat_sessions.id"), nullable=False)
|
||||
role = Column(String, nullable=False) # system, user, assistant, tool
|
||||
content = Column(Text, nullable=False)
|
||||
created_at = Column(DateTime(timezone=True), default=lambda: datetime.now(BJ_TZ))
|
||||
|
||||
# Optional: store tool calls or extra metadata if needed
|
||||
# For now, we store JSON in content if it's complex, or just text.
|
||||
|
||||
session = relationship("ChatSession", back_populates="messages")
|
||||
@ -1,13 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Integer, Float, TIMESTAMP
|
||||
from . import Base
|
||||
|
||||
class ClusterMetric(Base):
|
||||
__tablename__ = "cluster_metrics"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
cluster_id: Mapped[int] = mapped_column()
|
||||
cluster_name: Mapped[str] = mapped_column(String(100))
|
||||
cpu_avg: Mapped[float] = mapped_column(Float)
|
||||
memory_avg: Mapped[float] = mapped_column(Float)
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
@ -1,45 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Integer, Float, TIMESTAMP
|
||||
from sqlalchemy.dialects.postgresql import UUID, JSONB, INET
|
||||
from . import Base
|
||||
|
||||
class Cluster(Base):
|
||||
__tablename__ = "clusters"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
uuid: Mapped[str] = mapped_column(UUID(as_uuid=False), unique=True)
|
||||
name: Mapped[str] = mapped_column(String(100), unique=True)
|
||||
type: Mapped[str] = mapped_column(String(50))
|
||||
node_count: Mapped[int] = mapped_column(Integer, default=0)
|
||||
health_status: Mapped[str] = mapped_column(String(20), default="unknown")
|
||||
cpu_avg: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
memory_avg: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
namenode_ip: Mapped[str | None] = mapped_column(INET, nullable=True)
|
||||
namenode_psw: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
rm_ip: Mapped[str | None] = mapped_column(INET, nullable=True)
|
||||
rm_psw: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
description: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
config_info: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
updated_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""将集群对象转换为可序列化字典。"""
|
||||
return {
|
||||
"id": self.id,
|
||||
"uuid": self.uuid,
|
||||
"name": self.name,
|
||||
"type": self.type,
|
||||
"node_count": self.node_count,
|
||||
"health_status": self.health_status,
|
||||
"cpu_avg": self.cpu_avg,
|
||||
"memory_avg": self.memory_avg,
|
||||
"namenode_ip": (str(self.namenode_ip) if self.namenode_ip else None),
|
||||
"namenode_psw": self.namenode_psw,
|
||||
"rm_ip": (str(self.rm_ip) if self.rm_ip else None),
|
||||
"rm_psw": self.rm_psw,
|
||||
"description": self.description,
|
||||
"config_info": self.config_info,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
"updated_at": self.updated_at.isoformat() if self.updated_at else None,
|
||||
}
|
||||
@ -1,38 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String
|
||||
from sqlalchemy.dialects.postgresql import JSONB
|
||||
from sqlalchemy import TIMESTAMP
|
||||
from . import Base
|
||||
|
||||
class FaultRecord(Base):
|
||||
__tablename__ = "fault_records"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
fault_id: Mapped[str] = mapped_column(String(32), unique=True)
|
||||
cluster_id: Mapped[int | None] = mapped_column(nullable=True)
|
||||
fault_type: Mapped[str] = mapped_column(String(50))
|
||||
fault_level: Mapped[str] = mapped_column(String(20), default="medium")
|
||||
title: Mapped[str] = mapped_column(String(200))
|
||||
description: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
affected_nodes: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||
affected_clusters: Mapped[dict | None] = mapped_column(JSONB, nullable=True)
|
||||
root_cause: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
repair_suggestion: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(20), default="detected")
|
||||
assignee: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
reporter: Mapped[str] = mapped_column(String(50), default="system")
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
updated_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
resolved_at: Mapped[str | None] = mapped_column(TIMESTAMP(timezone=True), nullable=True)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""将故障记录转换为可序列化字典。"""
|
||||
return {
|
||||
"fault_id": self.fault_id,
|
||||
"cluster_id": self.cluster_id,
|
||||
"fault_type": self.fault_type,
|
||||
"fault_level": self.fault_level,
|
||||
"title": self.title,
|
||||
"status": self.status,
|
||||
"created_at": self.created_at.isoformat() if self.created_at else None,
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Integer, Text, TIMESTAMP, ForeignKey
|
||||
from . import Base
|
||||
|
||||
class HadoopExecLog(Base):
|
||||
__tablename__ = "hadoop_exec_logs"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
from_user_id: Mapped[int] = mapped_column(Integer, nullable=False)
|
||||
cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
description: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
start_time: Mapped[str | None] = mapped_column(TIMESTAMP(timezone=True), nullable=True)
|
||||
end_time: Mapped[str | None] = mapped_column(TIMESTAMP(timezone=True), nullable=True)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"id": self.id,
|
||||
"from_user_id": self.from_user_id,
|
||||
"cluster_name": self.cluster_name,
|
||||
"description": self.description,
|
||||
"start_time": self.start_time.isoformat() if self.start_time else None,
|
||||
"end_time": self.end_time.isoformat() if self.end_time else None,
|
||||
}
|
||||
@ -1,23 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Integer, Text, TIMESTAMP
|
||||
from . import Base
|
||||
|
||||
class HadoopLog(Base):
|
||||
__tablename__ = "hadoop_logs"
|
||||
|
||||
log_id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
cluster_name: Mapped[str] = mapped_column(String(255), nullable=False)
|
||||
node_host: Mapped[str] = mapped_column(String(100), nullable=False)
|
||||
title: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
info: Mapped[str | None] = mapped_column(Text, nullable=True)
|
||||
log_time: Mapped[str] = mapped_column(TIMESTAMP(timezone=True), nullable=False)
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"log_id": self.log_id,
|
||||
"cluster_name": self.cluster_name,
|
||||
"node_host": self.node_host,
|
||||
"title": self.title,
|
||||
"info": self.info,
|
||||
"log_time": self.log_time.isoformat() if self.log_time else None,
|
||||
}
|
||||
@ -1,14 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Integer, Float, TIMESTAMP
|
||||
from . import Base
|
||||
|
||||
class NodeMetric(Base):
|
||||
__tablename__ = "node_metrics"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True, autoincrement=True)
|
||||
cluster_id: Mapped[int] = mapped_column()
|
||||
node_id: Mapped[int] = mapped_column()
|
||||
hostname: Mapped[str] = mapped_column(String(100))
|
||||
cpu_usage: Mapped[float] = mapped_column(Float)
|
||||
memory_usage: Mapped[float] = mapped_column(Float)
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
@ -1,24 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String
|
||||
from sqlalchemy.dialects.postgresql import UUID, INET
|
||||
from sqlalchemy import TIMESTAMP, Float
|
||||
from . import Base
|
||||
|
||||
class Node(Base):
|
||||
__tablename__ = "nodes"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
uuid: Mapped[str] = mapped_column(UUID(as_uuid=False), unique=True)
|
||||
cluster_id: Mapped[int] = mapped_column()
|
||||
hostname: Mapped[str] = mapped_column(String(100))
|
||||
ip_address: Mapped[str] = mapped_column(INET)
|
||||
ssh_user: Mapped[str | None] = mapped_column(String(50), nullable=True)
|
||||
ssh_password: Mapped[str | None] = mapped_column(String(255), nullable=True)
|
||||
# description: Mapped[str | None] = mapped_column(String, nullable=True)
|
||||
status: Mapped[str] = mapped_column(String(20), default="unknown")
|
||||
cpu_usage: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
memory_usage: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
disk_usage: Mapped[float | None] = mapped_column(Float, nullable=True)
|
||||
last_heartbeat: Mapped[str | None] = mapped_column(TIMESTAMP(timezone=True), nullable=True)
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
updated_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
@ -1,20 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import Integer, Text, TIMESTAMP, ForeignKey, text
|
||||
from sqlalchemy.dialects.postgresql import UUID
|
||||
from . import Base
|
||||
|
||||
class SysExecLog(Base):
|
||||
__tablename__ = "sys_exec_logs"
|
||||
|
||||
operation_id: Mapped[str] = mapped_column(UUID(as_uuid=True), primary_key=True, server_default=text("uuid_generate_v4()"))
|
||||
user_id: Mapped[int] = mapped_column(Integer, ForeignKey("users.id"), nullable=False)
|
||||
description: Mapped[str] = mapped_column(Text, nullable=False)
|
||||
operation_time: Mapped[str] = mapped_column(TIMESTAMP(timezone=True), nullable=False, server_default=text("now()"))
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"operation_id": str(self.operation_id),
|
||||
"user_id": self.user_id,
|
||||
"description": self.description,
|
||||
"operation_time": self.operation_time.isoformat() if self.operation_time else None,
|
||||
}
|
||||
@ -1,18 +0,0 @@
|
||||
from sqlalchemy.orm import Mapped, mapped_column
|
||||
from sqlalchemy import String, Boolean
|
||||
from sqlalchemy import TIMESTAMP
|
||||
from . import Base
|
||||
|
||||
class User(Base):
|
||||
__tablename__ = "users"
|
||||
|
||||
id: Mapped[int] = mapped_column(primary_key=True)
|
||||
username: Mapped[str] = mapped_column(String(50), unique=True)
|
||||
email: Mapped[str] = mapped_column(String(100), unique=True)
|
||||
password_hash: Mapped[str] = mapped_column(String(255))
|
||||
full_name: Mapped[str] = mapped_column(String(100))
|
||||
is_active: Mapped[bool] = mapped_column(Boolean, default=True)
|
||||
sort: Mapped[int] = mapped_column(default=0)
|
||||
last_login: Mapped[str | None] = mapped_column(TIMESTAMP(timezone=True), nullable=True)
|
||||
created_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
updated_at: Mapped[str] = mapped_column(TIMESTAMP(timezone=True))
|
||||
@ -1 +0,0 @@
|
||||
|
||||
@ -1,206 +0,0 @@
|
||||
from fastapi import APIRouter, Depends, Query, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, func, delete, update
|
||||
from ..db import get_db
|
||||
from ..models.hadoop_logs import HadoopLog
|
||||
from ..models.clusters import Cluster
|
||||
from ..deps.auth import get_current_user
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
import json
|
||||
from ..config import now_bj
|
||||
from ..config import BJ_TZ
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _get_username(u) -> str:
|
||||
return getattr(u, "username", None) or (u.get("username") if isinstance(u, dict) else None)
|
||||
|
||||
|
||||
def _now():
|
||||
return now_bj()
|
||||
|
||||
|
||||
def _map_level(level: str) -> str:
|
||||
lv = (level or "").lower()
|
||||
if lv in ("critical", "fatal"):
|
||||
return "FATAL"
|
||||
if lv == "high":
|
||||
return "ERROR"
|
||||
if lv == "medium":
|
||||
return "WARN"
|
||||
return "INFO"
|
||||
|
||||
|
||||
class FaultCreate(BaseModel):
|
||||
id: str | None = None
|
||||
type: str
|
||||
level: str
|
||||
status: str
|
||||
title: str
|
||||
cluster: str | None = None
|
||||
node: str | None = None
|
||||
created: str | None = None
|
||||
|
||||
|
||||
class FaultUpdate(BaseModel):
|
||||
status: str | None = None
|
||||
title: str | None = None
|
||||
|
||||
|
||||
@router.get("/faults")
|
||||
async def list_faults(
|
||||
user=Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
cluster: str | None = Query(None),
|
||||
node: str | None = Query(None),
|
||||
time_from: str | None = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
size: int = Query(10, ge=1, le=100),
|
||||
):
|
||||
try:
|
||||
stmt = select(HadoopLog).where(HadoopLog.title == "fault")
|
||||
count_stmt = select(func.count(HadoopLog.log_id)).where(HadoopLog.title == "fault")
|
||||
|
||||
if cluster:
|
||||
stmt = stmt.where(HadoopLog.cluster_name == cluster)
|
||||
count_stmt = count_stmt.where(HadoopLog.cluster_name == cluster)
|
||||
if node:
|
||||
stmt = stmt.where(HadoopLog.node_host == node)
|
||||
count_stmt = count_stmt.where(HadoopLog.node_host == node)
|
||||
if time_from:
|
||||
try:
|
||||
tf = datetime.fromisoformat(time_from.replace("Z", "+00:00"))
|
||||
if tf.tzinfo is None:
|
||||
tf = tf.replace(tzinfo=BJ_TZ)
|
||||
else:
|
||||
tf = tf.astimezone(BJ_TZ)
|
||||
stmt = stmt.where(HadoopLog.log_time >= tf)
|
||||
count_stmt = count_stmt.where(HadoopLog.log_time >= tf)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
stmt = stmt.order_by(HadoopLog.log_time.desc()).offset((page - 1) * size).limit(size)
|
||||
rows = (await db.execute(stmt)).scalars().all()
|
||||
total = (await db.execute(count_stmt)).scalar() or 0
|
||||
|
||||
items = []
|
||||
for r in rows:
|
||||
meta = {}
|
||||
try:
|
||||
if r.info:
|
||||
meta = json.loads(r.info)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
items.append({
|
||||
"id": str(r.log_id),
|
||||
"type": meta.get("type", "unknown"),
|
||||
"level": r.title,
|
||||
"status": meta.get("status", "active"),
|
||||
"title": meta.get("title", r.title),
|
||||
"cluster": r.cluster_name,
|
||||
"node": r.node_host,
|
||||
"created": r.log_time.isoformat() if r.log_time else None
|
||||
})
|
||||
return {"items": items, "total": int(total)}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error listing faults: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.post("/faults")
|
||||
async def create_fault(req: FaultCreate, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
uname = _get_username(user)
|
||||
if uname not in {"admin", "ops"}:
|
||||
raise HTTPException(status_code=403, detail="not_allowed")
|
||||
|
||||
# 确定集群名称
|
||||
cluster_name = req.cluster or "unknown"
|
||||
if req.cluster and "-" in req.cluster: # 可能是 UUID
|
||||
res = await db.execute(select(Cluster.name).where(Cluster.uuid == req.cluster).limit(1))
|
||||
name = res.scalars().first()
|
||||
if name:
|
||||
cluster_name = name
|
||||
|
||||
ts = _now()
|
||||
if req.created:
|
||||
try:
|
||||
dt = datetime.fromisoformat(req.created.replace("Z", "+00:00"))
|
||||
if dt.tzinfo is None:
|
||||
ts = dt.replace(tzinfo=BJ_TZ)
|
||||
else:
|
||||
ts = dt.astimezone(BJ_TZ)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
meta = {"type": req.type, "status": req.status, "title": req.title, "cluster": req.cluster, "node": req.node}
|
||||
log = HadoopLog(
|
||||
cluster_name=cluster_name,
|
||||
node_host=req.node or "unknown",
|
||||
title="fault",
|
||||
info=json.dumps(meta, ensure_ascii=False),
|
||||
log_time=ts
|
||||
)
|
||||
db.add(log)
|
||||
await db.commit()
|
||||
return {"ok": True, "id": log.log_id}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error creating fault: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.put("/faults/{fid}")
|
||||
async def update_fault(fid: int, req: FaultUpdate, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
uname = _get_username(user)
|
||||
if uname not in {"admin", "ops"}:
|
||||
raise HTTPException(status_code=403, detail="not_allowed")
|
||||
|
||||
res = await db.execute(select(HadoopLog).where(HadoopLog.log_id == fid, HadoopLog.title == "fault").limit(1))
|
||||
row = res.scalars().first()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="not_found")
|
||||
|
||||
meta = {}
|
||||
try:
|
||||
if row.info:
|
||||
meta = json.loads(row.info)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if req.status is not None:
|
||||
meta["status"] = req.status
|
||||
if req.title is not None:
|
||||
meta["title"] = req.title
|
||||
|
||||
row.info = json.dumps(meta, ensure_ascii=False)
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error updating fault: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.delete("/faults/{fid}")
|
||||
async def delete_fault(fid: int, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
uname = _get_username(user)
|
||||
if uname not in {"admin", "ops"}:
|
||||
raise HTTPException(status_code=403, detail="not_allowed")
|
||||
await db.execute(delete(HadoopLog).where(HadoopLog.log_id == fid, HadoopLog.title == "fault"))
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error deleting fault: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
@ -1,129 +0,0 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, delete, update
|
||||
from ..db import get_db
|
||||
from ..models.hadoop_exec_logs import HadoopExecLog
|
||||
from ..models.users import User
|
||||
from ..deps.auth import get_current_user
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime, timezone
|
||||
from ..config import now_bj
|
||||
from ..config import BJ_TZ
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
class ExecLogCreate(BaseModel):
|
||||
from_user_id: int
|
||||
cluster_name: str
|
||||
description: str | None = None
|
||||
start_time: str | None = None
|
||||
end_time: str | None = None
|
||||
|
||||
|
||||
class ExecLogUpdate(BaseModel):
|
||||
description: str | None = None
|
||||
start_time: str | None = None
|
||||
end_time: str | None = None
|
||||
|
||||
|
||||
def _now() -> datetime:
|
||||
return now_bj()
|
||||
|
||||
|
||||
def _parse_time(s: str | None) -> datetime | None:
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=BJ_TZ)
|
||||
return dt.astimezone(BJ_TZ)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@router.get("/exec-logs")
|
||||
async def list_exec_logs(user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
stmt = (
|
||||
select(HadoopExecLog, User.username)
|
||||
.join(User, HadoopExecLog.from_user_id == User.id)
|
||||
.order_by(HadoopExecLog.start_time.desc())
|
||||
)
|
||||
result = await db.execute(stmt)
|
||||
rows = result.all()
|
||||
|
||||
items = []
|
||||
for log, username in rows:
|
||||
d = log.to_dict()
|
||||
d["username"] = username
|
||||
if "from_user_id" in d:
|
||||
del d["from_user_id"]
|
||||
items.append(d)
|
||||
|
||||
return {"items": items}
|
||||
except Exception as e:
|
||||
print(f"Error listing exec logs: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.post("/exec-logs")
|
||||
async def create_exec_log(req: ExecLogCreate, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
st = _parse_time(req.start_time)
|
||||
et = _parse_time(req.end_time)
|
||||
|
||||
row = HadoopExecLog(
|
||||
from_user_id=req.from_user_id,
|
||||
cluster_name=req.cluster_name,
|
||||
description=req.description,
|
||||
start_time=st,
|
||||
end_time=et
|
||||
)
|
||||
db.add(row)
|
||||
await db.flush()
|
||||
await db.commit()
|
||||
return {"ok": True, "id": row.id}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error creating exec log: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.put("/exec-logs/{log_id}")
|
||||
async def update_exec_log(log_id: int, req: ExecLogUpdate, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
st = _parse_time(req.start_time)
|
||||
et = _parse_time(req.end_time)
|
||||
values: dict = {}
|
||||
if req.description is not None:
|
||||
values["description"] = req.description
|
||||
if st is not None:
|
||||
values["start_time"] = st
|
||||
if et is not None:
|
||||
values["end_time"] = et
|
||||
|
||||
if not values:
|
||||
return {"ok": True}
|
||||
|
||||
await db.execute(update(HadoopExecLog).where(HadoopExecLog.id == log_id).values(**values))
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.delete("/exec-logs/{log_id}")
|
||||
async def delete_exec_log(log_id: int, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
await db.execute(delete(HadoopExecLog).where(HadoopExecLog.id == log_id))
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
@ -1,459 +0,0 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, func, or_, text
|
||||
from ..db import get_db
|
||||
from ..deps.auth import get_current_user
|
||||
from ..log_reader import log_reader
|
||||
from ..log_collector import log_collector
|
||||
from ..ssh_utils import ssh_manager
|
||||
from ..models.nodes import Node
|
||||
from ..models.clusters import Cluster
|
||||
from ..metrics_collector import metrics_collector
|
||||
from ..models.hadoop_logs import HadoopLog
|
||||
from datetime import datetime, timezone
|
||||
import time
|
||||
from ..models.node_metrics import NodeMetric
|
||||
from ..models.cluster_metrics import ClusterMetric
|
||||
from datetime import timedelta
|
||||
from ..config import now_bj
|
||||
from ..config import BJ_TZ
|
||||
from zoneinfo import ZoneInfo
|
||||
from ..schemas import (
|
||||
LogRequest,
|
||||
LogResponse,
|
||||
MultiLogResponse,
|
||||
NodeListResponse,
|
||||
LogFilesResponse
|
||||
)
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
async def _ensure_metrics_schema(db: AsyncSession):
|
||||
await db.execute(text("""
|
||||
CREATE TABLE IF NOT EXISTS node_metrics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
cluster_id INTEGER,
|
||||
node_id INTEGER,
|
||||
hostname VARCHAR(100),
|
||||
cpu_usage DOUBLE PRECISION,
|
||||
memory_usage DOUBLE PRECISION,
|
||||
created_at TIMESTAMPTZ
|
||||
)
|
||||
"""))
|
||||
await db.execute(text("""
|
||||
CREATE TABLE IF NOT EXISTS cluster_metrics (
|
||||
id SERIAL PRIMARY KEY,
|
||||
cluster_id INTEGER,
|
||||
cluster_name VARCHAR(100),
|
||||
cpu_avg DOUBLE PRECISION,
|
||||
memory_avg DOUBLE PRECISION,
|
||||
created_at TIMESTAMPTZ
|
||||
)
|
||||
"""))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS node_id INTEGER"))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS hostname VARCHAR(100)"))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS cpu_usage DOUBLE PRECISION"))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS memory_usage DOUBLE PRECISION"))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ"))
|
||||
await db.execute(text("ALTER TABLE node_metrics ADD COLUMN IF NOT EXISTS cluster_id INTEGER"))
|
||||
await db.execute(text("ALTER TABLE cluster_metrics ADD COLUMN IF NOT EXISTS cluster_name VARCHAR(100)"))
|
||||
await db.execute(text("ALTER TABLE cluster_metrics ADD COLUMN IF NOT EXISTS cpu_avg DOUBLE PRECISION"))
|
||||
await db.execute(text("ALTER TABLE cluster_metrics ADD COLUMN IF NOT EXISTS memory_avg DOUBLE PRECISION"))
|
||||
await db.execute(text("ALTER TABLE cluster_metrics ADD COLUMN IF NOT EXISTS created_at TIMESTAMPTZ"))
|
||||
await db.execute(text("ALTER TABLE cluster_metrics ADD COLUMN IF NOT EXISTS cluster_id INTEGER"))
|
||||
await db.commit()
|
||||
|
||||
def _parse_time(s: str | None) -> datetime | None:
|
||||
if not s:
|
||||
return None
|
||||
try:
|
||||
dt = datetime.fromisoformat(s.replace("Z", "+00:00"))
|
||||
if dt.tzinfo is None:
|
||||
return dt.replace(tzinfo=BJ_TZ)
|
||||
return dt.astimezone(BJ_TZ)
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@router.get("/logs")
|
||||
async def list_logs(
|
||||
user=Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
cluster: str | None = Query(None),
|
||||
node: str | None = Query(None),
|
||||
source: str | None = Query(None),
|
||||
time_from: str | None = Query(None),
|
||||
page: int = Query(1, ge=1),
|
||||
size: int = Query(10, ge=1, le=100),
|
||||
):
|
||||
try:
|
||||
stmt = select(HadoopLog)
|
||||
count_stmt = select(func.count(HadoopLog.log_id))
|
||||
|
||||
filters = []
|
||||
if cluster:
|
||||
filters.append(HadoopLog.cluster_name == cluster)
|
||||
if node:
|
||||
filters.append(HadoopLog.node_host == node)
|
||||
if source:
|
||||
like = f"%{source}%"
|
||||
filters.append(or_(HadoopLog.title.ilike(like), HadoopLog.info.ilike(like), HadoopLog.node_host.ilike(like)))
|
||||
tf = _parse_time(time_from)
|
||||
if tf:
|
||||
filters.append(HadoopLog.log_time >= tf)
|
||||
|
||||
for f in filters:
|
||||
stmt = stmt.where(f)
|
||||
count_stmt = count_stmt.where(f)
|
||||
|
||||
stmt = stmt.order_by(HadoopLog.log_time.desc()).offset((page - 1) * size).limit(size)
|
||||
rows = (await db.execute(stmt)).scalars().all()
|
||||
total = (await db.execute(count_stmt)).scalar() or 0
|
||||
|
||||
items = [
|
||||
{
|
||||
"id": r.log_id,
|
||||
"time": r.log_time.isoformat() if r.log_time else None,
|
||||
"cluster": r.cluster_name,
|
||||
"node": r.node_host,
|
||||
"title": r.title,
|
||||
"info": r.info,
|
||||
}
|
||||
for r in rows
|
||||
]
|
||||
return {"items": items, "total": int(total)}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
print(f"Error listing logs: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
async def get_node_ip(db: AsyncSession, node_name: str) -> str:
|
||||
result = await db.execute(select(Node.ip_address).where(Node.hostname == node_name))
|
||||
ip = result.scalar_one_or_none()
|
||||
if not ip:
|
||||
raise HTTPException(status_code=404, detail=f"Node {node_name} not found")
|
||||
return str(ip)
|
||||
|
||||
@router.get("/hadoop/nodes/")
|
||||
async def get_hadoop_nodes(user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Get list of all Hadoop nodes"""
|
||||
# Assuming all nodes in DB are relevant, or filter by Cluster type if needed
|
||||
stmt = select(Node.hostname).join(Cluster)
|
||||
# Optional: .where(Cluster.type.ilike('%hadoop%'))
|
||||
result = await db.execute(stmt)
|
||||
nodes = result.scalars().all()
|
||||
return NodeListResponse(nodes=nodes)
|
||||
|
||||
@router.get("/hadoop/logs/{node_name}/{log_type}/", response_model=LogResponse)
|
||||
async def get_hadoop_log(node_name: str, log_type: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Get log from a specific Hadoop node"""
|
||||
ip = await get_node_ip(db, node_name)
|
||||
try:
|
||||
# Read log content
|
||||
log_content = log_reader.read_log(node_name, log_type, ip=ip)
|
||||
return LogResponse(
|
||||
node_name=node_name,
|
||||
log_type=log_type,
|
||||
log_content=log_content
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.get("/hadoop/logs/all/{log_type}/", response_model=MultiLogResponse)
|
||||
async def get_all_hadoop_nodes_log(log_type: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Get logs from all Hadoop nodes"""
|
||||
stmt = select(Node.hostname, Node.ip_address).join(Cluster)
|
||||
result = await db.execute(stmt)
|
||||
nodes_data = result.all()
|
||||
|
||||
nodes_list = [{"name": n[0], "ip": str(n[1])} for n in nodes_data]
|
||||
|
||||
try:
|
||||
# Read logs from all nodes
|
||||
logs = log_reader.read_all_nodes_log(nodes_list, log_type)
|
||||
return MultiLogResponse(logs=logs)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.get("/hadoop/logs/files/{node_name}/", response_model=LogFilesResponse)
|
||||
async def get_hadoop_log_files(node_name: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Get list of log files on a specific Hadoop node"""
|
||||
ip = await get_node_ip(db, node_name)
|
||||
try:
|
||||
# Get log files list
|
||||
log_files = log_reader.get_log_files_list(node_name, ip=ip)
|
||||
return LogFilesResponse(
|
||||
node_name=node_name,
|
||||
log_files=log_files
|
||||
)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
# Log collection management endpoints
|
||||
@router.get("/hadoop/collectors/status/")
|
||||
async def get_hadoop_collectors_status(user=Depends(get_current_user)):
|
||||
"""Get status of all Hadoop log collectors"""
|
||||
status = log_collector.get_collectors_status()
|
||||
return {
|
||||
"collectors": status,
|
||||
"total_running": sum(status.values())
|
||||
}
|
||||
|
||||
@router.post("/hadoop/collectors/start/{node_name}/{log_type}/")
|
||||
async def start_hadoop_collector(node_name: str, log_type: str, interval: int = 5, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Start log collection for a specific Hadoop node and log type"""
|
||||
ip = await get_node_ip(db, node_name)
|
||||
try:
|
||||
log_collector.start_collection(node_name, log_type, ip=ip, interval=interval)
|
||||
return {
|
||||
"message": f"Started log collection for {node_name}_{log_type}",
|
||||
"interval": interval
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/stop/{node_name}/{log_type}/")
|
||||
async def stop_hadoop_collector(node_name: str, log_type: str, user=Depends(get_current_user)):
|
||||
"""Stop log collection for a specific Hadoop node and log type"""
|
||||
# stop doesn't need IP as it just stops the thread by ID
|
||||
try:
|
||||
log_collector.stop_collection(node_name, log_type)
|
||||
return {
|
||||
"message": f"Stopped log collection for {node_name}_{log_type}"
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/stop/all/")
|
||||
async def stop_all_hadoop_collectors(user=Depends(get_current_user)):
|
||||
"""Stop all Hadoop log collectors"""
|
||||
try:
|
||||
log_collector.stop_all_collections()
|
||||
return {
|
||||
"message": "Stopped all log collectors"
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/set-interval/{interval}/")
|
||||
async def set_hadoop_collection_interval(interval: int, user=Depends(get_current_user)):
|
||||
"""Set collection interval for all Hadoop collectors"""
|
||||
try:
|
||||
log_collector.set_collection_interval(interval)
|
||||
return {
|
||||
"message": f"Set collection interval to {interval} seconds"
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/set-log-dir/{log_dir}/")
|
||||
async def set_hadoop_log_directory(log_dir: str, user=Depends(get_current_user)):
|
||||
"""Set log directory for all Hadoop collectors"""
|
||||
try:
|
||||
log_collector.set_log_dir(log_dir)
|
||||
return {
|
||||
"message": f"Set log directory to {log_dir}"
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/nodes/{node_name}/execute/")
|
||||
async def execute_hadoop_command(node_name: str, command: str, timeout: int = 30, user=Depends(get_current_user)):
|
||||
"""Execute a command on a specific Hadoop node"""
|
||||
try:
|
||||
from sqlalchemy import select
|
||||
from ..db import SessionLocal
|
||||
from ..models.nodes import Node
|
||||
async with SessionLocal() as db:
|
||||
res = await db.execute(select(Node.ip_address).where(Node.hostname == node_name).limit(1))
|
||||
ip = res.scalar_one_or_none()
|
||||
if not ip:
|
||||
raise HTTPException(status_code=404, detail=f"Node {node_name} not found")
|
||||
ssh_client = ssh_manager.get_connection(node_name, ip=str(ip))
|
||||
|
||||
# Execute command with timeout
|
||||
stdout, stderr = ssh_client.execute_command_with_timeout(command, timeout)
|
||||
|
||||
return {
|
||||
"node_name": node_name,
|
||||
"command": command,
|
||||
"stdout": stdout,
|
||||
"stderr": stderr,
|
||||
"status": "success" if not stderr else "error"
|
||||
}
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/start-by-cluster/{cluster_uuid}/")
|
||||
async def start_collectors_by_cluster(cluster_uuid: str, interval: int = 5, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""Start log collection for all nodes of the cluster (by UUID), only for existing services"""
|
||||
try:
|
||||
cid_res = await db.execute(select(Cluster.id).where(Cluster.uuid == cluster_uuid).limit(1))
|
||||
cid = cid_res.scalar_one_or_none()
|
||||
if cid is None:
|
||||
raise HTTPException(status_code=404, detail="cluster_not_found")
|
||||
nodes_res = await db.execute(select(Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
rows = nodes_res.all()
|
||||
if not rows:
|
||||
return {"started": 0, "nodes": []}
|
||||
started = []
|
||||
for hn, ip in rows:
|
||||
ip_s = str(ip)
|
||||
files = []
|
||||
try:
|
||||
log_reader.find_working_log_dir(hn, ip_s)
|
||||
files = log_reader.get_log_files_list(hn, ip=ip_s)
|
||||
except Exception:
|
||||
files = []
|
||||
services = []
|
||||
for fn in files:
|
||||
f = fn.lower()
|
||||
if "namenode" in f:
|
||||
services.append("namenode")
|
||||
elif "secondarynamenode" in f:
|
||||
services.append("secondarynamenode")
|
||||
elif "datanode" in f:
|
||||
services.append("datanode")
|
||||
elif "resourcemanager" in f:
|
||||
services.append("resourcemanager")
|
||||
elif "nodemanager" in f:
|
||||
services.append("nodemanager")
|
||||
elif "historyserver" in f:
|
||||
services.append("historyserver")
|
||||
services = list(set(services))
|
||||
for t in services:
|
||||
ok = False
|
||||
try:
|
||||
ok = log_collector.start_collection(hn, t, ip=ip_s, interval=interval)
|
||||
except Exception:
|
||||
ok = False
|
||||
if ok:
|
||||
started.append(f"{hn}_{t}")
|
||||
return {"started": len(started), "nodes": started, "interval": interval}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/hadoop/collectors/backfill-by-cluster/{cluster_uuid}/")
|
||||
async def backfill_logs_by_cluster(cluster_uuid: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
cid_res = await db.execute(select(Cluster.id).where(Cluster.uuid == cluster_uuid).limit(1))
|
||||
cid = cid_res.scalar_one_or_none()
|
||||
if cid is None:
|
||||
raise HTTPException(status_code=404, detail="cluster_not_found")
|
||||
nodes_res = await db.execute(select(Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
rows = nodes_res.all()
|
||||
if not rows:
|
||||
return {"backfilled": 0, "details": []}
|
||||
details = []
|
||||
for hn, ip in rows:
|
||||
ip_s = str(ip)
|
||||
ssh_client = ssh_manager.get_connection(hn, ip=ip_s)
|
||||
candidates = [
|
||||
"/opt/module/hadoop-3.1.3/logs",
|
||||
"/usr/local/hadoop/logs",
|
||||
"/usr/local/hadoop-3.3.6/logs",
|
||||
"/usr/local/hadoop-3.3.5/logs",
|
||||
"/usr/local/hadoop-3.1.3/logs",
|
||||
"/opt/hadoop/logs",
|
||||
"/var/log/hadoop",
|
||||
]
|
||||
base = None
|
||||
for d in candidates:
|
||||
out, err = ssh_client.execute_command(f"ls -1 {d} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
base = d
|
||||
break
|
||||
services = []
|
||||
count = 0
|
||||
if base:
|
||||
out, err = ssh_client.execute_command(f"ls -1 {base} 2>/dev/null")
|
||||
if not err and out.strip():
|
||||
for fn in out.splitlines():
|
||||
f = fn.lower()
|
||||
t = None
|
||||
if "namenode" in f:
|
||||
t = "namenode"
|
||||
elif "secondarynamenode" in f:
|
||||
t = "secondarynamenode"
|
||||
elif "datanode" in f:
|
||||
t = "datanode"
|
||||
elif "resourcemanager" in f:
|
||||
t = "resourcemanager"
|
||||
elif "nodemanager" in f:
|
||||
t = "nodemanager"
|
||||
elif "historyserver" in f:
|
||||
t = "historyserver"
|
||||
if t:
|
||||
services.append(t)
|
||||
out2, err2 = ssh_client.execute_command(f"cat {base}/{fn} 2>/dev/null")
|
||||
if not err2 and out2:
|
||||
log_collector._save_log_chunk(hn, t, out2)
|
||||
count += out2.count("\n")
|
||||
details.append({"node": hn, "services": list(set(services)), "lines": count})
|
||||
total_lines = sum(d["lines"] for d in details)
|
||||
return {"backfilled": total_lines, "details": details}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/metrics/{cluster_uuid}/")
|
||||
async def sync_metrics(cluster_uuid: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
from sqlalchemy import select
|
||||
try:
|
||||
metrics_collector.stop_all()
|
||||
except Exception:
|
||||
pass
|
||||
cid_res = await db.execute(select(Cluster.id, Cluster.name).where(Cluster.uuid == cluster_uuid).limit(1))
|
||||
row = cid_res.first()
|
||||
if not row:
|
||||
raise HTTPException(status_code=404, detail="cluster_not_found")
|
||||
cid, cname = row
|
||||
nodes_res = await db.execute(select(Node.id, Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
rows = nodes_res.all()
|
||||
now = now_bj()
|
||||
details = []
|
||||
for nid, hn, ip in rows:
|
||||
ssh_client = ssh_manager.get_connection(hn, ip=str(ip))
|
||||
out1, err1 = ssh_client.execute_command("cat /proc/stat | head -n 1")
|
||||
time.sleep(0.5)
|
||||
out2, err2 = ssh_client.execute_command("cat /proc/stat | head -n 1")
|
||||
cpu_pct = 0.0
|
||||
if not err1 and not err2 and out1.strip() and out2.strip():
|
||||
p1 = out1.strip().split()
|
||||
p2 = out2.strip().split()
|
||||
v1 = [int(x) for x in p1[1:]]
|
||||
v2 = [int(x) for x in p2[1:]]
|
||||
get1 = lambda i: (v1[i] if i < len(v1) else 0)
|
||||
get2 = lambda i: (v2[i] if i < len(v2) else 0)
|
||||
idle = (get2(3) + get2(4)) - (get1(3) + get1(4))
|
||||
total = (get2(0) - get1(0)) + (get2(1) - get1(1)) + (get2(2) - get1(2)) + idle + (get2(5) - get1(5)) + (get2(6) - get1(6)) + (get2(7) - get1(7))
|
||||
if total > 0:
|
||||
cpu_pct = round((1.0 - idle / total) * 100.0, 2)
|
||||
outm, errm = ssh_client.execute_command("cat /proc/meminfo")
|
||||
mem_pct = 0.0
|
||||
if not errm and outm.strip():
|
||||
mt = 0
|
||||
ma = 0
|
||||
for line in outm.splitlines():
|
||||
if line.startswith("MemTotal:"):
|
||||
mt = int(line.split()[1])
|
||||
elif line.startswith("MemAvailable:"):
|
||||
ma = int(line.split()[1])
|
||||
if mt > 0:
|
||||
mem_pct = round((1.0 - (ma / mt)) * 100.0, 2)
|
||||
details.append({"node": hn, "cpu": cpu_pct, "memory": mem_pct})
|
||||
if details:
|
||||
ca = round(sum(d["cpu"] for d in details) / len(details), 3)
|
||||
ma = round(sum(d["memory"] for d in details) / len(details), 3)
|
||||
else:
|
||||
ca = 0.0
|
||||
ma = 0.0
|
||||
return {"cluster": {"cpu_avg": round(ca, 2), "memory_avg": round(ma, 2), "time": now.isoformat(), "cluster_name": cname}, "nodes": details}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
@ -1,16 +0,0 @@
|
||||
from fastapi import APIRouter, Depends
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import text
|
||||
from ..db import get_db
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/health")
|
||||
async def health_check(db: AsyncSession = Depends(get_db)):
|
||||
"""健康检查,包括数据库连接验证。"""
|
||||
try:
|
||||
# 尝试执行一个简单的查询来验证数据库连接
|
||||
await db.execute(text("SELECT 1"))
|
||||
return {"status": "ok", "database": "connected"}
|
||||
except Exception as e:
|
||||
return {"status": "ok", "database": f"disconnected: {str(e)}"}
|
||||
@ -1,120 +0,0 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, update, delete, func, text
|
||||
from ..db import get_db
|
||||
from ..deps.auth import get_current_user
|
||||
from ..models.nodes import Node
|
||||
from ..models.clusters import Cluster
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime, timezone
|
||||
from ..config import now_bj
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
def _get_username(u) -> str:
|
||||
return getattr(u, "username", None) or (u.get("username") if isinstance(u, dict) else None)
|
||||
|
||||
|
||||
def _status_to_contract(s: str) -> str:
|
||||
if s == "healthy":
|
||||
return "running"
|
||||
if s == "unhealthy":
|
||||
return "stopped"
|
||||
return s or "unknown"
|
||||
|
||||
|
||||
def _fmt_percent(v: float | None) -> str:
|
||||
if v is None:
|
||||
return "-"
|
||||
return f"{int(round(v))}%"
|
||||
|
||||
|
||||
def _fmt_updated(ts: datetime | None) -> str:
|
||||
if not ts:
|
||||
return "-"
|
||||
now = now_bj()
|
||||
diff = int((now - ts).total_seconds())
|
||||
if diff < 60:
|
||||
return "刚刚"
|
||||
if diff < 3600:
|
||||
return f"{diff // 60}分钟前"
|
||||
return f"{diff // 3600}小时前"
|
||||
|
||||
|
||||
class NodeDetail(BaseModel):
|
||||
name: str
|
||||
metrics: dict
|
||||
|
||||
|
||||
@router.get("/nodes")
|
||||
async def list_nodes(cluster: str = Query(...), user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""拉取指定集群的节点列表。"""
|
||||
try:
|
||||
name = _get_username(user)
|
||||
uid_res = await db.execute(text("SELECT id FROM users WHERE username=:un LIMIT 1"), {"un": name})
|
||||
uid_row = uid_res.first()
|
||||
if not uid_row:
|
||||
return {"nodes": []}
|
||||
cid_res = await db.execute(select(Cluster.id).where(Cluster.uuid == cluster).limit(1))
|
||||
cid = cid_res.scalars().first()
|
||||
if not cid:
|
||||
return {"nodes": []}
|
||||
auth_res = await db.execute(text("SELECT 1 FROM user_cluster_mapping WHERE user_id=:uid AND cluster_id=:cid LIMIT 1"), {"uid": uid_row[0], "cid": cid})
|
||||
if not auth_res.first():
|
||||
raise HTTPException(status_code=403, detail="not_allowed")
|
||||
result = await db.execute(select(Node).where(Node.cluster_id == cid).limit(500))
|
||||
rows = result.scalars().all()
|
||||
data = [
|
||||
{
|
||||
"name": n.hostname,
|
||||
"ip": str(getattr(n, "ip_address", "")) if getattr(n, "ip_address", None) else None,
|
||||
"status": _status_to_contract(n.status),
|
||||
"cpu": _fmt_percent(n.cpu_usage),
|
||||
"mem": _fmt_percent(n.memory_usage),
|
||||
"updated": _fmt_updated(n.last_heartbeat),
|
||||
}
|
||||
for n in rows
|
||||
]
|
||||
return {"nodes": data}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@router.get("/nodes/{name}")
|
||||
async def node_detail(name: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
"""查询节点详情。"""
|
||||
try:
|
||||
name_u = _get_username(user)
|
||||
uid_res = await db.execute(text("SELECT id FROM users WHERE username=:un LIMIT 1"), {"un": name_u})
|
||||
uid_row = uid_res.first()
|
||||
if not uid_row:
|
||||
raise HTTPException(status_code=404, detail="not_found")
|
||||
# 仅返回用户可访问集群中的该节点
|
||||
ids_res = await db.execute(text("SELECT cluster_id FROM user_cluster_mapping WHERE user_id=:uid"), {"uid": uid_row[0]})
|
||||
cluster_ids = [r[0] for r in ids_res.all()]
|
||||
if not cluster_ids:
|
||||
raise HTTPException(status_code=404, detail="not_found")
|
||||
res = await db.execute(select(Node).where(Node.hostname == name, Node.cluster_id.in_(cluster_ids)).limit(1))
|
||||
n = res.scalars().first()
|
||||
if not n:
|
||||
raise HTTPException(status_code=404, detail="not_found")
|
||||
return NodeDetail(
|
||||
name=n.hostname,
|
||||
metrics={
|
||||
"cpu": _fmt_percent(n.cpu_usage),
|
||||
"mem": _fmt_percent(n.memory_usage),
|
||||
"disk": _fmt_percent(n.disk_usage),
|
||||
"status": _status_to_contract(n.status),
|
||||
"ip": str(getattr(n, "ip_address", "")) if getattr(n, "ip_address", None) else None,
|
||||
"lastHeartbeat": getattr(n, "last_heartbeat", None).isoformat() if getattr(n, "last_heartbeat", None) else None,
|
||||
},
|
||||
).model_dump()
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception:
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
|
||||
@ -1,10 +0,0 @@
|
||||
from fastapi import APIRouter, Depends
|
||||
from ..deps.auth import get_current_user
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
@router.get("/user/me")
|
||||
async def me(user = Depends(get_current_user)):
|
||||
if isinstance(user, dict):
|
||||
return {"username": user.get("username"), "fullName": user.get("full_name"), "isActive": user.get("is_active")}
|
||||
return {"username": user.username, "fullName": user.full_name, "isActive": user.is_active}
|
||||
@ -1,61 +0,0 @@
|
||||
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy import select, delete, func
|
||||
from ..db import get_db
|
||||
from ..models.sys_exec_logs import SysExecLog
|
||||
from ..deps.auth import get_current_user
|
||||
from pydantic import BaseModel
|
||||
from datetime import datetime
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
class SysExecLogCreate(BaseModel):
|
||||
user_id: int
|
||||
description: str
|
||||
|
||||
@router.get("/sys-exec-logs")
|
||||
async def list_sys_exec_logs(
|
||||
user=Depends(get_current_user),
|
||||
db: AsyncSession = Depends(get_db),
|
||||
page: int = Query(1, ge=1),
|
||||
size: int = Query(10, ge=1, le=100),
|
||||
):
|
||||
try:
|
||||
stmt = select(SysExecLog).order_by(SysExecLog.operation_time.desc()).offset((page - 1) * size).limit(size)
|
||||
count_stmt = select(func.count(SysExecLog.operation_id))
|
||||
|
||||
rows = (await db.execute(stmt)).scalars().all()
|
||||
total = (await db.execute(count_stmt)).scalar() or 0
|
||||
|
||||
return {
|
||||
"items": [r.to_dict() for r in rows],
|
||||
"total": int(total)
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error listing sys exec logs: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
@router.post("/sys-exec-logs")
|
||||
async def create_sys_exec_log(req: SysExecLogCreate, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
row = SysExecLog(
|
||||
user_id=req.user_id,
|
||||
description=req.description
|
||||
)
|
||||
db.add(row)
|
||||
await db.commit()
|
||||
return {"ok": True, "operation_id": str(row.operation_id)}
|
||||
except Exception as e:
|
||||
print(f"Error creating sys exec log: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
|
||||
@router.delete("/sys-exec-logs/{operation_id}")
|
||||
async def delete_sys_exec_log(operation_id: str, user=Depends(get_current_user), db: AsyncSession = Depends(get_db)):
|
||||
try:
|
||||
# Note: operation_id is UUID
|
||||
await db.execute(delete(SysExecLog).where(SysExecLog.operation_id == operation_id))
|
||||
await db.commit()
|
||||
return {"ok": True}
|
||||
except Exception as e:
|
||||
print(f"Error deleting sys exec log: {e}")
|
||||
raise HTTPException(status_code=500, detail="server_error")
|
||||
@ -1,39 +0,0 @@
|
||||
from pydantic import BaseModel
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
class LogRequest(BaseModel):
|
||||
"""Log request model"""
|
||||
node_name: str
|
||||
log_type: str
|
||||
start_date: Optional[str] = None
|
||||
end_date: Optional[str] = None
|
||||
|
||||
class SaveLogRequest(BaseModel):
|
||||
"""Save log request model"""
|
||||
node_name: str
|
||||
log_type: str
|
||||
local_file_path: str
|
||||
|
||||
class LogResponse(BaseModel):
|
||||
"""Log response model"""
|
||||
node_name: str
|
||||
log_type: str
|
||||
log_content: str
|
||||
|
||||
class MultiLogResponse(BaseModel):
|
||||
"""Multiple logs response model"""
|
||||
logs: Dict[str, str]
|
||||
|
||||
class SaveLogResponse(BaseModel):
|
||||
"""Save log response model"""
|
||||
message: str
|
||||
local_file_path: str
|
||||
|
||||
class NodeListResponse(BaseModel):
|
||||
"""Node list response model"""
|
||||
nodes: List[str]
|
||||
|
||||
class LogFilesResponse(BaseModel):
|
||||
"""Log files list response model"""
|
||||
node_name: str
|
||||
log_files: List[str]
|
||||
@ -1,33 +0,0 @@
|
||||
import asyncio
|
||||
import argparse
|
||||
from sqlalchemy import select
|
||||
from app.db import SessionLocal
|
||||
from app.models.nodes import Node
|
||||
from app.models.clusters import Cluster
|
||||
from app.metrics_collector import metrics_collector
|
||||
|
||||
async def run(uuid: str):
|
||||
async with SessionLocal() as session:
|
||||
cid_res = await session.execute(select(Cluster.id).where(Cluster.uuid == uuid).limit(1))
|
||||
cid = cid_res.scalars().first()
|
||||
if not cid:
|
||||
print("NO_CLUSTER")
|
||||
return
|
||||
res = await session.execute(select(Node.id, Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
rows = res.all()
|
||||
if not rows:
|
||||
print("NO_NODES")
|
||||
return
|
||||
for nid, hn, ip in rows:
|
||||
cpu, mem = metrics_collector._read_cpu_mem(hn, str(ip))
|
||||
await metrics_collector._save_metrics(nid, hn, cid, cpu, mem)
|
||||
print("DONE", len(rows))
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--cluster", required=True)
|
||||
args = parser.parse_args()
|
||||
asyncio.run(run(args.cluster))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,27 +0,0 @@
|
||||
import os
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
uuid = os.environ.get("CLUSTER_UUID")
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text("ALTER TABLE clusters ADD COLUMN IF NOT EXISTS cpu_avg double precision"))
|
||||
await conn.execute(text("ALTER TABLE clusters ADD COLUMN IF NOT EXISTS memory_avg double precision"))
|
||||
await conn.execute(text("ALTER TABLE clusters ADD COLUMN IF NOT EXISTS last_avg_at timestamptz"))
|
||||
async with engine.begin() as conn:
|
||||
if uuid:
|
||||
res = await conn.execute(text("SELECT id FROM clusters WHERE uuid=:u LIMIT 1"), {"u": uuid})
|
||||
row = res.first()
|
||||
if row:
|
||||
cid = row[0]
|
||||
avg = await conn.execute(text("SELECT AVG(cpu_usage), AVG(memory_usage) FROM nodes WHERE cluster_id=:cid"), {"cid": cid})
|
||||
ar = avg.first()
|
||||
await conn.execute(text("UPDATE clusters SET cpu_avg=:ca, memory_avg=:ma, last_avg_at=NOW() WHERE id=:cid"), {"ca": float(ar[0] or 0.0), "ma": float(ar[1] or 0.0), "cid": cid})
|
||||
else:
|
||||
avg = await conn.execute(text("SELECT AVG(cpu_usage), AVG(memory_usage) FROM nodes"))
|
||||
ar = avg.first()
|
||||
await conn.execute(text("UPDATE clusters SET cpu_avg=:ca, memory_avg=:ma, last_avg_at=NOW()"), {"ca": float(ar[0] or 0.0), "ma": float(ar[1] or 0.0)})
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,12 +0,0 @@
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
async with engine.begin() as conn:
|
||||
res = await conn.execute(text('SELECT id, hostname, cpu_usage, memory_usage, last_heartbeat FROM nodes ORDER BY id LIMIT 5'))
|
||||
for row in res.all():
|
||||
print('NODE', row)
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
@ -1,2 +0,0 @@
|
||||
from app.config import DATABASE_URL
|
||||
print(DATABASE_URL)
|
||||
@ -1,14 +0,0 @@
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
async with engine.begin() as conn:
|
||||
c = await conn.execute(text('SELECT COUNT(*) FROM hadoop_logs'))
|
||||
print('HADOOP_LOGS_COUNT', c.scalar() or 0)
|
||||
rows = await conn.execute(text('SELECT cluster_name,node_host,title,log_time FROM hadoop_logs ORDER BY log_id DESC LIMIT 5'))
|
||||
for r in rows.all():
|
||||
print('LOG', r)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,17 +0,0 @@
|
||||
import os
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
uuid = os.environ.get("CLUSTER_UUID")
|
||||
async with engine.begin() as conn:
|
||||
if uuid:
|
||||
res = await conn.execute(text("SELECT cpu_avg, memory_avg FROM clusters WHERE uuid=:u LIMIT 1"), {"u": uuid})
|
||||
else:
|
||||
res = await conn.execute(text("SELECT cpu_avg, memory_avg FROM clusters LIMIT 1"))
|
||||
row = res.first()
|
||||
print("CLUSTER_AVG_STORED", (float(row[0]) if row and row[0] is not None else 0.0), (float(row[1]) if row and row[1] is not None else 0.0))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,12 +0,0 @@
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
async with engine.begin() as conn:
|
||||
await conn.execute(text("ALTER TABLE clusters DROP COLUMN IF EXISTS cpu_avg"))
|
||||
await conn.execute(text("ALTER TABLE clusters DROP COLUMN IF EXISTS memory_avg"))
|
||||
await conn.execute(text("ALTER TABLE clusters DROP COLUMN IF EXISTS last_avg_at"))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,66 +0,0 @@
|
||||
import os
|
||||
import asyncio
|
||||
import time
|
||||
from sqlalchemy import select, text
|
||||
from app.db import SessionLocal, engine
|
||||
from app.models.clusters import Cluster
|
||||
from app.models.nodes import Node
|
||||
from app.log_reader import log_reader
|
||||
from app.log_collector import log_collector
|
||||
|
||||
async def run(cluster_uuid: str, interval: int = 3, duration: int = 10):
|
||||
async with engine.begin() as conn:
|
||||
res = await conn.execute(text("SELECT id FROM clusters WHERE uuid=:u LIMIT 1"), {"u": cluster_uuid})
|
||||
row = res.first()
|
||||
if not row:
|
||||
print("CLUSTER_NOT_FOUND")
|
||||
return
|
||||
cid = row[0]
|
||||
before = await conn.execute(text("SELECT COUNT(*) FROM hadoop_logs"))
|
||||
print("HADOOP_LOGS_BEFORE", before.scalar() or 0)
|
||||
async with SessionLocal() as session:
|
||||
nodes_res = await session.execute(select(Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
nodes = [(r[0], str(r[1])) for r in nodes_res.all()]
|
||||
started = []
|
||||
for hn, ip in nodes:
|
||||
try:
|
||||
log_reader.find_working_log_dir(hn, ip)
|
||||
files = log_reader.get_log_files_list(hn, ip=ip)
|
||||
except Exception:
|
||||
files = []
|
||||
services = set()
|
||||
for f in files:
|
||||
lf = f.lower()
|
||||
if "namenode" in lf:
|
||||
services.add("namenode")
|
||||
elif "secondarynamenode" in lf:
|
||||
services.add("secondarynamenode")
|
||||
elif "datanode" in lf:
|
||||
services.add("datanode")
|
||||
elif "resourcemanager" in lf:
|
||||
services.add("resourcemanager")
|
||||
elif "nodemanager" in lf:
|
||||
services.add("nodemanager")
|
||||
elif "historyserver" in lf:
|
||||
services.add("historyserver")
|
||||
for t in services:
|
||||
ok = log_collector.start_collection(hn, t, ip=ip, interval=interval)
|
||||
if ok:
|
||||
started.append(f"{hn}_{t}")
|
||||
time.sleep(duration)
|
||||
log_collector.stop_all_collections()
|
||||
async with engine.begin() as conn:
|
||||
after = await conn.execute(text("SELECT COUNT(*) FROM hadoop_logs"))
|
||||
print("HADOOP_LOGS_AFTER", after.scalar() or 0)
|
||||
last = await conn.execute(text("SELECT cluster_name, node_host, title, log_time FROM hadoop_logs ORDER BY log_id DESC LIMIT 5"))
|
||||
for row in last.all():
|
||||
print("LOG", row)
|
||||
|
||||
def main():
|
||||
uuid = os.environ.get("CLUSTER_UUID")
|
||||
interval = int(os.environ.get("LOG_INTERVAL", "3"))
|
||||
duration = int(os.environ.get("LOG_DURATION", "10"))
|
||||
asyncio.run(run(uuid, interval=interval, duration=duration))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,11 +0,0 @@
|
||||
import asyncio
|
||||
from sqlalchemy import text
|
||||
from app.db import SessionLocal
|
||||
|
||||
async def main():
|
||||
async with SessionLocal() as session:
|
||||
res = await session.execute(text('SELECT 1'))
|
||||
print('OK', res.scalar())
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
@ -1,38 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
from sqlalchemy import text
|
||||
from app.db import engine
|
||||
|
||||
async def main():
|
||||
uuid = os.environ.get("CLUSTER_UUID")
|
||||
async with engine.begin() as conn:
|
||||
cid = None
|
||||
if uuid:
|
||||
res = await conn.execute(text("SELECT id FROM clusters WHERE uuid=:u LIMIT 1"), {"u": uuid})
|
||||
row = res.first()
|
||||
cid = row[0] if row else None
|
||||
if cid:
|
||||
res1 = await conn.execute(text("SELECT COUNT(*) FROM nodes WHERE cluster_id=:cid AND last_heartbeat IS NOT NULL"), {"cid": cid})
|
||||
else:
|
||||
res1 = await conn.execute(text("SELECT COUNT(*) FROM nodes WHERE last_heartbeat IS NOT NULL"))
|
||||
c1 = res1.scalar() or 0
|
||||
print('NODES_WITH_HEARTBEAT_BEFORE', c1)
|
||||
await asyncio.sleep(10)
|
||||
async with engine.begin() as conn:
|
||||
if cid:
|
||||
res2 = await conn.execute(text("SELECT COUNT(*) FROM nodes WHERE cluster_id=:cid AND last_heartbeat IS NOT NULL"), {"cid": cid})
|
||||
res3 = await conn.execute(text("SELECT hostname, cpu_usage, memory_usage, last_heartbeat FROM nodes WHERE cluster_id=:cid ORDER BY last_heartbeat DESC NULLS LAST LIMIT 5"), {"cid": cid})
|
||||
avg = await conn.execute(text("SELECT AVG(cpu_usage), AVG(memory_usage) FROM nodes WHERE cluster_id=:cid"), {"cid": cid})
|
||||
else:
|
||||
res2 = await conn.execute(text("SELECT COUNT(*) FROM nodes WHERE last_heartbeat IS NOT NULL"))
|
||||
res3 = await conn.execute(text("SELECT hostname, cpu_usage, memory_usage, last_heartbeat FROM nodes ORDER BY last_heartbeat DESC NULLS LAST LIMIT 5"))
|
||||
avg = await conn.execute(text("SELECT AVG(cpu_usage), AVG(memory_usage) FROM nodes"))
|
||||
c2 = res2.scalar() or 0
|
||||
print('NODES_WITH_HEARTBEAT_AFTER', c2)
|
||||
for row in res3.all():
|
||||
print('NODE', row)
|
||||
ar = avg.first()
|
||||
print('CLUSTER_AVG', float(ar[0] or 0.0), float(ar[1] or 0.0))
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(main())
|
||||
@ -1 +0,0 @@
|
||||
|
||||
@ -1,51 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from ..config import SSH_TIMEOUT
|
||||
from ..ssh_utils import SSHClient
|
||||
|
||||
|
||||
def collect_cluster_uuid(host: str, user: str, password: str, timeout: int | None = None) -> tuple[str | None, str | None, str | None]:
|
||||
cli = None
|
||||
try:
|
||||
cli = SSHClient(str(host), user or "", password or "")
|
||||
out, err = cli.execute_command_with_timeout(
|
||||
"hdfs getconf -confKey dfs.namenode.name.dir",
|
||||
timeout or SSH_TIMEOUT,
|
||||
)
|
||||
if not out or not out.strip():
|
||||
return None, "probe_name_dirs", (err or "empty_output")
|
||||
|
||||
name_dir = out.strip().split(",")[0]
|
||||
if name_dir.startswith("file://"):
|
||||
name_dir = name_dir[7:]
|
||||
version_path = f"{name_dir.rstrip('/')}/current/VERSION"
|
||||
|
||||
version_out, version_err = cli.execute_command_with_timeout(
|
||||
f"cat {version_path}",
|
||||
timeout or SSH_TIMEOUT,
|
||||
)
|
||||
if not version_out or not version_out.strip():
|
||||
return None, "read_version", (version_err or "empty_output")
|
||||
|
||||
cluster_id = None
|
||||
for line in version_out.splitlines():
|
||||
if "clusterID" in line:
|
||||
parts = line.strip().split("=", 1)
|
||||
if len(parts) == 2 and parts[0].strip() == "clusterID":
|
||||
cluster_id = parts[1].strip()
|
||||
break
|
||||
if not cluster_id:
|
||||
return None, "parse_cluster_id", version_out.strip()
|
||||
|
||||
if cluster_id.startswith("CID-"):
|
||||
cluster_id = cluster_id[4:]
|
||||
return cluster_id, None, None
|
||||
except Exception as e:
|
||||
return None, "connect_or_exec", str(e)
|
||||
finally:
|
||||
try:
|
||||
if cli:
|
||||
cli.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
@ -1,64 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import shlex
|
||||
from typing import Optional, Tuple
|
||||
|
||||
|
||||
async def run_local_command(cmd: str, timeout: int = 30) -> Tuple[int, str, str]:
|
||||
"""运行本地命令,返回 (exit_code, stdout, stderr)。"""
|
||||
if os.name == "nt":
|
||||
prog = ["powershell", "-NoProfile", "-NonInteractive", "-Command", cmd]
|
||||
else:
|
||||
prog = ["bash", "-lc", cmd]
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*prog,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
return (124, "", "timeout")
|
||||
return (proc.returncode or 0, out.decode(errors="ignore"), err.decode(errors="ignore"))
|
||||
|
||||
|
||||
def _build_ssh_prog(host: str, user: str, cmd: str, port: Optional[int] = None, identity_file: Optional[str] = None) -> list:
|
||||
"""构造 ssh 远程执行命令参数数组。"""
|
||||
prog = [
|
||||
"ssh",
|
||||
"-o",
|
||||
"BatchMode=yes",
|
||||
"-o",
|
||||
"StrictHostKeyChecking=no",
|
||||
]
|
||||
if port:
|
||||
prog += ["-p", str(port)]
|
||||
if identity_file:
|
||||
prog += ["-i", identity_file]
|
||||
target = f"{user}@{host}" if user else host
|
||||
prog += [target, "bash", "-lc", cmd]
|
||||
return prog
|
||||
|
||||
|
||||
async def run_remote_command(host: str, user: str, cmd: str, timeout: int = 30, port: Optional[int] = None, identity_file: Optional[str] = None) -> Tuple[int, str, str]:
|
||||
"""通过 ssh 在远端主机执行命令,返回 (exit_code, stdout, stderr)。"""
|
||||
prog = _build_ssh_prog(host, user, cmd, port=port, identity_file=identity_file)
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
*prog,
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE,
|
||||
)
|
||||
try:
|
||||
out, err = await asyncio.wait_for(proc.communicate(), timeout=timeout)
|
||||
except asyncio.TimeoutError:
|
||||
try:
|
||||
proc.kill()
|
||||
except Exception:
|
||||
pass
|
||||
return (124, "", "timeout")
|
||||
return (proc.returncode or 0, out.decode(errors="ignore"), err.decode(errors="ignore"))
|
||||
|
||||
@ -1,70 +0,0 @@
|
||||
from ..ssh_utils import SSHClient
|
||||
from ..config import SSH_TIMEOUT
|
||||
|
||||
def check_ssh_connectivity(host: str, user: str, password: str, timeout: int | None = None) -> tuple[bool, str | None]:
|
||||
try:
|
||||
cli = SSHClient(str(host), user or "", password or "")
|
||||
out, _ = cli.execute_command_with_timeout("echo ok", timeout or SSH_TIMEOUT)
|
||||
cli.close()
|
||||
if out is None:
|
||||
return (False, "no_output")
|
||||
if out.strip():
|
||||
return (True, None)
|
||||
return (False, "empty_output")
|
||||
except Exception as e:
|
||||
try:
|
||||
cli.close()
|
||||
except Exception:
|
||||
pass
|
||||
return (False, str(e))
|
||||
|
||||
def get_hdfs_cluster_id(host: str, user: str, password: str, timeout: int | None = None) -> tuple[str | None, str | None]:
|
||||
"""
|
||||
通过以下步骤获取 HDFS 集群 UUID:
|
||||
1. 执行 hdfs getconf -confKey dfs.namenode.name.dir 获取名称节点目录。
|
||||
2. 在该目录的 current 子目录下读取 VERSION 文件。
|
||||
3. 解析 VERSION 文件中的 clusterID 字段。
|
||||
4. 去掉 'CID-' 前缀并返回。
|
||||
"""
|
||||
try:
|
||||
cli = SSHClient(str(host), user or "", password or "")
|
||||
|
||||
# 1. 获取 dfs.namenode.name.dir
|
||||
dir_out, dir_err = cli.execute_command_with_timeout("hdfs getconf -confKey dfs.namenode.name.dir", timeout or SSH_TIMEOUT)
|
||||
if not dir_out or not dir_out.strip():
|
||||
cli.close()
|
||||
return None, f"Failed to get dfs.namenode.name.dir: {dir_err or 'Empty output'}"
|
||||
|
||||
# 处理可能存在的多个目录(取第一个)
|
||||
name_dir = dir_out.strip().split(',')[0]
|
||||
# 移除 file:// 前缀(如果存在)
|
||||
if name_dir.startswith("file://"):
|
||||
name_dir = name_dir[7:]
|
||||
|
||||
version_path = f"{name_dir.rstrip('/')}/current/VERSION"
|
||||
|
||||
# 2. 读取 VERSION 文件
|
||||
version_out, version_err = cli.execute_command_with_timeout(f"cat {version_path}", timeout or SSH_TIMEOUT)
|
||||
cli.close()
|
||||
|
||||
if not version_out or not version_out.strip():
|
||||
return None, f"Failed to read VERSION file at {version_path}: {version_err or 'Empty output'}"
|
||||
|
||||
# 3. 解析 clusterID
|
||||
cluster_id = None
|
||||
for line in version_out.splitlines():
|
||||
if line.startswith("clusterID="):
|
||||
cluster_id = line.split("=")[1].strip()
|
||||
break
|
||||
|
||||
if not cluster_id:
|
||||
return None, f"clusterID not found in {version_path}"
|
||||
|
||||
# 4. 去掉 'CID-' 前缀
|
||||
if cluster_id.startswith("CID-"):
|
||||
cluster_id = cluster_id[4:]
|
||||
|
||||
return cluster_id, None
|
||||
|
||||
except Exception as e:
|
||||
return None, str(e)
|
||||
@ -1,242 +0,0 @@
|
||||
import os
|
||||
import socket
|
||||
import paramiko
|
||||
from typing import Optional, TextIO, Dict, Tuple
|
||||
from .config import SSH_PORT, SSH_TIMEOUT
|
||||
|
||||
# Create a static node configuration dictionary that will be used for all requests
|
||||
# This avoids the issue of environment variables not being available in child processes
|
||||
STATIC_NODE_CONFIG = {
|
||||
"hadoop102": ("192.168.10.102", "hadoop", "limouren..."),
|
||||
"hadoop103": ("192.168.10.103", "hadoop", "limouren..."),
|
||||
"hadoop104": ("192.168.10.104", "hadoop", "limouren..."),
|
||||
"hadoop105": ("192.168.10.105", "hadoop", "limouren..."),
|
||||
"hadoop100": ("192.168.10.100", "hadoop", "limouren...")
|
||||
}
|
||||
|
||||
DEFAULT_SSH_USER = os.getenv("HADOOP_USER", "hadoop")
|
||||
DEFAULT_SSH_PASSWORD = os.getenv("HADOOP_PASSWORD", "limouren...")
|
||||
|
||||
class SSHClient:
|
||||
"""SSH Client for connecting to remote servers"""
|
||||
|
||||
def __init__(self, hostname: str, username: str, password: str, port: int = SSH_PORT):
|
||||
self.hostname = hostname
|
||||
self.username = username
|
||||
self.password = password
|
||||
self.port = port
|
||||
self.client: Optional[paramiko.SSHClient] = None
|
||||
|
||||
def _ensure_connected(self) -> None:
|
||||
if self.client is None:
|
||||
self.connect()
|
||||
return
|
||||
try:
|
||||
transport = self.client.get_transport()
|
||||
if transport is None or not transport.is_active():
|
||||
self.connect()
|
||||
except Exception:
|
||||
self.connect()
|
||||
|
||||
def connect(self) -> None:
|
||||
"""Establish SSH connection"""
|
||||
self.client = paramiko.SSHClient()
|
||||
# Automatically add host keys
|
||||
self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
|
||||
sock = None
|
||||
socks5 = os.getenv("TS_SOCKS5_SERVER") or os.getenv("TAILSCALE_SOCKS5_SERVER")
|
||||
if socks5:
|
||||
try:
|
||||
sock = _socks5_connect(socks5, self.hostname, self.port, SSH_TIMEOUT)
|
||||
except Exception:
|
||||
sock = None
|
||||
self.client.connect(
|
||||
hostname=self.hostname,
|
||||
username=self.username,
|
||||
password=self.password,
|
||||
port=self.port,
|
||||
timeout=SSH_TIMEOUT,
|
||||
sock=sock,
|
||||
)
|
||||
|
||||
def execute_command(self, command: str) -> tuple:
|
||||
"""Execute command on remote server"""
|
||||
self._ensure_connected()
|
||||
|
||||
stdin, stdout, stderr = self.client.exec_command(command)
|
||||
return stdout.read().decode(), stderr.read().decode()
|
||||
|
||||
def execute_command_with_status(self, command: str) -> tuple:
|
||||
self._ensure_connected()
|
||||
stdin, stdout, stderr = self.client.exec_command(command)
|
||||
exit_code = stdout.channel.recv_exit_status()
|
||||
return exit_code, stdout.read().decode(), stderr.read().decode()
|
||||
|
||||
def execute_command_with_timeout(self, command: str, timeout: int = 30) -> tuple:
|
||||
"""Execute command with timeout"""
|
||||
self._ensure_connected()
|
||||
|
||||
stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout)
|
||||
return stdout.read().decode(), stderr.read().decode()
|
||||
|
||||
def execute_command_with_timeout_and_status(self, command: str, timeout: int = 30) -> tuple:
|
||||
self._ensure_connected()
|
||||
stdin, stdout, stderr = self.client.exec_command(command, timeout=timeout)
|
||||
exit_code = stdout.channel.recv_exit_status()
|
||||
return exit_code, stdout.read().decode(), stderr.read().decode()
|
||||
|
||||
def read_file(self, file_path: str) -> str:
|
||||
"""Read file content from remote server"""
|
||||
self._ensure_connected()
|
||||
|
||||
with self.client.open_sftp() as sftp:
|
||||
with sftp.open(file_path, 'r') as f:
|
||||
return f.read().decode()
|
||||
|
||||
def download_file(self, remote_path: str, local_path: str) -> None:
|
||||
"""Download file from remote server to local"""
|
||||
self._ensure_connected()
|
||||
|
||||
with self.client.open_sftp() as sftp:
|
||||
sftp.get(remote_path, local_path)
|
||||
|
||||
def close(self) -> None:
|
||||
"""Close SSH connection"""
|
||||
if self.client:
|
||||
self.client.close()
|
||||
self.client = None
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
self.connect()
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
|
||||
class SSHConnectionManager:
|
||||
"""SSH Connection Manager for managing multiple SSH connections"""
|
||||
|
||||
def __init__(self):
|
||||
self.connections = {}
|
||||
|
||||
def get_connection(self, node_name: str, ip: str = None, username: str = None, password: str = None) -> SSHClient:
|
||||
"""Get or create SSH connection for a node"""
|
||||
if node_name in self.connections:
|
||||
client = self.connections[node_name]
|
||||
if ip and getattr(client, "hostname", None) != ip:
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
del self.connections[node_name]
|
||||
elif username and getattr(client, "username", None) != username:
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
del self.connections[node_name]
|
||||
elif password and getattr(client, "password", None) != password:
|
||||
try:
|
||||
client.close()
|
||||
except Exception:
|
||||
pass
|
||||
del self.connections[node_name]
|
||||
|
||||
if node_name not in self.connections:
|
||||
if not ip:
|
||||
raise ValueError(f"IP address required for new connection to {node_name}")
|
||||
|
||||
_user = username or DEFAULT_SSH_USER
|
||||
_pass = password or DEFAULT_SSH_PASSWORD
|
||||
|
||||
client = SSHClient(ip, _user, _pass)
|
||||
self.connections[node_name] = client
|
||||
|
||||
return self.connections[node_name]
|
||||
|
||||
def close_all(self) -> None:
|
||||
"""Close all SSH connections"""
|
||||
for conn in self.connections.values():
|
||||
conn.close()
|
||||
self.connections.clear()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close_all()
|
||||
|
||||
# Create a global SSH connection manager instance
|
||||
ssh_manager = SSHConnectionManager()
|
||||
|
||||
|
||||
def _parse_hostport(value: str, default_port: int) -> tuple[str, int]:
|
||||
s = (value or "").strip()
|
||||
if not s:
|
||||
return ("127.0.0.1", default_port)
|
||||
if s.startswith("http://"):
|
||||
s = s[7:]
|
||||
if s.startswith("socks5://"):
|
||||
s = s[9:]
|
||||
if "/" in s:
|
||||
s = s.split("/", 1)[0]
|
||||
if ":" in s:
|
||||
host, port_s = s.rsplit(":", 1)
|
||||
try:
|
||||
return (host.strip() or "127.0.0.1", int(port_s.strip()))
|
||||
except Exception:
|
||||
return (host.strip() or "127.0.0.1", default_port)
|
||||
return (s, default_port)
|
||||
|
||||
|
||||
def _socks5_connect(proxy: str, dest_host: str, dest_port: int, timeout: int) -> socket.socket:
|
||||
proxy_host, proxy_port = _parse_hostport(proxy, 1080)
|
||||
s = socket.create_connection((proxy_host, proxy_port), timeout=timeout)
|
||||
s.settimeout(timeout)
|
||||
s.sendall(b"\x05\x01\x00")
|
||||
resp = s.recv(2)
|
||||
if len(resp) != 2 or resp[0] != 0x05 or resp[1] != 0x00:
|
||||
s.close()
|
||||
raise RuntimeError("socks5_auth_failed")
|
||||
atyp = 0x03
|
||||
addr = dest_host.encode("utf-8")
|
||||
try:
|
||||
packed = socket.inet_pton(socket.AF_INET, dest_host)
|
||||
atyp = 0x01
|
||||
addr_field = packed
|
||||
except Exception:
|
||||
try:
|
||||
packed6 = socket.inet_pton(socket.AF_INET6, dest_host)
|
||||
atyp = 0x04
|
||||
addr_field = packed6
|
||||
except Exception:
|
||||
if len(addr) > 255:
|
||||
s.close()
|
||||
raise RuntimeError("socks5_domain_too_long")
|
||||
addr_field = bytes([len(addr)]) + addr
|
||||
port_field = int(dest_port).to_bytes(2, "big", signed=False)
|
||||
req = b"\x05\x01\x00" + bytes([atyp]) + addr_field + port_field
|
||||
s.sendall(req)
|
||||
head = s.recv(4)
|
||||
if len(head) != 4 or head[0] != 0x05:
|
||||
s.close()
|
||||
raise RuntimeError("socks5_bad_reply")
|
||||
rep = head[1]
|
||||
if rep != 0x00:
|
||||
s.close()
|
||||
raise RuntimeError(f"socks5_connect_failed:{rep}")
|
||||
bnd_atyp = head[3]
|
||||
if bnd_atyp == 0x01:
|
||||
s.recv(4)
|
||||
elif bnd_atyp == 0x04:
|
||||
s.recv(16)
|
||||
elif bnd_atyp == 0x03:
|
||||
ln = s.recv(1)
|
||||
if ln:
|
||||
s.recv(ln[0])
|
||||
s.recv(2)
|
||||
return s
|
||||
@ -1,38 +0,0 @@
|
||||
import asyncio
|
||||
import argparse
|
||||
from sqlalchemy import select
|
||||
from app.db import SessionLocal
|
||||
from app.models.nodes import Node
|
||||
from app.models.clusters import Cluster
|
||||
from app.metrics_collector import metrics_collector
|
||||
|
||||
async def collect_once(cluster_uuid: str):
|
||||
async with SessionLocal() as session:
|
||||
cid_res = await session.execute(select(Cluster.id).where(Cluster.uuid == cluster_uuid).limit(1))
|
||||
cid = cid_res.scalars().first()
|
||||
if not cid:
|
||||
return
|
||||
res = await session.execute(select(Node.id, Node.hostname, Node.ip_address).where(Node.cluster_id == cid))
|
||||
rows = res.all()
|
||||
for nid, hn, ip in rows:
|
||||
cpu, mem = metrics_collector._read_cpu_mem(hn, str(ip))
|
||||
await metrics_collector._save_metrics(nid, hn, cid, cpu, mem)
|
||||
|
||||
async def runner(cluster_uuid: str, interval: int):
|
||||
while True:
|
||||
try:
|
||||
await collect_once(cluster_uuid)
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(interval)
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--cluster", required=True, help="Cluster UUID to collect metrics for")
|
||||
parser.add_argument("--interval", type=int, default=3, help="Collect interval seconds")
|
||||
args = parser.parse_args()
|
||||
metrics_collector.set_collection_interval(args.interval)
|
||||
asyncio.run(runner(args.cluster, args.interval))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
@ -1,15 +0,0 @@
|
||||
fastapi
|
||||
uvicorn[standard]
|
||||
SQLAlchemy
|
||||
asyncpg
|
||||
python-dotenv
|
||||
passlib[bcrypt]
|
||||
bcrypt==3.2.0
|
||||
PyJWT
|
||||
langchain
|
||||
langchain-openai
|
||||
httpx
|
||||
paramiko
|
||||
pydantic-settings
|
||||
requests
|
||||
beautifulsoup4
|
||||
@ -1,20 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add backend directory to sys.path
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
|
||||
|
||||
from backend.app.db import engine
|
||||
from backend.app.models.chat import Base as ChatBase
|
||||
|
||||
async def init_db():
|
||||
async with engine.begin() as conn:
|
||||
print("Dropping chat tables if exist...")
|
||||
await conn.run_sync(ChatBase.metadata.drop_all)
|
||||
print("Creating chat tables...")
|
||||
await conn.run_sync(ChatBase.metadata.create_all)
|
||||
print("Done.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(init_db())
|
||||
@ -1,38 +0,0 @@
|
||||
import os
|
||||
import json
|
||||
import requests
|
||||
|
||||
def main():
|
||||
base = os.getenv("API_BASE", "http://localhost:8000")
|
||||
token = os.getenv("API_TOKEN", "")
|
||||
name = os.getenv("CLUSTER_NAME", "test-cluster")
|
||||
ctype = os.getenv("CLUSTER_TYPE", "hadoop")
|
||||
nodes_env = os.getenv("CLUSTER_NODES")
|
||||
if not nodes_env:
|
||||
print("请通过环境变量 CLUSTER_NODES 提供节点信息,示例:")
|
||||
print('[{"hostname":"nn","ip_address":"10.0.0.1","ssh_user":"u","ssh_password":"p"}]')
|
||||
return
|
||||
nodes = json.loads(nodes_env)
|
||||
payload = {
|
||||
"name": name,
|
||||
"type": ctype,
|
||||
"node_count": len(nodes),
|
||||
"health_status": "unknown",
|
||||
"nodes": nodes
|
||||
}
|
||||
if not token:
|
||||
try:
|
||||
r = requests.post(f"{base}/user/login", json={"username": "admin", "password": "admin123"}, timeout=15)
|
||||
if r.status_code == 200:
|
||||
token = r.json().get("token") or ""
|
||||
print("已自动登录获取 token")
|
||||
else:
|
||||
print("自动登录失败,请设置 API_TOKEN 环境变量")
|
||||
except Exception as e:
|
||||
print(f"自动登录异常:{e}")
|
||||
headers = {"Authorization": f"Bearer {token}"} if token else {}
|
||||
r = requests.post(f"{base}/clusters", json=payload, headers=headers, timeout=30)
|
||||
print(r.status_code, r.text)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -1,40 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 确保脚本在 backend 目录下运行
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
echo "=== 正在检查 Tailscale 状态 ==="
|
||||
|
||||
# 1. 检查并启动 tailscaled (SOCKS5 代理模式)
|
||||
if ! pgrep -f "tailscaled.*--socks5-server=127.0.0.1:1080" > /dev/null; then
|
||||
echo "Tailscale SOCKS5 代理未运行,正在启动..."
|
||||
sudo pkill tailscaled 2>/dev/null || true
|
||||
sudo nohup /usr/sbin/tailscaled \
|
||||
--tun=userspace-networking \
|
||||
--socket=/var/run/tailscale/tailscaled.sock \
|
||||
--state=/var/lib/tailscale/tailscaled.state \
|
||||
--socks5-server=127.0.0.1:1080 \
|
||||
>/tmp/tailscaled.log 2>&1 &
|
||||
|
||||
# 等待启动完成
|
||||
sleep 2
|
||||
|
||||
# 确保加入网络
|
||||
sudo tailscale up --accept-dns=false --accept-routes=true
|
||||
else
|
||||
echo "Tailscale SOCKS5 代理已在 127.0.0.1:1080 运行。"
|
||||
fi
|
||||
|
||||
# 2. 验证代理端口是否可用
|
||||
if python3 -c "import socket; s=socket.create_connection(('127.0.0.1',1080),2); s.close()" 2>/dev/null; then
|
||||
echo "SOCKS5 代理验证成功。"
|
||||
else
|
||||
echo "错误: SOCKS5 代理端口 1080 无法访问,请检查 /tmp/tailscaled.log"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "=== 正在启动后端服务 ==="
|
||||
|
||||
# 3. 启动后端服务 (注入代理环境变量)
|
||||
export TS_SOCKS5_SERVER='127.0.0.1:1080'
|
||||
python3 -m uvicorn app.main:app --host 0.0.0.0 --port 8000 --reload
|
||||
@ -1,49 +0,0 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
BASE_URL="${BASE_URL:-http://127.0.0.1:8000}"
|
||||
USERNAME="${USERNAME:-admin}"
|
||||
PASSWORD="${PASSWORD:-admin123}"
|
||||
SESSION_ID="${SESSION_ID:-curl-sse-$(date +%s)}"
|
||||
MESSAGE="${MESSAGE:-杀戮尖塔的观者怎么玩}"
|
||||
|
||||
TOKEN="$(
|
||||
curl -sS "${BASE_URL}/api/v1/user/login" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{\"username\":\"${USERNAME}\",\"password\":\"${PASSWORD}\"}" \
|
||||
| python3 -c 'import sys, json; print(json.load(sys.stdin)["token"])'
|
||||
)"
|
||||
|
||||
TMP_OUT="$(mktemp)"
|
||||
cleanup() { rm -f "${TMP_OUT}"; }
|
||||
trap cleanup EXIT
|
||||
|
||||
curl -N -sS "${BASE_URL}/api/v1/ai/chat" \
|
||||
-H "Authorization: Bearer ${TOKEN}" \
|
||||
-H "Content-Type: application/json" \
|
||||
-H "Accept: text/event-stream" \
|
||||
-d "$(python3 - <<'PY'
|
||||
import json, os
|
||||
payload = {
|
||||
"sessionId": os.environ["SESSION_ID"],
|
||||
"message": os.environ["MESSAGE"],
|
||||
"stream": True,
|
||||
"context": {
|
||||
"webSearch": True,
|
||||
"agent": "Hadoop助手. You MUST use the web_search tool before answering."
|
||||
}
|
||||
}
|
||||
print(json.dumps(payload, ensure_ascii=False))
|
||||
PY
|
||||
)" | tee "${TMP_OUT}"
|
||||
|
||||
python3 - <<'PY'
|
||||
import sys, re, pathlib
|
||||
p = pathlib.Path(sys.argv[1])
|
||||
s = p.read_text(encoding="utf-8", errors="ignore")
|
||||
n = len(re.findall(r"^data: ", s, flags=re.M))
|
||||
if n <= 0:
|
||||
raise SystemExit("未收到任何 SSE data 行,测试失败")
|
||||
print(f"OK: 收到 {n} 条 SSE data 行")
|
||||
PY "${TMP_OUT}"
|
||||
|
||||
@ -1,35 +0,0 @@
|
||||
import pytest
|
||||
from app.services.hadoop_cluster_uuid import collect_cluster_uuid
|
||||
|
||||
class _CliOK:
|
||||
def __init__(self, host, user, pwd):
|
||||
pass
|
||||
def execute_command_with_timeout(self, cmd, timeout):
|
||||
if "getconf" in cmd or "awk" in cmd:
|
||||
return ("/data/hdfs/namenode", "")
|
||||
if "VERSION" in cmd:
|
||||
return ("clusterID=12345-abc", "")
|
||||
return ("", "")
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
class _CliNoDirs:
|
||||
def __init__(self, host, user, pwd):
|
||||
pass
|
||||
def execute_command_with_timeout(self, cmd, timeout):
|
||||
return ("", "")
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def test_collect_cluster_uuid_success(monkeypatch):
|
||||
monkeypatch.setattr("app.services.hadoop_cluster_uuid.SSHClient", lambda h,u,p: _CliOK(h,u,p))
|
||||
u, step, detail = collect_cluster_uuid("10.0.0.1", "u", "p")
|
||||
assert u is not None
|
||||
assert step is None
|
||||
assert detail is None
|
||||
|
||||
def test_collect_cluster_uuid_fail_no_dirs(monkeypatch):
|
||||
monkeypatch.setattr("app.services.hadoop_cluster_uuid.SSHClient", lambda h,u,p: _CliNoDirs(h,u,p))
|
||||
u, step, detail = collect_cluster_uuid("10.0.0.1", "u", "p")
|
||||
assert u is None
|
||||
assert step == "probe_name_dirs"
|
||||
@ -1,38 +0,0 @@
|
||||
import app.log_collector as lc
|
||||
import app.log_reader as lr
|
||||
|
||||
def test_parse_and_save_chunk_mock():
|
||||
sample_lines = [
|
||||
"[2024-12-17 10:00:00,123] INFO org.apache.hadoop.hdfs.server.datanode.DataNode: Started",
|
||||
"[2024-12-17 10:01:00,456] WARN org.apache.hadoop.hdfs.server.datanode.DataNode: Disk nearly full",
|
||||
"[2024-12-17 10:02:00,789] ERROR org.apache.hadoop.hdfs.server.datanode.DataNode: Write failed",
|
||||
"Plain line without timestamp INFO something",
|
||||
"",
|
||||
]
|
||||
content = "\n".join(sample_lines)
|
||||
|
||||
captured = []
|
||||
|
||||
async def _fake_save_logs_to_db_batch(items: list[dict]):
|
||||
captured.extend(items)
|
||||
|
||||
# monkeypatch batch save method
|
||||
lc.log_collector._save_logs_to_db_batch = _fake_save_logs_to_db_batch
|
||||
|
||||
# run save chunk
|
||||
lc.log_collector._save_log_chunk("hadoop102", "datanode", content)
|
||||
|
||||
# verify non-empty lines saved
|
||||
expected_saved = [ln for ln in sample_lines if ln.strip()]
|
||||
assert len(captured) == len(expected_saved)
|
||||
# check fields
|
||||
for item in captured:
|
||||
assert item["host"] == "hadoop102"
|
||||
assert item["service"] == "datanode"
|
||||
assert isinstance(item["message"], str) and item["message"]
|
||||
assert item["log_level"] in {"INFO", "WARN", "ERROR", "DEBUG", "TRACE"}
|
||||
assert getattr(item["timestamp"], "tzinfo", None) is not None
|
||||
|
||||
def test_log_file_path_namenode():
|
||||
p = lr.log_reader.get_log_file_path("hadoop102", "namenode")
|
||||
assert p.endswith("/hadoop-hadoop-namenode-hadoop102.log")
|
||||
@ -1,123 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add backend directory to sys.path to import app modules
|
||||
# Current file: backend/tests/test_llm.py
|
||||
# Parent: backend/tests
|
||||
# Grandparent: backend
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from app.services.llm import LLMClient
|
||||
from app.services.ops_tools import openai_tools_schema, tool_web_search, tool_start_cluster, tool_stop_cluster
|
||||
from app.db import SessionLocal
|
||||
from dotenv import load_dotenv
|
||||
import json
|
||||
|
||||
async def main():
|
||||
# Load .env from backend directory
|
||||
env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".env")
|
||||
load_dotenv(env_path)
|
||||
|
||||
print("Testing LLMClient with REAL Tools...")
|
||||
try:
|
||||
llm = LLMClient()
|
||||
print(f"Provider: {llm.provider}")
|
||||
print(f"Endpoint: {llm.endpoint}")
|
||||
print(f"Model: {llm.model}")
|
||||
print(f"Timeout: {llm.timeout}")
|
||||
|
||||
messages = [{"role": "user", "content": "停止集群 5c43a9c7-e2a9-4756-b75d-6813ac55d3ba"}]
|
||||
|
||||
# 1. Get tools definition
|
||||
chat_tools = openai_tools_schema()
|
||||
|
||||
print(f"Tools loaded: {[t['function']['name'] for t in chat_tools]}")
|
||||
|
||||
print("Sending initial request...")
|
||||
resp = await llm.chat(messages, tools=chat_tools)
|
||||
|
||||
if "choices" in resp and resp["choices"]:
|
||||
msg = resp["choices"][0].get("message", {})
|
||||
tool_calls = msg.get("tool_calls")
|
||||
|
||||
if tool_calls:
|
||||
print(f"Tool calls triggered: {len(tool_calls)}")
|
||||
# Append assistant message with tool_calls
|
||||
messages.append(msg)
|
||||
|
||||
async with SessionLocal() as db:
|
||||
for tc in tool_calls:
|
||||
fn = tc.get("function", {})
|
||||
name = fn.get("name")
|
||||
args_str = fn.get("arguments", "{}")
|
||||
print(f"Executing REAL tool: {name} with args: {args_str}")
|
||||
|
||||
if name == "web_search":
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
tool_result = await tool_web_search(args.get("query"), args.get("max_results", 5))
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.get("id"),
|
||||
"name": name,
|
||||
"content": json.dumps(tool_result, ensure_ascii=False)
|
||||
})
|
||||
print("Tool execution completed.")
|
||||
except Exception as e:
|
||||
print(f"Tool execution failed: {e}")
|
||||
elif name == "start_cluster":
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
cluster_uuid = args.get("cluster_uuid")
|
||||
# Execute REAL tool
|
||||
tool_result = await tool_start_cluster(db, "admin", cluster_uuid)
|
||||
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.get("id"),
|
||||
"name": name,
|
||||
"content": json.dumps(tool_result, ensure_ascii=False)
|
||||
})
|
||||
print(f"REAL tool start_cluster execution completed: {tool_result.get('status')}")
|
||||
except Exception as e:
|
||||
print(f"REAL tool execution failed: {e}")
|
||||
elif name == "stop_cluster":
|
||||
try:
|
||||
args = json.loads(args_str)
|
||||
cluster_uuid = args.get("cluster_uuid")
|
||||
# Execute REAL tool
|
||||
tool_result = await tool_stop_cluster(db, "admin", cluster_uuid)
|
||||
|
||||
messages.append({
|
||||
"role": "tool",
|
||||
"tool_call_id": tc.get("id"),
|
||||
"name": name,
|
||||
"content": json.dumps(tool_result, ensure_ascii=False)
|
||||
})
|
||||
print(f"REAL tool stop_cluster execution completed: {tool_result.get('status')}")
|
||||
except Exception as e:
|
||||
print(f"REAL tool execution failed: {e}")
|
||||
|
||||
# 2. Send follow-up request with tool results
|
||||
print("Sending follow-up request...")
|
||||
resp = await llm.chat(messages, tools=chat_tools)
|
||||
if "choices" in resp and resp["choices"]:
|
||||
final_msg = resp["choices"][0].get("message", {})
|
||||
print("\nFinal Reply:")
|
||||
print(final_msg.get('content'))
|
||||
if "reasoning_content" in final_msg:
|
||||
print(f"\nReasoning:\n{final_msg.get('reasoning_content')}")
|
||||
else:
|
||||
print("No tool calls triggered.")
|
||||
print(f"Reply: {msg.get('content')}")
|
||||
else:
|
||||
print(resp)
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Error: {repr(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,58 +0,0 @@
|
||||
import httpx
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import pytest
|
||||
|
||||
async def _run_register_checks(base_url: str):
|
||||
url = f"{base_url.rstrip('/')}/api/v1/user/register"
|
||||
|
||||
# 1. 测试字段缺失 (422)
|
||||
print("\n1. Testing missing field...")
|
||||
payload = {
|
||||
"username": "testuser",
|
||||
"email": "test@example.com",
|
||||
"password": "password123"
|
||||
# fullName missing
|
||||
}
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.post(url, json=payload)
|
||||
print(f"Status: {r.status_code}")
|
||||
print(f"Response: {r.text}")
|
||||
|
||||
# 2. 测试校验错误 (400 with errors)
|
||||
print("\n2. Testing validation error (short username)...")
|
||||
payload = {
|
||||
"username": "t",
|
||||
"email": "invalid-email",
|
||||
"password": "123",
|
||||
"fullName": "Z"
|
||||
}
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.post(url, json=payload)
|
||||
print(f"Status: {r.status_code}")
|
||||
print(f"Response: {r.text}")
|
||||
|
||||
# 3. 测试用户名已存在 (400 with message)
|
||||
# 假设 admin 已存在
|
||||
print("\n3. Testing duplicate username...")
|
||||
payload = {
|
||||
"username": "admin",
|
||||
"email": "admin_new@example.com",
|
||||
"password": "Password123",
|
||||
"fullName": "Administrator"
|
||||
}
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.post(url, json=payload)
|
||||
print(f"Status: {r.status_code}")
|
||||
print(f"Response: {r.text}")
|
||||
|
||||
def test_register_fix_e2e():
|
||||
base_url = os.getenv("E2E_BASE_URL", "").strip()
|
||||
if not base_url:
|
||||
pytest.skip("需要设置 E2E_BASE_URL 并启动后端服务")
|
||||
asyncio.run(_run_register_checks(base_url))
|
||||
|
||||
if __name__ == "__main__":
|
||||
url = os.getenv("E2E_BASE_URL", "http://localhost:8000").strip()
|
||||
asyncio.run(_run_register_checks(url))
|
||||
@ -1,30 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add backend directory to sys.path to import app modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from app.services.ops_tools import tool_web_search
|
||||
|
||||
async def main():
|
||||
print("Testing Web Search...")
|
||||
query = "今天星期几"
|
||||
print(f"Query: {query}")
|
||||
try:
|
||||
res = await tool_web_search(query)
|
||||
if "error" in res:
|
||||
print(f"Error: {res['error']}")
|
||||
else:
|
||||
print(f"Current Time: {res.get('current_time')}")
|
||||
print(f"Results found: {len(res.get('results', []))}")
|
||||
for i, r in enumerate(res.get("results", [])[:2]):
|
||||
print(f"[{i+1}] {r.get('title')} - {r.get('href')}")
|
||||
if r.get('full_content'):
|
||||
print(f" Full content len: {len(r.get('full_content'))}")
|
||||
print(f" Sample: {r.get('full_content')[:100]}...")
|
||||
except Exception as e:
|
||||
print(f"Exception: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,27 +0,0 @@
|
||||
import pytest
|
||||
from app.services.ssh_probe import check_ssh_connectivity
|
||||
from app.ssh_utils import SSHClient
|
||||
|
||||
class _DummyCli:
|
||||
def __init__(self, host, user, pwd):
|
||||
self.closed = False
|
||||
def execute_command_with_timeout(self, cmd, timeout):
|
||||
return ("ok", "")
|
||||
def close(self):
|
||||
self.closed = True
|
||||
|
||||
def test_check_ssh_connectivity_success(monkeypatch):
|
||||
monkeypatch.setattr("app.services.ssh_probe.SSHClient", lambda h,u,p: _DummyCli(h,u,p))
|
||||
ok, err = check_ssh_connectivity("127.0.0.1", "u", "p", timeout=1)
|
||||
assert ok is True
|
||||
assert err is None
|
||||
|
||||
class _FailCli:
|
||||
def __init__(self, host, user, pwd):
|
||||
raise RuntimeError("connect_failed")
|
||||
|
||||
def test_check_ssh_connectivity_fail(monkeypatch):
|
||||
monkeypatch.setattr("app.services.ssh_probe.SSHClient", lambda h,u,p: _FailCli(h,u,p))
|
||||
ok, err = check_ssh_connectivity("127.0.0.1", "u", "p", timeout=1)
|
||||
assert ok is False
|
||||
assert "connect_failed" in str(err)
|
||||
@ -1,59 +0,0 @@
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
# Add backend directory to sys.path to import app modules
|
||||
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
|
||||
from app.services.llm import LLMClient
|
||||
from dotenv import load_dotenv
|
||||
|
||||
async def main():
|
||||
# Load .env from backend directory
|
||||
env_path = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), ".env")
|
||||
load_dotenv(env_path)
|
||||
|
||||
print("Testing LLMClient Streaming...")
|
||||
try:
|
||||
llm = LLMClient()
|
||||
print(f"Provider: {llm.provider}")
|
||||
print(f"Endpoint: {llm.endpoint}")
|
||||
print(f"Model: {llm.model}")
|
||||
|
||||
messages = [{"role": "user", "content": ""}]
|
||||
|
||||
print("Sending streaming request...")
|
||||
stream_gen = await llm.chat(messages, stream=True)
|
||||
|
||||
full_content = ""
|
||||
full_reasoning = ""
|
||||
|
||||
print("\nStreaming Response:")
|
||||
async for chunk in stream_gen:
|
||||
choices = chunk.get("choices") or []
|
||||
if not choices:
|
||||
continue
|
||||
|
||||
delta = choices[0].get("delta") or {}
|
||||
content = delta.get("content") or ""
|
||||
reasoning = delta.get("reasoning_content") or ""
|
||||
|
||||
if reasoning:
|
||||
full_reasoning += reasoning
|
||||
print(f"[Reasoning] {reasoning}", end="", flush=True)
|
||||
if content:
|
||||
full_content += content
|
||||
print(content, end="", flush=True)
|
||||
|
||||
print("\n\nStream Finished.")
|
||||
print(f"Full Content Length: {len(full_content)}")
|
||||
print(f"Full Reasoning Length: {len(full_reasoning)}")
|
||||
|
||||
except Exception as e:
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print(f"Error: {repr(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 配置变量
|
||||
BASE_URL="http://localhost:8000/api/v1"
|
||||
USERNAME="admin"
|
||||
PASSWORD="admin123"
|
||||
CLUSTER_UUID="5c43a9c7-e2a9-4756-b75d-6813ac55d3ba"
|
||||
|
||||
echo "正在登录获取 Token..."
|
||||
LOGIN_RESPONSE=$(curl -s -X POST "$BASE_URL/user/login" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"username\": \"$USERNAME\", \"password\": \"$PASSWORD\"}")
|
||||
|
||||
TOKEN=$(echo $LOGIN_RESPONSE | grep -oP '(?<="token":")[^"]*')
|
||||
|
||||
if [ -z "$TOKEN" ]; then
|
||||
echo "登录失败,无法获取 Token"
|
||||
echo "响应内容: $LOGIN_RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "登录成功,正在调用集群停止接口..."
|
||||
curl -X POST "$BASE_URL/ops/clusters/$CLUSTER_UUID/stop" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-H "Content-Type: application/json"
|
||||
|
||||
echo -e "\n操作完成"
|
||||
@ -1,27 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# 配置变量
|
||||
BASE_URL="http://localhost:8000/api/v1"
|
||||
USERNAME="admin"
|
||||
PASSWORD="admin123"
|
||||
CLUSTER_UUID="5c43a9c7-e2a9-4756-b75d-6813ac55d3ba"
|
||||
|
||||
echo "正在登录获取 Token..."
|
||||
LOGIN_RESPONSE=$(curl -s -X POST "$BASE_URL/user/login" \
|
||||
-H "Content-Type: application/json" \
|
||||
-d "{\"username\": \"$USERNAME\", \"password\": \"$PASSWORD\"}")
|
||||
|
||||
TOKEN=$(echo $LOGIN_RESPONSE | grep -oP '(?<="token":")[^"]*')
|
||||
|
||||
if [ -z "$TOKEN" ]; then
|
||||
echo "登录失败,无法获取 Token"
|
||||
echo "响应内容: $LOGIN_RESPONSE"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "登录成功,正在调用集群启动接口..."
|
||||
curl -X POST "$BASE_URL/ops/clusters/$CLUSTER_UUID/start" \
|
||||
-H "Authorization: Bearer $TOKEN" \
|
||||
-H "Content-Type: application/json"
|
||||
|
||||
echo -e "\n操作完成"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in new issue