ErrorDetecting/backend/app/services/ops_tools.py

import shlex
import asyncio
from typing import Any, Dict, List, Optional, Tuple
from datetime import datetime, timezone
import json
import httpx

from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy import select, text
import requests
from bs4 import BeautifulSoup
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

from ..models.nodes import Node
from ..models.exec_logs import ExecLog
from .runner import run_remote_command


def _now() -> datetime:
    """返回当前 UTC 时间。"""
    return datetime.now(timezone.utc)


async def _find_accessible_node(db: AsyncSession, user_name: str, hostname: str) -> Optional[Node]:
    """校验用户对节点的访问权限，并返回节点对象。"""
    uid_res = await db.execute(text("SELECT id FROM users WHERE username=:un LIMIT 1"), {"un": user_name})
    uid_row = uid_res.first()
    if not uid_row:
        return None
    ids_res = await db.execute(text("SELECT cluster_id FROM user_cluster_mapping WHERE user_id=:uid"), {"uid": uid_row[0]})
    cluster_ids = [r[0] for r in ids_res.all()]
    if not cluster_ids:
        return None
    res = await db.execute(select(Node).where(Node.hostname == hostname, Node.cluster_id.in_(cluster_ids)).limit(1))
    return res.scalars().first()


async def _write_exec_log(db: AsyncSession, exec_id: str, command_type: str, status: str, start: datetime, end: Optional[datetime], exit_code: Optional[int], operator: str, stdout: Optional[str] = None, stderr: Optional[str] = None):
    """写入执行审计日志。"""
    row = ExecLog(
        exec_id=exec_id,
        fault_id="-",
        command_type=command_type,
        script_path=None,
        command_content="[tool]",
        target_nodes=None,
        risk_level="medium",
        execution_status=status,
        start_time=start,
        end_time=end,
        duration=(int((end - start).total_seconds()) if (start and end) else None),
        stdout_log=stdout,
        stderr_log=stderr,
        exit_code=exit_code,
        operator=operator,
        created_at=_now(),
        updated_at=_now(),
    )
    db.add(row)
    await db.flush()
    await db.commit()


async def tool_read_log(db: AsyncSession, user_name: str, node: str, path: str, lines: int = 200, pattern: Optional[str] = None, ssh_user: Optional[str] = None, timeout: int = 20) -> Dict[str, Any]:
    """工具：读取远端日志并可选筛选。"""
    n = await _find_accessible_node(db, user_name, node)
    if not n:
        return {"error": "node_not_found"}
    path_q = shlex.quote(path)
    cmd = f"tail -n {lines} {path_q}"
    if pattern:
        pat_q = shlex.quote(pattern)
        cmd = f"{cmd} | grep -E {pat_q}"
    start = _now()
    code, out, err = await run_remote_command(str(getattr(n, "ip_address", "")), ssh_user or "", cmd, timeout=timeout)
    end = _now()
    exec_id = f"tool_{start.timestamp():.0f}"
    await _write_exec_log(db, exec_id, "read_log", ("success" if code == 0 else "failed"), start, end, code, user_name, out, err)
    return {"execId": exec_id, "exitCode": code, "stdout": out, "stderr": err}


async def _fetch_page_text(client: httpx.AsyncClient, url: str) -> str:
    """Fetch and extract text content from a URL."""
    try:
        # Skip if not a valid http url
        if not url.startswith("http"):
            return ""

        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
        }
        resp = await client.get(url, headers=headers, follow_redirects=True)
        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            # Remove scripts and styles
            for script in soup(["script", "style", "nav", "footer", "header"]):
                script.decompose()
            text = soup.get_text(separator="\n", strip=True)
            # Limit text length
            return text[:2000]
    except Exception:
        pass
    return ""


async def tool_web_search(query: str, max_results: int = 5) -> Dict[str, Any]:
    """工具：联网搜索（Baidu）并读取网页内容。"""
    try:
        results = []
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8",
            "Connection": "keep-alive",
            "Upgrade-Insecure-Requests": "1"
        }
        url = "https://www.baidu.com/s"
        params = {"wd": query}

        # Use sync requests for search page (stable)
        resp = requests.get(url, params=params, headers=headers, timeout=10, verify=False)

        if resp.status_code == 200:
            soup = BeautifulSoup(resp.text, "html.parser")
            # Baidu results are usually in div with class c-container
            for item in soup.select("div.c-container, div.result.c-container")[:max_results]:
                title_elem = item.select_one("h3")
                if not title_elem:
                    continue
                title = title_elem.get_text(strip=True)
                link_elem = item.select_one("a")
                href = link_elem.get("href") if link_elem else ""

                # Abstract/Snippet
                snippet = item.get_text(strip=True).replace(title, "")[:200]

                results.append({
                    "title": title,
                    "href": href,
                    "body": snippet,
                    "full_content": ""  # Placeholder
                })

        # Fetch full content for top 2 results
        if results:
            async with httpx.AsyncClient(timeout=10, verify=False) as client:
                tasks = []
                # Only fetch top 2 to avoid long wait
                for r in results[:2]:
                    tasks.append(_fetch_page_text(client, r["href"]))

                contents = await asyncio.gather(*tasks)

                for i, content in enumerate(contents):
                    if content:
                        results[i]["full_content"] = content
                        # Append note to body to indicate full content is available
                        results[i]["body"] += "\n[Full content fetched]"

        # Add current system time to help with "now" queries
        current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S %A")
        return {"query": query, "current_time": current_time, "results": results}
    except Exception as e:
        return {"error": str(e)}


def openai_tools_schema() -> List[Dict[str, Any]]:
    """返回 OpenAI 兼容的工具定义（Function Calling）。"""
    return [
        {
            "type": "function",
            "function": {
                "name": "read_log",
                "description": "读取指定节点的日志文件并可按正则筛选",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "node": {"type": "string"},
                        "path": {"type": "string"},
                        "lines": {"type": "integer", "default": 200},
                        "pattern": {"type": "string"},
                        "sshUser": {"type": "string"},
                    },
                    "required": ["node", "path"],
                },
            },
        },
        {
            "type": "function",
            "function": {
                "name": "web_search",
                "description": "联网搜索互联网公开信息，当遇到未知错误码、技术名词或需要外部资料时使用",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "query": {"type": "string", "description": "搜索关键词"},
                        "max_results": {"type": "integer", "default": 5},
                    },
                    "required": ["query"],
                },
            },
        },
    ]