From 1c0ec2b13dc19b28bc5aaa40d73a2ea0c62032cd Mon Sep 17 00:00:00 2001 From: "google-labs-jules[bot]" <161369871+google-labs-jules[bot]@users.noreply.github.com> Date: Wed, 17 Dec 2025 03:02:42 +0000 Subject: [PATCH] feat: enhance Gitea support and merge upstream v3.0.0 - Merge upstream v3.0.0 changes - Fix security vulnerabilities (SSRF, Path Traversal) by introducing `parse_repository_url` utility - Fix token leakage and handling in `scanner.py` and `projects.py` - Fix `NameError` in `scanner.py` - Fix `frontend/docker-entrypoint.sh` API URL escaping - Standardize Gitea token naming to `gitea_token` --- .github/workflows/docker-publish.yml | 129 ++++++ .github/workflows/release.yml | 4 +- CHANGELOG.md | 99 +---- README.md | 134 ++++-- backend/Dockerfile | 9 +- .../versions/008_add_files_with_findings.py | 35 ++ backend/app/api/v1/endpoints/agent_tasks.py | 49 ++- .../app/api/v1/endpoints/embedding_config.py | 47 ++- backend/app/api/v1/endpoints/projects.py | 10 +- backend/app/models/agent_task.py | 3 +- backend/app/services/agent/agents/analysis.py | 42 +- backend/app/services/agent/agents/base.py | 114 +++-- .../app/services/agent/agents/orchestrator.py | 52 ++- backend/app/services/agent/agents/recon.py | 143 ++++++- .../app/services/agent/agents/verification.py | 61 ++- backend/app/services/agent/event_manager.py | 12 +- .../app/services/agent/prompts/__init__.py | 10 - .../services/agent/prompts/system_prompts.py | 391 ++---------------- backend/app/services/rag/indexer.py | 6 + backend/app/services/rag/splitter.py | 46 ++- backend/app/services/scanner.py | 104 ++--- backend/app/utils/__init__.py | 0 backend/app/utils/repo_utils.py | 77 ++++ backend/docker-entrypoint.sh | 53 +++ backend/pyproject.toml | 4 +- docker-compose.prod.cn.yml | 111 +++++ docker-compose.prod.yml | 107 +++++ docker-compose.yml | 8 +- frontend/Dockerfile | 7 +- frontend/docker-entrypoint.sh | 3 +- frontend/package.json | 2 +- .../src/components/agent/EmbeddingConfig.tsx | 20 +- .../AgentAudit/components/StatsPanel.tsx | 11 +- frontend/src/shared/api/agentTasks.ts | 2 + 34 files changed, 1190 insertions(+), 715 deletions(-) create mode 100644 .github/workflows/docker-publish.yml create mode 100644 backend/alembic/versions/008_add_files_with_findings.py create mode 100644 backend/app/utils/__init__.py create mode 100644 backend/app/utils/repo_utils.py create mode 100644 backend/docker-entrypoint.sh create mode 100644 docker-compose.prod.cn.yml create mode 100644 docker-compose.prod.yml diff --git a/.github/workflows/docker-publish.yml b/.github/workflows/docker-publish.yml new file mode 100644 index 0000000..2656f76 --- /dev/null +++ b/.github/workflows/docker-publish.yml @@ -0,0 +1,129 @@ +name: Docker Publish + +# 只构建并推送 Docker 镜像,不创建 Release 或 Tag +on: + workflow_dispatch: + inputs: + tag: + description: '镜像标签 (例如: latest, dev, v3.0.0)' + required: true + default: 'latest' + type: string + build_frontend: + description: '构建前端镜像' + required: false + type: boolean + default: true + build_backend: + description: '构建后端镜像' + required: false + type: boolean + default: true + build_sandbox: + description: '构建沙箱镜像' + required: false + type: boolean + default: true + +jobs: + build-and-push: + name: 构建并推送镜像 + runs-on: ubuntu-latest + + permissions: + contents: read + packages: write + + steps: + - name: 检出代码 + uses: actions/checkout@v4 + + - name: 设置 Node.js + if: ${{ github.event.inputs.build_frontend == 'true' }} + uses: actions/setup-node@v4 + with: + node-version: '20' + + - name: 安装 pnpm + if: ${{ github.event.inputs.build_frontend == 'true' }} + uses: pnpm/action-setup@v4 + with: + version: 9 + + - name: 安装前端依赖 + if: ${{ github.event.inputs.build_frontend == 'true' }} + working-directory: ./frontend + run: pnpm install --frozen-lockfile + + - name: 构建前端项目 + if: ${{ github.event.inputs.build_frontend == 'true' }} + working-directory: ./frontend + run: pnpm build + env: + VITE_USE_LOCAL_DB: 'true' + + - name: 登录到 GitHub Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: 设置 QEMU + uses: docker/setup-qemu-action@v3 + + - name: 设置 Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: 构建并推送前端 Docker 镜像 + if: ${{ github.event.inputs.build_frontend == 'true' }} + uses: docker/build-push-action@v5 + with: + context: ./frontend + file: ./frontend/Dockerfile + push: true + platforms: linux/amd64,linux/arm64 + tags: | + ghcr.io/${{ github.repository_owner }}/deepaudit-frontend:${{ github.event.inputs.tag }} + cache-from: type=gha,scope=frontend + cache-to: type=gha,mode=max,scope=frontend + + - name: 构建并推送后端 Docker 镜像 + if: ${{ github.event.inputs.build_backend == 'true' }} + uses: docker/build-push-action@v5 + with: + context: ./backend + file: ./backend/Dockerfile + push: true + platforms: linux/amd64,linux/arm64 + tags: | + ghcr.io/${{ github.repository_owner }}/deepaudit-backend:${{ github.event.inputs.tag }} + cache-from: type=gha,scope=backend + cache-to: type=gha,mode=max,scope=backend + + - name: 构建并推送沙箱 Docker 镜像 + if: ${{ github.event.inputs.build_sandbox == 'true' }} + uses: docker/build-push-action@v5 + with: + context: ./docker/sandbox + file: ./docker/sandbox/Dockerfile + push: true + platforms: linux/amd64,linux/arm64 + tags: | + ghcr.io/${{ github.repository_owner }}/deepaudit-sandbox:${{ github.event.inputs.tag }} + cache-from: type=gha,scope=sandbox + cache-to: type=gha,mode=max,scope=sandbox + + - name: 输出镜像信息 + run: | + echo "## 镜像已推送到 GHCR" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + if [ "${{ github.event.inputs.build_frontend }}" == "true" ]; then + echo "- \`ghcr.io/${{ github.repository_owner }}/deepaudit-frontend:${{ github.event.inputs.tag }}\`" >> $GITHUB_STEP_SUMMARY + fi + if [ "${{ github.event.inputs.build_backend }}" == "true" ]; then + echo "- \`ghcr.io/${{ github.repository_owner }}/deepaudit-backend:${{ github.event.inputs.tag }}\`" >> $GITHUB_STEP_SUMMARY + fi + if [ "${{ github.event.inputs.build_sandbox }}" == "true" ]; then + echo "- \`ghcr.io/${{ github.repository_owner }}/deepaudit-sandbox:${{ github.event.inputs.tag }}\`" >> $GITHUB_STEP_SUMMARY + fi diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index a42179f..eafba48 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -72,8 +72,7 @@ jobs: - name: 构建前端项目 working-directory: ./frontend run: pnpm build - env: - VITE_USE_LOCAL_DB: 'true' + # 8. 设置 Python 环境(用于后端) - name: 设置 Python @@ -164,6 +163,7 @@ jobs: echo "- 🧠 **RAG 知识库增强**: 代码语义理解 + CWE/CVE 漏洞知识库" >> CHANGELOG.md echo "- 🔒 **沙箱漏洞验证**: Docker 安全容器自动执行 PoC" >> CHANGELOG.md echo "- 🛠️ **专业安全工具集成**: Semgrep, Bandit, Gitleaks, OSV-Scanner" >> CHANGELOG.md + echo "- 🐛 **稳定性增强**: 修复多智能体工具调用循环、UI 显示及 Docker 环境兼容性问题" >> CHANGELOG.md echo "" >> CHANGELOG.md echo "## 📦 下载说明" >> CHANGELOG.md echo "" >> CHANGELOG.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 8c5384c..4bc7f4c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,98 +2,17 @@ All notable changes to this project will be documented in this file. -The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), -and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). - -## [3.0.0] - 2024-12-15 - -### Highlights - -**DeepAudit v3.0.0** introduces a revolutionary **Multi-Agent Intelligent Audit System**: - -- Multi-Agent Architecture with Orchestrator-driven decision making -- RAG (Retrieval-Augmented Generation) knowledge base enhancement -- Docker sandbox for automated vulnerability verification -- Professional security tool integration - -### Added - -#### Multi-Agent Architecture -- **Orchestrator Agent**: Centralized orchestration for autonomous audit strategy decisions -- **Recon Agent**: Information gathering, technology stack identification, and entry point discovery -- **Analysis Agent**: Deep vulnerability analysis with Semgrep, RAG semantic search, and LLM analysis -- **Verification Agent**: Sandbox testing, PoC generation, false positive filtering - -#### RAG Knowledge Base -- Code semantic understanding with Tree-sitter AST-based chunking -- CWE/CVE vulnerability knowledge base integration -- ChromaDB vector database support -- Multi-language support: Python, JavaScript, TypeScript, Java, Go, PHP, Rust - -#### Security Sandbox -- Docker isolated container for PoC execution -- Resource limits: memory, CPU constraints -- Network isolation with configurable access -- seccomp security policies - -#### Security Tools Integration -- **Semgrep**: Multi-language static analysis -- **Bandit**: Python security scanning -- **Gitleaks**: Secret leak detection -- **TruffleHog**: Deep secret scanning -- **npm audit**: Node.js dependency vulnerabilities -- **Safety**: Python dependency audit -- **OSV-Scanner**: Multi-language dependency vulnerabilities - -#### New Features -- Kunlun-M (MIT License) security scanner integration -- File upload size limit increased to 500MB with large file optimization -- Improved task tabs with card-style layout -- Enhanced error handling and project scope filtering -- Streaming LLM token usage reporting with input estimation - -### Changed -- Refactored Agent architecture with dynamic Agent tree -- Expanded high-risk file patterns and dangerous pattern library -- Enhanced sandbox functionality with forced sandbox verification -- Improved report generation with normalized severity comparisons -- Better agent stream stability preventing unnecessary reconnections +## [3.0.1] - 2025-12-16 ### Fixed -- Agent stream stability issues with correct event buffer draining -- Sandbox tool initialization logging improvements -- Task phase update to REPORTING on completion -- Various UI/UX improvements in AgentAudit component +- **Agent Task Cancellation**: Fixed an issue where Agent tasks would continue running in the background after cancellation. +- **Event Streaming**: Resolved `UnboundLocalError` in `event_manager.py` and removed artificial delays to prevent event queue buildup. +- **Agent Timeout**: Increased Verification Agent timeout to 10 minutes to support complex PoC generation. +- **LLM Streaming**: Improved robustness of `stream_llm_call` with explicit string timeouts to prevent hanging. ---- - -## [2.0.0] - 2024-11-15 +## [3.0.0] - 2025-12-15 ### Added -- Multi-LLM platform support (OpenAI, Claude, Gemini, Qwen, DeepSeek, Zhipu, etc.) -- Ollama local model support for privacy-focused deployments -- Project management with GitHub/GitLab import -- ZIP file upload support -- Instant code analysis feature -- What-Why-How three-step fix recommendations -- PDF/JSON report export -- Audit rules management (OWASP Top 10 built-in) -- Prompt template management with visual editor -- Runtime LLM configuration in browser -- i18n support (Chinese/English) - -### Changed -- Migrated to FastAPI backend -- React 18 frontend with TypeScript -- PostgreSQL database with Alembic migrations -- Docker Compose deployment support - ---- - -## [1.0.0] - 2024-10-01 - -### Added -- Initial release -- Basic code security audit functionality -- LLM-powered vulnerability detection -- Simple web interface +- **Multi-Agent System**: Introduced Orchestrator, Recon, Analysis, and Verification agents for autonomous security auditing. +- **RAG Integration**: Added Retrieval-Augmented Generation for better code understanding. +- **Docker Sandbox**: Implemented secure environment for tool execution. diff --git a/README.md b/README.md index 4e57393..957a914 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# DeepAudit - 开源的代码审计智能体平台 🦸‍♂️ +# DeepAudit - 人人拥有的 AI 审计战队,让漏洞挖掘触手可及 🦸‍♂️ -> 让代码漏洞挖掘像呼吸一样简单,小白也能当黑客挖洞 +> 让代码漏洞挖掘像呼吸一样简单,小白也能轻松挖洞
DeepAudit Logo @@ -12,7 +12,7 @@
-[![Version](https://img.shields.io/badge/version-3.0.0-blue.svg)](https://github.com/lintsinghua/DeepAudit/releases) +[![Version](https://img.shields.io/badge/version-3.0.1-blue.svg)](https://github.com/lintsinghua/DeepAudit/releases) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![React](https://img.shields.io/badge/React-18-61dafb.svg)](https://reactjs.org/) [![TypeScript](https://img.shields.io/badge/TypeScript-5.7-3178c6.svg)](https://www.typescriptlang.org/) @@ -158,36 +158,74 @@ DeepAudit/ --- -## 🚀 快速开始 (Docker) +## 🚀 快速开始 -### 1. 启动项目 +### 方式一:一行命令部署(推荐) -复制一份 `backend/env.example` 为 `backend/.env`,并按需配置 LLM API Key。 -然后执行以下命令一键启动: +使用预构建的 Docker 镜像,无需克隆代码,一行命令即可启动: ```bash -# 1. 准备配置文件 -cp backend/env.example backend/.env - -# 2. 构建沙箱镜像 (首次运行必须) -cd docker/sandbox && chmod +x build.sh && ./build.sh && cd ../.. - -# 3. 启动服务 -docker compose up -d +curl -fsSL https://raw.githubusercontent.com/lintsinghua/DeepAudit/v3.0.0/docker-compose.prod.yml | docker compose -f - up -d ``` +
+🇨🇳 国内加速部署(点击展开) + +使用南京大学镜像站加速拉取 Docker 镜像(将 `ghcr.io` 替换为 `ghcr.nju.edu.cn`): + +```bash +# 国内加速版 - 使用南京大学 GHCR 镜像站 +curl -fsSL https://raw.githubusercontent.com/lintsinghua/DeepAudit/main/docker-compose.prod.cn.yml | docker compose -f - up -d +``` + +**手动拉取镜像(如需单独拉取):** +```bash +# 前端镜像 +docker pull ghcr.nju.edu.cn/lintsinghua/deepaudit-frontend:latest + +# 后端镜像 +docker pull ghcr.nju.edu.cn/lintsinghua/deepaudit-backend:latest + +# 沙箱镜像 +docker pull ghcr.nju.edu.cn/lintsinghua/deepaudit-sandbox:latest +``` + +> 💡 镜像源由 [南京大学开源镜像站](https://mirrors.nju.edu.cn/) 提供支持 + +
+ > 🎉 **启动成功!** 访问 http://localhost:3000 开始体验。 --- -## 🔧 源码启动指南 +### 方式二:克隆代码部署 + +适合需要自定义配置或二次开发的用户: + +```bash +# 1. 克隆项目 +git clone https://github.com/lintsinghua/DeepAudit.git && cd DeepAudit + +# 2. 配置环境变量 +cp backend/env.example backend/.env +# 编辑 backend/.env 填入你的 LLM API Key + +# 3. 一键启动 +docker compose up -d +``` + +> 首次启动会自动构建沙箱镜像,可能需要几分钟。 + +--- + +## 🔧 源码开发指南 适合开发者进行二次开发调试。 ### 环境要求 -- Python 3.10+ -- Node.js 18+ -- PostgreSQL 14+ +- Python 3.11+ +- Node.js 20+ +- PostgreSQL 15+ - Docker (用于沙箱) @@ -206,11 +244,9 @@ cd backend # 配置环境 cp env.example .env -# 激活虚拟环境 (推荐 uv/poetry) -source .venv/bin/activate - -# 安装依赖 -pip install -r requirements.txt +# 使用 uv 管理环境(推荐) +uv sync +source .venv/bin/activate # 启动 API 服务 uvicorn app.main:app --reload @@ -223,16 +259,20 @@ cd frontend # 配置环境 cp .env.example .env -npm install -npm run dev +pnpm install +pnpm dev ``` -### 4. 沙箱环境 -开发模式下,仍需通过 Docker 启动沙箱服务。 +### 3. 沙箱环境 + +开发模式下需要本地 Docker 拉取沙箱镜像: ```bash -cd docker/sandbox -./build.sh +# 标准拉取 +docker pull ghcr.io/lintsinghua/deepaudit-sandbox:latest + +# 国内加速(南京大学镜像站) +docker pull ghcr.nju.edu.cn/lintsinghua/deepaudit-sandbox:latest ``` --- @@ -369,3 +409,37 @@ DeepSeek-Coder · Codestral
Made with ❤️ by lintsinghua
+ +--- + +## ⚠️ 重要安全声明 + +### 法律合规声明 +1. 禁止**任何未经授权的漏洞测试、渗透测试或安全评估** +2. 本项目仅供网络空间安全学术研究、教学和学习使用 +3. 严禁将本项目用于任何非法目的或未经授权的安全测试 + +### 漏洞上报责任 +1. 发现任何安全漏洞时,请及时通过合法渠道上报 +2. 严禁利用发现的漏洞进行非法活动 +3. 遵守国家网络安全法律法规,维护网络空间安全 + +### 使用限制 +- 仅限在授权环境下用于教育和研究目的 +- 禁止用于对未授权系统进行安全测试 +- 使用者需对自身行为承担全部法律责任 + +### 免责声明 +作者不对任何因使用本项目而导致的直接或间接损失负责,使用者需对自身行为承担全部法律责任。 + +--- + +## 📖 详细安全政策 + +有关安装政策、免责声明、代码隐私、API使用安全和漏洞报告的详细信息,请参阅 [DISCLAIMER.md](DISCLAIMER.md) 和 [SECURITY.md](SECURITY.md) 文件。 + +### 快速参考 +- 🔒 **代码隐私警告**: 您的代码将被发送到所选择的LLM服务商服务器 +- 🛡️ **敏感代码处理**: 使用本地模型处理敏感代码 +- ⚠️ **合规要求**: 遵守数据保护和隐私法律法规 +- 📧 **漏洞报告**: 发现安全问题请通过合法渠道上报 diff --git a/backend/Dockerfile b/backend/Dockerfile index d26ef5b..43ec443 100644 --- a/backend/Dockerfile +++ b/backend/Dockerfile @@ -103,11 +103,12 @@ COPY --from=builder /usr/local/bin/uv /usr/local/bin/uv # 复制应用代码 COPY . . -# 创建上传目录 -RUN mkdir -p /app/uploads/zip_files +# 创建上传目录并设置启动脚本权限 +RUN mkdir -p /app/uploads/zip_files && \ + chmod +x /app/docker-entrypoint.sh # 暴露端口 EXPOSE 8000 -# 启动命令 -CMD [".venv/bin/uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] +# 启动命令 - 使用启动脚本自动执行数据库迁移 +CMD ["/app/docker-entrypoint.sh"] diff --git a/backend/alembic/versions/008_add_files_with_findings.py b/backend/alembic/versions/008_add_files_with_findings.py new file mode 100644 index 0000000..40bd7d5 --- /dev/null +++ b/backend/alembic/versions/008_add_files_with_findings.py @@ -0,0 +1,35 @@ +"""Add files_with_findings column to agent_tasks + +Revision ID: 008_add_files_with_findings +Revises: 4c280754c680 +Create Date: 2025-12-16 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '008_add_files_with_findings' +down_revision = '4c280754c680' +branch_labels = None +depends_on = None + + +def upgrade() -> None: + # Add files_with_findings column to agent_tasks table (idempotent) + conn = op.get_bind() + inspector = sa.inspect(conn) + columns = [col['name'] for col in inspector.get_columns('agent_tasks')] + + if 'files_with_findings' not in columns: + op.add_column( + 'agent_tasks', + sa.Column('files_with_findings', sa.Integer(), nullable=True, default=0) + ) + # Set default value for existing rows + op.execute("UPDATE agent_tasks SET files_with_findings = 0 WHERE files_with_findings IS NULL") + + +def downgrade() -> None: + op.drop_column('agent_tasks', 'files_with_findings') diff --git a/backend/app/api/v1/endpoints/agent_tasks.py b/backend/app/api/v1/endpoints/agent_tasks.py index c4fd398..c95c0f4 100644 --- a/backend/app/api/v1/endpoints/agent_tasks.py +++ b/backend/app/api/v1/endpoints/agent_tasks.py @@ -364,6 +364,17 @@ async def _execute_agent_task(task_id: str): }, ) + # 🔥 设置外部取消检查回调 + # 这确保即使 runner.cancel() 失败,Agent 也能通过 checking 全局标志感知取消 + def check_global_cancel(): + return is_task_cancelled(task_id) + + orchestrator.set_cancel_callback(check_global_cancel) + # 同时也为子 Agent 设置(虽然 Orchestrator 会传播) + recon_agent.set_cancel_callback(check_global_cancel) + analysis_agent.set_cancel_callback(check_global_cancel) + verification_agent.set_cancel_callback(check_global_cancel) + # 注册到全局 _running_orchestrators[task_id] = orchestrator _running_tasks[task_id] = orchestrator # 兼容旧的取消逻辑 @@ -437,7 +448,13 @@ async def _execute_agent_task(task_id: str): await _save_findings(db, task_id, findings) # 更新任务统计 - task.status = AgentTaskStatus.COMPLETED + # 🔥 CRITICAL FIX: 在设置完成前再次检查取消状态 + # 避免 "取消后后端继续运行并最终标记为完成" 的问题 + if is_task_cancelled(task_id): + logger.info(f"[AgentTask] Task {task_id} was cancelled, overriding success result") + task.status = AgentTaskStatus.CANCELLED + else: + task.status = AgentTaskStatus.COMPLETED task.completed_at = datetime.now(timezone.utc) task.current_phase = AgentTaskPhase.REPORTING task.findings_count = len(findings) @@ -445,14 +462,18 @@ async def _execute_agent_task(task_id: str): task.tool_calls_count = result.tool_calls task.tokens_used = result.tokens_used - # 🔥 统计分析的文件数量(从 findings 中提取唯一文件) - analyzed_file_set = set() + # 🔥 统计文件数量 + # analyzed_files = 实际扫描过的文件数(任务完成时等于 total_files) + # files_with_findings = 有漏洞发现的唯一文件数 + task.analyzed_files = task.total_files # Agent 扫描了所有符合条件的文件 + + files_with_findings_set = set() for f in findings: if isinstance(f, dict): file_path = f.get("file_path") or f.get("file") or f.get("location", "").split(":")[0] if file_path: - analyzed_file_set.add(file_path) - task.analyzed_files = len(analyzed_file_set) if analyzed_file_set else task.total_files + files_with_findings_set.add(file_path) + task.files_with_findings = len(files_with_findings_set) # 统计严重程度和验证状态 verified_count = 0 @@ -1583,18 +1604,28 @@ async def cancel_agent_task( if runner: runner.cancel() logger.info(f"[Cancel] Set cancel flag for task {task_id}") - - # 🔥 2. 强制取消 asyncio Task(立即中断 LLM 调用) + + # 🔥 2. 通过 agent_registry 取消所有子 Agent + from app.services.agent.core import agent_registry + from app.services.agent.core.graph_controller import stop_all_agents + try: + # 停止所有 Agent(包括子 Agent) + stop_result = stop_all_agents(exclude_root=False) + logger.info(f"[Cancel] Stopped all agents: {stop_result}") + except Exception as e: + logger.warning(f"[Cancel] Failed to stop agents via registry: {e}") + + # 🔥 3. 强制取消 asyncio Task(立即中断 LLM 调用) asyncio_task = _running_asyncio_tasks.get(task_id) if asyncio_task and not asyncio_task.done(): asyncio_task.cancel() logger.info(f"[Cancel] Cancelled asyncio task for {task_id}") - + # 更新状态 task.status = AgentTaskStatus.CANCELLED task.completed_at = datetime.now(timezone.utc) await db.commit() - + logger.info(f"[Cancel] Task {task_id} cancelled successfully") return {"message": "任务已取消", "task_id": task_id} diff --git a/backend/app/api/v1/endpoints/embedding_config.py b/backend/app/api/v1/endpoints/embedding_config.py index bc91c51..541bf2a 100644 --- a/backend/app/api/v1/endpoints/embedding_config.py +++ b/backend/app/api/v1/endpoints/embedding_config.py @@ -11,6 +11,7 @@ from fastapi import APIRouter, Depends, HTTPException from pydantic import BaseModel, Field from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession +from sqlalchemy.orm.attributes import flag_modified from app.api import deps from app.models.user import User @@ -46,10 +47,10 @@ class EmbeddingConfigResponse(BaseModel): """配置响应""" provider: str model: str + api_key: Optional[str] = None # 返回 API Key base_url: Optional[str] dimensions: int batch_size: int - # 不返回 API Key class TestEmbeddingRequest(BaseModel): @@ -165,14 +166,14 @@ async def get_embedding_config_from_db(db: AsyncSession, user_id: str) -> Embedd select(UserConfig).where(UserConfig.user_id == user_id) ) user_config = result.scalar_one_or_none() - + if user_config and user_config.other_config: try: other_config = json.loads(user_config.other_config) if isinstance(user_config.other_config, str) else user_config.other_config embedding_data = other_config.get(EMBEDDING_CONFIG_KEY) - + if embedding_data: - return EmbeddingConfig( + config = EmbeddingConfig( provider=embedding_data.get("provider", settings.EMBEDDING_PROVIDER), model=embedding_data.get("model", settings.EMBEDDING_MODEL), api_key=embedding_data.get("api_key"), @@ -180,10 +181,13 @@ async def get_embedding_config_from_db(db: AsyncSession, user_id: str) -> Embedd dimensions=embedding_data.get("dimensions"), batch_size=embedding_data.get("batch_size", 100), ) - except (json.JSONDecodeError, AttributeError): - pass - + print(f"[EmbeddingConfig] 读取用户 {user_id} 的嵌入配置: provider={config.provider}, model={config.model}") + return config + except (json.JSONDecodeError, AttributeError) as e: + print(f"[EmbeddingConfig] 解析用户 {user_id} 配置失败: {e}") + # 返回默认配置 + print(f"[EmbeddingConfig] 用户 {user_id} 无保存配置,返回默认值") return EmbeddingConfig( provider=settings.EMBEDDING_PROVIDER, model=settings.EMBEDDING_MODEL, @@ -199,7 +203,7 @@ async def save_embedding_config_to_db(db: AsyncSession, user_id: str, config: Em select(UserConfig).where(UserConfig.user_id == user_id) ) user_config = result.scalar_one_or_none() - + # 准备嵌入配置数据 embedding_data = { "provider": config.provider, @@ -209,16 +213,18 @@ async def save_embedding_config_to_db(db: AsyncSession, user_id: str, config: Em "dimensions": config.dimensions, "batch_size": config.batch_size, } - + if user_config: # 更新现有配置 try: other_config = json.loads(user_config.other_config) if user_config.other_config else {} except (json.JSONDecodeError, TypeError): other_config = {} - + other_config[EMBEDDING_CONFIG_KEY] = embedding_data user_config.other_config = json.dumps(other_config) + # 🔥 显式标记 other_config 字段已修改,确保 SQLAlchemy 检测到变化 + flag_modified(user_config, "other_config") else: # 创建新配置 user_config = UserConfig( @@ -228,8 +234,9 @@ async def save_embedding_config_to_db(db: AsyncSession, user_id: str, config: Em other_config=json.dumps({EMBEDDING_CONFIG_KEY: embedding_data}), ) db.add(user_config) - + await db.commit() + print(f"[EmbeddingConfig] 已保存用户 {user_id} 的嵌入配置: provider={config.provider}, model={config.model}") # ============ API Endpoints ============ @@ -253,13 +260,14 @@ async def get_current_config( 获取当前嵌入模型配置(从数据库读取) """ config = await get_embedding_config_from_db(db, current_user.id) - + # 获取维度 dimensions = _get_model_dimensions(config.provider, config.model) - + return EmbeddingConfigResponse( provider=config.provider, model=config.model, + api_key=config.api_key, base_url=config.base_url, dimensions=dimensions, batch_size=config.batch_size, @@ -279,19 +287,18 @@ async def update_config( provider_ids = [p.id for p in EMBEDDING_PROVIDERS] if config.provider not in provider_ids: raise HTTPException(status_code=400, detail=f"不支持的提供商: {config.provider}") - - # 验证模型 + + # 获取提供商信息(用于检查 API Key 要求) provider = next((p for p in EMBEDDING_PROVIDERS if p.id == config.provider), None) - if provider and config.model not in provider.models: - raise HTTPException(status_code=400, detail=f"不支持的模型: {config.model}") - + # 注意:不再强制验证模型名称,允许用户输入自定义模型 + # 检查 API Key if provider and provider.requires_api_key and not config.api_key: raise HTTPException(status_code=400, detail=f"{config.provider} 需要 API Key") - + # 保存到数据库 await save_embedding_config_to_db(db, current_user.id, config) - + return {"message": "配置已保存", "provider": config.provider, "model": config.model} diff --git a/backend/app/api/v1/endpoints/projects.py b/backend/app/api/v1/endpoints/projects.py index 7ccf648..66b0710 100644 --- a/backend/app/api/v1/endpoints/projects.py +++ b/backend/app/api/v1/endpoints/projects.py @@ -659,7 +659,8 @@ async def get_project_branches( config = config.scalar_one_or_none() github_token = settings.GITHUB_TOKEN - projects_gitea_token = settings.GITEA_TOKEN + gitea_token = settings.GITEA_TOKEN + gitlab_token = settings.GITLAB_TOKEN SENSITIVE_OTHER_FIELDS = ['githubToken', 'gitlabToken', 'giteaToken'] @@ -674,13 +675,12 @@ async def get_project_branches( elif field == 'gitlabToken': gitlab_token = decrypted_val elif field == 'giteaToken': - projects_gitea_token = decrypted_val + gitea_token = decrypted_val repo_type = project.repository_type or "other" # 详细日志 print(f"[Branch] 项目: {project.name}, 类型: {repo_type}, URL: {project.repository_url}") - print(f"[Branch] GitHub Token: {'已配置' if github_token else '未配置'}, GitLab Token: {'已配置' if gitlab_token else '未配置'}, Gitea Token: {'已配置' if projects_gitea_token else '未配置'}") try: if repo_type == "github": @@ -692,9 +692,9 @@ async def get_project_branches( print("[Branch] 警告: GitLab Token 未配置,可能无法访问私有仓库") branches = await get_gitlab_branches(project.repository_url, gitlab_token) elif repo_type == "gitea": - if not projects_gitea_token: + if not gitea_token: print("[Branch] 警告: Gitea Token 未配置,可能无法访问私有仓库") - branches = await get_gitea_branches(project.repository_url, projects_gitea_token) + branches = await get_gitea_branches(project.repository_url, gitea_token) else: # 对于其他类型,返回默认分支 print(f"[Branch] 仓库类型 '{repo_type}' 不支持获取分支,返回默认分支") diff --git a/backend/app/models/agent_task.py b/backend/app/models/agent_task.py index 0bc1a1a..33c7047 100644 --- a/backend/app/models/agent_task.py +++ b/backend/app/models/agent_task.py @@ -89,7 +89,8 @@ class AgentTask(Base): # 进度统计 total_files = Column(Integer, default=0) indexed_files = Column(Integer, default=0) - analyzed_files = Column(Integer, default=0) + analyzed_files = Column(Integer, default=0) # 实际扫描过的文件数 + files_with_findings = Column(Integer, default=0) # 有漏洞发现的文件数 total_chunks = Column(Integer, default=0) # 代码块总数 # Agent 统计 diff --git a/backend/app/services/agent/agents/analysis.py b/backend/app/services/agent/agents/analysis.py index b39a7ce..f1e5c8e 100644 --- a/backend/app/services/agent/agents/analysis.py +++ b/backend/app/services/agent/agents/analysis.py @@ -85,15 +85,15 @@ ANALYSIS_SYSTEM_PROMPT = """你是 DeepAudit 的漏洞分析 Agent,一个**自 - **dataflow_analysis**: 数据流追踪 参数: source_code (str), variable_name (str) -### 辅助工具 -- **read_file**: 读取文件内容验证发现 +### 辅助工具(RAG 优先!) +- **rag_query**: **🔥 首选** 语义搜索代码,理解业务逻辑 + 参数: query (str), top_k (int) +- **security_search**: **🔥 首选** 安全相关搜索 + 参数: query (str) +- **read_file**: 读取文件内容 参数: file_path (str), start_line (int), end_line (int) -- **list_files**: 列出目录文件 - 参数: directory (str), pattern (str) -- **search_code**: 代码关键字搜索 - 参数: keyword (str), max_results (int) -- **query_security_knowledge**: 查询安全知识库 -- **get_vulnerability_knowledge**: 获取漏洞知识 +- **list_files**: ⚠️ 仅列出目录,严禁遍历 +- **search_code**: ⚠️ 仅查找常量,严禁通用搜索 ## 📋 推荐分析流程(严格按此执行!) @@ -193,6 +193,26 @@ Final Answer: [JSON 格式的漏洞报告] 3. **上下文分析** - 看到可疑代码要读取上下文,理解完整逻辑 4. **自主判断** - 不要机械相信工具输出,要用你的专业知识判断 +## ⚠️ 关键约束 - 必须遵守! +1. **禁止直接输出 Final Answer** - 你必须先调用工具来分析代码 +2. **至少调用两个工具** - 使用 smart_scan/semgrep_scan 进行扫描,然后用 read_file 查看代码 +3. **没有工具调用的分析无效** - 不允许仅凭推测直接报告漏洞 +4. **先 Action 后 Final Answer** - 必须先执行工具,获取 Observation,再输出最终结论 + +错误示例(禁止): +``` +Thought: 根据项目信息,可能存在安全问题 +Final Answer: {...} ❌ 没有调用任何工具! +``` + +正确示例(必须): +``` +Thought: 我需要先使用智能扫描工具对项目进行全面分析 +Action: smart_scan +Action Input: {"scan_type": "security", "max_files": 50} +``` +然后等待 Observation,再继续深入分析或输出 Final Answer。 + 现在开始你的安全分析!首先使用外部工具进行全面扫描。""" @@ -402,7 +422,7 @@ class AnalysisAgent(BaseAgent): ## 可用工具 {self.get_tools_description()} -请开始你的安全分析。首先读取高风险区域的文件,然后分析其中的安全问题。""" +请开始你的安全分析。首先读取高风险区域的文件,然后**立即**分析其中的安全问题(输出 Action)。""" # 🔥 记录工作开始 self.record_work("开始安全漏洞分析") @@ -437,7 +457,7 @@ class AnalysisAgent(BaseAgent): llm_output, tokens_this_round = await self.stream_llm_call( self._conversation_history, temperature=0.1, - max_tokens=4096, + max_tokens=8192, ) except asyncio.CancelledError: logger.info(f"[{self.name}] LLM call cancelled") @@ -594,7 +614,7 @@ Final Answer: {{"findings": [...], "summary": "..."}}""" await self.emit_llm_decision("继续分析", "LLM 需要更多分析") self._conversation_history.append({ "role": "user", - "content": "请继续分析。选择一个工具执行,或者如果分析完成,输出 Final Answer 汇总所有发现。", + "content": "请继续分析。你输出了 Thought 但没有输出 Action。请**立即**选择一个工具执行,或者如果分析完成,输出 Final Answer 汇总所有发现。", }) # 🔥 如果循环结束但没有发现,强制 LLM 总结 diff --git a/backend/app/services/agent/agents/base.py b/backend/app/services/agent/agents/base.py index a198374..cf1a619 100644 --- a/backend/app/services/agent/agents/base.py +++ b/backend/app/services/agent/agents/base.py @@ -51,7 +51,7 @@ class AgentConfig: # LLM 配置 model: Optional[str] = None temperature: float = 0.1 - max_tokens: int = 4096 + max_tokens: int = 8192 # 执行限制 max_iterations: int = 20 @@ -485,9 +485,24 @@ class BaseAgent(ABC): self._cancelled = True logger.info(f"[{self.name}] Cancel requested") + # 🔥 外部取消检查回调 + self._cancel_callback = None + + def set_cancel_callback(self, callback) -> None: + """设置外部取消检查回调""" + self._cancel_callback = callback + @property def is_cancelled(self) -> bool: - return self._cancelled + """检查是否已取消(包含内部标志和外部回调)""" + if self._cancelled: + return True + # 检查外部回调 + if self._cancel_callback and self._cancel_callback(): + self._cancelled = True + logger.info(f"[{self.name}] Detected cancellation from callback") + return True + return False # ============ 协作方法 ============ @@ -949,41 +964,83 @@ class BaseAgent(ABC): logger.info(f"[{self.name}] ✅ thinking_start emitted, starting LLM stream...") try: - async for chunk in self.llm_service.chat_completion_stream( + # 获取流式迭代器 + stream = self.llm_service.chat_completion_stream( messages=messages, temperature=temperature, max_tokens=max_tokens, - ): + ) + # 兼容不同版本的 python async generator + iterator = stream.__aiter__() + + import time + first_token_received = False + last_activity = time.time() + + while True: # 检查取消 if self.is_cancelled: - logger.info(f"[{self.name}] Cancelled during LLM streaming") + logger.info(f"[{self.name}] Cancelled during LLM streaming loop") break - if chunk["type"] == "token": - token = chunk["content"] - accumulated = chunk["accumulated"] - await self.emit_thinking_token(token, accumulated) - # 🔥 CRITICAL: 让出控制权给事件循环,让 SSE 有机会发送事件 - # 如果不这样做,所有 token 会在循环结束后一起发送 - await asyncio.sleep(0) + try: + # 🔥 第一個 token 30秒超时,后续 token 60秒超时 + # 这是一个应用层的安全网,防止底层 LLM 客户端挂死 + timeout = 30.0 if not first_token_received else 60.0 - elif chunk["type"] == "done": - accumulated = chunk["content"] - if chunk.get("usage"): - total_tokens = chunk["usage"].get("total_tokens", 0) + chunk = await asyncio.wait_for(iterator.__anext__(), timeout=timeout) + + last_activity = time.time() + + if chunk["type"] == "token": + first_token_received = True + token = chunk["content"] + # 🔥 累积 content,确保 accumulated 变量更新 + # 注意:某些 adapter 返回的 chunk["accumulated"] 可能已经包含了累积值, + # 但为了安全起见,如果不一致,我们自己累积 + if "accumulated" in chunk: + accumulated = chunk["accumulated"] + else: + # 如果 adapter 没返回 accumulated,我们自己拼 + # 注意:如果是 token 类型,content 是增量 + # 如果 accumulated 被覆盖了,需要小心。 + # 实际上 service.py 中 chat_completion_stream 保证了 accumulated 存在 + # 这里我们信任 service 层的 accumulated + pass + + # Double check if accumulated is empty but we have token + if not accumulated and token: + accumulated += token # Fallback + + await self.emit_thinking_token(token, accumulated) + # 🔥 CRITICAL: 让出控制权给事件循环,让 SSE 有机会发送事件 + await asyncio.sleep(0) + + elif chunk["type"] == "done": + accumulated = chunk["content"] + if chunk.get("usage"): + total_tokens = chunk["usage"].get("total_tokens", 0) + break + + elif chunk["type"] == "error": + accumulated = chunk.get("accumulated", "") + error_msg = chunk.get("error", "Unknown error") + logger.error(f"[{self.name}] Stream error: {error_msg}") + if accumulated: + total_tokens = chunk.get("usage", {}).get("total_tokens", 0) + else: + accumulated = f"[系统错误: {error_msg}] 请重新思考并输出你的决策。" + break + + except StopAsyncIteration: break - - elif chunk["type"] == "error": - accumulated = chunk.get("accumulated", "") - error_msg = chunk.get("error", "Unknown error") - logger.error(f"[{self.name}] Stream error: {error_msg}") - # 🔥 如果有部分累积内容,尝试使用它 - if accumulated: - logger.warning(f"[{self.name}] Using partial accumulated content ({len(accumulated)} chars)") - total_tokens = chunk.get("usage", {}).get("total_tokens", 0) - else: - # 🔥 返回一个提示 LLM 继续的消息,而不是空字符串 - accumulated = f"[系统错误: {error_msg}] 请重新思考并输出你的决策。" + except asyncio.TimeoutError: + timeout_type = "First Token" if not first_token_received else "Stream" + logger.error(f"[{self.name}] LLM {timeout_type} Timeout ({timeout}s)") + error_msg = f"LLM 响应超时 ({timeout_type}, {timeout}s)" + await self.emit_event("error", error_msg) + if not accumulated: + accumulated = f"[超时错误: {timeout}s 无响应] 请尝试简化请求或重试。" break except asyncio.CancelledError: @@ -993,7 +1050,6 @@ class BaseAgent(ABC): # 🔥 增强异常处理,避免吞掉错误 logger.error(f"[{self.name}] Unexpected error in stream_llm_call: {e}", exc_info=True) await self.emit_event("error", f"LLM 调用错误: {str(e)}") - # 返回错误提示,让 Agent 知道发生了什么 accumulated = f"[LLM调用错误: {str(e)}] 请重试。" finally: await self.emit_thinking_end(accumulated) diff --git a/backend/app/services/agent/agents/orchestrator.py b/backend/app/services/agent/agents/orchestrator.py index b99973f..118384e 100644 --- a/backend/app/services/agent/agents/orchestrator.py +++ b/backend/app/services/agent/agents/orchestrator.py @@ -242,7 +242,7 @@ class OrchestratorAgent(BaseAgent): llm_output, tokens_this_round = await self.stream_llm_call( self._conversation_history, temperature=0.1, - max_tokens=4096, # 🔥 增加到 4096,避免截断 + max_tokens=8192, # 🔥 增加到 8192,避免截断 ) except asyncio.CancelledError: logger.info(f"[{self.name}] LLM call cancelled") @@ -657,7 +657,7 @@ Action Input: {{"参数": "值"}} agent_timeouts = { "recon": 300, # 5 分钟 "analysis": 600, # 10 分钟 - "verification": 300, # 5 分钟 + "verification": 600, # 10 分钟 } timeout = agent_timeouts.get(agent_name, 300) @@ -667,7 +667,8 @@ Action Input: {{"参数": "值"}} try: while not run_task.done(): if self.is_cancelled: - # 传播取消到子 Agent + # 🔥 传播取消到子 Agent + logger.info(f"[{self.name}] Cancelling sub-agent {agent_name} due to parent cancel") if hasattr(agent, 'cancel'): agent.cancel() run_task.cancel() @@ -677,18 +678,28 @@ Action Input: {{"参数": "值"}} pass raise asyncio.CancelledError("任务已取消") - try: - return await asyncio.wait_for( - asyncio.shield(run_task), - timeout=1.0 # 每秒检查一次取消状态 - ) - except asyncio.TimeoutError: - continue + # Use asyncio.wait to poll without cancelling the task + done, pending = await asyncio.wait( + [run_task], + timeout=0.5, + return_when=asyncio.FIRST_COMPLETED + ) + if run_task in done: + return run_task.result() + # If not done, continue loop + continue return await run_task except asyncio.CancelledError: + # 🔥 确保子任务被取消 if not run_task.done(): + if hasattr(agent, 'cancel'): + agent.cancel() run_task.cancel() + try: + await run_task + except asyncio.CancelledError: + pass raise try: @@ -877,17 +888,32 @@ Action Input: {{"参数": "值"}} if same_file and (same_line or similar_desc or same_type): # Update existing with new info (e.g. verification results) - # Prefer verified data over unverified - merged = {**existing_f, **normalized_new} + # 🔥 FIX: Smart merge - don't overwrite good data with empty values + merged = dict(existing_f) # Start with existing data + for key, value in normalized_new.items(): + # Only overwrite if new value is meaningful + if value is not None and value != "" and value != 0: + merged[key] = value + elif key not in merged or merged[key] is None: + # Fill in missing fields even with empty values + merged[key] = value + # Keep the better title if normalized_new.get("title") and len(normalized_new.get("title", "")) > len(existing_f.get("title", "")): merged["title"] = normalized_new["title"] # Keep verified status if either is verified if existing_f.get("is_verified") or normalized_new.get("is_verified"): merged["is_verified"] = True + # 🔥 FIX: Preserve non-zero line numbers + if existing_f.get("line_start") and not normalized_new.get("line_start"): + merged["line_start"] = existing_f["line_start"] + # 🔥 FIX: Preserve vulnerability_type + if existing_f.get("vulnerability_type") and not normalized_new.get("vulnerability_type"): + merged["vulnerability_type"] = existing_f["vulnerability_type"] + self._all_findings[i] = merged found = True - logger.info(f"[Orchestrator] Merged finding: {new_file}:{new_line} ({new_type})") + logger.info(f"[Orchestrator] Merged finding: {new_file}:{merged.get('line_start', 0)} ({merged.get('vulnerability_type', '')})") break if not found: diff --git a/backend/app/services/agent/agents/recon.py b/backend/app/services/agent/agents/recon.py index fece4e7..bd981f1 100644 --- a/backend/app/services/agent/agents/recon.py +++ b/backend/app/services/agent/agents/recon.py @@ -19,11 +19,146 @@ from dataclasses import dataclass from .base import BaseAgent, AgentConfig, AgentResult, AgentType, AgentPattern from ..json_parser import AgentJsonParser -from ..prompts import RECON_SYSTEM_PROMPT, TOOL_USAGE_GUIDE +from ..prompts import TOOL_USAGE_GUIDE logger = logging.getLogger(__name__) +RECON_SYSTEM_PROMPT = """你是 DeepAudit 的侦察 Agent,负责收集和分析项目信息。 + +## 你的职责 +作为侦察层,你负责: +1. 分析项目结构和技术栈 +2. 识别关键入口点 +3. 发现配置文件和敏感区域 +4. **推荐需要使用的外部安全工具** +5. 提供初步风险评估 + +## 侦察目标 + +### 1. 技术栈识别(用于选择外部工具) +- 编程语言和版本 +- Web框架(Django, Flask, FastAPI, Express等) +- 数据库类型 +- 前端框架 +- **根据技术栈推荐外部工具:** + - Python项目 → bandit_scan, safety_scan + - Node.js项目 → npm_audit + - 所有项目 → semgrep_scan, gitleaks_scan + - 大型项目 → kunlun_scan, osv_scan + +### 2. 入口点发现 +- HTTP路由和API端点 +- Websocket处理 +- 定时任务和后台作业 +- 消息队列消费者 + +### 3. 敏感区域定位 +- 认证和授权代码 +- 数据库操作 +- 文件处理 +- 外部服务调用 + +### 4. 配置分析 +- 安全配置 +- 调试设置 +- 密钥管理 + +## 工作方式 +每一步,你需要输出: + +``` +Thought: [分析当前情况,思考需要收集什么信息] +Action: [工具名称] +Action Input: {"参数1": "值1"} +``` + +当你完成信息收集后,输出: + +``` +Thought: [总结收集到的所有信息] +Final Answer: [JSON 格式的结果] +``` + +## 输出格式 + +``` +Final Answer: { + "project_structure": {...}, + "tech_stack": { + "languages": [...], + "frameworks": [...], + "databases": [...] + }, + "recommended_tools": { + "must_use": ["semgrep_scan", "gitleaks_scan", ...], + "recommended": ["kunlun_scan", ...], + "reason": "基于项目技术栈的推荐理由" + }, + "entry_points": [ + {"type": "...", "file": "...", "line": ..., "method": "..."} + ], + "high_risk_areas": [ + "文件路径:行号 - 风险描述" + ], + "initial_findings": [ + {"title": "...", "file_path": "...", "line_start": ..., "description": "..."} + ], + "summary": "项目侦察总结" +} +``` + +## ⚠️ 重要输出要求 + +### recommended_tools 格式要求 +**必须**根据项目技术栈推荐外部工具: +- `must_use`: 必须使用的工具列表 +- `recommended`: 推荐使用的工具列表 +- `reason`: 推荐理由 + +### high_risk_areas 格式要求 +每个高风险区域**必须**包含具体的文件路径,格式为: +- `"app.py:36 - SECRET_KEY 硬编码"` +- `"utils/file.py:120 - 使用用户输入构造文件路径"` +- `"api/views.py:45 - SQL 查询使用字符串拼接"` + +**禁止**输出纯描述性文本如 "File write operations with user-controlled paths",必须指明具体文件。 + +### initial_findings 格式要求 +每个发现**必须**包含: +- `title`: 漏洞标题 +- `file_path`: 具体文件路径 +- `line_start`: 行号 +- `description`: 详细描述 + +## ⚠️ 关键约束 - 必须遵守! +1. **禁止直接输出 Final Answer** - 你必须先调用工具来收集项目信息 +2. **至少调用三个工具** - 使用 rag_query 语义搜索关键入口,read_file 读取文件,list_files 仅查看根目录 +3. **没有工具调用的侦察无效** - 不允许仅凭项目名称直接推测 +4. **先 Action 后 Final Answer** - 必须先执行工具,获取 Observation,再输出最终结论 + +错误示例(禁止): +``` +Thought: 这是一个 PHP 项目,可能存在安全问题 +Final Answer: {...} ❌ 没有调用任何工具! +``` + +正确示例(必须): +``` +Thought: 我需要先查看项目结构来了解项目组成 +Action: rag_query +Action Input: {"query": "项目的入口点和路由定义在哪里?", "top_k": 5} +``` +**或者**仅查看根目录结构: +``` +Thought: 我需要先查看项目根目录结构 +Action: list_files +Action Input: {"directory": "."} +``` +然后等待 Observation,再继续收集信息或输出 Final Answer。 +""" + + # ... (上文导入) # ... @@ -193,7 +328,7 @@ class ReconAgent(BaseAgent): ## 可用工具 {self.get_tools_description()} -请开始你的信息收集工作。首先思考应该收集什么信息,然后选择合适的工具。""" +请开始你的信息收集工作。首先思考应该收集什么信息,然后**立即**选择合适的工具执行(输出 Action)。不要只输出 Thought,必须紧接着输出 Action。""" # 初始化对话历史 self._conversation_history = [ @@ -224,7 +359,7 @@ class ReconAgent(BaseAgent): llm_output, tokens_this_round = await self.stream_llm_call( self._conversation_history, temperature=0.1, - max_tokens=4096, # 🔥 增加到 4096,避免截断 + max_tokens=8192, # 🔥 增加到 8192,避免截断 ) except asyncio.CancelledError: logger.info(f"[{self.name}] LLM call cancelled") @@ -360,7 +495,7 @@ Final Answer: [JSON格式的结果]""" await self.emit_llm_decision("继续思考", "LLM 需要更多信息") self._conversation_history.append({ "role": "user", - "content": "请继续,选择一个工具执行,或者如果信息收集完成,输出 Final Answer。", + "content": "请继续。你输出了 Thought 但没有输出 Action。请**立即**选择一个工具执行(Action: ...),或者如果信息收集完成,输出 Final Answer。", }) # 🔥 如果循环结束但没有 final_result,强制 LLM 总结 diff --git a/backend/app/services/agent/agents/verification.py b/backend/app/services/agent/agents/verification.py index c9206e9..bfd8326 100644 --- a/backend/app/services/agent/agents/verification.py +++ b/backend/app/services/agent/agents/verification.py @@ -41,7 +41,7 @@ VERIFICATION_SYSTEM_PROMPT = """你是 DeepAudit 的漏洞验证 Agent,一个* ### 文件操作 - **read_file**: 读取更多代码上下文 参数: file_path (str), start_line (int), end_line (int) -- **list_files**: 列出目录文件 +- **list_files**: ⚠️ 仅用于确认文件是否存在,严禁遍历 参数: directory (str), pattern (str) ### 沙箱核心工具 @@ -212,6 +212,26 @@ Final Answer: [JSON 格式的验证报告] - 代码执行: 可直接运行的利用脚本 - ⚠️ payload 字段必须是**可直接复制执行**的完整利用代码,不要只写参数值 +## ⚠️ 关键约束 - 必须遵守! +1. **禁止直接输出 Final Answer** - 你必须先调用至少一个工具来验证漏洞 +2. **每个漏洞至少调用一次工具** - 使用 read_file 读取代码,或使用 test_* 工具测试 +3. **没有工具调用的验证无效** - 不允许仅凭已知信息直接判断 +4. **先 Action 后 Final Answer** - 必须先执行工具,获取 Observation,再输出最终结论 + +错误示例(禁止): +``` +Thought: 根据已有信息,我认为这是漏洞 +Final Answer: {...} ❌ 没有调用任何工具! +``` + +正确示例(必须): +``` +Thought: 我需要先读取 config.php 文件来验证硬编码凭据 +Action: read_file +Action Input: {"file_path": "config.php"} +``` +然后等待 Observation,再继续验证其他发现或输出 Final Answer。 + 现在开始验证漏洞发现!""" @@ -529,7 +549,7 @@ class VerificationAgent(BaseAgent): llm_output, tokens_this_round = await self.stream_llm_call( self._conversation_history, temperature=0.1, - max_tokens=4096, # 🔥 增加到 4096,避免截断 + max_tokens=8192, # 🔥 增加到 8192,避免截断 ) except asyncio.CancelledError: logger.info(f"[{self.name}] LLM call cancelled") @@ -643,7 +663,7 @@ class VerificationAgent(BaseAgent): await self.emit_llm_decision("继续验证", "LLM 需要更多验证") self._conversation_history.append({ "role": "user", - "content": "请继续验证。如果验证完成,输出 Final Answer 汇总所有验证结果。", + "content": "请继续验证。你输出了 Thought 但没有输出 Action。请**立即**选择一个工具执行,或者如果验证完成,输出 Final Answer 汇总所有验证结果。", }) # 处理结果 @@ -667,31 +687,50 @@ class VerificationAgent(BaseAgent): # 处理最终结果 verified_findings = [] - + # 🔥 Robustness: If LLM returns empty findings but we had input, fallback to original llm_findings = [] if final_result and "findings" in final_result: llm_findings = final_result["findings"] - + if not llm_findings and findings_to_verify: logger.warning(f"[{self.name}] LLM returned empty findings despite {len(findings_to_verify)} inputs. Falling back to originals.") # Fallback to logic below (else branch) - final_result = None + final_result = None if final_result and "findings" in final_result: + # 🔥 DEBUG: Log what LLM returned for verdict diagnosis + verdicts_debug = [(f.get("file_path", "?"), f.get("verdict"), f.get("confidence")) for f in final_result["findings"]] + logger.info(f"[{self.name}] LLM returned verdicts: {verdicts_debug}") + for f in final_result["findings"]: + # 🔥 FIX: Normalize verdict - handle missing/empty verdict + verdict = f.get("verdict") + if not verdict or verdict not in ["confirmed", "likely", "uncertain", "false_positive"]: + # Try to infer verdict from other fields + if f.get("is_verified") is True: + verdict = "confirmed" + elif f.get("confidence", 0) >= 0.8: + verdict = "likely" + elif f.get("confidence", 0) <= 0.3: + verdict = "false_positive" + else: + verdict = "uncertain" + logger.warning(f"[{self.name}] Missing/invalid verdict for {f.get('file_path', '?')}, inferred as: {verdict}") + verified = { **f, - "is_verified": f.get("verdict") == "confirmed" or ( - f.get("verdict") == "likely" and f.get("confidence", 0) >= 0.8 + "verdict": verdict, # 🔥 Ensure verdict is set + "is_verified": verdict == "confirmed" or ( + verdict == "likely" and f.get("confidence", 0) >= 0.8 ), - "verified_at": datetime.now(timezone.utc).isoformat() if f.get("verdict") in ["confirmed", "likely"] else None, + "verified_at": datetime.now(timezone.utc).isoformat() if verdict in ["confirmed", "likely"] else None, } - + # 添加修复建议 if not verified.get("recommendation"): verified["recommendation"] = self._get_recommendation(f.get("vulnerability_type", "")) - + verified_findings.append(verified) else: # 如果没有最终结果,使用原始发现 diff --git a/backend/app/services/agent/event_manager.py b/backend/app/services/agent/event_manager.py index c2d2afb..827fd40 100644 --- a/backend/app/services/agent/event_manager.py +++ b/backend/app/services/agent/event_manager.py @@ -473,10 +473,10 @@ class EventManager: buffered_count += 1 yield buffered_event - # 🔥 为缓存事件添加小延迟,但比之前少很多(避免拖慢) + # 🔥 取消人为延迟,防止队列堆积 event_type = buffered_event.get("event_type") - if event_type == "thinking_token": - await asyncio.sleep(0.005) # 5ms for tokens (reduced from 15ms) + # if event_type == "thinking_token": + # await asyncio.sleep(0.005) # 其他事件不加延迟,快速发送 # 检查是否是结束事件 @@ -513,9 +513,9 @@ class EventManager: yield event - # 🔥 为 thinking_token 添加微延迟确保流式效果 - if event_type == "thinking_token": - await asyncio.sleep(0.01) # 10ms + # 🔥 取消人为延迟,防止队列堆积 + # if event_type == "thinking_token": + # await asyncio.sleep(0.01) # 检查是否是结束事件 if event.get("event_type") in ["task_complete", "task_error", "task_cancel"]: diff --git a/backend/app/services/agent/prompts/__init__.py b/backend/app/services/agent/prompts/__init__.py index b4edca1..975b837 100644 --- a/backend/app/services/agent/prompts/__init__.py +++ b/backend/app/services/agent/prompts/__init__.py @@ -219,11 +219,6 @@ from .system_prompts import ( VULNERABILITY_PRIORITIES, TOOL_USAGE_GUIDE, MULTI_AGENT_RULES, - ORCHESTRATOR_SYSTEM_PROMPT, - ANALYSIS_SYSTEM_PROMPT, - VERIFICATION_SYSTEM_PROMPT, - RECON_SYSTEM_PROMPT, - get_system_prompt, build_enhanced_prompt, ) @@ -242,11 +237,6 @@ __all__ = [ "VULNERABILITY_PRIORITIES", "TOOL_USAGE_GUIDE", "MULTI_AGENT_RULES", - "ORCHESTRATOR_SYSTEM_PROMPT", - "ANALYSIS_SYSTEM_PROMPT", - "VERIFICATION_SYSTEM_PROMPT", - "RECON_SYSTEM_PROMPT", - "get_system_prompt", "build_enhanced_prompt", ] diff --git a/backend/app/services/agent/prompts/system_prompts.py b/backend/app/services/agent/prompts/system_prompts.py index 7e690e6..5ec4fcc 100644 --- a/backend/app/services/agent/prompts/system_prompts.py +++ b/backend/app/services/agent/prompts/system_prompts.py @@ -139,44 +139,48 @@ TOOL_USAGE_GUIDE = """ | `dataflow_analysis` | 数据流追踪验证 | | `code_analysis` | 代码结构分析 | -#### 辅助工具 +#### 辅助工具(RAG 优先!) | 工具 | 用途 | |------|------| -| `rag_query` | **语义搜索代码**(推荐!比 search_code 更智能,理解代码含义) | -| `security_search` | **安全相关代码搜索**(专门查找安全敏感代码) | -| `function_context` | **函数上下文搜索**(获取函数的调用关系和上下文) | -| `list_files` | 了解项目结构 | +| `rag_query` | **🔥 首选代码搜索工具** - 语义搜索,查找业务逻辑和漏洞上下文 | +| `security_search` | **🔥 首选安全搜索工具** - 查找特定的安全敏感代码模式 | +| `function_context` | **🔥 理解代码结构** - 获取函数调用关系和定义 | | `read_file` | 读取文件内容验证发现 | -| `search_code` | 关键词搜索代码(精确匹配) | +| `list_files` | ⚠️ **仅用于** 了解根目录结构,**严禁** 用于遍历代码查找内容 | +| `search_code` | ⚠️ **仅用于** 查找非常具体的字符串常量,**严禁** 作为主要代码搜索手段 | | `query_security_knowledge` | 查询安全知识库 | ### 🔍 代码搜索工具对比 | 工具 | 特点 | 适用场景 | |------|------|---------| -| `rag_query` | **语义搜索**,理解代码含义 | 查找"处理用户输入的函数"、"数据库查询逻辑" | -| `security_search` | **安全专用搜索** | 查找"SQL注入相关代码"、"认证授权代码" | -| `function_context` | **函数上下文** | 查找某函数的调用者和被调用者 | -| `search_code` | **关键词搜索**,精确匹配 | 查找特定函数名、变量名、字符串 | +| `rag_query` | **🔥 语义搜索**,理解代码含义 | **首选!** 查找"处理用户输入的函数"、"数据库查询逻辑" | +| `security_search` | **🔥 安全专用搜索** | **首选!** 查找"SQL注入相关代码"、"认证授权代码" | +| `function_context` | **🔥 函数上下文** | 查找某函数的调用者和被调用者 | +| `search_code` | **❌ 关键词搜索**,仅精确匹配 | **不推荐**,仅用于查找确定的常量或变量名 | -**推荐**: -1. 查找安全相关代码时优先使用 `security_search` -2. 理解函数关系时使用 `function_context` -3. 通用语义搜索使用 `rag_query` -4. 精确匹配时使用 `search_code` +**❌ 严禁行为**: +1. **不要** 使用 `list_files` 递归列出所有文件来查找代码 +2. **不要** 使用 `search_code` 搜索通用关键词(如 "function", "user"),这会产生大量无用结果 + +**✅ 推荐行为**: +1. **始终优先使用 RAG 工具** (`rag_query`, `security_search`) +2. `rag_query` 可以理解自然语言,如 "Show me the login function" +3. 仅在确实需要精确匹配特定字符串时才使用 `search_code` ### 📋 推荐分析流程 #### 第一步:快速侦察(5%时间) ``` -Action: list_files -Action Input: {"directory": "."} ``` -了解项目结构、技术栈、入口点 +Action: list_files +Action Input: {"directory": ".", "max_depth": 2} +``` +了解项目根目录结构(不要遍历全项目) -**语义搜索高风险代码(推荐!):** +**🔥 RAG 搜索关键逻辑(RAG 优先!):** ``` Action: rag_query -Action Input: {"query": "处理用户输入或执行数据库查询的函数", "top_k": 10} +Action Input: {"query": "用户的登录认证逻辑在哪里?", "top_k": 5} ``` #### 第二步:外部工具全面扫描(60%时间)⚡重点! @@ -303,334 +307,6 @@ MULTI_AGENT_RULES = """ """ -# ====== 各Agent专用提示词 ====== - -ORCHESTRATOR_SYSTEM_PROMPT = f"""你是 DeepAudit 安全审计平台的编排 Agent。 - -{CORE_SECURITY_PRINCIPLES} - -## 你的职责 -作为编排层,你负责协调整个安全审计流程: -1. 分析项目信息,制定审计策略 -2. 调度子Agent执行具体任务 -3. 收集和整合分析结果 -4. 生成最终审计报告 - -## 可用操作 - -### dispatch_agent - 调度子Agent -``` -Action: dispatch_agent -Action Input: {{"agent": "recon|analysis|verification", "task": "任务描述", "context": "上下文"}} -``` - -### summarize - 汇总发现 -``` -Action: summarize -Action Input: {{"findings": [...], "analysis": "分析"}} -``` - -### finish - 完成审计 -``` -Action: finish -Action Input: {{"conclusion": "结论", "findings": [...], "recommendations": [...]}} -``` - -## 审计流程 -1. 调度 recon Agent 收集项目信息 -2. 基于 recon 结果,调度 analysis Agent 进行漏洞分析 -3. 对高置信度发现,调度 verification Agent 验证 -4. 汇总所有发现,生成最终报告 - -{MULTI_AGENT_RULES} - -## 输出格式 -``` -Thought: [分析和决策过程] -Action: [操作名称] -Action Input: [JSON参数] -``` -""" - -ANALYSIS_SYSTEM_PROMPT = f"""你是 DeepAudit 的漏洞分析 Agent,一个专业的安全分析专家。 - -{CORE_SECURITY_PRINCIPLES} - -{VULNERABILITY_PRIORITIES} - -{TOOL_USAGE_GUIDE} - -## 你的职责 -作为分析层,你负责深度安全分析: -1. 识别代码中的安全漏洞 -2. 追踪数据流和攻击路径 -3. 评估漏洞的严重性和影响 -4. 提供专业的修复建议 - -## 分析策略 - -### ⚠️ 核心原则:外部工具优先! - -**必须首先使用外部专业安全工具进行扫描!** 这些工具有经过验证的规则库和更低的误报率。 - -### 第一步:外部工具全面扫描(最重要!)⭐⭐⭐ -**根据项目技术栈,选择并执行以下工具:** - -**所有项目必做:** -- `semgrep_scan`: 使用规则 "p/security-audit" 或 "p/owasp-top-ten" 进行全面扫描 -- `gitleaks_scan`: 检测密钥泄露 - -**Python项目必做:** -- `bandit_scan`: Python专用安全扫描 -- `safety_scan`: 依赖漏洞检查 - -**Node.js项目必做:** -- `npm_audit`: 依赖漏洞检查 - -**大型项目推荐:** -- `kunlun_scan`: Kunlun-M深度代码审计 -- `osv_scan`: 开源漏洞扫描 - -### 第二步:分析外部工具结果 -对外部工具发现的问题进行深入分析: -- 使用 `read_file` 查看完整代码上下文 -- 使用 `dataflow_analysis` 追踪数据流 -- 理解业务逻辑,排除误报 - -### 第三步:补充扫描(仅在需要时) -如果外部工具覆盖不足,使用内置工具补充: -- `smart_scan`: 综合智能扫描 -- `pattern_match`: 正则模式匹配 - -### 第四步:验证和报告 -- 确认漏洞可利用性 -- 评估实际影响 -- 输出结构化的漏洞报告 - -## 输出格式 - -### 中间步骤 -``` -Thought: [分析思考] -Action: [工具名称] -Action Input: {{"参数": "值"}} -``` - -### 最终输出 -``` -Final Answer: {{ - "findings": [ - {{ - "vulnerability_type": "漏洞类型", - "severity": "critical|high|medium|low", - "title": "漏洞标题", - "description": "详细描述", - "file_path": "文件路径", - "line_start": 行号, - "code_snippet": "代码片段", - "source": "污点来源", - "sink": "危险函数", - "suggestion": "修复建议", - "confidence": 0.9 - }} - ], - "summary": "分析总结" -}} -``` -""" - -VERIFICATION_SYSTEM_PROMPT = f"""你是 DeepAudit 的验证 Agent,负责验证分析Agent发现的潜在漏洞。 - -{CORE_SECURITY_PRINCIPLES} - -## 你的职责 -作为验证层,你负责: -1. 验证漏洞是否真实存在 -2. 分析漏洞的可利用性 -3. 评估实际安全影响 -4. 提供最终置信度评估 - -## 验证方法 - -### 1. 外部工具交叉验证 ⭐⭐⭐(推荐!) -使用不同的外部工具验证发现: -- 使用 `semgrep_scan` 配合特定规则验证 -- 使用 `bandit_scan` 交叉确认 Python 漏洞 -- 如果多个工具都报告同一问题,置信度更高 - -### 2. 上下文验证 -- 检查完整的代码上下文 -- 理解数据处理逻辑 -- 验证安全控制是否存在 - -### 3. 数据流验证 -- 追踪从输入到输出的完整路径 -- 识别中间的验证和过滤 -- 确认是否存在有效的安全控制 - -### 4. 配置验证 -- 检查安全配置 -- 验证框架安全特性 -- 评估防护措施 - -### 5. 沙箱验证(高置信度漏洞) -- 使用 `sandbox_execute` 或漏洞专用测试工具 -- 构造 PoC 验证可利用性 -- 记录验证结果 - -## 输出格式 - -``` -Final Answer: {{ - "verified_findings": [ - {{ - "original_finding": {{...}}, - "is_verified": true/false, - "verification_method": "使用的验证方法", - "cross_tool_results": {{"semgrep": "...", "bandit": "..."}}, - "evidence": "验证证据", - "final_severity": "最终严重程度", - "final_confidence": 0.95, - "poc": "概念验证(如有)", - "remediation": "详细修复建议" - }} - ], - "summary": "验证总结" -}} -``` - -{TOOL_USAGE_GUIDE} -""" - -RECON_SYSTEM_PROMPT = f"""你是 DeepAudit 的侦察 Agent,负责收集和分析项目信息。 - -## 你的职责 -作为侦察层,你负责: -1. 分析项目结构和技术栈 -2. 识别关键入口点 -3. 发现配置文件和敏感区域 -4. **推荐需要使用的外部安全工具** -5. 提供初步风险评估 - -## 侦察目标 - -### 1. 技术栈识别(用于选择外部工具) -- 编程语言和版本 -- Web框架(Django, Flask, FastAPI, Express等) -- 数据库类型 -- 前端框架 -- **根据技术栈推荐外部工具:** - - Python项目 → bandit_scan, safety_scan - - Node.js项目 → npm_audit - - 所有项目 → semgrep_scan, gitleaks_scan - - 大型项目 → kunlun_scan, osv_scan - -### 2. 入口点发现 -- HTTP路由和API端点 -- Websocket处理 -- 定时任务和后台作业 -- 消息队列消费者 - -### 3. 敏感区域定位 -- 认证和授权代码 -- 数据库操作 -- 文件处理 -- 外部服务调用 - -### 4. 配置分析 -- 安全配置 -- 调试设置 -- 密钥管理 - -## 工作方式 -每一步,你需要输出: - -``` -Thought: [分析当前情况,思考需要收集什么信息] -Action: [工具名称] -Action Input: {{"参数1": "值1"}} -``` - -当你完成信息收集后,输出: - -``` -Thought: [总结收集到的所有信息] -Final Answer: [JSON 格式的结果] -``` - -## 输出格式 - -``` -Final Answer: {{ - "project_structure": {{...}}, - "tech_stack": {{ - "languages": [...], - "frameworks": [...], - "databases": [...] - }}, - "recommended_tools": {{ - "must_use": ["semgrep_scan", "gitleaks_scan", ...], - "recommended": ["kunlun_scan", ...], - "reason": "基于项目技术栈的推荐理由" - }}, - "entry_points": [ - {{"type": "...", "file": "...", "line": ..., "method": "..."}} - ], - "high_risk_areas": [ - "文件路径:行号 - 风险描述" - ], - "initial_findings": [ - {{"title": "...", "file_path": "...", "line_start": ..., "description": "..."}} - ], - "summary": "项目侦察总结" -}} -``` - -## ⚠️ 重要输出要求 - -### recommended_tools 格式要求(新增!) -**必须**根据项目技术栈推荐外部工具: -- `must_use`: 必须使用的工具列表 -- `recommended`: 推荐使用的工具列表 -- `reason`: 推荐理由 - -### high_risk_areas 格式要求 -每个高风险区域**必须**包含具体的文件路径,格式为: -- `"app.py:36 - SECRET_KEY 硬编码"` -- `"utils/file.py:120 - 使用用户输入构造文件路径"` -- `"api/views.py:45 - SQL 查询使用字符串拼接"` - -**禁止**输出纯描述性文本如 "File write operations with user-controlled paths",必须指明具体文件。 - -### initial_findings 格式要求 -每个发现**必须**包含: -- `title`: 漏洞标题 -- `file_path`: 具体文件路径 -- `line_start`: 行号 -- `description`: 详细描述 - -{TOOL_USAGE_GUIDE} -""" - - -def get_system_prompt(agent_type: str) -> str: - """ - 获取指定Agent类型的系统提示词 - - Args: - agent_type: Agent类型 (orchestrator, analysis, verification, recon) - - Returns: - 系统提示词 - """ - prompts = { - "orchestrator": ORCHESTRATOR_SYSTEM_PROMPT, - "analysis": ANALYSIS_SYSTEM_PROMPT, - "verification": VERIFICATION_SYSTEM_PROMPT, - "recon": RECON_SYSTEM_PROMPT, - } - return prompts.get(agent_type.lower(), ANALYSIS_SYSTEM_PROMPT) - def build_enhanced_prompt( base_prompt: str, @@ -640,39 +316,34 @@ def build_enhanced_prompt( ) -> str: """ 构建增强的提示词 - + Args: base_prompt: 基础提示词 include_principles: 是否包含核心原则 include_priorities: 是否包含漏洞优先级 include_tools: 是否包含工具指南 - + Returns: 增强后的提示词 """ parts = [base_prompt] - + if include_principles: parts.append(CORE_SECURITY_PRINCIPLES) - + if include_priorities: parts.append(VULNERABILITY_PRIORITIES) - + if include_tools: parts.append(TOOL_USAGE_GUIDE) - + return "\n\n".join(parts) __all__ = [ "CORE_SECURITY_PRINCIPLES", - "VULNERABILITY_PRIORITIES", + "VULNERABILITY_PRIORITIES", "TOOL_USAGE_GUIDE", "MULTI_AGENT_RULES", - "ORCHESTRATOR_SYSTEM_PROMPT", - "ANALYSIS_SYSTEM_PROMPT", - "VERIFICATION_SYSTEM_PROMPT", - "RECON_SYSTEM_PROMPT", - "get_system_prompt", "build_enhanced_prompt", ] diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 168d489..d82ba68 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -992,6 +992,8 @@ class CodeIndexer: indexed_file_hashes = await self.vector_store.get_file_hashes() indexed_files = set(indexed_file_hashes.keys()) + logger.debug(f"📂 已索引文件数: {len(indexed_files)}, file_hashes: {list(indexed_file_hashes.keys())[:5]}...") + # 收集当前文件 current_files = self._collect_files(directory, exclude_patterns, include_patterns) current_file_map: Dict[str, str] = {} # relative_path -> absolute_path @@ -1002,11 +1004,15 @@ class CodeIndexer: current_file_set = set(current_file_map.keys()) + logger.debug(f"📁 当前文件数: {len(current_file_set)}, 示例: {list(current_file_set)[:5]}...") + # 计算差异 files_to_add = current_file_set - indexed_files files_to_delete = indexed_files - current_file_set files_to_check = current_file_set & indexed_files + logger.debug(f"📊 差异分析: 交集={len(files_to_check)}, 新增候选={len(files_to_add)}, 删除候选={len(files_to_delete)}") + # 检查需要更新的文件(hash 变化) files_to_update: Set[str] = set() for relative_path in files_to_check: diff --git a/backend/app/services/rag/splitter.py b/backend/app/services/rag/splitter.py index 4dbc89e..cb8b672 100644 --- a/backend/app/services/rag/splitter.py +++ b/backend/app/services/rag/splitter.py @@ -92,7 +92,7 @@ class CodeChunk: return len(self.content) // 4 def to_dict(self) -> Dict[str, Any]: - return { + result = { "id": self.id, "content": self.content, "file_path": self.file_path, @@ -110,8 +110,13 @@ class CodeChunk: "definitions": self.definitions, "security_indicators": self.security_indicators, "estimated_tokens": self.estimated_tokens, - "metadata": self.metadata, } + # 将 metadata 中的字段提升到顶级,确保 file_hash 等字段可以被正确检索 + if self.metadata: + for key, value in self.metadata.items(): + if key not in result: + result[key] = value + return result def to_embedding_text(self) -> str: """生成用于嵌入的文本""" @@ -244,20 +249,29 @@ class TreeSitterParser: """从 AST 提取定义""" if tree is None: return [] - + definitions = [] definition_types = self.DEFINITION_TYPES.get(language, {}) - + def traverse(node, parent_name=None): node_type = node.type - + # 检查是否是定义节点 + matched = False for def_category, types in definition_types.items(): if node_type in types: name = self._extract_name(node, language) - + + # 根据是否有 parent_name 来区分 function 和 method + actual_category = def_category + if def_category == "function" and parent_name: + actual_category = "method" + elif def_category == "method" and not parent_name: + # 跳过没有 parent 的 method 定义(由 function 类别处理) + continue + definitions.append({ - "type": def_category, + "type": actual_category, "name": name, "parent_name": parent_name, "start_point": node.start_point, @@ -266,17 +280,23 @@ class TreeSitterParser: "end_byte": node.end_byte, "node_type": node_type, }) - + + matched = True + # 对于类,继续遍历子节点找方法 if def_category == "class": for child in node.children: traverse(child, name) return - - # 继续遍历子节点 - for child in node.children: - traverse(child, parent_name) - + + # 匹配到一个类别后就不再匹配其他类别 + break + + # 如果没有匹配到定义,继续遍历子节点 + if not matched: + for child in node.children: + traverse(child, parent_name) + traverse(tree.root_node) return definitions diff --git a/backend/app/services/scanner.py b/backend/app/services/scanner.py index fea8968..abb1698 100644 --- a/backend/app/services/scanner.py +++ b/backend/app/services/scanner.py @@ -9,6 +9,7 @@ from datetime import datetime, timezone from urllib.parse import urlparse, quote from sqlalchemy.ext.asyncio import AsyncSession +from app.utils.repo_utils import parse_repository_url from app.models.audit import AuditTask, AuditIssue from app.models.project import Project from app.services.llm.service import LLMService @@ -149,17 +150,8 @@ async def fetch_file_content(url: str, headers: Dict[str, str] = None) -> Option async def get_github_branches(repo_url: str, token: str = None) -> List[str]: """获取GitHub仓库分支列表""" - match = repo_url.rstrip('/') - if match.endswith('.git'): - match = match[:-4] - if 'github.com/' in match: - parts = match.split('github.com/')[-1].split('/') - if len(parts) >= 2: - owner, repo = parts[0], parts[1] - else: - raise Exception("GitHub 仓库 URL 格式错误") - else: - raise Exception("GitHub 仓库 URL 格式错误") + repo_info = parse_repository_url(repo_url, "github") + owner, repo = repo_info['owner'], repo_info['repo'] branches_url = f"https://api.github.com/repos/{owner}/{repo}/branches?per_page=100" branches_data = await github_api(branches_url, token) @@ -172,20 +164,11 @@ async def get_github_branches(repo_url: str, token: str = None) -> List[str]: async def get_gitea_branches(repo_url: str, token: str = None) -> List[str]: """获取Gitea仓库分支列表""" - parsed = urlparse(repo_url) - base = f"{parsed.scheme}://{parsed.netloc}" + repo_info = parse_repository_url(repo_url, "gitea") + base_url = repo_info['base_url'] # This is {base}/api/v1 + owner, repo = repo_info['owner'], repo_info['repo'] - # 提取Owner和Repo: path通常是 /owner/repo.git 或 /owner/repo - path = parsed.path.strip('/') - if path.endswith('.git'): - path = path[:-4] - parts = path.split('/') - if len(parts) < 2: - raise Exception("Gitea 仓库 URL 格式错误") - - owner, repo = parts[0], parts[1] - - branches_url = f"{base}/api/v1/repos/{owner}/{repo}/branches" + branches_url = f"{base_url}/repos/{owner}/{repo}/branches" branches_data = await gitea_api(branches_url, token) return [b["name"] for b in branches_data] @@ -194,7 +177,6 @@ async def get_gitea_branches(repo_url: str, token: str = None) -> List[str]: async def get_gitlab_branches(repo_url: str, token: str = None) -> List[str]: """获取GitLab仓库分支列表""" parsed = urlparse(repo_url) - base = f"{parsed.scheme}://{parsed.netloc}" extracted_token = token if parsed.username: @@ -203,14 +185,11 @@ async def get_gitlab_branches(repo_url: str, token: str = None) -> List[str]: elif parsed.username and not parsed.password: extracted_token = parsed.username - path = parsed.path.strip('/') - if path.endswith('.git'): - path = path[:-4] - if not path: - raise Exception("GitLab 仓库 URL 格式错误") + repo_info = parse_repository_url(repo_url, "gitlab") + base_url = repo_info['base_url'] + project_path = quote(repo_info['project_path'], safe='') - project_path = quote(path, safe='') - branches_url = f"{base}/api/v4/projects/{project_path}/repository/branches?per_page=100" + branches_url = f"{base_url}/projects/{project_path}/repository/branches?per_page=100" branches_data = await gitlab_api(branches_url, extracted_token) return [b["name"] for b in branches_data] @@ -219,17 +198,8 @@ async def get_gitlab_branches(repo_url: str, token: str = None) -> List[str]: async def get_github_files(repo_url: str, branch: str, token: str = None, exclude_patterns: List[str] = None) -> List[Dict[str, str]]: """获取GitHub仓库文件列表""" # 解析仓库URL - match = repo_url.rstrip('/') - if match.endswith('.git'): - match = match[:-4] - if 'github.com/' in match: - parts = match.split('github.com/')[-1].split('/') - if len(parts) >= 2: - owner, repo = parts[0], parts[1] - else: - raise Exception("GitHub 仓库 URL 格式错误") - else: - raise Exception("GitHub 仓库 URL 格式错误") + repo_info = parse_repository_url(repo_url, "github") + owner, repo = repo_info['owner'], repo_info['repo'] # 获取仓库文件树 tree_url = f"https://api.github.com/repos/{owner}/{repo}/git/trees/{quote(branch)}?recursive=1" @@ -251,7 +221,6 @@ async def get_github_files(repo_url: str, branch: str, token: str = None, exclud async def get_gitlab_files(repo_url: str, branch: str, token: str = None, exclude_patterns: List[str] = None) -> List[Dict[str, str]]: """获取GitLab仓库文件列表""" parsed = urlparse(repo_url) - base = f"{parsed.scheme}://{parsed.netloc}" # 从URL中提取token(如果存在) extracted_token = token @@ -262,16 +231,12 @@ async def get_gitlab_files(repo_url: str, branch: str, token: str = None, exclud extracted_token = parsed.username # 解析项目路径 - path = parsed.path.strip('/') - if path.endswith('.git'): - path = path[:-4] - if not path: - raise Exception("GitLab 仓库 URL 格式错误") - - project_path = quote(path, safe='') + repo_info = parse_repository_url(repo_url, "gitlab") + base_url = repo_info['base_url'] # {base}/api/v4 + project_path = quote(repo_info['project_path'], safe='') # 获取仓库文件树 - tree_url = f"{base}/api/v4/projects/{project_path}/repository/tree?ref={quote(branch)}&recursive=true&per_page=100" + tree_url = f"{base_url}/projects/{project_path}/repository/tree?ref={quote(branch)}&recursive=true&per_page=100" tree_data = await gitlab_api(tree_url, extracted_token) files = [] @@ -279,7 +244,7 @@ async def get_gitlab_files(repo_url: str, branch: str, token: str = None, exclud if item.get("type") == "blob" and is_text_file(item["path"]) and not should_exclude(item["path"], exclude_patterns): files.append({ "path": item["path"], - "url": f"{base}/api/v4/projects/{project_path}/repository/files/{quote(item['path'], safe='')}/raw?ref={quote(branch)}", + "url": f"{base_url}/projects/{project_path}/repository/files/{quote(item['path'], safe='')}/raw?ref={quote(branch)}", "token": extracted_token }) @@ -289,40 +254,23 @@ async def get_gitlab_files(repo_url: str, branch: str, token: str = None, exclud async def get_gitea_files(repo_url: str, branch: str, token: str = None, exclude_patterns: List[str] = None) -> List[Dict[str, str]]: """获取Gitea仓库文件列表""" - parsed = urlparse(repo_url) - base = f"{parsed.scheme}://{parsed.netloc}" - - path = parsed.path.strip('/') - if path.endswith('.git'): - path = path[:-4] - parts = path.split('/') - if len(parts) < 2: - raise Exception("Gitea 仓库 URL 格式错误") - - owner, repo = parts[0], parts[1] + repo_info = parse_repository_url(repo_url, "gitea") + base_url = repo_info['base_url'] + owner, repo = repo_info['owner'], repo_info['repo'] # Gitea tree API: GET /repos/{owner}/{repo}/git/trees/{sha}?recursive=1 # 可以直接使用分支名作为sha - tree_url = f"{base}/api/v1/repos/{owner}/{repo}/git/trees/{quote(branch)}?recursive=1" + tree_url = f"{base_url}/repos/{owner}/{repo}/git/trees/{quote(branch)}?recursive=1" tree_data = await gitea_api(tree_url, token) files = [] for item in tree_data.get("tree", []): # Gitea API returns 'type': 'blob' for files if item.get("type") == "blob" and is_text_file(item["path"]) and not should_exclude(item["path"], exclude_patterns): - # Gitea raw file URL: {base}/{owner}/{repo}/raw/branch/{branch}/{path} - # 或者 API: /repos/{owner}/{repo}/contents/{filepath}?ref={branch} (get content, base64) - # 这里使用 raw URL 可能会更方便,但要注意私有仓库可能需要token访问raw - # Gitea raw URL usually works with token in header or query param. - # Standard Gitea: GET /repos/{owner}/{repo}/raw/{filepath}?ref={branch} (API) returns raw content? - # Actually Gitea raw url: {base}/{owner}/{repo}/raw/branch/{branch}/{path} or /raw/tag or /raw/commit - - # 使用API raw endpoint: GET /repos/{owner}/{repo}/raw/{filepath}?ref={branch} ==> 实际是 /repos/{owner}/{repo}/raw/{path} (ref通过query param?) - # 查阅文档,Gitea API v1 /repos/{owner}/{repo}/raw/{filepath} 接受 ref query param - # URL: {base}/api/v1/repos/{owner}/{repo}/raw/{quote(item['path'])}?ref={branch} + # 使用API raw endpoint: GET /repos/{owner}/{repo}/raw/{filepath}?ref={branch} files.append({ "path": item["path"], - "url": f"{base}/api/v1/repos/{owner}/{repo}/raw/{quote(item['path'])}?ref={quote(branch)}", + "url": f"{base_url}/repos/{owner}/{repo}/raw/{quote(item['path'])}?ref={quote(branch)}", "token": token # 传递token以便fetch_file_content使用 }) @@ -482,11 +430,11 @@ async def scan_repo_task(task_id: str, db_session_factory, user_config: dict = N # 使用提取的 token 或用户配置的 token if repo_type == "gitlab": - token_to_use = extracted_token or gitlab_token + token_to_use = file_info.get('token') or gitlab_token if token_to_use: headers["PRIVATE-TOKEN"] = token_to_use elif repo_type == "gitea": - token_to_use = extracted_token or gitea_token + token_to_use = file_info.get('token') or gitea_token if token_to_use: headers["Authorization"] = f"token {token_to_use}" elif repo_type == "github": diff --git a/backend/app/utils/__init__.py b/backend/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/backend/app/utils/repo_utils.py b/backend/app/utils/repo_utils.py new file mode 100644 index 0000000..58246df --- /dev/null +++ b/backend/app/utils/repo_utils.py @@ -0,0 +1,77 @@ +from urllib.parse import urlparse, urlunparse +from typing import Dict, Optional + +def parse_repository_url(repo_url: str, repo_type: str) -> Dict[str, str]: + """ + Parses a repository URL and returns its components. + + Args: + repo_url: The repository URL. + repo_type: The type of repository ('github', 'gitlab', 'gitea'). + + Returns: + A dictionary containing parsed components: + - base_url: The API base URL (for self-hosted instances) or default API URL. + - owner: The owner/namespace of the repository. + - repo: The repository name. + - server_url: The base URL of the server (scheme + netloc). + + Raises: + ValueError: If the URL is invalid or schema/domain check fails. + """ + if not repo_url: + raise ValueError(f"{repo_type} 仓库 URL 不能为空") + + # Basic sanitization + repo_url = repo_url.strip() + + # Check scheme to prevent SSRF (only allow http and https) + parsed = urlparse(repo_url) + if parsed.scheme not in ('http', 'https'): + raise ValueError(f"{repo_type} 仓库 URL 必须使用 http 或 https 协议") + + # Remove .git suffix if present + path = parsed.path.strip('/') + if path.endswith('.git'): + path = path[:-4] + + path_parts = path.split('/') + if len(path_parts) < 2: + raise ValueError(f"{repo_type} 仓库 URL 格式错误") + + base = f"{parsed.scheme}://{parsed.netloc}" + + if repo_type == "github": + # Handle github.com specifically if needed, or assume path_parts are owner/repo + # Case: https://github.com/owner/repo + if 'github.com' in parsed.netloc: + owner, repo = path_parts[-2], path_parts[-1] + api_base = "https://api.github.com" + else: + # Enterprise GitHub or similar? + owner, repo = path_parts[-2], path_parts[-1] + api_base = f"{base}/api/v3" # Assumption for GHE + + elif repo_type == "gitlab": + # GitLab supports subgroups, so path could be group/subgroup/repo + # But commonly we just need project path (URL encoded) + # We'll treat the full path as the project path identifier + repo = path_parts[-1] + owner = "/".join(path_parts[:-1]) + api_base = f"{base}/api/v4" + + elif repo_type == "gitea": + # Gitea: /owner/repo + owner, repo = path_parts[0], path_parts[1] + api_base = f"{base}/api/v1" + + else: + raise ValueError(f"不支持的仓库类型: {repo_type}") + + return { + "base_url": api_base, + "owner": owner, + "repo": repo, + "project_path": path, # Useful for GitLab + "server_url": base + } diff --git a/backend/docker-entrypoint.sh b/backend/docker-entrypoint.sh new file mode 100644 index 0000000..280cd5b --- /dev/null +++ b/backend/docker-entrypoint.sh @@ -0,0 +1,53 @@ +#!/bin/bash +set -e + +echo "🚀 DeepAudit 后端启动中..." + +# 等待 PostgreSQL 就绪 +echo "⏳ 等待数据库连接..." +max_retries=30 +retry_count=0 + +while [ $retry_count -lt $max_retries ]; do + if .venv/bin/python -c " +import asyncio +from sqlalchemy.ext.asyncio import create_async_engine +import os + +async def check_db(): + engine = create_async_engine(os.environ.get('DATABASE_URL', '')) + try: + async with engine.connect() as conn: + await conn.execute(text('SELECT 1')) + return True + except Exception: + return False + finally: + await engine.dispose() + +from sqlalchemy import text +exit(0 if asyncio.run(check_db()) else 1) +" 2>/dev/null; then + echo "✅ 数据库连接成功" + break + fi + + retry_count=$((retry_count + 1)) + echo " 重试 $retry_count/$max_retries..." + sleep 2 +done + +if [ $retry_count -eq $max_retries ]; then + echo "❌ 无法连接到数据库,请检查 DATABASE_URL 配置" + exit 1 +fi + +# 运行数据库迁移 +echo "📦 执行数据库迁移..." +.venv/bin/alembic upgrade head + +echo "✅ 数据库迁移完成" + +# 启动 uvicorn +echo "🌐 启动 API 服务..." +exec .venv/bin/uvicorn app.main:app --host 0.0.0.0 --port 8000 diff --git a/backend/pyproject.toml b/backend/pyproject.toml index 0b6187d..3424446 100644 --- a/backend/pyproject.toml +++ b/backend/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "deepaudit-backend" -version = "3.0.0" +version = "3.0.1" description = "DeepAudit Backend API - AI-Powered Code Security Audit Platform" requires-python = ">=3.11" readme = "README.md" @@ -202,7 +202,7 @@ exclude_lines = [ "if TYPE_CHECKING:", ] -# ============ UV Configuration ============ +# ============ Dependency Groups (PEP 735) ============ [dependency-groups] dev = [ diff --git a/docker-compose.prod.cn.yml b/docker-compose.prod.cn.yml new file mode 100644 index 0000000..6b756b3 --- /dev/null +++ b/docker-compose.prod.cn.yml @@ -0,0 +1,111 @@ +# ============================================= +# DeepAudit v3.0.0 生产环境一键部署配置(国内加速版) +# ============================================= +# 使用南京大学镜像站加速拉取 GHCR 镜像 +# 部署命令: curl -fsSL https://raw.githubusercontent.com/lintsinghua/DeepAudit/main/docker-compose.prod.cn.yml | docker compose -f - up -d +# +# 镜像加速说明: +# - 原始地址:ghcr.io +# - 加速地址:ghcr.nju.edu.cn(南京大学开源镜像站) + +services: + db: + image: postgres:15-alpine + restart: unless-stopped + volumes: + - postgres_data:/var/lib/postgresql/data + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_DB=deepaudit + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - deepaudit-network + + redis: + image: redis:7-alpine + restart: unless-stopped + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - deepaudit-network + + backend: + image: ghcr.nju.edu.cn/lintsinghua/deepaudit-backend:latest + restart: unless-stopped + volumes: + - backend_uploads:/app/uploads + - /var/run/docker.sock:/var/run/docker.sock + ports: + - "8000:8000" + environment: + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/deepaudit + - REDIS_URL=redis://redis:6379/0 + - AGENT_ENABLED=true + - SANDBOX_ENABLED=true + - SANDBOX_IMAGE=ghcr.nju.edu.cn/lintsinghua/deepaudit-sandbox:latest + # LLM 配置 - 请根据需要修改 + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - LLM_MODEL=${LLM_MODEL:-gpt-4o} + - LLM_API_KEY=${LLM_API_KEY:-your-api-key-here} + - LLM_BASE_URL=${LLM_BASE_URL:-} + # 禁用代理 + - HTTP_PROXY= + - HTTPS_PROXY= + - NO_PROXY=* + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + db-migrate: + condition: service_completed_successfully + networks: + - deepaudit-network + + # 数据库迁移服务 - 在后端启动前自动执行 + db-migrate: + image: ghcr.nju.edu.cn/lintsinghua/deepaudit-backend:latest + restart: "no" + environment: + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/deepaudit + command: [".venv/bin/alembic", "upgrade", "head"] + depends_on: + db: + condition: service_healthy + networks: + - deepaudit-network + + frontend: + image: ghcr.nju.edu.cn/lintsinghua/deepaudit-frontend:latest + restart: unless-stopped + ports: + - "3000:80" + depends_on: + - backend + networks: + - deepaudit-network + + # 预拉取沙箱镜像(后端会按需调用) + sandbox-pull: + image: ghcr.nju.edu.cn/lintsinghua/deepaudit-sandbox:latest + restart: "no" + command: echo "Sandbox image ready" + +networks: + deepaudit-network: + driver: bridge + +volumes: + postgres_data: + backend_uploads: + redis_data: diff --git a/docker-compose.prod.yml b/docker-compose.prod.yml new file mode 100644 index 0000000..2d7baa9 --- /dev/null +++ b/docker-compose.prod.yml @@ -0,0 +1,107 @@ +# ============================================= +# DeepAudit v3.0.0 生产环境一键部署配置 +# ============================================= +# 使用预构建的 GHCR 镜像,无需本地构建 +# 部署命令: curl -fsSL https://raw.githubusercontent.com/lintsinghua/DeepAudit/main/docker-compose.prod.yml | docker compose -f - up -d + +services: + db: + image: postgres:15-alpine + restart: unless-stopped + volumes: + - postgres_data:/var/lib/postgresql/data + environment: + - POSTGRES_USER=postgres + - POSTGRES_PASSWORD=postgres + - POSTGRES_DB=deepaudit + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 5s + timeout: 5s + retries: 5 + networks: + - deepaudit-network + + redis: + image: redis:7-alpine + restart: unless-stopped + volumes: + - redis_data:/data + healthcheck: + test: ["CMD", "redis-cli", "ping"] + interval: 10s + timeout: 5s + retries: 5 + networks: + - deepaudit-network + + backend: + image: ghcr.io/lintsinghua/deepaudit-backend:latest + restart: unless-stopped + volumes: + - backend_uploads:/app/uploads + - /var/run/docker.sock:/var/run/docker.sock + ports: + - "8000:8000" + environment: + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/deepaudit + - REDIS_URL=redis://redis:6379/0 + - AGENT_ENABLED=true + - SANDBOX_ENABLED=true + - SANDBOX_IMAGE=ghcr.io/lintsinghua/deepaudit-sandbox:latest + # LLM 配置 - 请根据需要修改 + - LLM_PROVIDER=${LLM_PROVIDER:-openai} + - LLM_MODEL=${LLM_MODEL:-gpt-4o} + - LLM_API_KEY=${LLM_API_KEY:-your-api-key-here} + - LLM_BASE_URL=${LLM_BASE_URL:-} + # 禁用代理 + - HTTP_PROXY= + - HTTPS_PROXY= + - NO_PROXY=* + depends_on: + db: + condition: service_healthy + redis: + condition: service_healthy + db-migrate: + condition: service_completed_successfully + networks: + - deepaudit-network + + # 数据库迁移服务 - 在后端启动前自动执行 + db-migrate: + image: ghcr.io/lintsinghua/deepaudit-backend:latest + restart: "no" + environment: + - DATABASE_URL=postgresql+asyncpg://postgres:postgres@db:5432/deepaudit + command: [".venv/bin/alembic", "upgrade", "head"] + depends_on: + db: + condition: service_healthy + networks: + - deepaudit-network + + frontend: + image: ghcr.io/lintsinghua/deepaudit-frontend:latest + restart: unless-stopped + ports: + - "3000:80" + depends_on: + - backend + networks: + - deepaudit-network + + # 预拉取沙箱镜像(后端会按需调用) + sandbox-pull: + image: ghcr.io/lintsinghua/deepaudit-sandbox:latest + restart: "no" + command: echo "Sandbox image ready" + +networks: + deepaudit-network: + driver: bridge + +volumes: + postgres_data: + backend_uploads: + redis_data: diff --git a/docker-compose.yml b/docker-compose.yml index d643c0b..9b16ae2 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -80,6 +80,9 @@ services: - all_proxy= - ALL_PROXY= restart: unless-stopped + volumes: + - ./frontend/dist:/usr/share/nginx/html:ro # 挂载构建产物,本地 pnpm build 后自动生效 + - ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置 ports: - "3000:80" # Nginx 监听 80 端口 environment: @@ -110,14 +113,13 @@ services: - deepaudit-network # 沙箱镜像构建服务 (漏洞验证必须) - # 注意: 此服务仅用于构建镜像,不会持续运行 + # 注意: 此服务仅用于构建镜像,构建完成后自动退出 sandbox: build: context: ./docker/sandbox dockerfile: Dockerfile image: deepaudit/sandbox:latest - profiles: - - build-only + restart: "no" command: echo "Sandbox image built successfully" networks: diff --git a/frontend/Dockerfile b/frontend/Dockerfile index 3be62b4..1d40b48 100644 --- a/frontend/Dockerfile +++ b/frontend/Dockerfile @@ -3,7 +3,7 @@ # ============================================= # 使用 Nginx 提供静态文件和反向代理 (支持 SSE 流式传输) -FROM node:20-alpine AS builder +FROM node:20-slim AS builder WORKDIR /app @@ -25,7 +25,10 @@ RUN npm config set registry https://registry.npmmirror.com && \ # 复制依赖文件 COPY package.json pnpm-lock.yaml ./ -RUN pnpm install --no-frozen-lockfile +# 增加网络超时设置和并发数限制,防止 ARM 架构构建卡死 +RUN pnpm config set network-timeout 300000 && \ + pnpm config set fetch-retries 5 && \ + pnpm install --no-frozen-lockfile --network-concurrency 1 # 复制源代码 COPY . . diff --git a/frontend/docker-entrypoint.sh b/frontend/docker-entrypoint.sh index b082397..264f67e 100644 --- a/frontend/docker-entrypoint.sh +++ b/frontend/docker-entrypoint.sh @@ -9,7 +9,8 @@ echo "Injecting API URL: $API_URL" # 在所有 JS 文件中替换占位符 # 注意:这里路径必须是 nginx 实际存放文件的路径 -find /usr/share/nginx/html -name '*.js' -exec sed -i "s|__API_BASE_URL__|${API_URL}|g" {} \; +ESCAPED_API_URL=$(echo "${API_URL}" | sed 's/[&/|]/\\&/g') +find /usr/share/nginx/html -name '*.js' -exec sed -i "s|__API_BASE_URL__|${ESCAPED_API_URL}|g" {} \; # 执行原始命令 exec "$@" diff --git a/frontend/package.json b/frontend/package.json index 5ef44d3..6d039a6 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "deep-audit", - "version": "3.0.0", + "version": "3.0.1", "type": "module", "scripts": { "dev": "vite", diff --git a/frontend/src/components/agent/EmbeddingConfig.tsx b/frontend/src/components/agent/EmbeddingConfig.tsx index a4f1c98..cbbda76 100644 --- a/frontend/src/components/agent/EmbeddingConfig.tsx +++ b/frontend/src/components/agent/EmbeddingConfig.tsx @@ -46,6 +46,7 @@ interface EmbeddingProvider { interface EmbeddingConfig { provider: string; model: string; + api_key: string | null; base_url: string | null; dimensions: number; batch_size: number; @@ -79,15 +80,15 @@ export default function EmbeddingConfigPanel() { loadData(); }, []); - // 当 provider 改变时更新模型 - useEffect(() => { - if (selectedProvider) { - const provider = providers.find((p) => p.id === selectedProvider); - if (provider) { - setSelectedModel(provider.default_model); - } + // 用户手动切换 provider 时更新为默认模型 + const handleProviderChange = (newProvider: string) => { + setSelectedProvider(newProvider); + // 切换 provider 时重置为该 provider 的默认模型 + const provider = providers.find((p) => p.id === newProvider); + if (provider) { + setSelectedModel(provider.default_model); } - }, [selectedProvider, providers]); + }; const loadData = async () => { try { @@ -104,6 +105,7 @@ export default function EmbeddingConfigPanel() { if (configRes.data) { setSelectedProvider(configRes.data.provider); setSelectedModel(configRes.data.model); + setApiKey(configRes.data.api_key || ""); setBaseUrl(configRes.data.base_url || ""); setBatchSize(configRes.data.batch_size); } @@ -230,7 +232,7 @@ export default function EmbeddingConfigPanel() { {/* 提供商选择 */}
- diff --git a/frontend/src/pages/AgentAudit/components/StatsPanel.tsx b/frontend/src/pages/AgentAudit/components/StatsPanel.tsx index 2b109b5..98956da 100644 --- a/frontend/src/pages/AgentAudit/components/StatsPanel.tsx +++ b/frontend/src/pages/AgentAudit/components/StatsPanel.tsx @@ -133,11 +133,20 @@ export const StatsPanel = memo(function StatsPanel({ task, findings }: StatsPane {/* File progress */}
- Files analyzed + Files scanned {task.analyzed_files}/{task.total_files}
+ {/* Files with findings */} + {task.files_with_findings > 0 && ( +
+ Files with findings + + {task.files_with_findings} + +
+ )}
{/* Metrics Grid */} diff --git a/frontend/src/shared/api/agentTasks.ts b/frontend/src/shared/api/agentTasks.ts index 777caf1..f8e28af 100644 --- a/frontend/src/shared/api/agentTasks.ts +++ b/frontend/src/shared/api/agentTasks.ts @@ -21,6 +21,7 @@ export interface AgentTask { total_files: number; indexed_files: number; analyzed_files: number; + files_with_findings: number; // 有漏洞发现的文件数 total_chunks: number; findings_count: number; verified_count: number; @@ -128,6 +129,7 @@ export interface AgentTaskSummary { total_files: number; indexed_files: number; analyzed_files: number; + files_with_findings: number; total_chunks: number; findings_count: number; verified_count: number;