From de88b69f869ef1dee3eae2fe33424ef0b6b4f6d8 Mon Sep 17 00:00:00 2001 From: vinland100 Date: Fri, 9 Jan 2026 16:41:40 +0800 Subject: [PATCH] The fast scan mode follows the file exclusion pattern used during RAG embedding. --- backend/app/api/v1/endpoints/agent_tasks.py | 2 +- backend/app/api/v1/endpoints/scan.py | 10 ++---- backend/app/services/scanner.py | 40 ++++----------------- 3 files changed, 11 insertions(+), 41 deletions(-) diff --git a/backend/app/api/v1/endpoints/agent_tasks.py b/backend/app/api/v1/endpoints/agent_tasks.py index e9ccfa2..5fbf5d3 100644 --- a/backend/app/api/v1/endpoints/agent_tasks.py +++ b/backend/app/api/v1/endpoints/agent_tasks.py @@ -1044,7 +1044,7 @@ async def _collect_project_info( continue # 使用通用的筛选逻辑 - if should_exclude(relative_path, f, exclude_patterns): + if not is_text_file(f) or should_exclude(relative_path, f, exclude_patterns): continue info["file_count"] += 1 diff --git a/backend/app/api/v1/endpoints/scan.py b/backend/app/api/v1/endpoints/scan.py index 73c8f1f..24f64aa 100644 --- a/backend/app/api/v1/endpoints/scan.py +++ b/backend/app/api/v1/endpoints/scan.py @@ -23,6 +23,7 @@ from app.services.llm.service import LLMService from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip from app.core.config import settings +from app.core.file_filter import EXCLUDE_DIRS router = APIRouter() @@ -36,12 +37,7 @@ def normalize_path(path: str) -> str: return path.replace("\\", "/") -# 支持的文件扩展名 -TEXT_EXTENSIONS = [ - ".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs", - ".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb", - ".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml" -] +# TEXT_EXTENSIONS has been moved to app.core.file_filter and is used via app.services.scanner.is_text_file async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None): @@ -74,7 +70,7 @@ async def process_zip_task(task_id: str, file_path: str, db_session_factory, use files_to_scan = [] for root, dirs, files in os.walk(extract_dir): # 排除常见非代码目录 - dirs[:] = [d for d in dirs if d not in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'vendor']] + dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS] for file in files: full_path = Path(root) / file diff --git a/backend/app/services/scanner.py b/backend/app/services/scanner.py index 7041d0e..9eb23a7 100644 --- a/backend/app/services/scanner.py +++ b/backend/app/services/scanner.py @@ -2,6 +2,7 @@ 仓库扫描服务 - 支持GitHub, GitLab 和 Gitea 仓库扫描 """ +import os import asyncio import httpx from typing import List, Dict, Any, Optional @@ -14,6 +15,7 @@ from app.models.audit import AuditTask, AuditIssue from app.models.project import Project from app.services.llm.service import LLMService from app.core.config import settings +from app.core.file_filter import is_text_file as core_is_text_file, should_exclude as core_should_exclude, TEXT_EXTENSIONS as CORE_TEXT_EXTENSIONS def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: @@ -35,46 +37,18 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st } -# 支持的文本文件扩展名 -TEXT_EXTENSIONS = [ - ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", - ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", - ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", - ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj", - ".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc", - ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", - ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t", - ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql", - ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", - ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", - ".erl", ".hrl", ".m", ".mm", ".r", ".rmd" -] - -# 排除的目录和文件模式 -EXCLUDE_PATTERNS = [ - # 常用目录 - "node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/", - ".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/", - ".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/", - ".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/", - "packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*", - # 常见锁文件与二进制 - "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", - "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile", - ".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe", - "*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class" -] - +# 支持的文本文件扩展名使用全局定义 +TEXT_EXTENSIONS = list(CORE_TEXT_EXTENSIONS) def is_text_file(path: str) -> bool: """检查是否为文本文件""" - return any(path.lower().endswith(ext) for ext in TEXT_EXTENSIONS) + return core_is_text_file(path) def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool: """检查是否应该排除该文件""" - all_patterns = EXCLUDE_PATTERNS + (exclude_patterns or []) - return any(pattern in path for pattern in all_patterns) + filename = os.path.basename(path) + return core_should_exclude(path, filename, exclude_patterns) def get_language_from_path(path: str) -> str: