From de88b69f869ef1dee3eae2fe33424ef0b6b4f6d8 Mon Sep 17 00:00:00 2001
From: vinland100 <wuyanbo5210@qq.com>
Date: Fri, 9 Jan 2026 16:41:40 +0800
Subject: [PATCH] The fast scan mode follows the file exclusion pattern used
 during RAG embedding.

---
 backend/app/api/v1/endpoints/agent_tasks.py |  2 +-
 backend/app/api/v1/endpoints/scan.py        | 10 ++----
 backend/app/services/scanner.py             | 40 ++++-----------------
 3 files changed, 11 insertions(+), 41 deletions(-)

diff --git a/backend/app/api/v1/endpoints/agent_tasks.py b/backend/app/api/v1/endpoints/agent_tasks.py
index e9ccfa2..5fbf5d3 100644
--- a/backend/app/api/v1/endpoints/agent_tasks.py
+++ b/backend/app/api/v1/endpoints/agent_tasks.py
@@ -1044,7 +1044,7 @@ async def _collect_project_info(
                     continue
                 
                 # 使用通用的筛选逻辑
-                if should_exclude(relative_path, f, exclude_patterns):
+                if not is_text_file(f) or should_exclude(relative_path, f, exclude_patterns):
                     continue
                 
                 info["file_count"] += 1
diff --git a/backend/app/api/v1/endpoints/scan.py b/backend/app/api/v1/endpoints/scan.py
index 73c8f1f..24f64aa 100644
--- a/backend/app/api/v1/endpoints/scan.py
+++ b/backend/app/api/v1/endpoints/scan.py
@@ -23,6 +23,7 @@ from app.services.llm.service import LLMService
 from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config
 from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip
 from app.core.config import settings
+from app.core.file_filter import EXCLUDE_DIRS
 
 router = APIRouter()
 
@@ -36,12 +37,7 @@ def normalize_path(path: str) -> str:
     return path.replace("\\", "/")
 
 
-# 支持的文件扩展名
-TEXT_EXTENSIONS = [
-    ".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs",
-    ".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
-    ".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml"
-]
+# TEXT_EXTENSIONS has been moved to app.core.file_filter and is used via app.services.scanner.is_text_file
 
 
 async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None):
@@ -74,7 +70,7 @@ async def process_zip_task(task_id: str, file_path: str, db_session_factory, use
             files_to_scan = []
             for root, dirs, files in os.walk(extract_dir):
                 # 排除常见非代码目录
-                dirs[:] = [d for d in dirs if d not in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'vendor']]
+                dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
                 
                 for file in files:
                     full_path = Path(root) / file
diff --git a/backend/app/services/scanner.py b/backend/app/services/scanner.py
index 7041d0e..9eb23a7 100644
--- a/backend/app/services/scanner.py
+++ b/backend/app/services/scanner.py
@@ -2,6 +2,7 @@
 仓库扫描服务 - 支持GitHub, GitLab 和 Gitea 仓库扫描
 """
 
+import os
 import asyncio
 import httpx
 from typing import List, Dict, Any, Optional
@@ -14,6 +15,7 @@ from app.models.audit import AuditTask, AuditIssue
 from app.models.project import Project
 from app.services.llm.service import LLMService
 from app.core.config import settings
+from app.core.file_filter import is_text_file as core_is_text_file, should_exclude as core_should_exclude, TEXT_EXTENSIONS as CORE_TEXT_EXTENSIONS
 
 
 def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
@@ -35,46 +37,18 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st
     }
 
 
-# 支持的文本文件扩展名
-TEXT_EXTENSIONS = [
-    ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
-    ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
-    ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
-    ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
-    ".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
-    ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
-    ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
-    ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
-    ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
-    ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
-    ".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
-]
-
-# 排除的目录和文件模式
-EXCLUDE_PATTERNS = [
-    # 常用目录
-    "node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/",
-    ".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/",
-    ".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/",
-    ".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/",
-    "packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*",
-    # 常见锁文件与二进制
-    "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
-    "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
-    ".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe",
-    "*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class"
-]
-
+# 支持的文本文件扩展名使用全局定义
+TEXT_EXTENSIONS = list(CORE_TEXT_EXTENSIONS)
 
 def is_text_file(path: str) -> bool:
     """检查是否为文本文件"""
-    return any(path.lower().endswith(ext) for ext in TEXT_EXTENSIONS)
+    return core_is_text_file(path)
 
 
 def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool:
     """检查是否应该排除该文件"""
-    all_patterns = EXCLUDE_PATTERNS + (exclude_patterns or [])
-    return any(pattern in path for pattern in all_patterns)
+    filename = os.path.basename(path)
+    return core_should_exclude(path, filename, exclude_patterns)
 
 
 def get_language_from_path(path: str) -> str: