From a11542c4bfe491eb5c36b474e83857dc20068f2c Mon Sep 17 00:00:00 2001 From: vinland100 Date: Fri, 9 Jan 2026 16:28:27 +0800 Subject: [PATCH] Configure unified file extensions, exclude directories, and exclude files. --- backend/app/api/v1/endpoints/agent_tasks.py | 20 ++-- backend/app/core/file_filter.py | 100 ++++++++++++++++++ backend/app/services/agent/config.py | 19 +--- backend/app/services/agent/tools/file_tool.py | 51 +++------ .../services/agent/tools/smart_scan_tool.py | 38 ++----- backend/app/services/rag/indexer.py | 91 +--------------- 6 files changed, 137 insertions(+), 182 deletions(-) create mode 100644 backend/app/core/file_filter.py diff --git a/backend/app/api/v1/endpoints/agent_tasks.py b/backend/app/api/v1/endpoints/agent_tasks.py index 8640253..e9ccfa2 100644 --- a/backend/app/api/v1/endpoints/agent_tasks.py +++ b/backend/app/api/v1/endpoints/agent_tasks.py @@ -34,6 +34,7 @@ from app.services.agent.event_manager import EventManager from app.services.agent.streaming import StreamHandler, StreamEvent, StreamEventType from app.services.git_ssh_service import GitSSHOperations from app.core.encryption import decrypt_sensitive_data +from app.core.file_filter import EXCLUDE_DIRS, should_exclude, is_text_file logger = logging.getLogger(__name__) router = APIRouter() @@ -1008,11 +1009,8 @@ async def _collect_project_info( } try: - # 默认排除目录 - exclude_dirs = { - "node_modules", "__pycache__", ".git", "venv", ".venv", - "build", "dist", "target", ".idea", ".vscode", - } + # 使用通用的排除目录 + exclude_dirs = set(EXCLUDE_DIRS) # 从用户配置的排除模式中提取目录 if exclude_patterns: @@ -1045,14 +1043,8 @@ async def _collect_project_info( if target_files_set and relative_path not in target_files_set: continue - # 检查排除模式 - should_skip = False - if exclude_patterns: - for pattern in exclude_patterns: - if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(f, pattern): - should_skip = True - break - if should_skip: + # 使用通用的筛选逻辑 + if should_exclude(relative_path, f, exclude_patterns): continue info["file_count"] += 1 @@ -1085,7 +1077,7 @@ async def _collect_project_info( top_items = os.listdir(project_root) info["structure"] = { "directories": [d for d in top_items if os.path.isdir(os.path.join(project_root, d)) and d not in exclude_dirs], - "files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f))][:20], + "files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f)) and is_text_file(f)][:20], "scope_limited": False, } except Exception: diff --git a/backend/app/core/file_filter.py b/backend/app/core/file_filter.py new file mode 100644 index 0000000..dd8d1cc --- /dev/null +++ b/backend/app/core/file_filter.py @@ -0,0 +1,100 @@ +""" +Common file filtering logic for RAG and Audit Agents. +""" + +import os +import fnmatch +from typing import List, Set, Optional + +# Supported text file extensions (Code and Config) +TEXT_EXTENSIONS = { + # Core Languages + ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", + ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", + ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", + # .NET (Logic code) + ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc", + # Data & Config + ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", + # Scripts & Commands + ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t", + # Web + ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql", + ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", + # Others + ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", + ".erl", ".hrl", ".m", ".mm", ".r", ".rmd" +} + +# Directories to exclude +EXCLUDE_DIRS = { + # Build & Dependencies + "node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj", + "bower_components", "packages", "pkg", "Pods", ".gradle", ".m2", + "vendor/bundle", ".bundle", "jspm_packages", "typings", + # Virtual Environments + "venv", "env", ".env", "virtualenv", ".venv", + # IDE & Metadata + ".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults", + "_ReSharper.*", ".settings", ".project", ".classpath", ".metadata", + # Cache & Logs + "__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output", + ".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch", + # Cloud & Infrastructure + ".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site", + # Others + "__MACOSX", "extern", "externals", "third-party", "3rdparty" +} + +# Files to exclude (supports glob patterns) +EXCLUDE_FILES = { + # Lock files + "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", + "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile", + "mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml", + # Static & Binary + "*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map", + "*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj", + "*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class", + "*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif", + # System & Secrets + ".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key", + # Temp & Logs + "*.log", "*.bak", "*.swp", "*.tmp", "tags", + # IDE & Project Config (non-code) + "*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj", + "*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb", + "launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc", + # Generated code + "*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs", + # Large data files + "haarcascade_*.xml" +} + +def is_text_file(file_path: str) -> bool: + """Check if a file should be considered for indexing/scanning.""" + ext = os.path.splitext(file_path)[1].lower() + return ext in TEXT_EXTENSIONS + +def should_exclude(rel_path: str, filename: str, exclude_patterns: Optional[List[str]] = None) -> bool: + """ + Check if a file should be excluded based on default rules and custom patterns. + """ + # 1. Check EXCLUDE_FILES (glob patterns) + for pattern in EXCLUDE_FILES: + if fnmatch.fnmatch(filename, pattern): + return True + + # 2. Check EXCLUDE_DIRS in the path + path_parts = rel_path.replace('\\', '/').split('/') + for part in path_parts[:-1]: # Don't check filename as a directory + if part in EXCLUDE_DIRS: + return True + + # 3. Check custom exclude_patterns + if exclude_patterns: + for pattern in exclude_patterns: + if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern): + return True + + return False diff --git a/backend/app/services/agent/config.py b/backend/app/services/agent/config.py index 4a2c43b..52d82a5 100644 --- a/backend/app/services/agent/config.py +++ b/backend/app/services/agent/config.py @@ -14,6 +14,8 @@ from functools import lru_cache from pydantic import Field from pydantic_settings import BaseSettings +from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS + class LogLevel(str, Enum): """Logging levels""" @@ -279,24 +281,11 @@ class AgentConfig(BaseSettings): # ============ Security ============ allowed_file_extensions: Set[str] = Field( - default={ - ".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rb", ".php", - ".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".rs", ".scala", - ".vue", ".svelte", ".html", ".css", ".scss", ".sass", ".less", - ".json", ".yaml", ".yml", ".xml", ".toml", ".ini", ".conf", - ".sql", ".graphql", ".proto", ".sh", ".bash", ".zsh", ".ps1", - ".md", ".txt", ".rst", ".env.example", ".gitignore", - }, + default=TEXT_EXTENSIONS, description="Allowed file extensions for analysis" ) blocked_directories: Set[str] = Field( - default={ - "node_modules", "__pycache__", ".git", ".svn", ".hg", - "venv", ".venv", "env", ".env", "virtualenv", - "dist", "build", "target", "out", "bin", "obj", - ".idea", ".vscode", ".vs", ".pytest_cache", ".mypy_cache", - "coverage", ".coverage", "htmlcov", ".tox", ".nox", - }, + default=EXCLUDE_DIRS, description="Directories to exclude from scanning" ) max_path_depth: int = Field( diff --git a/backend/app/services/agent/tools/file_tool.py b/backend/app/services/agent/tools/file_tool.py index 009cf44..62bce6c 100644 --- a/backend/app/services/agent/tools/file_tool.py +++ b/backend/app/services/agent/tools/file_tool.py @@ -11,6 +11,7 @@ from typing import Optional, List, Dict, Any from pydantic import BaseModel, Field from .base import AgentTool, ToolResult +from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude class FileReadInput(BaseModel): @@ -107,15 +108,9 @@ class FileReadTool(AgentTool): if self.target_files and file_path not in self.target_files: return True - # 检查排除模式 - for pattern in self.exclude_patterns: - if fnmatch.fnmatch(file_path, pattern): - return True - # 也检查文件名 - if fnmatch.fnmatch(os.path.basename(file_path), pattern): - return True - - return False + # 使用通用的筛选逻辑 + filename = os.path.basename(file_path) + return should_exclude(file_path, filename, self.exclude_patterns) async def _execute( self, @@ -275,7 +270,7 @@ class FileSearchTool(AgentTool): self.target_files = set(target_files) if target_files else None # 从 exclude_patterns 中提取目录排除 - self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS) + self.exclude_dirs = set(EXCLUDE_DIRS) for pattern in self.exclude_patterns: if pattern.endswith("/**"): self.exclude_dirs.add(pattern[:-3]) @@ -370,13 +365,8 @@ class FileSearchTool(AgentTool): if self.target_files and relative_path not in self.target_files: continue - # 检查排除模式 - should_skip = False - for excl_pattern in self.exclude_patterns: - if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern): - should_skip = True - break - if should_skip: + # 使用通用的筛选逻辑 + if not is_text_file(filename) or should_exclude(relative_path, filename, self.exclude_patterns): continue try: @@ -466,11 +456,6 @@ class ListFilesTool(AgentTool): 列出目录中的文件 """ - DEFAULT_EXCLUDE_DIRS = { - "node_modules", "vendor", "dist", "build", ".git", - "__pycache__", ".pytest_cache", "coverage", - } - def __init__( self, project_root: str, @@ -482,8 +467,8 @@ class ListFilesTool(AgentTool): self.exclude_patterns = exclude_patterns or [] self.target_files = set(target_files) if target_files else None - # 从 exclude_patterns 中提取目录排除 - self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS) + # 使用通用的排除目录 + self.exclude_dirs = set(EXCLUDE_DIRS) for pattern in self.exclude_patterns: # 如果是目录模式(如 node_modules/**),提取目录名 if pattern.endswith("/**"): @@ -560,13 +545,8 @@ class ListFilesTool(AgentTool): if self.target_files and relative_path not in self.target_files: continue - # 检查排除模式 - should_skip = False - for excl_pattern in self.exclude_patterns: - if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern): - should_skip = True - break - if should_skip: + # 使用通用的筛选逻辑 + if should_exclude(relative_path, filename, self.exclude_patterns): continue files.append(relative_path) @@ -628,13 +608,8 @@ class ListFilesTool(AgentTool): if pattern and not fnmatch.fnmatch(item, pattern): continue - # 检查排除模式 - should_skip = False - for excl_pattern in self.exclude_patterns: - if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(item, excl_pattern): - should_skip = True - break - if should_skip: + # 使用通用的筛选逻辑 + if should_exclude(relative_path, item, self.exclude_patterns): continue files.append(relative_path) diff --git a/backend/app/services/agent/tools/smart_scan_tool.py b/backend/app/services/agent/tools/smart_scan_tool.py index 3898f90..3514188 100644 --- a/backend/app/services/agent/tools/smart_scan_tool.py +++ b/backend/app/services/agent/tools/smart_scan_tool.py @@ -17,6 +17,7 @@ from pydantic import BaseModel, Field from dataclasses import dataclass, field from .base import AgentTool, ToolResult +from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude logger = logging.getLogger(__name__) @@ -213,43 +214,24 @@ class SmartScanTool(AgentTool): if not full_path.startswith(os.path.normpath(self.project_root)): return [] - files = [] - - # 排除目录 - exclude_dirs = { - 'node_modules', '__pycache__', '.git', 'venv', '.venv', - 'build', 'dist', 'target', '.idea', '.vscode', 'vendor', - 'coverage', '.pytest_cache', '.mypy_cache', - } - - # 支持的代码文件扩展名 - code_extensions = { - '.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.php', - '.go', '.rb', '.cs', '.c', '.cpp', '.h', '.hpp', - '.swift', '.m', '.mm', '.kt', '.rs', '.sh', '.bat', - '.vue', '.html', '.htm', '.xml', '.gradle', '.properties' - } - - # 配置文件扩展名 - config_extensions = {'.json', '.yaml', '.yml', '.env', '.ini', '.cfg', '.plist', '.conf'} - - all_extensions = code_extensions | config_extensions - if os.path.isfile(full_path): - return [os.path.relpath(full_path, self.project_root)] + rel_path = os.path.relpath(full_path, self.project_root) + return [rel_path] if is_text_file(full_path) else [] + + files = [] for root, dirs, filenames in os.walk(full_path): # 过滤排除目录 - dirs[:] = [d for d in dirs if d not in exclude_dirs] + dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS] for filename in filenames: - ext = os.path.splitext(filename)[1].lower() - if ext not in all_extensions: - continue - file_path = os.path.join(root, filename) rel_path = os.path.relpath(file_path, self.project_root) + # 使用通用的筛选逻辑 + if not is_text_file(filename) or should_exclude(rel_path, filename): + continue + # 快速模式:只扫描高风险文件 if quick_mode: is_high_risk = any( diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 3ece51c..a983c1a 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -29,70 +29,7 @@ logger = logging.getLogger(__name__) INDEX_VERSION = "2.0" -# 支持的文本文件扩展名 -TEXT_EXTENSIONS = { - # 核心语言 - ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", - ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", - ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", - # .NET (逻辑代码) - ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc", - # 数据与配置 - ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", - # 脚本与命令 - ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t", - # Web - ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql", - ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", - # 其他 - ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", - ".erl", ".hrl", ".m", ".mm", ".r", ".rmd" -} - -# 排除的目录 -EXCLUDE_DIRS = { - # 构建与依赖 - "node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj", - "bower_components", "packages", "pkg", "Pods", ".gradle", ".m2", - "vendor/bundle", ".bundle", "jspm_packages", "typings", - # 虚拟环境 - "venv", "env", ".env", "virtualenv", ".venv", - # IDE 与元数据 - ".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults", - "_ReSharper.*", ".settings", ".project", ".classpath", ".metadata", - # 缓存与日志 - "__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output", - ".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch", - # 云与基础设施 - ".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site", - # 其他 - "__MACOSX", "extern", "externals", "third-party", "3rdparty" -} - -# 排除的文件 -EXCLUDE_FILES = { - # 锁文件 (通常不索引,因为内容太长且无语义) - "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", - "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile", - "mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml", - # 静态资源与二进制 - "*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map", - "*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj", - "*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class", - "*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif", - # 系统与秘密 - ".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key", - # 临时与日志 - "*.log", "*.bak", "*.swp", "*.tmp", "tags", - # IDE 与项目配置 (非代码) - "*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj", - "*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb", - "launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc", - # 自动生成的代码 (噪声) - "*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs", - # 大型数据文件 (非代码) - "haarcascade_*.xml" -} +from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude @@ -1475,18 +1412,10 @@ class CodeIndexer: dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS] for filename in filenames: - # 检查扩展名 - ext = os.path.splitext(filename)[1].lower() - if ext not in TEXT_EXTENSIONS: - continue + relative_path = os.path.relpath(os.path.join(root, filename), directory) - # 检查排除文件 (支持通配符) - should_skip_file = False - for pattern in EXCLUDE_FILES: - if fnmatch.fnmatch(filename, pattern): - should_skip_file = True - break - if should_skip_file: + # 检查是否为文本文件且不应排除 + if not is_text_file(filename) or should_exclude(relative_path, filename, exclude_patterns): continue # 排除疑似压缩过的文件 (通过行长度和内容分析) @@ -1509,18 +1438,6 @@ class CodeIndexer: except Exception: pass - relative_path = os.path.relpath(file_path, directory) - - # 检查排除模式 - excluded = False - for pattern in exclude_patterns: - if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(filename, pattern): - excluded = True - break - - if excluded: - continue - # 检查包含模式 if include_patterns: included = False