Configure unified file extensions, exclude directories, and exclude files.
Build and Push CodeReview / build (push) Waiting to run Details

This commit is contained in:
vinland100 2026-01-09 16:28:27 +08:00
parent cd8fb49a56
commit a11542c4bf
6 changed files with 137 additions and 182 deletions

View File

@ -34,6 +34,7 @@ from app.services.agent.event_manager import EventManager
from app.services.agent.streaming import StreamHandler, StreamEvent, StreamEventType from app.services.agent.streaming import StreamHandler, StreamEvent, StreamEventType
from app.services.git_ssh_service import GitSSHOperations from app.services.git_ssh_service import GitSSHOperations
from app.core.encryption import decrypt_sensitive_data from app.core.encryption import decrypt_sensitive_data
from app.core.file_filter import EXCLUDE_DIRS, should_exclude, is_text_file
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
router = APIRouter() router = APIRouter()
@ -1008,11 +1009,8 @@ async def _collect_project_info(
} }
try: try:
# 默认排除目录 # 使用通用的排除目录
exclude_dirs = { exclude_dirs = set(EXCLUDE_DIRS)
"node_modules", "__pycache__", ".git", "venv", ".venv",
"build", "dist", "target", ".idea", ".vscode",
}
# 从用户配置的排除模式中提取目录 # 从用户配置的排除模式中提取目录
if exclude_patterns: if exclude_patterns:
@ -1045,14 +1043,8 @@ async def _collect_project_info(
if target_files_set and relative_path not in target_files_set: if target_files_set and relative_path not in target_files_set:
continue continue
# 检查排除模式 # 使用通用的筛选逻辑
should_skip = False if should_exclude(relative_path, f, exclude_patterns):
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(f, pattern):
should_skip = True
break
if should_skip:
continue continue
info["file_count"] += 1 info["file_count"] += 1
@ -1085,7 +1077,7 @@ async def _collect_project_info(
top_items = os.listdir(project_root) top_items = os.listdir(project_root)
info["structure"] = { info["structure"] = {
"directories": [d for d in top_items if os.path.isdir(os.path.join(project_root, d)) and d not in exclude_dirs], "directories": [d for d in top_items if os.path.isdir(os.path.join(project_root, d)) and d not in exclude_dirs],
"files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f))][:20], "files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f)) and is_text_file(f)][:20],
"scope_limited": False, "scope_limited": False,
} }
except Exception: except Exception:

View File

@ -0,0 +1,100 @@
"""
Common file filtering logic for RAG and Audit Agents.
"""
import os
import fnmatch
from typing import List, Set, Optional
# Supported text file extensions (Code and Config)
TEXT_EXTENSIONS = {
# Core Languages
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
# .NET (Logic code)
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc",
# Data & Config
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
# Scripts & Commands
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
# Web
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
# Others
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
}
# Directories to exclude
EXCLUDE_DIRS = {
# Build & Dependencies
"node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj",
"bower_components", "packages", "pkg", "Pods", ".gradle", ".m2",
"vendor/bundle", ".bundle", "jspm_packages", "typings",
# Virtual Environments
"venv", "env", ".env", "virtualenv", ".venv",
# IDE & Metadata
".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults",
"_ReSharper.*", ".settings", ".project", ".classpath", ".metadata",
# Cache & Logs
"__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output",
".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch",
# Cloud & Infrastructure
".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site",
# Others
"__MACOSX", "extern", "externals", "third-party", "3rdparty"
}
# Files to exclude (supports glob patterns)
EXCLUDE_FILES = {
# Lock files
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
"mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml",
# Static & Binary
"*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map",
"*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj",
"*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class",
"*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif",
# System & Secrets
".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key",
# Temp & Logs
"*.log", "*.bak", "*.swp", "*.tmp", "tags",
# IDE & Project Config (non-code)
"*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj",
"*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb",
"launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc",
# Generated code
"*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs",
# Large data files
"haarcascade_*.xml"
}
def is_text_file(file_path: str) -> bool:
"""Check if a file should be considered for indexing/scanning."""
ext = os.path.splitext(file_path)[1].lower()
return ext in TEXT_EXTENSIONS
def should_exclude(rel_path: str, filename: str, exclude_patterns: Optional[List[str]] = None) -> bool:
"""
Check if a file should be excluded based on default rules and custom patterns.
"""
# 1. Check EXCLUDE_FILES (glob patterns)
for pattern in EXCLUDE_FILES:
if fnmatch.fnmatch(filename, pattern):
return True
# 2. Check EXCLUDE_DIRS in the path
path_parts = rel_path.replace('\\', '/').split('/')
for part in path_parts[:-1]: # Don't check filename as a directory
if part in EXCLUDE_DIRS:
return True
# 3. Check custom exclude_patterns
if exclude_patterns:
for pattern in exclude_patterns:
if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern):
return True
return False

View File

@ -14,6 +14,8 @@ from functools import lru_cache
from pydantic import Field from pydantic import Field
from pydantic_settings import BaseSettings from pydantic_settings import BaseSettings
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS
class LogLevel(str, Enum): class LogLevel(str, Enum):
"""Logging levels""" """Logging levels"""
@ -279,24 +281,11 @@ class AgentConfig(BaseSettings):
# ============ Security ============ # ============ Security ============
allowed_file_extensions: Set[str] = Field( allowed_file_extensions: Set[str] = Field(
default={ default=TEXT_EXTENSIONS,
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rb", ".php",
".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".rs", ".scala",
".vue", ".svelte", ".html", ".css", ".scss", ".sass", ".less",
".json", ".yaml", ".yml", ".xml", ".toml", ".ini", ".conf",
".sql", ".graphql", ".proto", ".sh", ".bash", ".zsh", ".ps1",
".md", ".txt", ".rst", ".env.example", ".gitignore",
},
description="Allowed file extensions for analysis" description="Allowed file extensions for analysis"
) )
blocked_directories: Set[str] = Field( blocked_directories: Set[str] = Field(
default={ default=EXCLUDE_DIRS,
"node_modules", "__pycache__", ".git", ".svn", ".hg",
"venv", ".venv", "env", ".env", "virtualenv",
"dist", "build", "target", "out", "bin", "obj",
".idea", ".vscode", ".vs", ".pytest_cache", ".mypy_cache",
"coverage", ".coverage", "htmlcov", ".tox", ".nox",
},
description="Directories to exclude from scanning" description="Directories to exclude from scanning"
) )
max_path_depth: int = Field( max_path_depth: int = Field(

View File

@ -11,6 +11,7 @@ from typing import Optional, List, Dict, Any
from pydantic import BaseModel, Field from pydantic import BaseModel, Field
from .base import AgentTool, ToolResult from .base import AgentTool, ToolResult
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
class FileReadInput(BaseModel): class FileReadInput(BaseModel):
@ -107,15 +108,9 @@ class FileReadTool(AgentTool):
if self.target_files and file_path not in self.target_files: if self.target_files and file_path not in self.target_files:
return True return True
# 检查排除模式 # 使用通用的筛选逻辑
for pattern in self.exclude_patterns: filename = os.path.basename(file_path)
if fnmatch.fnmatch(file_path, pattern): return should_exclude(file_path, filename, self.exclude_patterns)
return True
# 也检查文件名
if fnmatch.fnmatch(os.path.basename(file_path), pattern):
return True
return False
async def _execute( async def _execute(
self, self,
@ -275,7 +270,7 @@ class FileSearchTool(AgentTool):
self.target_files = set(target_files) if target_files else None self.target_files = set(target_files) if target_files else None
# 从 exclude_patterns 中提取目录排除 # 从 exclude_patterns 中提取目录排除
self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS) self.exclude_dirs = set(EXCLUDE_DIRS)
for pattern in self.exclude_patterns: for pattern in self.exclude_patterns:
if pattern.endswith("/**"): if pattern.endswith("/**"):
self.exclude_dirs.add(pattern[:-3]) self.exclude_dirs.add(pattern[:-3])
@ -370,13 +365,8 @@ class FileSearchTool(AgentTool):
if self.target_files and relative_path not in self.target_files: if self.target_files and relative_path not in self.target_files:
continue continue
# 检查排除模式 # 使用通用的筛选逻辑
should_skip = False if not is_text_file(filename) or should_exclude(relative_path, filename, self.exclude_patterns):
for excl_pattern in self.exclude_patterns:
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern):
should_skip = True
break
if should_skip:
continue continue
try: try:
@ -466,11 +456,6 @@ class ListFilesTool(AgentTool):
列出目录中的文件 列出目录中的文件
""" """
DEFAULT_EXCLUDE_DIRS = {
"node_modules", "vendor", "dist", "build", ".git",
"__pycache__", ".pytest_cache", "coverage",
}
def __init__( def __init__(
self, self,
project_root: str, project_root: str,
@ -482,8 +467,8 @@ class ListFilesTool(AgentTool):
self.exclude_patterns = exclude_patterns or [] self.exclude_patterns = exclude_patterns or []
self.target_files = set(target_files) if target_files else None self.target_files = set(target_files) if target_files else None
# 从 exclude_patterns 中提取目录排除 # 使用通用的排除目录
self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS) self.exclude_dirs = set(EXCLUDE_DIRS)
for pattern in self.exclude_patterns: for pattern in self.exclude_patterns:
# 如果是目录模式(如 node_modules/**),提取目录名 # 如果是目录模式(如 node_modules/**),提取目录名
if pattern.endswith("/**"): if pattern.endswith("/**"):
@ -560,13 +545,8 @@ class ListFilesTool(AgentTool):
if self.target_files and relative_path not in self.target_files: if self.target_files and relative_path not in self.target_files:
continue continue
# 检查排除模式 # 使用通用的筛选逻辑
should_skip = False if should_exclude(relative_path, filename, self.exclude_patterns):
for excl_pattern in self.exclude_patterns:
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern):
should_skip = True
break
if should_skip:
continue continue
files.append(relative_path) files.append(relative_path)
@ -628,13 +608,8 @@ class ListFilesTool(AgentTool):
if pattern and not fnmatch.fnmatch(item, pattern): if pattern and not fnmatch.fnmatch(item, pattern):
continue continue
# 检查排除模式 # 使用通用的筛选逻辑
should_skip = False if should_exclude(relative_path, item, self.exclude_patterns):
for excl_pattern in self.exclude_patterns:
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(item, excl_pattern):
should_skip = True
break
if should_skip:
continue continue
files.append(relative_path) files.append(relative_path)

View File

@ -17,6 +17,7 @@ from pydantic import BaseModel, Field
from dataclasses import dataclass, field from dataclasses import dataclass, field
from .base import AgentTool, ToolResult from .base import AgentTool, ToolResult
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -213,43 +214,24 @@ class SmartScanTool(AgentTool):
if not full_path.startswith(os.path.normpath(self.project_root)): if not full_path.startswith(os.path.normpath(self.project_root)):
return [] return []
files = []
# 排除目录
exclude_dirs = {
'node_modules', '__pycache__', '.git', 'venv', '.venv',
'build', 'dist', 'target', '.idea', '.vscode', 'vendor',
'coverage', '.pytest_cache', '.mypy_cache',
}
# 支持的代码文件扩展名
code_extensions = {
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.php',
'.go', '.rb', '.cs', '.c', '.cpp', '.h', '.hpp',
'.swift', '.m', '.mm', '.kt', '.rs', '.sh', '.bat',
'.vue', '.html', '.htm', '.xml', '.gradle', '.properties'
}
# 配置文件扩展名
config_extensions = {'.json', '.yaml', '.yml', '.env', '.ini', '.cfg', '.plist', '.conf'}
all_extensions = code_extensions | config_extensions
if os.path.isfile(full_path): if os.path.isfile(full_path):
return [os.path.relpath(full_path, self.project_root)] rel_path = os.path.relpath(full_path, self.project_root)
return [rel_path] if is_text_file(full_path) else []
files = []
for root, dirs, filenames in os.walk(full_path): for root, dirs, filenames in os.walk(full_path):
# 过滤排除目录 # 过滤排除目录
dirs[:] = [d for d in dirs if d not in exclude_dirs] dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for filename in filenames: for filename in filenames:
ext = os.path.splitext(filename)[1].lower()
if ext not in all_extensions:
continue
file_path = os.path.join(root, filename) file_path = os.path.join(root, filename)
rel_path = os.path.relpath(file_path, self.project_root) rel_path = os.path.relpath(file_path, self.project_root)
# 使用通用的筛选逻辑
if not is_text_file(filename) or should_exclude(rel_path, filename):
continue
# 快速模式:只扫描高风险文件 # 快速模式:只扫描高风险文件
if quick_mode: if quick_mode:
is_high_risk = any( is_high_risk = any(

View File

@ -29,70 +29,7 @@ logger = logging.getLogger(__name__)
INDEX_VERSION = "2.0" INDEX_VERSION = "2.0"
# 支持的文本文件扩展名 from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
TEXT_EXTENSIONS = {
# 核心语言
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
# .NET (逻辑代码)
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc",
# 数据与配置
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
# 脚本与命令
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
# Web
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
# 其他
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
}
# 排除的目录
EXCLUDE_DIRS = {
# 构建与依赖
"node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj",
"bower_components", "packages", "pkg", "Pods", ".gradle", ".m2",
"vendor/bundle", ".bundle", "jspm_packages", "typings",
# 虚拟环境
"venv", "env", ".env", "virtualenv", ".venv",
# IDE 与元数据
".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults",
"_ReSharper.*", ".settings", ".project", ".classpath", ".metadata",
# 缓存与日志
"__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output",
".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch",
# 云与基础设施
".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site",
# 其他
"__MACOSX", "extern", "externals", "third-party", "3rdparty"
}
# 排除的文件
EXCLUDE_FILES = {
# 锁文件 (通常不索引,因为内容太长且无语义)
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
"mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml",
# 静态资源与二进制
"*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map",
"*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj",
"*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class",
"*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif",
# 系统与秘密
".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key",
# 临时与日志
"*.log", "*.bak", "*.swp", "*.tmp", "tags",
# IDE 与项目配置 (非代码)
"*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj",
"*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb",
"launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc",
# 自动生成的代码 (噪声)
"*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs",
# 大型数据文件 (非代码)
"haarcascade_*.xml"
}
@ -1475,18 +1412,10 @@ class CodeIndexer:
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS] dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for filename in filenames: for filename in filenames:
# 检查扩展名 relative_path = os.path.relpath(os.path.join(root, filename), directory)
ext = os.path.splitext(filename)[1].lower()
if ext not in TEXT_EXTENSIONS:
continue
# 检查排除文件 (支持通配符) # 检查是否为文本文件且不应排除
should_skip_file = False if not is_text_file(filename) or should_exclude(relative_path, filename, exclude_patterns):
for pattern in EXCLUDE_FILES:
if fnmatch.fnmatch(filename, pattern):
should_skip_file = True
break
if should_skip_file:
continue continue
# 排除疑似压缩过的文件 (通过行长度和内容分析) # 排除疑似压缩过的文件 (通过行长度和内容分析)
@ -1509,18 +1438,6 @@ class CodeIndexer:
except Exception: except Exception:
pass pass
relative_path = os.path.relpath(file_path, directory)
# 检查排除模式
excluded = False
for pattern in exclude_patterns:
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(filename, pattern):
excluded = True
break
if excluded:
continue
# 检查包含模式 # 检查包含模式
if include_patterns: if include_patterns:
included = False included = False