Configure unified file extensions, exclude directories, and exclude files.
Build and Push CodeReview / build (push) Waiting to run
Details
Build and Push CodeReview / build (push) Waiting to run
Details
This commit is contained in:
parent
cd8fb49a56
commit
a11542c4bf
|
|
@ -34,6 +34,7 @@ from app.services.agent.event_manager import EventManager
|
||||||
from app.services.agent.streaming import StreamHandler, StreamEvent, StreamEventType
|
from app.services.agent.streaming import StreamHandler, StreamEvent, StreamEventType
|
||||||
from app.services.git_ssh_service import GitSSHOperations
|
from app.services.git_ssh_service import GitSSHOperations
|
||||||
from app.core.encryption import decrypt_sensitive_data
|
from app.core.encryption import decrypt_sensitive_data
|
||||||
|
from app.core.file_filter import EXCLUDE_DIRS, should_exclude, is_text_file
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
@ -1008,11 +1009,8 @@ async def _collect_project_info(
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# 默认排除目录
|
# 使用通用的排除目录
|
||||||
exclude_dirs = {
|
exclude_dirs = set(EXCLUDE_DIRS)
|
||||||
"node_modules", "__pycache__", ".git", "venv", ".venv",
|
|
||||||
"build", "dist", "target", ".idea", ".vscode",
|
|
||||||
}
|
|
||||||
|
|
||||||
# 从用户配置的排除模式中提取目录
|
# 从用户配置的排除模式中提取目录
|
||||||
if exclude_patterns:
|
if exclude_patterns:
|
||||||
|
|
@ -1045,14 +1043,8 @@ async def _collect_project_info(
|
||||||
if target_files_set and relative_path not in target_files_set:
|
if target_files_set and relative_path not in target_files_set:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 检查排除模式
|
# 使用通用的筛选逻辑
|
||||||
should_skip = False
|
if should_exclude(relative_path, f, exclude_patterns):
|
||||||
if exclude_patterns:
|
|
||||||
for pattern in exclude_patterns:
|
|
||||||
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(f, pattern):
|
|
||||||
should_skip = True
|
|
||||||
break
|
|
||||||
if should_skip:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
info["file_count"] += 1
|
info["file_count"] += 1
|
||||||
|
|
@ -1085,7 +1077,7 @@ async def _collect_project_info(
|
||||||
top_items = os.listdir(project_root)
|
top_items = os.listdir(project_root)
|
||||||
info["structure"] = {
|
info["structure"] = {
|
||||||
"directories": [d for d in top_items if os.path.isdir(os.path.join(project_root, d)) and d not in exclude_dirs],
|
"directories": [d for d in top_items if os.path.isdir(os.path.join(project_root, d)) and d not in exclude_dirs],
|
||||||
"files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f))][:20],
|
"files": [f for f in top_items if os.path.isfile(os.path.join(project_root, f)) and is_text_file(f)][:20],
|
||||||
"scope_limited": False,
|
"scope_limited": False,
|
||||||
}
|
}
|
||||||
except Exception:
|
except Exception:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,100 @@
|
||||||
|
"""
|
||||||
|
Common file filtering logic for RAG and Audit Agents.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import fnmatch
|
||||||
|
from typing import List, Set, Optional
|
||||||
|
|
||||||
|
# Supported text file extensions (Code and Config)
|
||||||
|
TEXT_EXTENSIONS = {
|
||||||
|
# Core Languages
|
||||||
|
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
|
||||||
|
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
|
||||||
|
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
|
||||||
|
# .NET (Logic code)
|
||||||
|
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc",
|
||||||
|
# Data & Config
|
||||||
|
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
|
||||||
|
# Scripts & Commands
|
||||||
|
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
|
||||||
|
# Web
|
||||||
|
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
|
||||||
|
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
|
||||||
|
# Others
|
||||||
|
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
|
||||||
|
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Directories to exclude
|
||||||
|
EXCLUDE_DIRS = {
|
||||||
|
# Build & Dependencies
|
||||||
|
"node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj",
|
||||||
|
"bower_components", "packages", "pkg", "Pods", ".gradle", ".m2",
|
||||||
|
"vendor/bundle", ".bundle", "jspm_packages", "typings",
|
||||||
|
# Virtual Environments
|
||||||
|
"venv", "env", ".env", "virtualenv", ".venv",
|
||||||
|
# IDE & Metadata
|
||||||
|
".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults",
|
||||||
|
"_ReSharper.*", ".settings", ".project", ".classpath", ".metadata",
|
||||||
|
# Cache & Logs
|
||||||
|
"__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output",
|
||||||
|
".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch",
|
||||||
|
# Cloud & Infrastructure
|
||||||
|
".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site",
|
||||||
|
# Others
|
||||||
|
"__MACOSX", "extern", "externals", "third-party", "3rdparty"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Files to exclude (supports glob patterns)
|
||||||
|
EXCLUDE_FILES = {
|
||||||
|
# Lock files
|
||||||
|
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
|
||||||
|
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
|
||||||
|
"mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml",
|
||||||
|
# Static & Binary
|
||||||
|
"*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map",
|
||||||
|
"*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj",
|
||||||
|
"*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class",
|
||||||
|
"*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif",
|
||||||
|
# System & Secrets
|
||||||
|
".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key",
|
||||||
|
# Temp & Logs
|
||||||
|
"*.log", "*.bak", "*.swp", "*.tmp", "tags",
|
||||||
|
# IDE & Project Config (non-code)
|
||||||
|
"*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj",
|
||||||
|
"*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb",
|
||||||
|
"launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc",
|
||||||
|
# Generated code
|
||||||
|
"*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs",
|
||||||
|
# Large data files
|
||||||
|
"haarcascade_*.xml"
|
||||||
|
}
|
||||||
|
|
||||||
|
def is_text_file(file_path: str) -> bool:
|
||||||
|
"""Check if a file should be considered for indexing/scanning."""
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
return ext in TEXT_EXTENSIONS
|
||||||
|
|
||||||
|
def should_exclude(rel_path: str, filename: str, exclude_patterns: Optional[List[str]] = None) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a file should be excluded based on default rules and custom patterns.
|
||||||
|
"""
|
||||||
|
# 1. Check EXCLUDE_FILES (glob patterns)
|
||||||
|
for pattern in EXCLUDE_FILES:
|
||||||
|
if fnmatch.fnmatch(filename, pattern):
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 2. Check EXCLUDE_DIRS in the path
|
||||||
|
path_parts = rel_path.replace('\\', '/').split('/')
|
||||||
|
for part in path_parts[:-1]: # Don't check filename as a directory
|
||||||
|
if part in EXCLUDE_DIRS:
|
||||||
|
return True
|
||||||
|
|
||||||
|
# 3. Check custom exclude_patterns
|
||||||
|
if exclude_patterns:
|
||||||
|
for pattern in exclude_patterns:
|
||||||
|
if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
@ -14,6 +14,8 @@ from functools import lru_cache
|
||||||
from pydantic import Field
|
from pydantic import Field
|
||||||
from pydantic_settings import BaseSettings
|
from pydantic_settings import BaseSettings
|
||||||
|
|
||||||
|
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS
|
||||||
|
|
||||||
|
|
||||||
class LogLevel(str, Enum):
|
class LogLevel(str, Enum):
|
||||||
"""Logging levels"""
|
"""Logging levels"""
|
||||||
|
|
@ -279,24 +281,11 @@ class AgentConfig(BaseSettings):
|
||||||
|
|
||||||
# ============ Security ============
|
# ============ Security ============
|
||||||
allowed_file_extensions: Set[str] = Field(
|
allowed_file_extensions: Set[str] = Field(
|
||||||
default={
|
default=TEXT_EXTENSIONS,
|
||||||
".py", ".js", ".ts", ".jsx", ".tsx", ".java", ".go", ".rb", ".php",
|
|
||||||
".c", ".cpp", ".h", ".hpp", ".cs", ".swift", ".kt", ".rs", ".scala",
|
|
||||||
".vue", ".svelte", ".html", ".css", ".scss", ".sass", ".less",
|
|
||||||
".json", ".yaml", ".yml", ".xml", ".toml", ".ini", ".conf",
|
|
||||||
".sql", ".graphql", ".proto", ".sh", ".bash", ".zsh", ".ps1",
|
|
||||||
".md", ".txt", ".rst", ".env.example", ".gitignore",
|
|
||||||
},
|
|
||||||
description="Allowed file extensions for analysis"
|
description="Allowed file extensions for analysis"
|
||||||
)
|
)
|
||||||
blocked_directories: Set[str] = Field(
|
blocked_directories: Set[str] = Field(
|
||||||
default={
|
default=EXCLUDE_DIRS,
|
||||||
"node_modules", "__pycache__", ".git", ".svn", ".hg",
|
|
||||||
"venv", ".venv", "env", ".env", "virtualenv",
|
|
||||||
"dist", "build", "target", "out", "bin", "obj",
|
|
||||||
".idea", ".vscode", ".vs", ".pytest_cache", ".mypy_cache",
|
|
||||||
"coverage", ".coverage", "htmlcov", ".tox", ".nox",
|
|
||||||
},
|
|
||||||
description="Directories to exclude from scanning"
|
description="Directories to exclude from scanning"
|
||||||
)
|
)
|
||||||
max_path_depth: int = Field(
|
max_path_depth: int = Field(
|
||||||
|
|
|
||||||
|
|
@ -11,6 +11,7 @@ from typing import Optional, List, Dict, Any
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
from .base import AgentTool, ToolResult
|
from .base import AgentTool, ToolResult
|
||||||
|
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
|
||||||
|
|
||||||
|
|
||||||
class FileReadInput(BaseModel):
|
class FileReadInput(BaseModel):
|
||||||
|
|
@ -107,15 +108,9 @@ class FileReadTool(AgentTool):
|
||||||
if self.target_files and file_path not in self.target_files:
|
if self.target_files and file_path not in self.target_files:
|
||||||
return True
|
return True
|
||||||
|
|
||||||
# 检查排除模式
|
# 使用通用的筛选逻辑
|
||||||
for pattern in self.exclude_patterns:
|
filename = os.path.basename(file_path)
|
||||||
if fnmatch.fnmatch(file_path, pattern):
|
return should_exclude(file_path, filename, self.exclude_patterns)
|
||||||
return True
|
|
||||||
# 也检查文件名
|
|
||||||
if fnmatch.fnmatch(os.path.basename(file_path), pattern):
|
|
||||||
return True
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
async def _execute(
|
async def _execute(
|
||||||
self,
|
self,
|
||||||
|
|
@ -275,7 +270,7 @@ class FileSearchTool(AgentTool):
|
||||||
self.target_files = set(target_files) if target_files else None
|
self.target_files = set(target_files) if target_files else None
|
||||||
|
|
||||||
# 从 exclude_patterns 中提取目录排除
|
# 从 exclude_patterns 中提取目录排除
|
||||||
self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS)
|
self.exclude_dirs = set(EXCLUDE_DIRS)
|
||||||
for pattern in self.exclude_patterns:
|
for pattern in self.exclude_patterns:
|
||||||
if pattern.endswith("/**"):
|
if pattern.endswith("/**"):
|
||||||
self.exclude_dirs.add(pattern[:-3])
|
self.exclude_dirs.add(pattern[:-3])
|
||||||
|
|
@ -370,13 +365,8 @@ class FileSearchTool(AgentTool):
|
||||||
if self.target_files and relative_path not in self.target_files:
|
if self.target_files and relative_path not in self.target_files:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 检查排除模式
|
# 使用通用的筛选逻辑
|
||||||
should_skip = False
|
if not is_text_file(filename) or should_exclude(relative_path, filename, self.exclude_patterns):
|
||||||
for excl_pattern in self.exclude_patterns:
|
|
||||||
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern):
|
|
||||||
should_skip = True
|
|
||||||
break
|
|
||||||
if should_skip:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -466,11 +456,6 @@ class ListFilesTool(AgentTool):
|
||||||
列出目录中的文件
|
列出目录中的文件
|
||||||
"""
|
"""
|
||||||
|
|
||||||
DEFAULT_EXCLUDE_DIRS = {
|
|
||||||
"node_modules", "vendor", "dist", "build", ".git",
|
|
||||||
"__pycache__", ".pytest_cache", "coverage",
|
|
||||||
}
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
project_root: str,
|
project_root: str,
|
||||||
|
|
@ -482,8 +467,8 @@ class ListFilesTool(AgentTool):
|
||||||
self.exclude_patterns = exclude_patterns or []
|
self.exclude_patterns = exclude_patterns or []
|
||||||
self.target_files = set(target_files) if target_files else None
|
self.target_files = set(target_files) if target_files else None
|
||||||
|
|
||||||
# 从 exclude_patterns 中提取目录排除
|
# 使用通用的排除目录
|
||||||
self.exclude_dirs = set(self.DEFAULT_EXCLUDE_DIRS)
|
self.exclude_dirs = set(EXCLUDE_DIRS)
|
||||||
for pattern in self.exclude_patterns:
|
for pattern in self.exclude_patterns:
|
||||||
# 如果是目录模式(如 node_modules/**),提取目录名
|
# 如果是目录模式(如 node_modules/**),提取目录名
|
||||||
if pattern.endswith("/**"):
|
if pattern.endswith("/**"):
|
||||||
|
|
@ -560,13 +545,8 @@ class ListFilesTool(AgentTool):
|
||||||
if self.target_files and relative_path not in self.target_files:
|
if self.target_files and relative_path not in self.target_files:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 检查排除模式
|
# 使用通用的筛选逻辑
|
||||||
should_skip = False
|
if should_exclude(relative_path, filename, self.exclude_patterns):
|
||||||
for excl_pattern in self.exclude_patterns:
|
|
||||||
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(filename, excl_pattern):
|
|
||||||
should_skip = True
|
|
||||||
break
|
|
||||||
if should_skip:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
files.append(relative_path)
|
files.append(relative_path)
|
||||||
|
|
@ -628,13 +608,8 @@ class ListFilesTool(AgentTool):
|
||||||
if pattern and not fnmatch.fnmatch(item, pattern):
|
if pattern and not fnmatch.fnmatch(item, pattern):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 检查排除模式
|
# 使用通用的筛选逻辑
|
||||||
should_skip = False
|
if should_exclude(relative_path, item, self.exclude_patterns):
|
||||||
for excl_pattern in self.exclude_patterns:
|
|
||||||
if fnmatch.fnmatch(relative_path, excl_pattern) or fnmatch.fnmatch(item, excl_pattern):
|
|
||||||
should_skip = True
|
|
||||||
break
|
|
||||||
if should_skip:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
files.append(relative_path)
|
files.append(relative_path)
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ from pydantic import BaseModel, Field
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
from .base import AgentTool, ToolResult
|
from .base import AgentTool, ToolResult
|
||||||
|
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -213,43 +214,24 @@ class SmartScanTool(AgentTool):
|
||||||
if not full_path.startswith(os.path.normpath(self.project_root)):
|
if not full_path.startswith(os.path.normpath(self.project_root)):
|
||||||
return []
|
return []
|
||||||
|
|
||||||
files = []
|
|
||||||
|
|
||||||
# 排除目录
|
|
||||||
exclude_dirs = {
|
|
||||||
'node_modules', '__pycache__', '.git', 'venv', '.venv',
|
|
||||||
'build', 'dist', 'target', '.idea', '.vscode', 'vendor',
|
|
||||||
'coverage', '.pytest_cache', '.mypy_cache',
|
|
||||||
}
|
|
||||||
|
|
||||||
# 支持的代码文件扩展名
|
|
||||||
code_extensions = {
|
|
||||||
'.py', '.js', '.ts', '.jsx', '.tsx', '.java', '.php',
|
|
||||||
'.go', '.rb', '.cs', '.c', '.cpp', '.h', '.hpp',
|
|
||||||
'.swift', '.m', '.mm', '.kt', '.rs', '.sh', '.bat',
|
|
||||||
'.vue', '.html', '.htm', '.xml', '.gradle', '.properties'
|
|
||||||
}
|
|
||||||
|
|
||||||
# 配置文件扩展名
|
|
||||||
config_extensions = {'.json', '.yaml', '.yml', '.env', '.ini', '.cfg', '.plist', '.conf'}
|
|
||||||
|
|
||||||
all_extensions = code_extensions | config_extensions
|
|
||||||
|
|
||||||
if os.path.isfile(full_path):
|
if os.path.isfile(full_path):
|
||||||
return [os.path.relpath(full_path, self.project_root)]
|
rel_path = os.path.relpath(full_path, self.project_root)
|
||||||
|
return [rel_path] if is_text_file(full_path) else []
|
||||||
|
|
||||||
|
files = []
|
||||||
|
|
||||||
for root, dirs, filenames in os.walk(full_path):
|
for root, dirs, filenames in os.walk(full_path):
|
||||||
# 过滤排除目录
|
# 过滤排除目录
|
||||||
dirs[:] = [d for d in dirs if d not in exclude_dirs]
|
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
|
||||||
|
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
ext = os.path.splitext(filename)[1].lower()
|
|
||||||
if ext not in all_extensions:
|
|
||||||
continue
|
|
||||||
|
|
||||||
file_path = os.path.join(root, filename)
|
file_path = os.path.join(root, filename)
|
||||||
rel_path = os.path.relpath(file_path, self.project_root)
|
rel_path = os.path.relpath(file_path, self.project_root)
|
||||||
|
|
||||||
|
# 使用通用的筛选逻辑
|
||||||
|
if not is_text_file(filename) or should_exclude(rel_path, filename):
|
||||||
|
continue
|
||||||
|
|
||||||
# 快速模式:只扫描高风险文件
|
# 快速模式:只扫描高风险文件
|
||||||
if quick_mode:
|
if quick_mode:
|
||||||
is_high_risk = any(
|
is_high_risk = any(
|
||||||
|
|
|
||||||
|
|
@ -29,70 +29,7 @@ logger = logging.getLogger(__name__)
|
||||||
INDEX_VERSION = "2.0"
|
INDEX_VERSION = "2.0"
|
||||||
|
|
||||||
|
|
||||||
# 支持的文本文件扩展名
|
from app.core.file_filter import TEXT_EXTENSIONS, EXCLUDE_DIRS, EXCLUDE_FILES, is_text_file, should_exclude
|
||||||
TEXT_EXTENSIONS = {
|
|
||||||
# 核心语言
|
|
||||||
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
|
|
||||||
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
|
|
||||||
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
|
|
||||||
# .NET (逻辑代码)
|
|
||||||
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc",
|
|
||||||
# 数据与配置
|
|
||||||
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
|
|
||||||
# 脚本与命令
|
|
||||||
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
|
|
||||||
# Web
|
|
||||||
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
|
|
||||||
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
|
|
||||||
# 其他
|
|
||||||
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
|
|
||||||
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 排除的目录
|
|
||||||
EXCLUDE_DIRS = {
|
|
||||||
# 构建与依赖
|
|
||||||
"node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj",
|
|
||||||
"bower_components", "packages", "pkg", "Pods", ".gradle", ".m2",
|
|
||||||
"vendor/bundle", ".bundle", "jspm_packages", "typings",
|
|
||||||
# 虚拟环境
|
|
||||||
"venv", "env", ".env", "virtualenv", ".venv",
|
|
||||||
# IDE 与元数据
|
|
||||||
".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults",
|
|
||||||
"_ReSharper.*", ".settings", ".project", ".classpath", ".metadata",
|
|
||||||
# 缓存与日志
|
|
||||||
"__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output",
|
|
||||||
".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch",
|
|
||||||
# 云与基础设施
|
|
||||||
".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site",
|
|
||||||
# 其他
|
|
||||||
"__MACOSX", "extern", "externals", "third-party", "3rdparty"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 排除的文件
|
|
||||||
EXCLUDE_FILES = {
|
|
||||||
# 锁文件 (通常不索引,因为内容太长且无语义)
|
|
||||||
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
|
|
||||||
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
|
|
||||||
"mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml",
|
|
||||||
# 静态资源与二进制
|
|
||||||
"*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map",
|
|
||||||
"*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj",
|
|
||||||
"*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class",
|
|
||||||
"*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif",
|
|
||||||
# 系统与秘密
|
|
||||||
".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key",
|
|
||||||
# 临时与日志
|
|
||||||
"*.log", "*.bak", "*.swp", "*.tmp", "tags",
|
|
||||||
# IDE 与项目配置 (非代码)
|
|
||||||
"*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj",
|
|
||||||
"*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb",
|
|
||||||
"launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc",
|
|
||||||
# 自动生成的代码 (噪声)
|
|
||||||
"*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs",
|
|
||||||
# 大型数据文件 (非代码)
|
|
||||||
"haarcascade_*.xml"
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1475,18 +1412,10 @@ class CodeIndexer:
|
||||||
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
|
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
|
||||||
|
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
# 检查扩展名
|
relative_path = os.path.relpath(os.path.join(root, filename), directory)
|
||||||
ext = os.path.splitext(filename)[1].lower()
|
|
||||||
if ext not in TEXT_EXTENSIONS:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 检查排除文件 (支持通配符)
|
# 检查是否为文本文件且不应排除
|
||||||
should_skip_file = False
|
if not is_text_file(filename) or should_exclude(relative_path, filename, exclude_patterns):
|
||||||
for pattern in EXCLUDE_FILES:
|
|
||||||
if fnmatch.fnmatch(filename, pattern):
|
|
||||||
should_skip_file = True
|
|
||||||
break
|
|
||||||
if should_skip_file:
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 排除疑似压缩过的文件 (通过行长度和内容分析)
|
# 排除疑似压缩过的文件 (通过行长度和内容分析)
|
||||||
|
|
@ -1509,18 +1438,6 @@ class CodeIndexer:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
relative_path = os.path.relpath(file_path, directory)
|
|
||||||
|
|
||||||
# 检查排除模式
|
|
||||||
excluded = False
|
|
||||||
for pattern in exclude_patterns:
|
|
||||||
if fnmatch.fnmatch(relative_path, pattern) or fnmatch.fnmatch(filename, pattern):
|
|
||||||
excluded = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if excluded:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# 检查包含模式
|
# 检查包含模式
|
||||||
if include_patterns:
|
if include_patterns:
|
||||||
included = False
|
included = False
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue