The fast scan mode follows the file exclusion pattern used during RAG embedding.
Build and Push CodeReview / build (push) Waiting to run Details

This commit is contained in:
vinland100 2026-01-09 16:41:40 +08:00
parent a11542c4bf
commit de88b69f86
3 changed files with 11 additions and 41 deletions

View File

@ -1044,7 +1044,7 @@ async def _collect_project_info(
continue
# 使用通用的筛选逻辑
if should_exclude(relative_path, f, exclude_patterns):
if not is_text_file(f) or should_exclude(relative_path, f, exclude_patterns):
continue
info["file_count"] += 1

View File

@ -23,6 +23,7 @@ from app.services.llm.service import LLMService
from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config
from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip
from app.core.config import settings
from app.core.file_filter import EXCLUDE_DIRS
router = APIRouter()
@ -36,12 +37,7 @@ def normalize_path(path: str) -> str:
return path.replace("\\", "/")
# 支持的文件扩展名
TEXT_EXTENSIONS = [
".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml"
]
# TEXT_EXTENSIONS has been moved to app.core.file_filter and is used via app.services.scanner.is_text_file
async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None):
@ -74,7 +70,7 @@ async def process_zip_task(task_id: str, file_path: str, db_session_factory, use
files_to_scan = []
for root, dirs, files in os.walk(extract_dir):
# 排除常见非代码目录
dirs[:] = [d for d in dirs if d not in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'vendor']]
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for file in files:
full_path = Path(root) / file

View File

@ -2,6 +2,7 @@
仓库扫描服务 - 支持GitHub, GitLab Gitea 仓库扫描
"""
import os
import asyncio
import httpx
from typing import List, Dict, Any, Optional
@ -14,6 +15,7 @@ from app.models.audit import AuditTask, AuditIssue
from app.models.project import Project
from app.services.llm.service import LLMService
from app.core.config import settings
from app.core.file_filter import is_text_file as core_is_text_file, should_exclude as core_should_exclude, TEXT_EXTENSIONS as CORE_TEXT_EXTENSIONS
def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
@ -35,46 +37,18 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st
}
# 支持的文本文件扩展名
TEXT_EXTENSIONS = [
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
]
# 排除的目录和文件模式
EXCLUDE_PATTERNS = [
# 常用目录
"node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/",
".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/",
".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/",
".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/",
"packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*",
# 常见锁文件与二进制
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe",
"*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class"
]
# 支持的文本文件扩展名使用全局定义
TEXT_EXTENSIONS = list(CORE_TEXT_EXTENSIONS)
def is_text_file(path: str) -> bool:
"""检查是否为文本文件"""
return any(path.lower().endswith(ext) for ext in TEXT_EXTENSIONS)
return core_is_text_file(path)
def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool:
"""检查是否应该排除该文件"""
all_patterns = EXCLUDE_PATTERNS + (exclude_patterns or [])
return any(pattern in path for pattern in all_patterns)
filename = os.path.basename(path)
return core_should_exclude(path, filename, exclude_patterns)
def get_language_from_path(path: str) -> str: