The fast scan mode follows the file exclusion pattern used during RAG embedding.
Build and Push CodeReview / build (push) Waiting to run Details

This commit is contained in:
vinland100 2026-01-09 16:41:40 +08:00
parent a11542c4bf
commit de88b69f86
3 changed files with 11 additions and 41 deletions

View File

@ -1044,7 +1044,7 @@ async def _collect_project_info(
continue continue
# 使用通用的筛选逻辑 # 使用通用的筛选逻辑
if should_exclude(relative_path, f, exclude_patterns): if not is_text_file(f) or should_exclude(relative_path, f, exclude_patterns):
continue continue
info["file_count"] += 1 info["file_count"] += 1

View File

@ -23,6 +23,7 @@ from app.services.llm.service import LLMService
from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config
from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip
from app.core.config import settings from app.core.config import settings
from app.core.file_filter import EXCLUDE_DIRS
router = APIRouter() router = APIRouter()
@ -36,12 +37,7 @@ def normalize_path(path: str) -> str:
return path.replace("\\", "/") return path.replace("\\", "/")
# 支持的文件扩展名 # TEXT_EXTENSIONS has been moved to app.core.file_filter and is used via app.services.scanner.is_text_file
TEXT_EXTENSIONS = [
".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml"
]
async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None): async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None):
@ -74,7 +70,7 @@ async def process_zip_task(task_id: str, file_path: str, db_session_factory, use
files_to_scan = [] files_to_scan = []
for root, dirs, files in os.walk(extract_dir): for root, dirs, files in os.walk(extract_dir):
# 排除常见非代码目录 # 排除常见非代码目录
dirs[:] = [d for d in dirs if d not in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'vendor']] dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
for file in files: for file in files:
full_path = Path(root) / file full_path = Path(root) / file

View File

@ -2,6 +2,7 @@
仓库扫描服务 - 支持GitHub, GitLab Gitea 仓库扫描 仓库扫描服务 - 支持GitHub, GitLab Gitea 仓库扫描
""" """
import os
import asyncio import asyncio
import httpx import httpx
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
@ -14,6 +15,7 @@ from app.models.audit import AuditTask, AuditIssue
from app.models.project import Project from app.models.project import Project
from app.services.llm.service import LLMService from app.services.llm.service import LLMService
from app.core.config import settings from app.core.config import settings
from app.core.file_filter import is_text_file as core_is_text_file, should_exclude as core_should_exclude, TEXT_EXTENSIONS as CORE_TEXT_EXTENSIONS
def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]: def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
@ -35,46 +37,18 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st
} }
# 支持的文本文件扩展名 # 支持的文本文件扩展名使用全局定义
TEXT_EXTENSIONS = [ TEXT_EXTENSIONS = list(CORE_TEXT_EXTENSIONS)
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
]
# 排除的目录和文件模式
EXCLUDE_PATTERNS = [
# 常用目录
"node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/",
".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/",
".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/",
".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/",
"packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*",
# 常见锁文件与二进制
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe",
"*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class"
]
def is_text_file(path: str) -> bool: def is_text_file(path: str) -> bool:
"""检查是否为文本文件""" """检查是否为文本文件"""
return any(path.lower().endswith(ext) for ext in TEXT_EXTENSIONS) return core_is_text_file(path)
def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool: def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool:
"""检查是否应该排除该文件""" """检查是否应该排除该文件"""
all_patterns = EXCLUDE_PATTERNS + (exclude_patterns or []) filename = os.path.basename(path)
return any(pattern in path for pattern in all_patterns) return core_should_exclude(path, filename, exclude_patterns)
def get_language_from_path(path: str) -> str: def get_language_from_path(path: str) -> str: