The fast scan mode follows the file exclusion pattern used during RAG embedding.
Build and Push CodeReview / build (push) Waiting to run
Details
Build and Push CodeReview / build (push) Waiting to run
Details
This commit is contained in:
parent
a11542c4bf
commit
de88b69f86
|
|
@ -1044,7 +1044,7 @@ async def _collect_project_info(
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 使用通用的筛选逻辑
|
# 使用通用的筛选逻辑
|
||||||
if should_exclude(relative_path, f, exclude_patterns):
|
if not is_text_file(f) or should_exclude(relative_path, f, exclude_patterns):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
info["file_count"] += 1
|
info["file_count"] += 1
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ from app.services.llm.service import LLMService
|
||||||
from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config
|
from app.services.scanner import task_control, is_text_file, should_exclude, get_language_from_path, get_analysis_config
|
||||||
from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip
|
from app.services.zip_storage import load_project_zip, save_project_zip, has_project_zip
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
|
from app.core.file_filter import EXCLUDE_DIRS
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
@ -36,12 +37,7 @@ def normalize_path(path: str) -> str:
|
||||||
return path.replace("\\", "/")
|
return path.replace("\\", "/")
|
||||||
|
|
||||||
|
|
||||||
# 支持的文件扩展名
|
# TEXT_EXTENSIONS has been moved to app.core.file_filter and is used via app.services.scanner.is_text_file
|
||||||
TEXT_EXTENSIONS = [
|
|
||||||
".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs",
|
|
||||||
".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
|
|
||||||
".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None):
|
async def process_zip_task(task_id: str, file_path: str, db_session_factory, user_config: dict = None):
|
||||||
|
|
@ -74,7 +70,7 @@ async def process_zip_task(task_id: str, file_path: str, db_session_factory, use
|
||||||
files_to_scan = []
|
files_to_scan = []
|
||||||
for root, dirs, files in os.walk(extract_dir):
|
for root, dirs, files in os.walk(extract_dir):
|
||||||
# 排除常见非代码目录
|
# 排除常见非代码目录
|
||||||
dirs[:] = [d for d in dirs if d not in ['node_modules', '__pycache__', '.git', 'dist', 'build', 'vendor']]
|
dirs[:] = [d for d in dirs if d not in EXCLUDE_DIRS]
|
||||||
|
|
||||||
for file in files:
|
for file in files:
|
||||||
full_path = Path(root) / file
|
full_path = Path(root) / file
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@
|
||||||
仓库扫描服务 - 支持GitHub, GitLab 和 Gitea 仓库扫描
|
仓库扫描服务 - 支持GitHub, GitLab 和 Gitea 仓库扫描
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
import httpx
|
import httpx
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
|
|
@ -14,6 +15,7 @@ from app.models.audit import AuditTask, AuditIssue
|
||||||
from app.models.project import Project
|
from app.models.project import Project
|
||||||
from app.services.llm.service import LLMService
|
from app.services.llm.service import LLMService
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
|
from app.core.file_filter import is_text_file as core_is_text_file, should_exclude as core_should_exclude, TEXT_EXTENSIONS as CORE_TEXT_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||||
|
|
@ -35,46 +37,18 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
# 支持的文本文件扩展名
|
# 支持的文本文件扩展名使用全局定义
|
||||||
TEXT_EXTENSIONS = [
|
TEXT_EXTENSIONS = list(CORE_TEXT_EXTENSIONS)
|
||||||
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
|
|
||||||
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
|
|
||||||
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
|
|
||||||
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
|
|
||||||
".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
|
|
||||||
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
|
|
||||||
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
|
|
||||||
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
|
|
||||||
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
|
|
||||||
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
|
|
||||||
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
|
|
||||||
]
|
|
||||||
|
|
||||||
# 排除的目录和文件模式
|
|
||||||
EXCLUDE_PATTERNS = [
|
|
||||||
# 常用目录
|
|
||||||
"node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/",
|
|
||||||
".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/",
|
|
||||||
".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/",
|
|
||||||
".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/",
|
|
||||||
"packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*",
|
|
||||||
# 常见锁文件与二进制
|
|
||||||
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
|
|
||||||
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
|
|
||||||
".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe",
|
|
||||||
"*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class"
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def is_text_file(path: str) -> bool:
|
def is_text_file(path: str) -> bool:
|
||||||
"""检查是否为文本文件"""
|
"""检查是否为文本文件"""
|
||||||
return any(path.lower().endswith(ext) for ext in TEXT_EXTENSIONS)
|
return core_is_text_file(path)
|
||||||
|
|
||||||
|
|
||||||
def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool:
|
def should_exclude(path: str, exclude_patterns: List[str] = None) -> bool:
|
||||||
"""检查是否应该排除该文件"""
|
"""检查是否应该排除该文件"""
|
||||||
all_patterns = EXCLUDE_PATTERNS + (exclude_patterns or [])
|
filename = os.path.basename(path)
|
||||||
return any(pattern in path for pattern in all_patterns)
|
return core_should_exclude(path, filename, exclude_patterns)
|
||||||
|
|
||||||
|
|
||||||
def get_language_from_path(path: str) -> str:
|
def get_language_from_path(path: str) -> str:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue