feat: Centralize repository synchronization and indexing into `_ensure_indexed` for PR and chat events, add empty chat query handling, and improve Gitea URL rewriting.

This commit is contained in:
vinland100 2025-12-31 16:57:42 +08:00
parent b401a26b10
commit df8796e6e3
1 changed files with 80 additions and 34 deletions

View File

@ -72,28 +72,12 @@ class CIService:
logger.error(f"Error creating project: {e}")
return
# 2. Clone/Update Repo & Indexing (RAG)
try:
repo_path = await self._prepare_repository(project, repo_url, branch, settings.GITEA_BOT_TOKEN)
except Exception as e:
logger.error(f"Git operation failed: {e}")
# If clone fails, we can't proceed with RAG, but we shouldn't crash
# 2. Sync Repository and Index
repo_path = await self._ensure_indexed(project, repo, branch)
if not repo_path:
return
try:
# 3. Incremental Indexing
indexer = CodeIndexer(
collection_name=f"ci_{project.id}",
persist_directory=str(CI_VECTOR_DB_DIR / project.id)
)
# Iterate over the generator to execute indexing
async for progress in indexer.smart_index_directory(
directory=repo_path,
update_mode=IndexUpdateMode.INCREMENTAL
):
if progress.processed_files % 10 == 0:
logger.info(f"Indexing progress: {progress.processed_files}/{progress.total_files}")
# 4. Analyze Diff & Retrieve Context
diff_text = await self._get_pr_diff(repo, pr_number)
if not diff_text:
@ -179,20 +163,16 @@ class CIService:
# We need a dummy PR object if we are creating project from chat, or we just fetch by repo
# Since _get_or_create_project needs PR info to determine branch/owner, we might need a distinct method
# or simplified flow.
project = await self._get_project_by_repo(repo.get("clone_url"))
# 1. Get Project (or Create if discovered via Chat first)
repo_url = repo.get("clone_url")
project = await self._get_project_by_repo(repo_url)
if not project:
# If project doesn't exist, we try to create it using available repo info
# We construct a minimal "pseudo-PR" dict if needed, or better:
# We assume if we are chatting on a PR, we can get PR details via API later
# For now, let's just Try to Find Project. If not found, we CANNOT proceed easily without syncing.
# But user wants "Auto Discovery".
# Let's try to create it.
try:
# Mock a PR object for creation purposes (minimal fields)
# Mock a PR object for creation
mock_pr = {
"number": issue.get("number"),
"head": {"ref": repo.get("default_branch", "main"), "sha": "HEAD"}, # Fallback
"head": {"ref": repo.get("default_branch", "main"), "sha": "HEAD"},
"base": {"ref": repo.get("default_branch", "main")}
}
project = await self._get_or_create_project(repo, mock_pr)
@ -204,22 +184,36 @@ class CIService:
logger.warning("Project could not be determined for chat event")
return
# 2. Retrieve Context (RAG)
# 2. Ensure Indexed (Important for first-time chat or if project auto-created)
branch = repo.get("default_branch", "main")
repo_path = await self._ensure_indexed(project, repo, branch)
if not repo_path:
logger.error("Failed to sync/index repository for chat")
return
# 3. Retrieve Context (RAG)
retriever = CodeRetriever(
collection_name=f"ci_{project.id}",
persist_directory=str(CI_VECTOR_DB_DIR / project.id)
)
# Use the user comment as query
query = body.replace("@ai-bot", "").strip()
# Handle empty query
if not query:
msg = "你好!我是 DeepAudit AI 助手。你可以问我关于此 PR 或项目代码的任何安全及逻辑问题。例如:\n- '这段代码有 SQL 注入风险吗?'\n- '这个 PR 修改了哪些核心组件?'\n\n请提供具体问题以便我通过代码上下文为你解答。"
await self._post_gitea_comment(repo, issue.get("number"), msg)
return
context_results = await retriever.retrieve(query, top_k=5)
repo_context = "\n".join([r.to_context_string() for r in context_results])
# 3. Build Prompt
# 4. Build Prompt
# Fetch conversation history (simplified: just current comment)
history = f"User: {query}"
prompt = build_chat_prompt(query, repo_context, history)
# 4. Generate Answer
# 5. Generate Answer
response = await self.llm_service.chat_completion_raw(
messages=[{"role": "user", "content": prompt}],
temperature=0.4
@ -227,9 +221,10 @@ class CIService:
answer = response["content"]
# 5. Reply
# 6. Reply
# Append context info footer
footer = "\n\n---\n*Context used: " + ", ".join([f"`{r.file_path}`" for r in context_results]) + "*"
footer_parts = [f"`{r.file_path}`" for r in context_results]
footer = "\n\n---\n*Context used: " + (", ".join(footer_parts) if footer_parts else "None (General knowledge used)") + "*"
await self._post_gitea_comment(repo, issue.get("number"), answer + footer)
# 6. Record (Optional, maybe just log)
@ -286,6 +281,40 @@ class CIService:
return project
async def _ensure_indexed(self, project: Project, repo: Dict, branch: str) -> Optional[str]:
"""
Syncs the repository and ensures it is indexed.
Returns the local path if successful.
"""
repo_url = repo.get("clone_url")
# 1. Prepare Repository (Clone/Pull)
repo_path = await self._prepare_repository(project, repo_url, branch, settings.GITEA_BOT_TOKEN)
if not repo_path:
logger.error(f"Failed to prepare repository for project {project.id}")
return None
try:
# 2. Incremental Indexing
indexer = CodeIndexer(
collection_name=f"ci_{project.id}",
persist_directory=str(CI_VECTOR_DB_DIR / project.id)
)
# Iterate over the generator to execute indexing
async for progress in indexer.smart_index_directory(
directory=repo_path,
update_mode=IndexUpdateMode.INCREMENTAL
):
# Log progress occasionally
if progress.total_files > 0 and progress.processed_files % 20 == 0:
logger.info(f"[{project.name}] Indexing: {progress.processed_files}/{progress.total_files}")
logger.info(f"✅ Project {project.name} indexing complete.")
return repo_path
except Exception as e:
logger.error(f"Indexing error for project {project.id}: {e}")
return repo_path # Return path anyway, maybe some files are present
async def _get_project_by_repo(self, repo_url: str) -> Optional[Project]:
stmt = select(Project).where(Project.repository_url == repo_url)
result = await self.db.execute(stmt)
@ -297,7 +326,24 @@ class CIService:
"""
target_dir = CI_WORKSPACE_DIR / project.id
# Inject Token into URL for auth
# 1. Rewrite URL to use configured Host if necessary
# Gitea might send 'localhost:3000' in payload, but we need settings.GITEA_HOST_URL
if settings.GITEA_HOST_URL and "://" in repo_url:
from urllib.parse import urlparse, urlunparse
payload_url = urlparse(repo_url)
config_url = urlparse(settings.GITEA_HOST_URL)
# Use host (and port) from config, keep path from payload
repo_url = urlunparse((
config_url.scheme or payload_url.scheme,
config_url.netloc,
payload_url.path,
payload_url.params,
payload_url.query,
payload_url.fragment
))
logger.info(f"🔗 Rewrote Clone URL: {repo_url}")
# 2. Inject Token into URL for auth
# Format: http://token@host/repo.git
if "://" in repo_url:
protocol, rest = repo_url.split("://", 1)