feat: Add Gitea as a default repository type, refine ChromaDB metadata updates, and enhance indexer logging and file deletion logic.

This commit is contained in:
vinland100 2026-01-06 16:21:21 +08:00
parent 0656062a4f
commit fcb40db556
3 changed files with 45 additions and 7 deletions

View File

@ -18,6 +18,7 @@ from pathlib import Path
from dataclasses import dataclass, field from dataclasses import dataclass, field
from enum import Enum from enum import Enum
import json import json
import fnmatch
from .splitter import CodeSplitter, CodeChunk from .splitter import CodeSplitter, CodeChunk
from .embeddings import EmbeddingService from .embeddings import EmbeddingService
@ -556,10 +557,14 @@ class ChromaVectorStore(VectorStore):
current_metadata["updated_at"] = time.time() current_metadata["updated_at"] = time.time()
# Chroma 不支持直接更新元数据,需要通过修改 collection # Chroma 不支持直接更新元数据,需要通过修改 collection
# 🔥 FIX: 不能在 modify 中传递 "hnsw:space",因为 Chroma 不支持在创建后修改距离函数
# 即使值相同也可能导致某些版本报错
modified_metadata = {k: v for k, v in current_metadata.items() if k != "hnsw:space"}
# 这里我们使用 modify 方法 # 这里我们使用 modify 方法
await asyncio.to_thread( await asyncio.to_thread(
self._collection.modify, self._collection.modify,
metadata=current_metadata, metadata=modified_metadata,
) )
except Exception as e: except Exception as e:
logger.warning(f"更新 collection 元数据失败: {e}") logger.warning(f"更新 collection 元数据失败: {e}")
@ -986,6 +991,12 @@ class CodeIndexer:
progress.total_files = len(files) progress.total_files = len(files)
logger.info(f"📁 发现 {len(files)} 个文件待索引") logger.info(f"📁 发现 {len(files)} 个文件待索引")
# 🔥 详细打印所有待索引的文件名,方便调试 (满足用户需求)
if files:
relative_files = [os.path.relpath(f, directory) for f in files]
logger.info(f"📄 待索引文件列表: {', '.join(sorted(relative_files))}")
yield progress yield progress
semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存 semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存
@ -1087,7 +1098,7 @@ class CodeIndexer:
files_to_delete = indexed_files - current_file_set files_to_delete = indexed_files - current_file_set
files_to_check = current_file_set & indexed_files files_to_check = current_file_set & indexed_files
logger.debug(f"📊 差异分析: 交集={len(files_to_check)}, 新增候选={len(files_to_add)}, 删除候选={len(files_to_delete)}") logger.debug(f"<EFBFBD> 差异分析: 交集={len(files_to_check)}, 新增候选={len(files_to_add)}, 删除候选={len(files_to_delete)}")
# 检查需要更新的文件hash 变化) # 检查需要更新的文件hash 变化)
files_to_update: Set[str] = set() files_to_update: Set[str] = set()
@ -1104,10 +1115,37 @@ class CodeIndexer:
except Exception: except Exception:
files_to_update.add(relative_path) files_to_update.add(relative_path)
# 只有当指定了 include_patterns 时,才需要特殊处理 files_to_delete
# 如果当前扫描是限定范围的(例如只审计某几个文件),不应该删除范围外的已有索引
if include_patterns:
actual_files_to_delete = set()
for rel_path in files_to_delete:
filename = os.path.basename(rel_path)
is_in_scope = False
for pattern in include_patterns:
if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern):
is_in_scope = True
break
if is_in_scope:
actual_files_to_delete.add(rel_path)
if len(actual_files_to_delete) < len(files_to_delete):
logger.debug(f"Scope limited: reducing files_to_delete from {len(files_to_delete)} to {len(actual_files_to_delete)}")
files_to_delete = actual_files_to_delete
total_operations = len(files_to_add) + len(files_to_delete) + len(files_to_update) total_operations = len(files_to_add) + len(files_to_delete) + len(files_to_update)
progress.total_files = total_operations progress.total_files = total_operations
logger.info(f"📊 增量更新: 新增 {len(files_to_add)}, 删除 {len(files_to_delete)}, 更新 {len(files_to_update)}") logger.info(f"📊 增量更新摘要: 新增 {len(files_to_add)}, 删除 {len(files_to_delete)}, 更新 {len(files_to_update)}")
# 🔥 详细打印新增、更新和删除的文件名,方便调试 (满足用户需求)
if files_to_add:
logger.info(f"🆕 新增文件 ({len(files_to_add)}): {', '.join(sorted(list(files_to_add)))}")
if files_to_update:
logger.info(f"🔄 更新文件 ({len(files_to_update)}): {', '.join(sorted(list(files_to_update)))}")
if files_to_delete:
logger.info(f"🗑️ 删除文件 ({len(files_to_delete)}): {', '.join(sorted(list(files_to_delete)))}")
yield progress yield progress
# 删除已移除的文件 # 删除已移除的文件

View File

@ -68,7 +68,7 @@ export default function Projects() {
description: "", description: "",
source_type: "repository", source_type: "repository",
repository_url: "", repository_url: "",
repository_type: "github", repository_type: "gitea",
default_branch: "main", default_branch: "main",
programming_languages: [] programming_languages: []
}); });
@ -77,7 +77,7 @@ export default function Projects() {
description: "", description: "",
source_type: "repository", source_type: "repository",
repository_url: "", repository_url: "",
repository_type: "github", repository_type: "gitea",
default_branch: "main", default_branch: "main",
programming_languages: [] programming_languages: []
}); });
@ -172,7 +172,7 @@ export default function Projects() {
description: "", description: "",
source_type: "repository", source_type: "repository",
repository_url: "", repository_url: "",
repository_type: "github", repository_type: "gitea",
default_branch: "main", default_branch: "main",
programming_languages: [] programming_languages: []
}); });

View File

@ -24,9 +24,9 @@ export const PROJECT_SOURCE_TYPES: Array<{
// 仓库平台显示名称 // 仓库平台显示名称
export const REPOSITORY_PLATFORM_LABELS: Record<RepositoryPlatform, string> = { export const REPOSITORY_PLATFORM_LABELS: Record<RepositoryPlatform, string> = {
gitea: 'Gitea',
github: 'GitHub', github: 'GitHub',
gitlab: 'GitLab', gitlab: 'GitLab',
gitea: 'Gitea',
other: '其他', other: '其他',
}; };