feat: Add Gitea as a default repository type, refine ChromaDB metadata updates, and enhance indexer logging and file deletion logic.
This commit is contained in:
parent
0656062a4f
commit
fcb40db556
|
|
@ -18,6 +18,7 @@ from pathlib import Path
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
import json
|
import json
|
||||||
|
import fnmatch
|
||||||
|
|
||||||
from .splitter import CodeSplitter, CodeChunk
|
from .splitter import CodeSplitter, CodeChunk
|
||||||
from .embeddings import EmbeddingService
|
from .embeddings import EmbeddingService
|
||||||
|
|
@ -556,10 +557,14 @@ class ChromaVectorStore(VectorStore):
|
||||||
current_metadata["updated_at"] = time.time()
|
current_metadata["updated_at"] = time.time()
|
||||||
|
|
||||||
# Chroma 不支持直接更新元数据,需要通过修改 collection
|
# Chroma 不支持直接更新元数据,需要通过修改 collection
|
||||||
|
# 🔥 FIX: 不能在 modify 中传递 "hnsw:space",因为 Chroma 不支持在创建后修改距离函数
|
||||||
|
# 即使值相同也可能导致某些版本报错
|
||||||
|
modified_metadata = {k: v for k, v in current_metadata.items() if k != "hnsw:space"}
|
||||||
|
|
||||||
# 这里我们使用 modify 方法
|
# 这里我们使用 modify 方法
|
||||||
await asyncio.to_thread(
|
await asyncio.to_thread(
|
||||||
self._collection.modify,
|
self._collection.modify,
|
||||||
metadata=current_metadata,
|
metadata=modified_metadata,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"更新 collection 元数据失败: {e}")
|
logger.warning(f"更新 collection 元数据失败: {e}")
|
||||||
|
|
@ -986,6 +991,12 @@ class CodeIndexer:
|
||||||
progress.total_files = len(files)
|
progress.total_files = len(files)
|
||||||
|
|
||||||
logger.info(f"📁 发现 {len(files)} 个文件待索引")
|
logger.info(f"📁 发现 {len(files)} 个文件待索引")
|
||||||
|
|
||||||
|
# 🔥 详细打印所有待索引的文件名,方便调试 (满足用户需求)
|
||||||
|
if files:
|
||||||
|
relative_files = [os.path.relpath(f, directory) for f in files]
|
||||||
|
logger.info(f"📄 待索引文件列表: {', '.join(sorted(relative_files))}")
|
||||||
|
|
||||||
yield progress
|
yield progress
|
||||||
|
|
||||||
semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存
|
semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存
|
||||||
|
|
@ -1087,7 +1098,7 @@ class CodeIndexer:
|
||||||
files_to_delete = indexed_files - current_file_set
|
files_to_delete = indexed_files - current_file_set
|
||||||
files_to_check = current_file_set & indexed_files
|
files_to_check = current_file_set & indexed_files
|
||||||
|
|
||||||
logger.debug(f"📊 差异分析: 交集={len(files_to_check)}, 新增候选={len(files_to_add)}, 删除候选={len(files_to_delete)}")
|
logger.debug(f"<EFBFBD> 差异分析: 交集={len(files_to_check)}, 新增候选={len(files_to_add)}, 删除候选={len(files_to_delete)}")
|
||||||
|
|
||||||
# 检查需要更新的文件(hash 变化)
|
# 检查需要更新的文件(hash 变化)
|
||||||
files_to_update: Set[str] = set()
|
files_to_update: Set[str] = set()
|
||||||
|
|
@ -1104,10 +1115,37 @@ class CodeIndexer:
|
||||||
except Exception:
|
except Exception:
|
||||||
files_to_update.add(relative_path)
|
files_to_update.add(relative_path)
|
||||||
|
|
||||||
|
# 只有当指定了 include_patterns 时,才需要特殊处理 files_to_delete
|
||||||
|
# 如果当前扫描是限定范围的(例如只审计某几个文件),不应该删除范围外的已有索引
|
||||||
|
if include_patterns:
|
||||||
|
actual_files_to_delete = set()
|
||||||
|
for rel_path in files_to_delete:
|
||||||
|
filename = os.path.basename(rel_path)
|
||||||
|
is_in_scope = False
|
||||||
|
for pattern in include_patterns:
|
||||||
|
if fnmatch.fnmatch(rel_path, pattern) or fnmatch.fnmatch(filename, pattern):
|
||||||
|
is_in_scope = True
|
||||||
|
break
|
||||||
|
if is_in_scope:
|
||||||
|
actual_files_to_delete.add(rel_path)
|
||||||
|
|
||||||
|
if len(actual_files_to_delete) < len(files_to_delete):
|
||||||
|
logger.debug(f"Scope limited: reducing files_to_delete from {len(files_to_delete)} to {len(actual_files_to_delete)}")
|
||||||
|
files_to_delete = actual_files_to_delete
|
||||||
|
|
||||||
total_operations = len(files_to_add) + len(files_to_delete) + len(files_to_update)
|
total_operations = len(files_to_add) + len(files_to_delete) + len(files_to_update)
|
||||||
progress.total_files = total_operations
|
progress.total_files = total_operations
|
||||||
|
|
||||||
logger.info(f"📊 增量更新: 新增 {len(files_to_add)}, 删除 {len(files_to_delete)}, 更新 {len(files_to_update)}")
|
logger.info(f"📊 增量更新摘要: 新增 {len(files_to_add)}, 删除 {len(files_to_delete)}, 更新 {len(files_to_update)}")
|
||||||
|
|
||||||
|
# 🔥 详细打印新增、更新和删除的文件名,方便调试 (满足用户需求)
|
||||||
|
if files_to_add:
|
||||||
|
logger.info(f"🆕 新增文件 ({len(files_to_add)}): {', '.join(sorted(list(files_to_add)))}")
|
||||||
|
if files_to_update:
|
||||||
|
logger.info(f"🔄 更新文件 ({len(files_to_update)}): {', '.join(sorted(list(files_to_update)))}")
|
||||||
|
if files_to_delete:
|
||||||
|
logger.info(f"🗑️ 删除文件 ({len(files_to_delete)}): {', '.join(sorted(list(files_to_delete)))}")
|
||||||
|
|
||||||
yield progress
|
yield progress
|
||||||
|
|
||||||
# 删除已移除的文件
|
# 删除已移除的文件
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,7 @@ export default function Projects() {
|
||||||
description: "",
|
description: "",
|
||||||
source_type: "repository",
|
source_type: "repository",
|
||||||
repository_url: "",
|
repository_url: "",
|
||||||
repository_type: "github",
|
repository_type: "gitea",
|
||||||
default_branch: "main",
|
default_branch: "main",
|
||||||
programming_languages: []
|
programming_languages: []
|
||||||
});
|
});
|
||||||
|
|
@ -77,7 +77,7 @@ export default function Projects() {
|
||||||
description: "",
|
description: "",
|
||||||
source_type: "repository",
|
source_type: "repository",
|
||||||
repository_url: "",
|
repository_url: "",
|
||||||
repository_type: "github",
|
repository_type: "gitea",
|
||||||
default_branch: "main",
|
default_branch: "main",
|
||||||
programming_languages: []
|
programming_languages: []
|
||||||
});
|
});
|
||||||
|
|
@ -172,7 +172,7 @@ export default function Projects() {
|
||||||
description: "",
|
description: "",
|
||||||
source_type: "repository",
|
source_type: "repository",
|
||||||
repository_url: "",
|
repository_url: "",
|
||||||
repository_type: "github",
|
repository_type: "gitea",
|
||||||
default_branch: "main",
|
default_branch: "main",
|
||||||
programming_languages: []
|
programming_languages: []
|
||||||
});
|
});
|
||||||
|
|
|
||||||
|
|
@ -24,9 +24,9 @@ export const PROJECT_SOURCE_TYPES: Array<{
|
||||||
|
|
||||||
// 仓库平台显示名称
|
// 仓库平台显示名称
|
||||||
export const REPOSITORY_PLATFORM_LABELS: Record<RepositoryPlatform, string> = {
|
export const REPOSITORY_PLATFORM_LABELS: Record<RepositoryPlatform, string> = {
|
||||||
|
gitea: 'Gitea',
|
||||||
github: 'GitHub',
|
github: 'GitHub',
|
||||||
gitlab: 'GitLab',
|
gitlab: 'GitLab',
|
||||||
gitea: 'Gitea',
|
|
||||||
other: '其他',
|
other: '其他',
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue