feat: Enhance indexer smart mode to always full-index new collections and fix ChromaDB query limits, adding comprehensive tests for indexer logic.

This commit is contained in:
vinland100 2026-01-06 13:48:14 +08:00
parent 7f74551b2d
commit b8e5c96541
5 changed files with 207 additions and 6 deletions

View File

@ -462,9 +462,11 @@ class ChromaVectorStore(VectorStore):
try:
# 获取所有文档的元数据
# 🔥 FIX: Chroma 默认 limit 是 10这里设置为一个较大的值以获取完整列表
result = await asyncio.to_thread(
self._collection.get,
include=["metadatas"],
limit=10000,
)
file_paths = set()
@ -483,9 +485,11 @@ class ChromaVectorStore(VectorStore):
return {}
try:
# 🔥 FIX: Chroma 默认 limit 是 10这里设置为一个较大的值以获取完整列表
result = await asyncio.to_thread(
self._collection.get,
include=["metadatas"],
limit=10000,
)
file_hashes = {}
@ -901,9 +905,13 @@ class CodeIndexer:
# 确定实际的更新模式
if update_mode == IndexUpdateMode.SMART:
if needs_rebuild:
# 🔥 FIX: 如果是新 collection即使没有配置变更也要用 FULL 模式
is_new = hasattr(self.vector_store, 'is_new_collection') and self.vector_store.is_new_collection
if needs_rebuild or is_new:
actual_mode = IndexUpdateMode.FULL
logger.info(f"🔄 智能模式: 选择全量重建 (原因: {rebuild_reason})")
reason = rebuild_reason if needs_rebuild else "新集合"
logger.info(f"🔄 智能模式: 选择全量重建 (原因: {reason})")
else:
actual_mode = IndexUpdateMode.INCREMENTAL
logger.info("📝 智能模式: 选择增量更新")

View File

@ -0,0 +1,88 @@
import sys
from unittest.mock import MagicMock, AsyncMock
import unittest
# Mock modules that might be missing or fail on import
mock_chromadb = MagicMock()
sys.modules['chromadb'] = mock_chromadb
sys.modules['chromadb.config'] = MagicMock()
sys.modules['httpx'] = MagicMock()
sys.modules['tiktoken'] = MagicMock()
sys.modules['tree_sitter'] = MagicMock()
sys.modules['tree_sitter_language_pack'] = MagicMock()
# Now import the class to test
# We need to ensure dependencies are mocked before this
from app.services.rag.indexer import CodeIndexer, IndexUpdateMode
class TestIndexerLogicIsolated(unittest.IsolatedAsyncioTestCase):
async def test_smart_index_mode_selection(self):
mock_vector_store = MagicMock()
mock_embedding_service = MagicMock()
indexer = CodeIndexer(
collection_name="test_collection",
embedding_service=mock_embedding_service,
vector_store=mock_vector_store
)
# Mock methods that are called during smart_index_directory
indexer.initialize = AsyncMock(return_value=(False, ""))
indexer._full_index = AsyncMock()
indexer._incremental_index = AsyncMock()
async def mock_gen(*args, **kwargs):
yield MagicMock()
indexer._full_index.side_effect = mock_gen
indexer._incremental_index.side_effect = mock_gen
# 1. Test: New collection (should be FULL)
mock_vector_store.is_new_collection = True
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
pass
# Verify FULL mode was selected because it's a new collection
indexer._full_index.assert_called_once()
indexer._incremental_index.assert_not_called()
# 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL)
indexer._full_index.reset_mock()
indexer._incremental_index.reset_mock()
mock_vector_store.is_new_collection = False
indexer.initialize = AsyncMock(return_value=(False, "")) # needs_rebuild = False
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
pass
indexer._full_index.assert_not_called()
indexer._incremental_index.assert_called_once()
async def test_needs_rebuild_selection(self):
mock_vector_store = MagicMock()
mock_embedding_service = MagicMock()
indexer = CodeIndexer(
collection_name="test_collection",
embedding_service=mock_embedding_service,
vector_store=mock_vector_store
)
indexer._full_index = AsyncMock()
indexer._incremental_index = AsyncMock()
async def mock_gen(*args, **kwargs):
yield MagicMock()
indexer._full_index.side_effect = mock_gen
# Test: Existing collection, but needs_rebuild is True (should be FULL)
mock_vector_store.is_new_collection = False
indexer.initialize = AsyncMock(return_value=(True, "Config changed"))
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
pass
indexer._full_index.assert_called_once()
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,58 @@
import asyncio
import unittest
from unittest.mock import MagicMock, AsyncMock
from app.services.rag.indexer import CodeIndexer, IndexUpdateMode
class TestIndexerLogic(unittest.IsolatedAsyncioTestCase):
async def test_smart_index_mode_selection(self):
# Mock dependencies
mock_vector_store = MagicMock()
mock_embedding_service = MagicMock()
indexer = CodeIndexer(
collection_name="test_collection",
embedding_service=mock_embedding_service,
vector_store=mock_vector_store
)
# 1. Test: New collection (should be FULL)
mock_vector_store.is_new_collection = True
mock_vector_store.initialize = AsyncMock()
mock_vector_store.get_embedding_config = MagicMock(return_value={})
mock_vector_store.get_collection_metadata = MagicMock(return_value={})
# We need to mock _check_rebuild_needed indirectly or just let it run
# Since is_new_collection is True, _check_rebuild_needed returns (False, "")
# We'll use a wrapper to capture the calls to _full_index and _incremental_index
indexer._full_index = AsyncMock()
indexer._incremental_index = AsyncMock()
# Mock _full_index as an async generator
async def mock_gen(*args, **kwargs):
yield MagicMock()
indexer._full_index.side_effect = mock_gen
indexer._incremental_index.side_effect = mock_gen
# Run smart_index_directory
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
pass
# Verify FULL mode was selected because it's a new collection
indexer._full_index.assert_called_once()
indexer._incremental_index.assert_not_called()
# 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL)
indexer._full_index.reset_mock()
indexer._incremental_index.reset_mock()
mock_vector_store.is_new_collection = False
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
pass
indexer._full_index.assert_not_called()
indexer._incremental_index.assert_called_once()
if __name__ == "__main__":
unittest.main()

View File

@ -0,0 +1,45 @@
import unittest
from enum import Enum
class IndexUpdateMode(Enum):
FULL = "full"
INCREMENTAL = "incremental"
SMART = "smart"
def select_mode_simulation(update_mode, needs_rebuild, is_new_collection, rebuild_reason=""):
"""
Simulates the logic in indexer.py:smart_index_directory
"""
if update_mode == IndexUpdateMode.SMART:
# The logic we implemented:
if needs_rebuild or is_new_collection:
actual_mode = IndexUpdateMode.FULL
reason = rebuild_reason if needs_rebuild else "新集合"
print(f"🔄 智能模式: 选择全量重建 (原因: {reason})")
return actual_mode
else:
actual_mode = IndexUpdateMode.INCREMENTAL
print("📝 智能模式: 选择增量更新")
return actual_mode
else:
return update_mode
class TestSimulation(unittest.TestCase):
def test_new_collection(self):
# Case: Smart mode, new collection, no config change
mode = select_mode_simulation(IndexUpdateMode.SMART, False, True)
self.assertEqual(mode, IndexUpdateMode.FULL)
def test_existing_rebuild(self):
# Case: Smart mode, existing collection, config change
mode = select_mode_simulation(IndexUpdateMode.SMART, True, False, "Model changed")
self.assertEqual(mode, IndexUpdateMode.FULL)
def test_existing_no_rebuild(self):
# Case: Smart mode, existing collection, no change
mode = select_mode_simulation(IndexUpdateMode.SMART, False, False)
self.assertEqual(mode, IndexUpdateMode.INCREMENTAL)
if __name__ == "__main__":
unittest.main()

View File

@ -39,6 +39,7 @@ services:
volumes:
# - ./backend/app:/app/app:ro # 挂载代码目录,修改后自动生效
- backend_uploads:/app/uploads
- chroma_data:/app/data/vector_db
- /var/run/docker.sock:/var/run/docker.sock # 沙箱执行必须
ports:
- "8000:8000"
@ -49,7 +50,7 @@ services:
- REDIS_URL=redis://redis:6379/0
- AGENT_ENABLED=true
- SANDBOX_ENABLED=true
- SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像
- SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像
# 指定 embedding 服务地址
- EMBEDDING_BASE_URL=http://host.docker.internal:8003/v1
# Gitea 配置
@ -71,7 +72,7 @@ services:
restart: unless-stopped
volumes:
# - ./frontend/dist:/usr/share/nginx/html:ro # 挂载构建产物,本地 pnpm build 后自动生效
- ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置
- ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置
ports:
- "83:80" # Nginx 监听 80 端口
environment:
@ -115,9 +116,10 @@ networks:
deepaudit-network:
driver: bridge
driver_opts:
com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称
com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称
volumes:
postgres_data:
backend_uploads:
chroma_data:
redis_data: