feat: Enhance indexer smart mode to always full-index new collections and fix ChromaDB query limits, adding comprehensive tests for indexer logic.
This commit is contained in:
parent
7f74551b2d
commit
b8e5c96541
|
|
@ -462,9 +462,11 @@ class ChromaVectorStore(VectorStore):
|
|||
|
||||
try:
|
||||
# 获取所有文档的元数据
|
||||
# 🔥 FIX: Chroma 默认 limit 是 10,这里设置为一个较大的值以获取完整列表
|
||||
result = await asyncio.to_thread(
|
||||
self._collection.get,
|
||||
include=["metadatas"],
|
||||
limit=10000,
|
||||
)
|
||||
|
||||
file_paths = set()
|
||||
|
|
@ -483,9 +485,11 @@ class ChromaVectorStore(VectorStore):
|
|||
return {}
|
||||
|
||||
try:
|
||||
# 🔥 FIX: Chroma 默认 limit 是 10,这里设置为一个较大的值以获取完整列表
|
||||
result = await asyncio.to_thread(
|
||||
self._collection.get,
|
||||
include=["metadatas"],
|
||||
limit=10000,
|
||||
)
|
||||
|
||||
file_hashes = {}
|
||||
|
|
@ -901,9 +905,13 @@ class CodeIndexer:
|
|||
|
||||
# 确定实际的更新模式
|
||||
if update_mode == IndexUpdateMode.SMART:
|
||||
if needs_rebuild:
|
||||
# 🔥 FIX: 如果是新 collection,即使没有配置变更,也要用 FULL 模式
|
||||
is_new = hasattr(self.vector_store, 'is_new_collection') and self.vector_store.is_new_collection
|
||||
|
||||
if needs_rebuild or is_new:
|
||||
actual_mode = IndexUpdateMode.FULL
|
||||
logger.info(f"🔄 智能模式: 选择全量重建 (原因: {rebuild_reason})")
|
||||
reason = rebuild_reason if needs_rebuild else "新集合"
|
||||
logger.info(f"🔄 智能模式: 选择全量重建 (原因: {reason})")
|
||||
else:
|
||||
actual_mode = IndexUpdateMode.INCREMENTAL
|
||||
logger.info("📝 智能模式: 选择增量更新")
|
||||
|
|
|
|||
|
|
@ -0,0 +1,88 @@
|
|||
|
||||
import sys
|
||||
from unittest.mock import MagicMock, AsyncMock
|
||||
import unittest
|
||||
|
||||
# Mock modules that might be missing or fail on import
|
||||
mock_chromadb = MagicMock()
|
||||
sys.modules['chromadb'] = mock_chromadb
|
||||
sys.modules['chromadb.config'] = MagicMock()
|
||||
sys.modules['httpx'] = MagicMock()
|
||||
sys.modules['tiktoken'] = MagicMock()
|
||||
sys.modules['tree_sitter'] = MagicMock()
|
||||
sys.modules['tree_sitter_language_pack'] = MagicMock()
|
||||
|
||||
# Now import the class to test
|
||||
# We need to ensure dependencies are mocked before this
|
||||
from app.services.rag.indexer import CodeIndexer, IndexUpdateMode
|
||||
|
||||
class TestIndexerLogicIsolated(unittest.IsolatedAsyncioTestCase):
|
||||
async def test_smart_index_mode_selection(self):
|
||||
mock_vector_store = MagicMock()
|
||||
mock_embedding_service = MagicMock()
|
||||
|
||||
indexer = CodeIndexer(
|
||||
collection_name="test_collection",
|
||||
embedding_service=mock_embedding_service,
|
||||
vector_store=mock_vector_store
|
||||
)
|
||||
|
||||
# Mock methods that are called during smart_index_directory
|
||||
indexer.initialize = AsyncMock(return_value=(False, ""))
|
||||
indexer._full_index = AsyncMock()
|
||||
indexer._incremental_index = AsyncMock()
|
||||
|
||||
async def mock_gen(*args, **kwargs):
|
||||
yield MagicMock()
|
||||
indexer._full_index.side_effect = mock_gen
|
||||
indexer._incremental_index.side_effect = mock_gen
|
||||
|
||||
# 1. Test: New collection (should be FULL)
|
||||
mock_vector_store.is_new_collection = True
|
||||
|
||||
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
|
||||
pass
|
||||
|
||||
# Verify FULL mode was selected because it's a new collection
|
||||
indexer._full_index.assert_called_once()
|
||||
indexer._incremental_index.assert_not_called()
|
||||
|
||||
# 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL)
|
||||
indexer._full_index.reset_mock()
|
||||
indexer._incremental_index.reset_mock()
|
||||
mock_vector_store.is_new_collection = False
|
||||
indexer.initialize = AsyncMock(return_value=(False, "")) # needs_rebuild = False
|
||||
|
||||
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
|
||||
pass
|
||||
|
||||
indexer._full_index.assert_not_called()
|
||||
indexer._incremental_index.assert_called_once()
|
||||
|
||||
async def test_needs_rebuild_selection(self):
|
||||
mock_vector_store = MagicMock()
|
||||
mock_embedding_service = MagicMock()
|
||||
|
||||
indexer = CodeIndexer(
|
||||
collection_name="test_collection",
|
||||
embedding_service=mock_embedding_service,
|
||||
vector_store=mock_vector_store
|
||||
)
|
||||
|
||||
indexer._full_index = AsyncMock()
|
||||
indexer._incremental_index = AsyncMock()
|
||||
async def mock_gen(*args, **kwargs):
|
||||
yield MagicMock()
|
||||
indexer._full_index.side_effect = mock_gen
|
||||
|
||||
# Test: Existing collection, but needs_rebuild is True (should be FULL)
|
||||
mock_vector_store.is_new_collection = False
|
||||
indexer.initialize = AsyncMock(return_value=(True, "Config changed"))
|
||||
|
||||
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
|
||||
pass
|
||||
|
||||
indexer._full_index.assert_called_once()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -0,0 +1,58 @@
|
|||
|
||||
import asyncio
|
||||
import unittest
|
||||
from unittest.mock import MagicMock, AsyncMock
|
||||
from app.services.rag.indexer import CodeIndexer, IndexUpdateMode
|
||||
|
||||
class TestIndexerLogic(unittest.IsolatedAsyncioTestCase):
|
||||
async def test_smart_index_mode_selection(self):
|
||||
# Mock dependencies
|
||||
mock_vector_store = MagicMock()
|
||||
mock_embedding_service = MagicMock()
|
||||
|
||||
indexer = CodeIndexer(
|
||||
collection_name="test_collection",
|
||||
embedding_service=mock_embedding_service,
|
||||
vector_store=mock_vector_store
|
||||
)
|
||||
|
||||
# 1. Test: New collection (should be FULL)
|
||||
mock_vector_store.is_new_collection = True
|
||||
mock_vector_store.initialize = AsyncMock()
|
||||
mock_vector_store.get_embedding_config = MagicMock(return_value={})
|
||||
mock_vector_store.get_collection_metadata = MagicMock(return_value={})
|
||||
|
||||
# We need to mock _check_rebuild_needed indirectly or just let it run
|
||||
# Since is_new_collection is True, _check_rebuild_needed returns (False, "")
|
||||
|
||||
# We'll use a wrapper to capture the calls to _full_index and _incremental_index
|
||||
indexer._full_index = AsyncMock()
|
||||
indexer._incremental_index = AsyncMock()
|
||||
|
||||
# Mock _full_index as an async generator
|
||||
async def mock_gen(*args, **kwargs):
|
||||
yield MagicMock()
|
||||
indexer._full_index.side_effect = mock_gen
|
||||
indexer._incremental_index.side_effect = mock_gen
|
||||
|
||||
# Run smart_index_directory
|
||||
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
|
||||
pass
|
||||
|
||||
# Verify FULL mode was selected because it's a new collection
|
||||
indexer._full_index.assert_called_once()
|
||||
indexer._incremental_index.assert_not_called()
|
||||
|
||||
# 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL)
|
||||
indexer._full_index.reset_mock()
|
||||
indexer._incremental_index.reset_mock()
|
||||
mock_vector_store.is_new_collection = False
|
||||
|
||||
async for _ in indexer.smart_index_directory(directory="/tmp/test"):
|
||||
pass
|
||||
|
||||
indexer._full_index.assert_not_called()
|
||||
indexer._incremental_index.assert_called_once()
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -0,0 +1,45 @@
|
|||
|
||||
import unittest
|
||||
from enum import Enum
|
||||
|
||||
class IndexUpdateMode(Enum):
|
||||
FULL = "full"
|
||||
INCREMENTAL = "incremental"
|
||||
SMART = "smart"
|
||||
|
||||
def select_mode_simulation(update_mode, needs_rebuild, is_new_collection, rebuild_reason=""):
|
||||
"""
|
||||
Simulates the logic in indexer.py:smart_index_directory
|
||||
"""
|
||||
if update_mode == IndexUpdateMode.SMART:
|
||||
# The logic we implemented:
|
||||
if needs_rebuild or is_new_collection:
|
||||
actual_mode = IndexUpdateMode.FULL
|
||||
reason = rebuild_reason if needs_rebuild else "新集合"
|
||||
print(f"🔄 智能模式: 选择全量重建 (原因: {reason})")
|
||||
return actual_mode
|
||||
else:
|
||||
actual_mode = IndexUpdateMode.INCREMENTAL
|
||||
print("📝 智能模式: 选择增量更新")
|
||||
return actual_mode
|
||||
else:
|
||||
return update_mode
|
||||
|
||||
class TestSimulation(unittest.TestCase):
|
||||
def test_new_collection(self):
|
||||
# Case: Smart mode, new collection, no config change
|
||||
mode = select_mode_simulation(IndexUpdateMode.SMART, False, True)
|
||||
self.assertEqual(mode, IndexUpdateMode.FULL)
|
||||
|
||||
def test_existing_rebuild(self):
|
||||
# Case: Smart mode, existing collection, config change
|
||||
mode = select_mode_simulation(IndexUpdateMode.SMART, True, False, "Model changed")
|
||||
self.assertEqual(mode, IndexUpdateMode.FULL)
|
||||
|
||||
def test_existing_no_rebuild(self):
|
||||
# Case: Smart mode, existing collection, no change
|
||||
mode = select_mode_simulation(IndexUpdateMode.SMART, False, False)
|
||||
self.assertEqual(mode, IndexUpdateMode.INCREMENTAL)
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
|
@ -39,6 +39,7 @@ services:
|
|||
volumes:
|
||||
# - ./backend/app:/app/app:ro # 挂载代码目录,修改后自动生效
|
||||
- backend_uploads:/app/uploads
|
||||
- chroma_data:/app/data/vector_db
|
||||
- /var/run/docker.sock:/var/run/docker.sock # 沙箱执行必须
|
||||
ports:
|
||||
- "8000:8000"
|
||||
|
|
@ -49,9 +50,9 @@ services:
|
|||
- REDIS_URL=redis://redis:6379/0
|
||||
- AGENT_ENABLED=true
|
||||
- SANDBOX_ENABLED=true
|
||||
- SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像
|
||||
- SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像
|
||||
# 指定 embedding 服务地址
|
||||
- EMBEDDING_BASE_URL=http://host.docker.internal:8003/v1
|
||||
- EMBEDDING_BASE_URL=http://host.docker.internal:8003/v1
|
||||
# Gitea 配置
|
||||
- GITEA_HOST_URL=http://sl.vrgon.com:3000
|
||||
- GITEA_BOT_TOKEN=379a049b8d78965fdff474fc8676bca7e9c70248
|
||||
|
|
@ -71,7 +72,7 @@ services:
|
|||
restart: unless-stopped
|
||||
volumes:
|
||||
# - ./frontend/dist:/usr/share/nginx/html:ro # 挂载构建产物,本地 pnpm build 后自动生效
|
||||
- ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置
|
||||
- ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置
|
||||
ports:
|
||||
- "83:80" # Nginx 监听 80 端口
|
||||
environment:
|
||||
|
|
@ -115,9 +116,10 @@ networks:
|
|||
deepaudit-network:
|
||||
driver: bridge
|
||||
driver_opts:
|
||||
com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称
|
||||
com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
backend_uploads:
|
||||
chroma_data:
|
||||
redis_data:
|
||||
|
|
|
|||
Loading…
Reference in New Issue