From b8e5c96541906e0c57bf47ca64bda4f5fb586150 Mon Sep 17 00:00:00 2001 From: vinland100 Date: Tue, 6 Jan 2026 13:48:14 +0800 Subject: [PATCH] feat: Enhance indexer smart mode to always full-index new collections and fix ChromaDB query limits, adding comprehensive tests for indexer logic. --- backend/app/services/rag/indexer.py | 12 +++- backend/tests/test_indexer_isolated.py | 88 ++++++++++++++++++++++++++ backend/tests/test_indexer_logic.py | 58 +++++++++++++++++ backend/tests/verify_logic_sim.py | 45 +++++++++++++ docker-compose.yml | 10 +-- 5 files changed, 207 insertions(+), 6 deletions(-) create mode 100644 backend/tests/test_indexer_isolated.py create mode 100644 backend/tests/test_indexer_logic.py create mode 100644 backend/tests/verify_logic_sim.py diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 3ecfadb..9927365 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -462,9 +462,11 @@ class ChromaVectorStore(VectorStore): try: # 获取所有文档的元数据 + # 🔥 FIX: Chroma 默认 limit 是 10,这里设置为一个较大的值以获取完整列表 result = await asyncio.to_thread( self._collection.get, include=["metadatas"], + limit=10000, ) file_paths = set() @@ -483,9 +485,11 @@ class ChromaVectorStore(VectorStore): return {} try: + # 🔥 FIX: Chroma 默认 limit 是 10,这里设置为一个较大的值以获取完整列表 result = await asyncio.to_thread( self._collection.get, include=["metadatas"], + limit=10000, ) file_hashes = {} @@ -901,9 +905,13 @@ class CodeIndexer: # 确定实际的更新模式 if update_mode == IndexUpdateMode.SMART: - if needs_rebuild: + # 🔥 FIX: 如果是新 collection,即使没有配置变更,也要用 FULL 模式 + is_new = hasattr(self.vector_store, 'is_new_collection') and self.vector_store.is_new_collection + + if needs_rebuild or is_new: actual_mode = IndexUpdateMode.FULL - logger.info(f"🔄 智能模式: 选择全量重建 (原因: {rebuild_reason})") + reason = rebuild_reason if needs_rebuild else "新集合" + logger.info(f"🔄 智能模式: 选择全量重建 (原因: {reason})") else: actual_mode = IndexUpdateMode.INCREMENTAL logger.info("📝 智能模式: 选择增量更新") diff --git a/backend/tests/test_indexer_isolated.py b/backend/tests/test_indexer_isolated.py new file mode 100644 index 0000000..ac35e3d --- /dev/null +++ b/backend/tests/test_indexer_isolated.py @@ -0,0 +1,88 @@ + +import sys +from unittest.mock import MagicMock, AsyncMock +import unittest + +# Mock modules that might be missing or fail on import +mock_chromadb = MagicMock() +sys.modules['chromadb'] = mock_chromadb +sys.modules['chromadb.config'] = MagicMock() +sys.modules['httpx'] = MagicMock() +sys.modules['tiktoken'] = MagicMock() +sys.modules['tree_sitter'] = MagicMock() +sys.modules['tree_sitter_language_pack'] = MagicMock() + +# Now import the class to test +# We need to ensure dependencies are mocked before this +from app.services.rag.indexer import CodeIndexer, IndexUpdateMode + +class TestIndexerLogicIsolated(unittest.IsolatedAsyncioTestCase): + async def test_smart_index_mode_selection(self): + mock_vector_store = MagicMock() + mock_embedding_service = MagicMock() + + indexer = CodeIndexer( + collection_name="test_collection", + embedding_service=mock_embedding_service, + vector_store=mock_vector_store + ) + + # Mock methods that are called during smart_index_directory + indexer.initialize = AsyncMock(return_value=(False, "")) + indexer._full_index = AsyncMock() + indexer._incremental_index = AsyncMock() + + async def mock_gen(*args, **kwargs): + yield MagicMock() + indexer._full_index.side_effect = mock_gen + indexer._incremental_index.side_effect = mock_gen + + # 1. Test: New collection (should be FULL) + mock_vector_store.is_new_collection = True + + async for _ in indexer.smart_index_directory(directory="/tmp/test"): + pass + + # Verify FULL mode was selected because it's a new collection + indexer._full_index.assert_called_once() + indexer._incremental_index.assert_not_called() + + # 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL) + indexer._full_index.reset_mock() + indexer._incremental_index.reset_mock() + mock_vector_store.is_new_collection = False + indexer.initialize = AsyncMock(return_value=(False, "")) # needs_rebuild = False + + async for _ in indexer.smart_index_directory(directory="/tmp/test"): + pass + + indexer._full_index.assert_not_called() + indexer._incremental_index.assert_called_once() + + async def test_needs_rebuild_selection(self): + mock_vector_store = MagicMock() + mock_embedding_service = MagicMock() + + indexer = CodeIndexer( + collection_name="test_collection", + embedding_service=mock_embedding_service, + vector_store=mock_vector_store + ) + + indexer._full_index = AsyncMock() + indexer._incremental_index = AsyncMock() + async def mock_gen(*args, **kwargs): + yield MagicMock() + indexer._full_index.side_effect = mock_gen + + # Test: Existing collection, but needs_rebuild is True (should be FULL) + mock_vector_store.is_new_collection = False + indexer.initialize = AsyncMock(return_value=(True, "Config changed")) + + async for _ in indexer.smart_index_directory(directory="/tmp/test"): + pass + + indexer._full_index.assert_called_once() + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/test_indexer_logic.py b/backend/tests/test_indexer_logic.py new file mode 100644 index 0000000..c298f46 --- /dev/null +++ b/backend/tests/test_indexer_logic.py @@ -0,0 +1,58 @@ + +import asyncio +import unittest +from unittest.mock import MagicMock, AsyncMock +from app.services.rag.indexer import CodeIndexer, IndexUpdateMode + +class TestIndexerLogic(unittest.IsolatedAsyncioTestCase): + async def test_smart_index_mode_selection(self): + # Mock dependencies + mock_vector_store = MagicMock() + mock_embedding_service = MagicMock() + + indexer = CodeIndexer( + collection_name="test_collection", + embedding_service=mock_embedding_service, + vector_store=mock_vector_store + ) + + # 1. Test: New collection (should be FULL) + mock_vector_store.is_new_collection = True + mock_vector_store.initialize = AsyncMock() + mock_vector_store.get_embedding_config = MagicMock(return_value={}) + mock_vector_store.get_collection_metadata = MagicMock(return_value={}) + + # We need to mock _check_rebuild_needed indirectly or just let it run + # Since is_new_collection is True, _check_rebuild_needed returns (False, "") + + # We'll use a wrapper to capture the calls to _full_index and _incremental_index + indexer._full_index = AsyncMock() + indexer._incremental_index = AsyncMock() + + # Mock _full_index as an async generator + async def mock_gen(*args, **kwargs): + yield MagicMock() + indexer._full_index.side_effect = mock_gen + indexer._incremental_index.side_effect = mock_gen + + # Run smart_index_directory + async for _ in indexer.smart_index_directory(directory="/tmp/test"): + pass + + # Verify FULL mode was selected because it's a new collection + indexer._full_index.assert_called_once() + indexer._incremental_index.assert_not_called() + + # 2. Test: Existing collection, no rebuild needed (should be INCREMENTAL) + indexer._full_index.reset_mock() + indexer._incremental_index.reset_mock() + mock_vector_store.is_new_collection = False + + async for _ in indexer.smart_index_directory(directory="/tmp/test"): + pass + + indexer._full_index.assert_not_called() + indexer._incremental_index.assert_called_once() + +if __name__ == "__main__": + unittest.main() diff --git a/backend/tests/verify_logic_sim.py b/backend/tests/verify_logic_sim.py new file mode 100644 index 0000000..1d3ddab --- /dev/null +++ b/backend/tests/verify_logic_sim.py @@ -0,0 +1,45 @@ + +import unittest +from enum import Enum + +class IndexUpdateMode(Enum): + FULL = "full" + INCREMENTAL = "incremental" + SMART = "smart" + +def select_mode_simulation(update_mode, needs_rebuild, is_new_collection, rebuild_reason=""): + """ + Simulates the logic in indexer.py:smart_index_directory + """ + if update_mode == IndexUpdateMode.SMART: + # The logic we implemented: + if needs_rebuild or is_new_collection: + actual_mode = IndexUpdateMode.FULL + reason = rebuild_reason if needs_rebuild else "新集合" + print(f"🔄 智能模式: 选择全量重建 (原因: {reason})") + return actual_mode + else: + actual_mode = IndexUpdateMode.INCREMENTAL + print("📝 智能模式: 选择增量更新") + return actual_mode + else: + return update_mode + +class TestSimulation(unittest.TestCase): + def test_new_collection(self): + # Case: Smart mode, new collection, no config change + mode = select_mode_simulation(IndexUpdateMode.SMART, False, True) + self.assertEqual(mode, IndexUpdateMode.FULL) + + def test_existing_rebuild(self): + # Case: Smart mode, existing collection, config change + mode = select_mode_simulation(IndexUpdateMode.SMART, True, False, "Model changed") + self.assertEqual(mode, IndexUpdateMode.FULL) + + def test_existing_no_rebuild(self): + # Case: Smart mode, existing collection, no change + mode = select_mode_simulation(IndexUpdateMode.SMART, False, False) + self.assertEqual(mode, IndexUpdateMode.INCREMENTAL) + +if __name__ == "__main__": + unittest.main() diff --git a/docker-compose.yml b/docker-compose.yml index 597241b..e4c596b 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -39,6 +39,7 @@ services: volumes: # - ./backend/app:/app/app:ro # 挂载代码目录,修改后自动生效 - backend_uploads:/app/uploads + - chroma_data:/app/data/vector_db - /var/run/docker.sock:/var/run/docker.sock # 沙箱执行必须 ports: - "8000:8000" @@ -49,9 +50,9 @@ services: - REDIS_URL=redis://redis:6379/0 - AGENT_ENABLED=true - SANDBOX_ENABLED=true - - SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像 + - SANDBOX_IMAGE=deepaudit/sandbox:latest # 使用本地构建的沙箱镜像 # 指定 embedding 服务地址 - - EMBEDDING_BASE_URL=http://host.docker.internal:8003/v1 + - EMBEDDING_BASE_URL=http://host.docker.internal:8003/v1 # Gitea 配置 - GITEA_HOST_URL=http://sl.vrgon.com:3000 - GITEA_BOT_TOKEN=379a049b8d78965fdff474fc8676bca7e9c70248 @@ -71,7 +72,7 @@ services: restart: unless-stopped volumes: # - ./frontend/dist:/usr/share/nginx/html:ro # 挂载构建产物,本地 pnpm build 后自动生效 - - ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置 + - ./frontend/nginx.conf:/etc/nginx/conf.d/default.conf:ro # 挂载 nginx 配置 ports: - "83:80" # Nginx 监听 80 端口 environment: @@ -115,9 +116,10 @@ networks: deepaudit-network: driver: bridge driver_opts: - com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称 + com.docker.network.bridge.name: br-deepaudit # 指定宿主机网卡名称 volumes: postgres_data: backend_uploads: + chroma_data: redis_data: