From 7efb89d2d2938c97d0848438682b2ee82c91e7fb Mon Sep 17 00:00:00 2001 From: lintsinghua Date: Tue, 16 Dec 2025 18:49:30 +0800 Subject: [PATCH] =?UTF-8?q?fix(rag):=20=E4=BF=AE=E5=A4=8D=E4=BB=A3?= =?UTF-8?q?=E7=A0=81=E5=9D=97ID=E9=87=8D=E5=A4=8D=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 在CodeIndexer中添加去重逻辑,确保没有重复ID的代码块被索引 使用完整内容生成hash作为ID,提高唯一性 --- backend/app/services/rag/indexer.py | 15 +++++++++++++++ backend/app/services/rag/splitter.py | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 2f22c60..168d489 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -1241,6 +1241,21 @@ class CodeIndexer: if not chunks: return + # 去重:确保没有重复的 ID + seen_ids: Set[str] = set() + unique_chunks: List[CodeChunk] = [] + for chunk in chunks: + if chunk.id not in seen_ids: + seen_ids.add(chunk.id) + unique_chunks.append(chunk) + else: + logger.warning(f"跳过重复 ID 的代码块: {chunk.id} ({chunk.file_path}:{chunk.line_start})") + + if len(unique_chunks) < len(chunks): + logger.info(f"🔄 去重: {len(chunks)} -> {len(unique_chunks)} 个代码块") + + chunks = unique_chunks + # 准备嵌入文本 texts = [chunk.to_embedding_text() for chunk in chunks] diff --git a/backend/app/services/rag/splitter.py b/backend/app/services/rag/splitter.py index 2144f1c..4dbc89e 100644 --- a/backend/app/services/rag/splitter.py +++ b/backend/app/services/rag/splitter.py @@ -78,7 +78,8 @@ class CodeChunk: self.estimated_tokens = self._estimate_tokens() def _generate_id(self) -> str: - content = f"{self.file_path}:{self.line_start}:{self.line_end}:{self.content[:100]}" + # 使用完整内容的 hash 确保唯一性 + content = f"{self.file_path}:{self.line_start}:{self.line_end}:{self.content}" return hashlib.sha256(content.encode()).hexdigest()[:16] def _estimate_tokens(self) -> int: