fix(rag): 修复代码块ID重复问题
在CodeIndexer中添加去重逻辑,确保没有重复ID的代码块被索引 使用完整内容生成hash作为ID,提高唯一性
This commit is contained in:
parent
96560e6474
commit
7efb89d2d2
|
|
@ -1241,6 +1241,21 @@ class CodeIndexer:
|
|||
if not chunks:
|
||||
return
|
||||
|
||||
# 去重:确保没有重复的 ID
|
||||
seen_ids: Set[str] = set()
|
||||
unique_chunks: List[CodeChunk] = []
|
||||
for chunk in chunks:
|
||||
if chunk.id not in seen_ids:
|
||||
seen_ids.add(chunk.id)
|
||||
unique_chunks.append(chunk)
|
||||
else:
|
||||
logger.warning(f"跳过重复 ID 的代码块: {chunk.id} ({chunk.file_path}:{chunk.line_start})")
|
||||
|
||||
if len(unique_chunks) < len(chunks):
|
||||
logger.info(f"🔄 去重: {len(chunks)} -> {len(unique_chunks)} 个代码块")
|
||||
|
||||
chunks = unique_chunks
|
||||
|
||||
# 准备嵌入文本
|
||||
texts = [chunk.to_embedding_text() for chunk in chunks]
|
||||
|
||||
|
|
|
|||
|
|
@ -78,7 +78,8 @@ class CodeChunk:
|
|||
self.estimated_tokens = self._estimate_tokens()
|
||||
|
||||
def _generate_id(self) -> str:
|
||||
content = f"{self.file_path}:{self.line_start}:{self.line_end}:{self.content[:100]}"
|
||||
# 使用完整内容的 hash 确保唯一性
|
||||
content = f"{self.file_path}:{self.line_start}:{self.line_end}:{self.content}"
|
||||
return hashlib.sha256(content.encode()).hexdigest()[:16]
|
||||
|
||||
def _estimate_tokens(self) -> int:
|
||||
|
|
|
|||
Loading…
Reference in New Issue