diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 6d1ea48..3ece51c 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -35,9 +35,8 @@ TEXT_EXTENSIONS = { ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", - # .NET - ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj", - ".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc", + # .NET (逻辑代码) + ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".asax", ".master", ".ascx", ".asmx", ".svc", # 数据与配置 ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", # 脚本与命令 @@ -47,7 +46,7 @@ TEXT_EXTENSIONS = { ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", # 其他 ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", - ".erl", ".hrl", ".m", ".mm", ".r", ".rmd", ".properties" + ".erl", ".hrl", ".m", ".mm", ".r", ".rmd" } # 排除的目录 @@ -85,12 +84,18 @@ EXCLUDE_FILES = { ".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key", # 临时与日志 "*.log", "*.bak", "*.swp", "*.tmp", "tags", - # IDRE 与特定配置 - "*.suo", "*.user", "*.sln.docstates", "*.vshost.*", "*.pdb", - ".ruby-version", ".nvmrc" + # IDE 与项目配置 (非代码) + "*.suo", "*.user", "*.sln", "*.csproj", "*.vbproj", "*.fsproj", + "*.props", "*.targets", "*.resx", "*.sln.docstates", "*.vshost.*", "*.pdb", + "launchSettings.json", "dotnet-tools.json", ".ruby-version", ".nvmrc", + # 自动生成的代码 (噪声) + "*.Designer.cs", "*.Designer.vb", "*ModelSnapshot.cs", "*.generated.cs", "*.g.cs", "*.g.i.cs", + # 大型数据文件 (非代码) + "haarcascade_*.xml" } + class IndexUpdateMode(Enum): """索引更新模式""" FULL = "full" # 全量重建:删除旧索引,完全重新索引 @@ -994,8 +999,15 @@ class CodeIndexer: # 🔥 详细打印所有待索引的文件名,方便调试 (满足用户需求) if files: - relative_files = [os.path.relpath(f, directory) for f in files] - logger.info(f"📄 待索引文件列表: {', '.join(sorted(relative_files))}") + file_infos = [] + # 按相对路径排序 + sorted_files = sorted(files, key=lambda f: os.path.relpath(f, directory)) + for f in sorted_files: + rel = os.path.relpath(f, directory) + size_str = self._get_file_size_str(f) + file_infos.append(f"{rel} ({size_str})") + + logger.info(f"📄 待索引文件列表: {', '.join(file_infos)}") yield progress @@ -1009,6 +1021,8 @@ class CodeIndexer: progress.current_file = relative_path # 异步读取文件 + file_size = os.path.getsize(file_path) + logger.info(f"📄 正在处理: {relative_path} (大小: {file_size / 1024:.2f} KB)") content = await asyncio.to_thread(self._read_file_sync, file_path) if not content.strip(): progress.processed_files += 1 @@ -1140,9 +1154,11 @@ class CodeIndexer: # 🔥 详细打印新增、更新和删除的文件名,方便调试 (满足用户需求) if files_to_add: - logger.info(f"🆕 新增文件 ({len(files_to_add)}): {', '.join(sorted(list(files_to_add)))}") + file_infos = [f"{rel} ({self._get_file_size_str(current_file_map.get(rel))})" for rel in sorted(list(files_to_add))] + logger.info(f"🆕 新增文件 ({len(files_to_add)}): {', '.join(file_infos)}") if files_to_update: - logger.info(f"🔄 更新文件 ({len(files_to_update)}): {', '.join(sorted(list(files_to_update)))}") + file_infos = [f"{rel} ({self._get_file_size_str(current_file_map.get(rel))})" for rel in sorted(list(files_to_update))] + logger.info(f"🔄 更新文件 ({len(files_to_update)}): {', '.join(file_infos)}") if files_to_delete: logger.info(f"🗑️ 删除文件 ({len(files_to_delete)}): {', '.join(sorted(list(files_to_delete)))}") @@ -1171,6 +1187,8 @@ class CodeIndexer: try: # 异步读取文件 + file_size = os.path.getsize(file_path) + logger.info(f"📄 正在处理: {relative_path} (大小: {file_size / 1024:.2f} KB)") content = await asyncio.to_thread(self._read_file_sync, file_path) if not content.strip(): @@ -1305,6 +1323,8 @@ class CodeIndexer: progress.current_file = file_path try: + file_size = len(content.encode('utf-8')) + logger.info(f"📄 正在处理: {file_path} (大小: {file_size / 1024:.2f} KB)") if not content.strip(): progress.processed_files += 1 progress.skipped_files += 1 @@ -1381,10 +1401,14 @@ class CodeIndexer: chunks = unique_chunks + # 获取涉及的文件名用于日志展示 + file_paths = sorted(list(set(chunk.file_path for chunk in chunks))) + file_label = f"[{file_paths[0]}]" if len(file_paths) == 1 else f"[{len(file_paths)} 个文件]" + # 准备嵌入文本 texts = [chunk.to_embedding_text() for chunk in chunks] - logger.info(f"🔢 生成 {len(texts)} 个代码块的嵌入向量...") + logger.info(f"🔢 文件: {file_label}, 生成 {len(texts)} 个代码块的嵌入向量...") # 批量嵌入(带进度回调和取消检查) embeddings = await self.embedding_service.embed_batch( @@ -1400,7 +1424,7 @@ class CodeIndexer: metadatas = [chunk.to_dict() for chunk in chunks] # 添加到向量存储 - logger.info(f"💾 添加 {len(chunks)} 个代码块到向量存储...") + logger.info(f"💾 文件: {file_label} ,添加 {len(chunks)} 个代码块到向量存储...") if use_upsert: await self.vector_store.upsert_documents( @@ -1417,7 +1441,23 @@ class CodeIndexer: metadatas=metadatas, ) - logger.info(f"✅ 索引 {len(chunks)} 个代码块成功") + logger.info(f"✅ 文件:{file_label}, 索引 {len(chunks)} 个代码块成功") + + @staticmethod + def _get_file_size_str(file_path: Optional[str]) -> str: + """获取文件大小的可读字符串""" + if not file_path or not os.path.exists(file_path): + return "unknown" + try: + size = os.path.getsize(file_path) + if size < 1024: + return f"{size} B" + elif size < 1024 * 1024: + return f"{size / 1024:.2f} KB" + else: + return f"{size / (1024 * 1024):.2f} MB" + except Exception: + return "error" def _collect_files( self,