diff --git a/backend/app/services/rag/embeddings.py b/backend/app/services/rag/embeddings.py index ba9ebfb..894024b 100644 --- a/backend/app/services/rag/embeddings.py +++ b/backend/app/services/rag/embeddings.py @@ -553,14 +553,16 @@ class QwenEmbedding(EmbeddingProvider): payload = { "model": self.model, "input": truncated_texts, - "encoding_format": "float", } - url = f"{self.base_url.rstrip('/')}/embeddings" try: - async with httpx.AsyncClient(timeout=60) as client: - response = await client.post(url, headers=headers, json=payload) + async with httpx.AsyncClient(timeout=60.0) as client: + response = await client.post( + url, + headers=headers, + json=payload, + ) response.raise_for_status() data = response.json() diff --git a/backend/app/services/rag/indexer.py b/backend/app/services/rag/indexer.py index 775b625..8e2bd36 100644 --- a/backend/app/services/rag/indexer.py +++ b/backend/app/services/rag/indexer.py @@ -30,24 +30,63 @@ INDEX_VERSION = "2.0" # 支持的文本文件扩展名 TEXT_EXTENSIONS = { + # 核心语言 ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", - ".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb", - ".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml", - ".xml", ".html", ".css", ".vue", ".svelte", ".md", + ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", + ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", + # .NET + ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj", + ".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc", + # 数据与配置 + ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", + # 脚本与命令 + ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t", + # Web + ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql", + ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", + # 其他 + ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", + ".erl", ".hrl", ".m", ".mm", ".r", ".rmd", ".properties" } # 排除的目录 EXCLUDE_DIRS = { - "node_modules", "vendor", "dist", "build", ".git", - "__pycache__", ".pytest_cache", "coverage", ".nyc_output", - ".vscode", ".idea", ".vs", "target", "out", "bin", "obj", - "__MACOSX", ".next", ".nuxt", "venv", "env", ".env", + # 构建与依赖 + "node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj", + "bower_components", "packages", "pkg", "Pods", ".gradle", ".m2", + "vendor/bundle", ".bundle", "jspm_packages", "typings", + # 虚拟环境 + "venv", "env", ".env", "virtualenv", ".venv", + # IDE 与元数据 + ".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults", + "_ReSharper.*", ".settings", ".project", ".classpath", ".metadata", + # 缓存与日志 + "__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output", + ".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch", + # 云与基础设施 + ".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site", + # 其他 + "__MACOSX", "extern", "externals", "third-party", "3rdparty" } # 排除的文件 EXCLUDE_FILES = { - ".DS_Store", "package-lock.json", "yarn.lock", "pnpm-lock.yaml", - "Cargo.lock", "poetry.lock", "composer.lock", "Gemfile.lock", + # 锁文件 (通常不索引,因为内容太长且无语义) + "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", + "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile", + "mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml", + # 静态资源与二进制 + "*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map", + "*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj", + "*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class", + "*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif", + # 系统与秘密 + ".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key", + # 临时与日志 + "*.log", "*.bak", "*.swp", "*.tmp", "tags", + # IDRE 与特定配置 + "*.suo", "*.user", "*.sln.docstates", "*.vshost.*", "*.pdb", + ".ruby-version", ".nvmrc" } @@ -949,7 +988,7 @@ class CodeIndexer: logger.info(f"📁 发现 {len(files)} 个文件待索引") yield progress - semaphore = asyncio.Semaphore(20) # 控制文件处理并发 + semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存 file_hashes: Dict[str, str] = {} async def process_file(file_path: str): @@ -1083,7 +1122,7 @@ class CodeIndexer: progress_callback(progress) yield progress - semaphore = asyncio.Semaphore(20) + semaphore = asyncio.Semaphore(10) file_hashes: Dict[str, str] = dict(indexed_file_hashes) async def process_incremental_file(relative_path: str): @@ -1363,11 +1402,35 @@ class CodeIndexer: if ext not in TEXT_EXTENSIONS: continue - # 检查排除文件 - if filename in EXCLUDE_FILES: + # 检查排除文件 (支持通配符) + should_skip_file = False + for pattern in EXCLUDE_FILES: + if fnmatch.fnmatch(filename, pattern): + should_skip_file = True + break + if should_skip_file: continue + # 排除疑似压缩过的文件 (通过行长度和内容分析) file_path = os.path.join(root, filename) + try: + if os.path.getsize(file_path) > 50000: # > 50KB + with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: + # 检查前 5 行,防止第一行是版权注释的情况 + is_minified = False + for _ in range(5): + line = f.readline() + if not line: break + if len(line) > 1000: + is_minified = True + break + + if is_minified: + logger.info(f"⏩ 跳过疑似压缩或非代码文件: {filename}") + continue + except Exception: + pass + relative_path = os.path.relpath(file_path, directory) # 检查排除模式 diff --git a/backend/app/services/rag/splitter.py b/backend/app/services/rag/splitter.py index 5c350b0..cc62344 100644 --- a/backend/app/services/rag/splitter.py +++ b/backend/app/services/rag/splitter.py @@ -155,11 +155,29 @@ class TreeSitterParser: ".c": "c", ".h": "c", ".hpp": "cpp", + ".hxx": "cpp", ".cs": "csharp", ".php": "php", ".rb": "ruby", ".kt": "kotlin", + ".ktm": "kotlin", + ".kts": "kotlin", ".swift": "swift", + ".dart": "dart", + ".scala": "scala", + ".sc": "scala", + ".groovy": "groovy", + ".lua": "lua", + ".hs": "haskell", + ".clj": "clojure", + ".ex": "elixir", + ".erl": "erlang", + ".m": "objective-c", + ".mm": "objective-c", + ".sh": "bash", + ".bash": "bash", + ".zsh": "bash", + ".sql": "sql", } # 各语言的函数/类节点类型 @@ -182,24 +200,55 @@ class TreeSitterParser: "import": ["import_statement"], }, "java": { - "class": ["class_declaration"], + "class": ["class_declaration", "enum_declaration", "record_declaration"], "method": ["method_declaration", "constructor_declaration"], - "interface": ["interface_declaration"], + "interface": ["interface_declaration", "annotation_type_declaration"], "import": ["import_declaration"], }, + "csharp": { + "class": ["class_declaration", "record_declaration", "struct_declaration", "enum_declaration"], + "method": ["method_declaration", "constructor_declaration", "destructor_declaration"], + "interface": ["interface_declaration"], + "import": ["using_directive"], + }, + "cpp": { + "class": ["class_specifier", "struct_specifier", "enum_specifier"], + "function": ["function_definition"], + }, "go": { "struct": ["type_declaration"], "function": ["function_declaration", "method_declaration"], "interface": ["type_declaration"], "import": ["import_declaration"], }, + "rust": { + "struct": ["struct_item", "union_item"], + "enum": ["enum_item"], + "function": ["function_item"], + "class": ["impl_item", "trait_item"], + }, + "php": { + "class": ["class_declaration"], + "function": ["function_definition", "method_definition"], + "interface": ["interface_declaration"], + }, + "ruby": { + "class": ["class", "module"], + "function": ["method"], + }, + "swift": { + "class": ["class_declaration", "struct_declaration", "enum_declaration"], + "function": ["function_declaration"], + "interface": ["protocol_declaration"], + }, } # tree-sitter-languages 支持的语言列表 SUPPORTED_LANGUAGES = { "python", "javascript", "typescript", "tsx", "java", "go", "rust", "c", "cpp", "csharp", "php", "ruby", "kotlin", "swift", "bash", - "json", "yaml", "html", "css", "sql", "markdown", + "json", "yaml", "html", "css", "sql", "markdown", "dart", "scala", + "lua", "haskell", "clojure", "elixir", "erlang", "objective-c" } def __init__(self): @@ -384,6 +433,26 @@ class CodeSplitter: (r"\$_POST\[", "post_input"), (r"\$_REQUEST\[", "request_input"), ], + "csharp": [ + (r"Process\.Start\s*\(", "process_start"), + (r"SqlCommand\s*\(.*\+", "sql_concat"), + (r"Deserialize\s*\(", "deserialization"), + (r"AllowHtml\s*=", "unsafe_html"), + (r"password\s*=", "password_assign"), + ], + "cpp": [ + (r"\bsystem\s*\(", "system_call"), + (r"\bpopen\s*\(", "popen"), + (r"\bstrcpy\s*\(", "unsafe_string_copy"), + (r"\bsprintf\s*\(", "unsafe_string_format"), + (r"\bmalloc\s*\(", "memory_allocation"), + ], + "ruby": [ + (r"\beval\s*\(", "eval"), + (r"`.*`", "shell_execution"), + (r"system\s*\(", "system_call"), + (r"send\s*\(", "dynamic_method_call"), + ], } def __init__( @@ -585,6 +654,25 @@ class CodeSplitter: (r"^(\s*)interface\s+(\w+)", ChunkType.INTERFACE), (r"^(\s*)(?:public|private|protected)?\s*(?:static\s+)?function\s+(\w+)", ChunkType.FUNCTION), ], + "csharp": [ + (r"^(\s*)(?:public|private|protected|internal)?\s*(?:static\s+)?(?:partial\s+)?(?:class|record|struct|enum)\s+(\w+)", ChunkType.CLASS), + (r"^(\s*)(?:public|private|protected|internal)?\s*interface\s+(\w+)", ChunkType.INTERFACE), + (r"^(\s*)(?:public|private|protected|internal)?\s*(?:async\s+)?(?:static\s+)?[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)", ChunkType.METHOD), + ], + "cpp": [ + (r"^(\s*)(?:class|struct)\s+(\w+)", ChunkType.CLASS), + (r"^(\s*)[\w<>:]+\s+(\w+)\s*\([^)]*\)\s*\{", ChunkType.FUNCTION), + ], + "ruby": [ + (r"^(\s*)(?:class|module)\s+(\w+)", ChunkType.CLASS), + (r"^(\s*)def\s+(\w+)", ChunkType.FUNCTION), + ], + "rust": [ + (r"^(\s*)(?:pub\s+)?(?:struct|enum|union)\s+(\w+)", ChunkType.CLASS), + (r"^(\s*)(?:pub\s+)?(?:async\s+)?fn\s+(\w+)", ChunkType.FUNCTION), + (r"^(\s*)(?:pub\s+)?impl", ChunkType.CLASS), + (r"^(\s*)(?:pub\s+)?trait\s+(\w+)", ChunkType.INTERFACE), + ], } lang_patterns = patterns.get(language, []) @@ -788,9 +876,23 @@ class CodeSplitter: "java": [ r"^import\s+([\w.]+);", ], + "csharp": [ + r"^using\s+([\w.]+);", + ], "go": [ r"['\"]([^'\"]+)['\"]", ], + "cpp": [ + r'^#include\s+["<]([^">]+)[">]', + ], + "php": [ + r"^use\s+([\w\\]+);", + r"require(?:_once)?\s*\(['\"]([^'\"]+)['\"]\)", + ], + "ruby": [ + r"require\s+['\"]([^'\"]+)['\"]", + r"require_relative\s+['\"]([^'\"]+)['\"]", + ], } for pattern in patterns.get(language, []): @@ -835,10 +937,21 @@ class CodeSplitter: r"class\s+(\w+)", r"(\w+)\s*=\s*", ], - "javascript": [ - r"function\s+(\w+)", - r"(?:const|let|var)\s+(\w+)", - r"class\s+(\w+)", + "java": [ + r"(?:public|private|protected)?\s*(?:static\s+)?(?:final\s+)?(?:class|interface|enum|record)\s+(\w+)", + r"(?:public|private|protected)?\s*(?:static\s+)?[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)", + ], + "csharp": [ + r"(?:class|record|struct|enum|interface)\s+(\w+)", + r"[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)", + ], + "cpp": [ + r"(?:class|struct)\s+(\w+)", + r"(?:[\w<>:]+)\s+(\w+)\s*\([^)]*\)\s*\{", + ], + "rust": [ + r"(?:struct|enum|union|trait)\s+(\w+)", + r"fn\s+(\w+)", ], } diff --git a/backend/app/services/scanner.py b/backend/app/services/scanner.py index f3459f0..be046f5 100644 --- a/backend/app/services/scanner.py +++ b/backend/app/services/scanner.py @@ -37,18 +37,32 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st # 支持的文本文件扩展名 TEXT_EXTENSIONS = [ - ".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs", - ".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb", - ".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml" + ".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs", + ".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb", + ".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts", + ".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj", + ".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc", + ".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini", + ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t", + ".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql", + ".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars", + ".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs", + ".erl", ".hrl", ".m", ".mm", ".r", ".rmd" ] # 排除的目录和文件模式 EXCLUDE_PATTERNS = [ - "node_modules/", "vendor/", "dist/", "build/", ".git/", - "__pycache__/", ".pytest_cache/", "coverage/", ".nyc_output/", - ".vscode/", ".idea/", ".vs/", "target/", "out/", - "__MACOSX/", ".DS_Store", "package-lock.json", "yarn.lock", - "pnpm-lock.yaml", ".min.js", ".min.css", ".map" + # 常用目录 + "node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/", + ".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/", + ".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/", + ".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/", + "packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*", + # 常见锁文件与二进制 + "package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock", + "poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile", + ".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe", + "*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class" ] @@ -67,13 +81,26 @@ def get_language_from_path(path: str) -> str: """从文件路径获取语言类型""" ext = path.split('.')[-1].lower() if '.' in path else '' language_map = { + 'py': 'python', 'js': 'javascript', 'jsx': 'javascript', 'ts': 'typescript', 'tsx': 'typescript', - 'py': 'python', 'java': 'java', 'go': 'go', - 'rs': 'rust', 'cpp': 'cpp', 'c': 'cpp', - 'cc': 'cpp', 'h': 'cpp', 'hh': 'cpp', + 'java': 'java', 'go': 'go', 'rs': 'rust', + 'cpp': 'cpp', 'c': 'c', 'cc': 'cpp', 'h': 'c', 'hh': 'cpp', + 'hpp': 'cpp', 'hxx': 'cpp', 'cs': 'csharp', 'php': 'php', 'rb': 'ruby', - 'kt': 'kotlin', 'swift': 'swift' + 'kt': 'kotlin', 'ktm': 'kotlin', 'kts': 'kotlin', + 'swift': 'swift', 'dart': 'dart', + 'scala': 'scala', 'sc': 'scala', + 'groovy': 'groovy', 'gsh': 'groovy', 'gvy': 'groovy', 'gy': 'groovy', + 'sql': 'sql', 'sh': 'bash', 'bash': 'bash', 'zsh': 'bash', + 'pl': 'perl', 'pm': 'perl', 't': 'perl', + 'lua': 'lua', 'hs': 'haskell', 'lhs': 'haskell', + 'clj': 'clojure', 'cljs': 'clojure', 'cljc': 'clojure', 'edn': 'clojure', + 'ex': 'elixir', 'exs': 'elixir', 'erl': 'erlang', 'hrl': 'erlang', + 'm': 'objective-c', 'mm': 'objective-c', + 'r': 'r', 'rmd': 'r', + 'vb': 'visual-basic', 'fs': 'fsharp', 'fsi': 'fsharp', 'fsx': 'fsharp', + 'tf': 'hcl', 'hcl': 'hcl', 'dockerfile': 'dockerfile' } return language_map.get(ext, 'text')