feat: Add extensive language support for code splitting, scanning, and vulnerability detection by expanding file extensions, Tree-sitter node types, vulnerability patterns, and exclusion rules.

This commit is contained in:
vinland100 2026-01-06 15:18:38 +08:00
parent 969d899476
commit f743357bd7
4 changed files with 241 additions and 36 deletions

View File

@ -553,14 +553,16 @@ class QwenEmbedding(EmbeddingProvider):
payload = {
"model": self.model,
"input": truncated_texts,
"encoding_format": "float",
}
url = f"{self.base_url.rstrip('/')}/embeddings"
try:
async with httpx.AsyncClient(timeout=60) as client:
response = await client.post(url, headers=headers, json=payload)
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.post(
url,
headers=headers,
json=payload,
)
response.raise_for_status()
data = response.json()

View File

@ -30,24 +30,63 @@ INDEX_VERSION = "2.0"
# 支持的文本文件扩展名
TEXT_EXTENSIONS = {
# 核心语言
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml",
".xml", ".html", ".css", ".vue", ".svelte", ".md",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
# .NET
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
# 数据与配置
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
# 脚本与命令
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
# Web
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
# 其他
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd", ".properties"
}
# 排除的目录
EXCLUDE_DIRS = {
"node_modules", "vendor", "dist", "build", ".git",
"__pycache__", ".pytest_cache", "coverage", ".nyc_output",
".vscode", ".idea", ".vs", "target", "out", "bin", "obj",
"__MACOSX", ".next", ".nuxt", "venv", "env", ".env",
# 构建与依赖
"node_modules", "vendor", "dist", "build", "target", "out", "bin", "obj",
"bower_components", "packages", "pkg", "Pods", ".gradle", ".m2",
"vendor/bundle", ".bundle", "jspm_packages", "typings",
# 虚拟环境
"venv", "env", ".env", "virtualenv", ".venv",
# IDE 与元数据
".git", ".svn", ".hg", ".vscode", ".idea", ".vs", "TestResults",
"_ReSharper.*", ".settings", ".project", ".classpath", ".metadata",
# 缓存与日志
"__pycache__", ".pytest_cache", "coverage", "htmlcov", ".nyc_output",
".cache", ".next", ".nuxt", ".dart_tool", "htmlcov", "logs", "ipch",
# 云与基础设施
".aws-sam", ".serverless", ".terraform", ".terraform.d", "_site",
# 其他
"__MACOSX", "extern", "externals", "third-party", "3rdparty"
}
# 排除的文件
EXCLUDE_FILES = {
".DS_Store", "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
"Cargo.lock", "poetry.lock", "composer.lock", "Gemfile.lock",
# 锁文件 (通常不索引,因为内容太长且无语义)
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
"mix.lock", "pnpm-workspace.yaml", "shrinkwrap.yaml",
# 静态资源与二进制
"*.min.js", "*.min.css", "jquery.js", "jquery.min.js", "*.map",
"*.pyc", "*.pyo", "*.pyd", "*.so", "*.dll", "*.exe", "*.o", "*.obj",
"*.a", "*.lib", "*.jar", "*.war", "*.ear", "*.class",
"*.svg", "*.ico", "*.woff*", "*.png", "*.jpg", "*.jpeg", "*.gif",
# 系统与秘密
".DS_Store", "thumbs.db", "desktop.ini", "*.pem", "*.crt", "*.key",
# 临时与日志
"*.log", "*.bak", "*.swp", "*.tmp", "tags",
# IDRE 与特定配置
"*.suo", "*.user", "*.sln.docstates", "*.vshost.*", "*.pdb",
".ruby-version", ".nvmrc"
}
@ -949,7 +988,7 @@ class CodeIndexer:
logger.info(f"📁 发现 {len(files)} 个文件待索引")
yield progress
semaphore = asyncio.Semaphore(20) # 控制文件处理并发
semaphore = asyncio.Semaphore(10) # 降低并行度以平衡 CPU 和内存
file_hashes: Dict[str, str] = {}
async def process_file(file_path: str):
@ -1083,7 +1122,7 @@ class CodeIndexer:
progress_callback(progress)
yield progress
semaphore = asyncio.Semaphore(20)
semaphore = asyncio.Semaphore(10)
file_hashes: Dict[str, str] = dict(indexed_file_hashes)
async def process_incremental_file(relative_path: str):
@ -1363,11 +1402,35 @@ class CodeIndexer:
if ext not in TEXT_EXTENSIONS:
continue
# 检查排除文件
if filename in EXCLUDE_FILES:
# 检查排除文件 (支持通配符)
should_skip_file = False
for pattern in EXCLUDE_FILES:
if fnmatch.fnmatch(filename, pattern):
should_skip_file = True
break
if should_skip_file:
continue
# 排除疑似压缩过的文件 (通过行长度和内容分析)
file_path = os.path.join(root, filename)
try:
if os.path.getsize(file_path) > 50000: # > 50KB
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
# 检查前 5 行,防止第一行是版权注释的情况
is_minified = False
for _ in range(5):
line = f.readline()
if not line: break
if len(line) > 1000:
is_minified = True
break
if is_minified:
logger.info(f"⏩ 跳过疑似压缩或非代码文件: {filename}")
continue
except Exception:
pass
relative_path = os.path.relpath(file_path, directory)
# 检查排除模式

View File

@ -155,11 +155,29 @@ class TreeSitterParser:
".c": "c",
".h": "c",
".hpp": "cpp",
".hxx": "cpp",
".cs": "csharp",
".php": "php",
".rb": "ruby",
".kt": "kotlin",
".ktm": "kotlin",
".kts": "kotlin",
".swift": "swift",
".dart": "dart",
".scala": "scala",
".sc": "scala",
".groovy": "groovy",
".lua": "lua",
".hs": "haskell",
".clj": "clojure",
".ex": "elixir",
".erl": "erlang",
".m": "objective-c",
".mm": "objective-c",
".sh": "bash",
".bash": "bash",
".zsh": "bash",
".sql": "sql",
}
# 各语言的函数/类节点类型
@ -182,24 +200,55 @@ class TreeSitterParser:
"import": ["import_statement"],
},
"java": {
"class": ["class_declaration"],
"class": ["class_declaration", "enum_declaration", "record_declaration"],
"method": ["method_declaration", "constructor_declaration"],
"interface": ["interface_declaration"],
"interface": ["interface_declaration", "annotation_type_declaration"],
"import": ["import_declaration"],
},
"csharp": {
"class": ["class_declaration", "record_declaration", "struct_declaration", "enum_declaration"],
"method": ["method_declaration", "constructor_declaration", "destructor_declaration"],
"interface": ["interface_declaration"],
"import": ["using_directive"],
},
"cpp": {
"class": ["class_specifier", "struct_specifier", "enum_specifier"],
"function": ["function_definition"],
},
"go": {
"struct": ["type_declaration"],
"function": ["function_declaration", "method_declaration"],
"interface": ["type_declaration"],
"import": ["import_declaration"],
},
"rust": {
"struct": ["struct_item", "union_item"],
"enum": ["enum_item"],
"function": ["function_item"],
"class": ["impl_item", "trait_item"],
},
"php": {
"class": ["class_declaration"],
"function": ["function_definition", "method_definition"],
"interface": ["interface_declaration"],
},
"ruby": {
"class": ["class", "module"],
"function": ["method"],
},
"swift": {
"class": ["class_declaration", "struct_declaration", "enum_declaration"],
"function": ["function_declaration"],
"interface": ["protocol_declaration"],
},
}
# tree-sitter-languages 支持的语言列表
SUPPORTED_LANGUAGES = {
"python", "javascript", "typescript", "tsx", "java", "go", "rust",
"c", "cpp", "csharp", "php", "ruby", "kotlin", "swift", "bash",
"json", "yaml", "html", "css", "sql", "markdown",
"json", "yaml", "html", "css", "sql", "markdown", "dart", "scala",
"lua", "haskell", "clojure", "elixir", "erlang", "objective-c"
}
def __init__(self):
@ -384,6 +433,26 @@ class CodeSplitter:
(r"\$_POST\[", "post_input"),
(r"\$_REQUEST\[", "request_input"),
],
"csharp": [
(r"Process\.Start\s*\(", "process_start"),
(r"SqlCommand\s*\(.*\+", "sql_concat"),
(r"Deserialize\s*\(", "deserialization"),
(r"AllowHtml\s*=", "unsafe_html"),
(r"password\s*=", "password_assign"),
],
"cpp": [
(r"\bsystem\s*\(", "system_call"),
(r"\bpopen\s*\(", "popen"),
(r"\bstrcpy\s*\(", "unsafe_string_copy"),
(r"\bsprintf\s*\(", "unsafe_string_format"),
(r"\bmalloc\s*\(", "memory_allocation"),
],
"ruby": [
(r"\beval\s*\(", "eval"),
(r"`.*`", "shell_execution"),
(r"system\s*\(", "system_call"),
(r"send\s*\(", "dynamic_method_call"),
],
}
def __init__(
@ -585,6 +654,25 @@ class CodeSplitter:
(r"^(\s*)interface\s+(\w+)", ChunkType.INTERFACE),
(r"^(\s*)(?:public|private|protected)?\s*(?:static\s+)?function\s+(\w+)", ChunkType.FUNCTION),
],
"csharp": [
(r"^(\s*)(?:public|private|protected|internal)?\s*(?:static\s+)?(?:partial\s+)?(?:class|record|struct|enum)\s+(\w+)", ChunkType.CLASS),
(r"^(\s*)(?:public|private|protected|internal)?\s*interface\s+(\w+)", ChunkType.INTERFACE),
(r"^(\s*)(?:public|private|protected|internal)?\s*(?:async\s+)?(?:static\s+)?[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)", ChunkType.METHOD),
],
"cpp": [
(r"^(\s*)(?:class|struct)\s+(\w+)", ChunkType.CLASS),
(r"^(\s*)[\w<>:]+\s+(\w+)\s*\([^)]*\)\s*\{", ChunkType.FUNCTION),
],
"ruby": [
(r"^(\s*)(?:class|module)\s+(\w+)", ChunkType.CLASS),
(r"^(\s*)def\s+(\w+)", ChunkType.FUNCTION),
],
"rust": [
(r"^(\s*)(?:pub\s+)?(?:struct|enum|union)\s+(\w+)", ChunkType.CLASS),
(r"^(\s*)(?:pub\s+)?(?:async\s+)?fn\s+(\w+)", ChunkType.FUNCTION),
(r"^(\s*)(?:pub\s+)?impl", ChunkType.CLASS),
(r"^(\s*)(?:pub\s+)?trait\s+(\w+)", ChunkType.INTERFACE),
],
}
lang_patterns = patterns.get(language, [])
@ -788,9 +876,23 @@ class CodeSplitter:
"java": [
r"^import\s+([\w.]+);",
],
"csharp": [
r"^using\s+([\w.]+);",
],
"go": [
r"['\"]([^'\"]+)['\"]",
],
"cpp": [
r'^#include\s+["<]([^">]+)[">]',
],
"php": [
r"^use\s+([\w\\]+);",
r"require(?:_once)?\s*\(['\"]([^'\"]+)['\"]\)",
],
"ruby": [
r"require\s+['\"]([^'\"]+)['\"]",
r"require_relative\s+['\"]([^'\"]+)['\"]",
],
}
for pattern in patterns.get(language, []):
@ -835,10 +937,21 @@ class CodeSplitter:
r"class\s+(\w+)",
r"(\w+)\s*=\s*",
],
"javascript": [
r"function\s+(\w+)",
r"(?:const|let|var)\s+(\w+)",
r"class\s+(\w+)",
"java": [
r"(?:public|private|protected)?\s*(?:static\s+)?(?:final\s+)?(?:class|interface|enum|record)\s+(\w+)",
r"(?:public|private|protected)?\s*(?:static\s+)?[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)",
],
"csharp": [
r"(?:class|record|struct|enum|interface)\s+(\w+)",
r"[\w<>\[\],\s]+\s+(\w+)\s*\([^)]*\)",
],
"cpp": [
r"(?:class|struct)\s+(\w+)",
r"(?:[\w<>:]+)\s+(\w+)\s*\([^)]*\)\s*\{",
],
"rust": [
r"(?:struct|enum|union|trait)\s+(\w+)",
r"fn\s+(\w+)",
],
}

View File

@ -37,18 +37,32 @@ def get_analysis_config(user_config: Optional[Dict[str, Any]] = None) -> Dict[st
# 支持的文本文件扩展名
TEXT_EXTENSIONS = [
".js", ".ts", ".tsx", ".jsx", ".py", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".cs", ".php", ".rb",
".kt", ".swift", ".sql", ".sh", ".json", ".yml", ".yaml"
".py", ".js", ".ts", ".tsx", ".jsx", ".java", ".go", ".rs",
".cpp", ".c", ".h", ".cc", ".hh", ".hpp", ".hxx", ".cs", ".php", ".rb",
".kt", ".swift", ".dart", ".scala", ".sc", ".groovy", ".ktm", ".kts",
".cshtml", ".vb", ".fs", ".fsi", ".fsx", ".sln", ".csproj", ".vbproj",
".fsproj", ".config", ".asax", ".master", ".ascx", ".asmx", ".svc",
".json", ".yml", ".yaml", ".toml", ".xml", ".properties", ".conf", ".ini",
".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", ".sql", ".pl", ".pm", ".t",
".html", ".css", ".vue", ".svelte", ".md", ".proto", ".graphql", ".gql",
".prisma", ".sol", ".dockerfile", ".tf", ".hcl", ".tfvars",
".lua", ".hs", ".lhs", ".clj", ".cljs", ".cljc", ".edn", ".ex", ".exs",
".erl", ".hrl", ".m", ".mm", ".r", ".rmd"
]
# 排除的目录和文件模式
EXCLUDE_PATTERNS = [
"node_modules/", "vendor/", "dist/", "build/", ".git/",
"__pycache__/", ".pytest_cache/", "coverage/", ".nyc_output/",
".vscode/", ".idea/", ".vs/", "target/", "out/",
"__MACOSX/", ".DS_Store", "package-lock.json", "yarn.lock",
"pnpm-lock.yaml", ".min.js", ".min.css", ".map"
# 常用目录
"node_modules/", "vendor/", "dist/", "build/", "target/", "out/", "bin/", "obj/",
".git/", ".svn/", ".hg/", ".vscode/", ".idea/", ".vs/", ".settings/",
".gradle/", ".m2/", "venv/", "env/", ".env/", "__pycache__/",
".pytest_cache/", "coverage/", ".nyc_output/", "bower_components/",
"packages/", "pkg/", "Pods/", "TestResults/", "_ReSharper.*",
# 常见锁文件与二进制
"package-lock.json", "yarn.lock", "pnpm-lock.yaml", "Cargo.lock",
"poetry.lock", "composer.lock", "Gemfile.lock", "gradle.lockfile",
".min.js", ".min.css", ".map", ".DS_Store", "*.pdb", "*.dll", "*.exe",
"*.o", "*.obj", "*.a", "*.lib", "*.jar", "*.war", "*.class"
]
@ -67,13 +81,26 @@ def get_language_from_path(path: str) -> str:
"""从文件路径获取语言类型"""
ext = path.split('.')[-1].lower() if '.' in path else ''
language_map = {
'py': 'python',
'js': 'javascript', 'jsx': 'javascript',
'ts': 'typescript', 'tsx': 'typescript',
'py': 'python', 'java': 'java', 'go': 'go',
'rs': 'rust', 'cpp': 'cpp', 'c': 'cpp',
'cc': 'cpp', 'h': 'cpp', 'hh': 'cpp',
'java': 'java', 'go': 'go', 'rs': 'rust',
'cpp': 'cpp', 'c': 'c', 'cc': 'cpp', 'h': 'c', 'hh': 'cpp',
'hpp': 'cpp', 'hxx': 'cpp',
'cs': 'csharp', 'php': 'php', 'rb': 'ruby',
'kt': 'kotlin', 'swift': 'swift'
'kt': 'kotlin', 'ktm': 'kotlin', 'kts': 'kotlin',
'swift': 'swift', 'dart': 'dart',
'scala': 'scala', 'sc': 'scala',
'groovy': 'groovy', 'gsh': 'groovy', 'gvy': 'groovy', 'gy': 'groovy',
'sql': 'sql', 'sh': 'bash', 'bash': 'bash', 'zsh': 'bash',
'pl': 'perl', 'pm': 'perl', 't': 'perl',
'lua': 'lua', 'hs': 'haskell', 'lhs': 'haskell',
'clj': 'clojure', 'cljs': 'clojure', 'cljc': 'clojure', 'edn': 'clojure',
'ex': 'elixir', 'exs': 'elixir', 'erl': 'erlang', 'hrl': 'erlang',
'm': 'objective-c', 'mm': 'objective-c',
'r': 'r', 'rmd': 'r',
'vb': 'visual-basic', 'fs': 'fsharp', 'fsi': 'fsharp', 'fsx': 'fsharp',
'tf': 'hcl', 'hcl': 'hcl', 'dockerfile': 'dockerfile'
}
return language_map.get(ext, 'text')