CodeReview/backend/app/utils/repo_utils.py

78 lines
2.6 KiB
Python

from urllib.parse import urlparse, urlunparse
from typing import Dict, Optional
def parse_repository_url(repo_url: str, repo_type: str) -> Dict[str, str]:
"""
Parses a repository URL and returns its components.
Args:
repo_url: The repository URL.
repo_type: The type of repository ('github', 'gitlab', 'gitea').
Returns:
A dictionary containing parsed components:
- base_url: The API base URL (for self-hosted instances) or default API URL.
- owner: The owner/namespace of the repository.
- repo: The repository name.
- server_url: The base URL of the server (scheme + netloc).
Raises:
ValueError: If the URL is invalid or schema/domain check fails.
"""
if not repo_url:
raise ValueError(f"{repo_type} 仓库 URL 不能为空")
# Basic sanitization
repo_url = repo_url.strip()
# Check scheme to prevent SSRF (only allow http and https)
parsed = urlparse(repo_url)
if parsed.scheme not in ('http', 'https'):
raise ValueError(f"{repo_type} 仓库 URL 必须使用 http 或 https 协议")
# Remove .git suffix if present
path = parsed.path.strip('/')
if path.endswith('.git'):
path = path[:-4]
path_parts = path.split('/')
if len(path_parts) < 2:
raise ValueError(f"{repo_type} 仓库 URL 格式错误")
base = f"{parsed.scheme}://{parsed.netloc}"
if repo_type == "github":
# Handle github.com specifically if needed, or assume path_parts are owner/repo
# Case: https://github.com/owner/repo
if 'github.com' in parsed.netloc:
owner, repo = path_parts[-2], path_parts[-1]
api_base = "https://api.github.com"
else:
# Enterprise GitHub or similar?
owner, repo = path_parts[-2], path_parts[-1]
api_base = f"{base}/api/v3" # Assumption for GHE
elif repo_type == "gitlab":
# GitLab supports subgroups, so path could be group/subgroup/repo
# But commonly we just need project path (URL encoded)
# We'll treat the full path as the project path identifier
repo = path_parts[-1]
owner = "/".join(path_parts[:-1])
api_base = f"{base}/api/v4"
elif repo_type == "gitea":
# Gitea: /owner/repo
owner, repo = path_parts[0], path_parts[1]
api_base = f"{base}/api/v1"
else:
raise ValueError(f"不支持的仓库类型: {repo_type}")
return {
"base_url": api_base,
"owner": owner,
"repo": repo,
"project_path": path, # Useful for GitLab
"server_url": base
}