# encoding:utf-8 import json import os import html from urllib.parse import urlparse import re import requests import plugins from bridge.context import ContextType from bridge.reply import Reply, ReplyType from common.log import logger from plugins import * from lib.dify.dify_client import ChatClient @plugins.register( name="VLink2Text", desire_priority=10, hidden=False, enabled=True, desc="Sum url link content with jina reader and llm", version="0.0.1", author="wangzixiang", ) class VLink2Text(Plugin): jina_reader_base = "https://r.jina.ai" max_words = 8000 white_url_list = [] black_url_list = [ "https://support.weixin.qq.com", # 视频号视频 "https://channels-aladin.wxqcloud.qq.com", # 视频号音乐 ] def __init__(self): super().__init__() try: self.config = super().load_config() if not self.config: self.config = self._load_config_template() self.first_reply = "🎉正在为您审核,请稍候..." self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", "")) self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base) self.max_words = self.config.get("max_words", self.max_words) self.first_reply = self.config.get("first_reply", self.first_reply) self.white_url_list = self.config.get("white_url_list", self.white_url_list) self.black_url_list = self.config.get("black_url_list", self.black_url_list) logger.info(f"[VLink2Text] inited, config={self.config}") self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context except Exception as e: logger.error(f"[VLink2Text] 初始化异常:{e}") raise "[VLink2Text] init failed, ignore " def on_handle_context(self, e_context: EventContext, retry_count: int = 0): try: context = e_context["context"] content = context.content if context.type != ContextType.SHARING and context.type != ContextType.TEXT: return if not self._check_url(content): logger.debug(f"[VLink2Text] {content} is not a valid url, skip") return if retry_count == 0: logger.debug("[VLink2Text] on_handle_context. content: %s" % content) reply = Reply(ReplyType.TEXT, self.first_reply) channel = e_context["channel"] channel.send(reply, context) target_url = html.unescape(content) # 解决公众号卡片链接校验问题,参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45 jina_url = self._get_jina_url(target_url) headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"} response = requests.get(jina_url, headers=headers, timeout=60) response.raise_for_status() # 通过正则过滤掉图片以及链接,并保留最大字数 target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words] response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text") response.raise_for_status() result = json.loads(response.content)["answer"] logger.info(f"[VLink2Text] response: {result}") reply = Reply(ReplyType.TEXT, result) e_context["reply"] = reply e_context.action = EventAction.BREAK_PASS except Exception as e: if retry_count < 3: logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}") self.on_handle_context(e_context, retry_count + 1) return logger.exception(f"[VLink2Text] {str(e)}") reply = Reply(ReplyType.ERROR, "我暂时无法审核链接,请稍后再试") e_context["reply"] = reply e_context.action = EventAction.BREAK_PASS def get_help_text(self, verbose, **kwargs): return f'使用jina reader和ChatGPT审核网页链接内容' def _load_config_template(self): logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template") try: plugin_config_path = os.path.join(self.path, "config.json.template") if os.path.exists(plugin_config_path): with open(plugin_config_path, "r", encoding="utf-8") as f: plugin_conf = json.load(f) return plugin_conf except Exception as e: logger.exception(e) def _get_jina_url(self, target_url): return self.jina_reader_base + "/" + target_url def _check_url(self, target_url: str): stripped_url = target_url.strip() # 简单校验是否是url if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"): return False # 检查白名单 if len(self.white_url_list): if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list): return False # 排除黑名单,黑名单优先级>白名单 for black_url in self.black_url_list: if stripped_url.startswith(black_url): return False return True