vlinktotext/vlink_to_text.py

136 lines
5.5 KiB
Python
Raw Normal View History

2024-10-17 09:56:32 +08:00
# encoding:utf-8
import json
import os
import html
from urllib.parse import urlparse
import re
import requests
import plugins
from bridge.context import ContextType
from bridge.reply import Reply, ReplyType
from common.log import logger
from plugins import *
from lib.dify.dify_client import ChatClient
@plugins.register(
name="VLink2Text",
desire_priority=10,
hidden=False,
enabled=True,
desc="Sum url link content with jina reader and llm",
version="0.0.1",
author="wangzixiang",
)
class VLink2Text(Plugin):
jina_reader_base = "https://r.jina.ai"
max_words = 8000
white_url_list = []
black_url_list = [
"https://support.weixin.qq.com", # 视频号视频
"https://channels-aladin.wxqcloud.qq.com", # 视频号音乐
]
def __init__(self):
super().__init__()
try:
self.config = super().load_config()
if not self.config:
self.config = self._load_config_template()
self.first_reply = "🎉正在为您审核,请稍候..."
self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))
self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
self.max_words = self.config.get("max_words", self.max_words)
self.first_reply = self.config.get("first_reply", self.first_reply)
self.white_url_list = self.config.get("white_url_list", self.white_url_list)
self.black_url_list = self.config.get("black_url_list", self.black_url_list)
logger.info(f"[VLink2Text] inited, config={self.config}")
self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
except Exception as e:
logger.error(f"[VLink2Text] 初始化异常:{e}")
raise "[VLink2Text] init failed, ignore "
def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
try:
context = e_context["context"]
content = context.content
if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
return
if not self._check_url(content):
logger.debug(f"[VLink2Text] {content} is not a valid url, skip")
return
if retry_count == 0:
logger.debug("[VLink2Text] on_handle_context. content: %s" % content)
reply = Reply(ReplyType.TEXT, self.first_reply)
channel = e_context["channel"]
channel.send(reply, context)
target_url = html.unescape(content) # 解决公众号卡片链接校验问题,参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
jina_url = self._get_jina_url(target_url)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
response = requests.get(jina_url, headers=headers, timeout=60)
response.raise_for_status()
# 通过正则过滤掉图片以及链接,并保留最大字数
target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words]
response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")
response.raise_for_status()
result = json.loads(response.content)["answer"]
logger.info(f"[VLink2Text] response: {result}")
reply = Reply(ReplyType.TEXT, result)
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS
except Exception as e:
if retry_count < 3:
logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")
self.on_handle_context(e_context, retry_count + 1)
return
logger.exception(f"[VLink2Text] {str(e)}")
reply = Reply(ReplyType.ERROR, "我暂时无法审核链接,请稍后再试")
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS
def get_help_text(self, verbose, **kwargs):
return f'使用jina reader和ChatGPT审核网页链接内容'
def _load_config_template(self):
logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
try:
plugin_config_path = os.path.join(self.path, "config.json.template")
if os.path.exists(plugin_config_path):
with open(plugin_config_path, "r", encoding="utf-8") as f:
plugin_conf = json.load(f)
return plugin_conf
except Exception as e:
logger.exception(e)
def _get_jina_url(self, target_url):
return self.jina_reader_base + "/" + target_url
def _check_url(self, target_url: str):
stripped_url = target_url.strip()
# 简单校验是否是url
if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):
return False
# 检查白名单
if len(self.white_url_list):
if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):
return False
# 排除黑名单,黑名单优先级>白名单
for black_url in self.black_url_list:
if stripped_url.startswith(black_url):
return False
return True