vlinktotext/vlink_to_text.py

# encoding:utf-8
import json
import os
import html
from urllib.parse import urlparse
import re
import requests

import plugins
from bridge.context import ContextType
from bridge.reply import Reply, ReplyType
from common.log import logger
from plugins import *

from lib.dify.dify_client import ChatClient


@plugins.register(
    name="VLink2Text",
    desire_priority=10,
    hidden=False,
    enabled=True,
    desc="Sum url link content with jina reader and llm",
    version="0.0.1",
    author="wangzixiang",
)


class VLink2Text(Plugin):
    jina_reader_base = "https://r.jina.ai"
    max_words = 8000
    white_url_list = []
    black_url_list = [
        "https://support.weixin.qq.com", # 视频号视频
        "https://channels-aladin.wxqcloud.qq.com", # 视频号音乐
    ]

    def __init__(self):
        super().__init__()
        try:
            self.config = super().load_config()
            if not self.config:
                self.config = self._load_config_template()
            self.first_reply = "🎉正在为您审核，请稍候..."
            self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))
            self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
            self.max_words = self.config.get("max_words", self.max_words)
            self.first_reply = self.config.get("first_reply", self.first_reply)
            self.white_url_list = self.config.get("white_url_list", self.white_url_list)
            self.black_url_list = self.config.get("black_url_list", self.black_url_list)
            logger.info(f"[VLink2Text] inited, config={self.config}")
            self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
        except Exception as e:
            logger.error(f"[VLink2Text] 初始化异常：{e}")
            raise "[VLink2Text] init failed, ignore "

    def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
        try:
            context = e_context["context"]
            content = context.content
            if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
                return
            if not self._check_url(content):
                logger.debug(f"[VLink2Text] {content} is not a valid url, skip")
                return
            if retry_count == 0:
                logger.debug("[VLink2Text] on_handle_context. content: %s" % content)
                reply = Reply(ReplyType.TEXT, self.first_reply)
                channel = e_context["channel"]
                channel.send(reply, context)

            target_url = html.unescape(content) # 解决公众号卡片链接校验问题，参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
            jina_url = self._get_jina_url(target_url)
            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
            response = requests.get(jina_url, headers=headers, timeout=60)
            response.raise_for_status()
            # 通过正则过滤掉图片以及链接，并保留最大字数
            target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words]
            

            response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")
            response.raise_for_status()

            result = json.loads(response.content)["answer"]
            logger.info(f"[VLink2Text] response: {result}")

            reply = Reply(ReplyType.TEXT, result)
            e_context["reply"] = reply
            e_context.action = EventAction.BREAK_PASS

        except Exception as e:
            if retry_count < 3:
                logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")
                self.on_handle_context(e_context, retry_count + 1)
                return

            logger.exception(f"[VLink2Text] {str(e)}")
            reply = Reply(ReplyType.ERROR, "我暂时无法审核链接，请稍后再试")
            e_context["reply"] = reply
            e_context.action = EventAction.BREAK_PASS

    def get_help_text(self, verbose, **kwargs):
        return f'使用jina reader和ChatGPT审核网页链接内容'

    def _load_config_template(self):
        logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
        try:
            plugin_config_path = os.path.join(self.path, "config.json.template")
            if os.path.exists(plugin_config_path):
                with open(plugin_config_path, "r", encoding="utf-8") as f:
                    plugin_conf = json.load(f)
                    return plugin_conf
        except Exception as e:
            logger.exception(e)

    def _get_jina_url(self, target_url):
        return self.jina_reader_base + "/" + target_url

    def _check_url(self, target_url: str):
        stripped_url = target_url.strip()
        # 简单校验是否是url
        if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):
            return False

        # 检查白名单
        if len(self.white_url_list):
            if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):
                return False

        # 排除黑名单，黑名单优先级>白名单
        for black_url in self.black_url_list:
            if stripped_url.startswith(black_url):
                return False

        return True
init 2024-10-17 09:56:32 +08:00			`# encoding:utf-8`
			`import json`
			`import os`
			`import html`
			`from urllib.parse import urlparse`
			`import re`
			`import requests`

			`import plugins`
			`from bridge.context import ContextType`
			`from bridge.reply import Reply, ReplyType`
			`from common.log import logger`
			`from plugins import *`

			`from lib.dify.dify_client import ChatClient`


			`@plugins.register(`
			`name="VLink2Text",`
			`desire_priority=10,`
			`hidden=False,`
			`enabled=True,`
			`desc="Sum url link content with jina reader and llm",`
			`version="0.0.1",`
			`author="wangzixiang",`
			`)`


			`class VLink2Text(Plugin):`
			`jina_reader_base = "https://r.jina.ai"`
			`max_words = 8000`
			`white_url_list = []`
			`black_url_list = [`
			`"https://support.weixin.qq.com", # 视频号视频`
			`"https://channels-aladin.wxqcloud.qq.com", # 视频号音乐`
			`]`

			`def __init__(self):`
			`super().__init__()`
			`try:`
			`self.config = super().load_config()`
			`if not self.config:`
			`self.config = self._load_config_template()`
			`self.first_reply = "🎉正在为您审核，请稍候..."`
			`self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))`
			`self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)`
			`self.max_words = self.config.get("max_words", self.max_words)`
			`self.first_reply = self.config.get("first_reply", self.first_reply)`
			`self.white_url_list = self.config.get("white_url_list", self.white_url_list)`
			`self.black_url_list = self.config.get("black_url_list", self.black_url_list)`
			`logger.info(f"[VLink2Text] inited, config={self.config}")`
			`self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context`
			`except Exception as e:`
			`logger.error(f"[VLink2Text] 初始化异常：{e}")`
			`raise "[VLink2Text] init failed, ignore "`

			`def on_handle_context(self, e_context: EventContext, retry_count: int = 0):`
			`try:`
			`context = e_context["context"]`
			`content = context.content`
			`if context.type != ContextType.SHARING and context.type != ContextType.TEXT:`
			`return`
			`if not self._check_url(content):`
			`logger.debug(f"[VLink2Text] {content} is not a valid url, skip")`
			`return`
			`if retry_count == 0:`
			`logger.debug("[VLink2Text] on_handle_context. content: %s" % content)`
			`reply = Reply(ReplyType.TEXT, self.first_reply)`
			`channel = e_context["channel"]`
			`channel.send(reply, context)`

			`target_url = html.unescape(content) # 解决公众号卡片链接校验问题，参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45`
			`jina_url = self._get_jina_url(target_url)`
			`headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}`
			`response = requests.get(jina_url, headers=headers, timeout=60)`
			`response.raise_for_status()`
			`# 通过正则过滤掉图片以及链接，并保留最大字数`
			`target_url_content = re.sub(r'!\[.?\]\(.?\)\|http\S+\|www.\S+', '', response.text)[:self.max_words]`


			`response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")`
			`response.raise_for_status()`

			`result = json.loads(response.content)["answer"]`
			`logger.info(f"[VLink2Text] response: {result}")`

			`reply = Reply(ReplyType.TEXT, result)`
			`e_context["reply"] = reply`
			`e_context.action = EventAction.BREAK_PASS`

			`except Exception as e:`
			`if retry_count < 3:`
			`logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")`
			`self.on_handle_context(e_context, retry_count + 1)`
			`return`

			`logger.exception(f"[VLink2Text] {str(e)}")`
			`reply = Reply(ReplyType.ERROR, "我暂时无法审核链接，请稍后再试")`
			`e_context["reply"] = reply`
			`e_context.action = EventAction.BREAK_PASS`

			`def get_help_text(self, verbose, **kwargs):`
			`return f'使用jina reader和ChatGPT审核网页链接内容'`

			`def _load_config_template(self):`
			`logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")`
			`try:`
			`plugin_config_path = os.path.join(self.path, "config.json.template")`
			`if os.path.exists(plugin_config_path):`
			`with open(plugin_config_path, "r", encoding="utf-8") as f:`
			`plugin_conf = json.load(f)`
			`return plugin_conf`
			`except Exception as e:`
			`logger.exception(e)`

			`def _get_jina_url(self, target_url):`
			`return self.jina_reader_base + "/" + target_url`

			`def _check_url(self, target_url: str):`
			`stripped_url = target_url.strip()`
			`# 简单校验是否是url`
			`if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):`
			`return False`

			`# 检查白名单`
			`if len(self.white_url_list):`
			`if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):`
			`return False`

			`# 排除黑名单，黑名单优先级>白名单`
			`for black_url in self.black_url_list:`
			`if stripped_url.startswith(black_url):`
			`return False`

			`return True`