commit 127854bd104ea8d021473a7915e8554bc59c0e8c Author: wzx <17839623189@163.com> Date: Thu Oct 17 09:56:32 2024 +0800 init diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..317634a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 Han Fangyuan + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..17e1ba8 --- /dev/null +++ b/README.md @@ -0,0 +1,20 @@ +# jina_sumary +ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT解析网页内容 + +支持解析公众号、小红书、csdn等分享卡片链接(有的卡片链接会触发验证,一般直链没有此问题) + +![wechat_mp](./docs/images/wechat_mp.jpg) +![red](./docs/images/red.jpg) +![csdn](./docs/images/csdn.jpg) + +config.json 配置说明 +```bash +{ + "jina_reader_base": "https://r.jina.ai", # jina reader链接,默认为https://r.jina.ai + "dify_base_url": "", # dify api链接,默认为空 + "dify_api_key": "app-XXXX", # dify api key + "max_words": 8000, # 网页链接内容的最大字数,防止超过最大输入token,使用字符串长度简单计数 + "white_url_list": [], # url白名单, 列表为空时不做限制,黑名单优先级大于白名单,即当一个url既在白名单又在黑名单时,黑名单生效 + "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"] # url黑名单,排除不支持总结的视频号等链接 +} +``` diff --git a/__init__.py b/__init__.py new file mode 100644 index 0000000..58635bd --- /dev/null +++ b/__init__.py @@ -0,0 +1 @@ +from .vlink_to_text import * diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..94299fd Binary files /dev/null and b/__pycache__/__init__.cpython-310.pyc differ diff --git a/__pycache__/jina_sum.cpython-310.pyc b/__pycache__/jina_sum.cpython-310.pyc new file mode 100644 index 0000000..26c8ea8 Binary files /dev/null and b/__pycache__/jina_sum.cpython-310.pyc differ diff --git a/__pycache__/vlink_to_text.cpython-310.pyc b/__pycache__/vlink_to_text.cpython-310.pyc new file mode 100644 index 0000000..a6b8538 Binary files /dev/null and b/__pycache__/vlink_to_text.cpython-310.pyc differ diff --git a/config.json b/config.json new file mode 100644 index 0000000..86bb0b9 --- /dev/null +++ b/config.json @@ -0,0 +1,8 @@ +{ + "jina_reader_base": "https://r.jina.ai", + "dify_base_url": "http://sl.vrgon.com:6062/v1", + "dify_api_key": "app-MHDQgABuewVvmTChGVXUcl3S", + "max_words": 8000, + "white_url_list": [], + "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"] +} diff --git a/config.json.template b/config.json.template new file mode 100644 index 0000000..e603504 --- /dev/null +++ b/config.json.template @@ -0,0 +1,8 @@ +{ + "jina_reader_base": "https://r.jina.ai", + "dify_base_url": "", + "dify_api_key": "app-XXXX", + "max_words": 8000, + "white_url_list": [], + "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"], +} diff --git a/docs/images/csdn.jpg b/docs/images/csdn.jpg new file mode 100644 index 0000000..ab98e3b Binary files /dev/null and b/docs/images/csdn.jpg differ diff --git a/docs/images/red.jpg b/docs/images/red.jpg new file mode 100644 index 0000000..e8fb740 Binary files /dev/null and b/docs/images/red.jpg differ diff --git a/docs/images/wechat_mp.jpg b/docs/images/wechat_mp.jpg new file mode 100644 index 0000000..42a1359 Binary files /dev/null and b/docs/images/wechat_mp.jpg differ diff --git a/vlink_to_text.py b/vlink_to_text.py new file mode 100644 index 0000000..2b2f6f4 --- /dev/null +++ b/vlink_to_text.py @@ -0,0 +1,135 @@ +# encoding:utf-8 +import json +import os +import html +from urllib.parse import urlparse +import re +import requests + +import plugins +from bridge.context import ContextType +from bridge.reply import Reply, ReplyType +from common.log import logger +from plugins import * + +from lib.dify.dify_client import ChatClient + + +@plugins.register( + name="VLink2Text", + desire_priority=10, + hidden=False, + enabled=True, + desc="Sum url link content with jina reader and llm", + version="0.0.1", + author="wangzixiang", +) + + +class VLink2Text(Plugin): + jina_reader_base = "https://r.jina.ai" + max_words = 8000 + white_url_list = [] + black_url_list = [ + "https://support.weixin.qq.com", # 视频号视频 + "https://channels-aladin.wxqcloud.qq.com", # 视频号音乐 + ] + + def __init__(self): + super().__init__() + try: + self.config = super().load_config() + if not self.config: + self.config = self._load_config_template() + self.first_reply = "🎉正在为您审核,请稍候..." + self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", "")) + self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base) + self.max_words = self.config.get("max_words", self.max_words) + self.first_reply = self.config.get("first_reply", self.first_reply) + self.white_url_list = self.config.get("white_url_list", self.white_url_list) + self.black_url_list = self.config.get("black_url_list", self.black_url_list) + logger.info(f"[VLink2Text] inited, config={self.config}") + self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context + except Exception as e: + logger.error(f"[VLink2Text] 初始化异常:{e}") + raise "[VLink2Text] init failed, ignore " + + def on_handle_context(self, e_context: EventContext, retry_count: int = 0): + try: + context = e_context["context"] + content = context.content + if context.type != ContextType.SHARING and context.type != ContextType.TEXT: + return + if not self._check_url(content): + logger.debug(f"[VLink2Text] {content} is not a valid url, skip") + return + if retry_count == 0: + logger.debug("[VLink2Text] on_handle_context. content: %s" % content) + reply = Reply(ReplyType.TEXT, self.first_reply) + channel = e_context["channel"] + channel.send(reply, context) + + target_url = html.unescape(content) # 解决公众号卡片链接校验问题,参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45 + jina_url = self._get_jina_url(target_url) + headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"} + response = requests.get(jina_url, headers=headers, timeout=60) + response.raise_for_status() + # 通过正则过滤掉图片以及链接,并保留最大字数 + target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words] + + + response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text") + response.raise_for_status() + + result = json.loads(response.content)["answer"] + logger.info(f"[VLink2Text] response: {result}") + + reply = Reply(ReplyType.TEXT, result) + e_context["reply"] = reply + e_context.action = EventAction.BREAK_PASS + + except Exception as e: + if retry_count < 3: + logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}") + self.on_handle_context(e_context, retry_count + 1) + return + + logger.exception(f"[VLink2Text] {str(e)}") + reply = Reply(ReplyType.ERROR, "我暂时无法审核链接,请稍后再试") + e_context["reply"] = reply + e_context.action = EventAction.BREAK_PASS + + def get_help_text(self, verbose, **kwargs): + return f'使用jina reader和ChatGPT审核网页链接内容' + + def _load_config_template(self): + logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template") + try: + plugin_config_path = os.path.join(self.path, "config.json.template") + if os.path.exists(plugin_config_path): + with open(plugin_config_path, "r", encoding="utf-8") as f: + plugin_conf = json.load(f) + return plugin_conf + except Exception as e: + logger.exception(e) + + def _get_jina_url(self, target_url): + return self.jina_reader_base + "/" + target_url + + def _check_url(self, target_url: str): + stripped_url = target_url.strip() + # 简单校验是否是url + if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"): + return False + + # 检查白名单 + if len(self.white_url_list): + if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list): + return False + + # 排除黑名单,黑名单优先级>白名单 + for black_url in self.black_url_list: + if stripped_url.startswith(black_url): + return False + + return True