This commit is contained in:
wzx 2024-10-17 09:56:32 +08:00
commit 127854bd10
12 changed files with 193 additions and 0 deletions

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 Han Fangyuan
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

20
README.md Normal file
View File

@ -0,0 +1,20 @@
# jina_sumary
ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT解析网页内容
支持解析公众号、小红书、csdn等分享卡片链接(有的卡片链接会触发验证,一般直链没有此问题)
![wechat_mp](./docs/images/wechat_mp.jpg)
![red](./docs/images/red.jpg)
![csdn](./docs/images/csdn.jpg)
config.json 配置说明
```bash
{
"jina_reader_base": "https://r.jina.ai", # jina reader链接默认为https://r.jina.ai
"dify_base_url": "", # dify api链接默认为空
"dify_api_key": "app-XXXX", # dify api key
"max_words": 8000, # 网页链接内容的最大字数防止超过最大输入token使用字符串长度简单计数
"white_url_list": [], # url白名单, 列表为空时不做限制黑名单优先级大于白名单即当一个url既在白名单又在黑名单时黑名单生效
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"] # url黑名单排除不支持总结的视频号等链接
}
```

1
__init__.py Normal file
View File

@ -0,0 +1 @@
from .vlink_to_text import *

Binary file not shown.

Binary file not shown.

Binary file not shown.

8
config.json Normal file
View File

@ -0,0 +1,8 @@
{
"jina_reader_base": "https://r.jina.ai",
"dify_base_url": "http://sl.vrgon.com:6062/v1",
"dify_api_key": "app-MHDQgABuewVvmTChGVXUcl3S",
"max_words": 8000,
"white_url_list": [],
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"]
}

8
config.json.template Normal file
View File

@ -0,0 +1,8 @@
{
"jina_reader_base": "https://r.jina.ai",
"dify_base_url": "",
"dify_api_key": "app-XXXX",
"max_words": 8000,
"white_url_list": [],
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"],
}

BIN
docs/images/csdn.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

BIN
docs/images/red.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 88 KiB

BIN
docs/images/wechat_mp.jpg Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 115 KiB

135
vlink_to_text.py Normal file
View File

@ -0,0 +1,135 @@
# encoding:utf-8
import json
import os
import html
from urllib.parse import urlparse
import re
import requests
import plugins
from bridge.context import ContextType
from bridge.reply import Reply, ReplyType
from common.log import logger
from plugins import *
from lib.dify.dify_client import ChatClient
@plugins.register(
name="VLink2Text",
desire_priority=10,
hidden=False,
enabled=True,
desc="Sum url link content with jina reader and llm",
version="0.0.1",
author="wangzixiang",
)
class VLink2Text(Plugin):
jina_reader_base = "https://r.jina.ai"
max_words = 8000
white_url_list = []
black_url_list = [
"https://support.weixin.qq.com", # 视频号视频
"https://channels-aladin.wxqcloud.qq.com", # 视频号音乐
]
def __init__(self):
super().__init__()
try:
self.config = super().load_config()
if not self.config:
self.config = self._load_config_template()
self.first_reply = "🎉正在为您审核,请稍候..."
self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))
self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
self.max_words = self.config.get("max_words", self.max_words)
self.first_reply = self.config.get("first_reply", self.first_reply)
self.white_url_list = self.config.get("white_url_list", self.white_url_list)
self.black_url_list = self.config.get("black_url_list", self.black_url_list)
logger.info(f"[VLink2Text] inited, config={self.config}")
self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
except Exception as e:
logger.error(f"[VLink2Text] 初始化异常:{e}")
raise "[VLink2Text] init failed, ignore "
def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
try:
context = e_context["context"]
content = context.content
if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
return
if not self._check_url(content):
logger.debug(f"[VLink2Text] {content} is not a valid url, skip")
return
if retry_count == 0:
logger.debug("[VLink2Text] on_handle_context. content: %s" % content)
reply = Reply(ReplyType.TEXT, self.first_reply)
channel = e_context["channel"]
channel.send(reply, context)
target_url = html.unescape(content) # 解决公众号卡片链接校验问题,参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
jina_url = self._get_jina_url(target_url)
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
response = requests.get(jina_url, headers=headers, timeout=60)
response.raise_for_status()
# 通过正则过滤掉图片以及链接,并保留最大字数
target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words]
response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")
response.raise_for_status()
result = json.loads(response.content)["answer"]
logger.info(f"[VLink2Text] response: {result}")
reply = Reply(ReplyType.TEXT, result)
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS
except Exception as e:
if retry_count < 3:
logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")
self.on_handle_context(e_context, retry_count + 1)
return
logger.exception(f"[VLink2Text] {str(e)}")
reply = Reply(ReplyType.ERROR, "我暂时无法审核链接,请稍后再试")
e_context["reply"] = reply
e_context.action = EventAction.BREAK_PASS
def get_help_text(self, verbose, **kwargs):
return f'使用jina reader和ChatGPT审核网页链接内容'
def _load_config_template(self):
logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
try:
plugin_config_path = os.path.join(self.path, "config.json.template")
if os.path.exists(plugin_config_path):
with open(plugin_config_path, "r", encoding="utf-8") as f:
plugin_conf = json.load(f)
return plugin_conf
except Exception as e:
logger.exception(e)
def _get_jina_url(self, target_url):
return self.jina_reader_base + "/" + target_url
def _check_url(self, target_url: str):
stripped_url = target_url.strip()
# 简单校验是否是url
if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):
return False
# 检查白名单
if len(self.white_url_list):
if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):
return False
# 排除黑名单,黑名单优先级>白名单
for black_url in self.black_url_list:
if stripped_url.startswith(black_url):
return False
return True