init
This commit is contained in:
commit
127854bd10
|
@ -0,0 +1,21 @@
|
|||
MIT License
|
||||
|
||||
Copyright (c) 2024 Han Fangyuan
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in all
|
||||
copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
|
@ -0,0 +1,20 @@
|
|||
# jina_sumary
|
||||
ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT解析网页内容
|
||||
|
||||
支持解析公众号、小红书、csdn等分享卡片链接(有的卡片链接会触发验证,一般直链没有此问题)
|
||||
|
||||
![wechat_mp](./docs/images/wechat_mp.jpg)
|
||||
![red](./docs/images/red.jpg)
|
||||
![csdn](./docs/images/csdn.jpg)
|
||||
|
||||
config.json 配置说明
|
||||
```bash
|
||||
{
|
||||
"jina_reader_base": "https://r.jina.ai", # jina reader链接,默认为https://r.jina.ai
|
||||
"dify_base_url": "", # dify api链接,默认为空
|
||||
"dify_api_key": "app-XXXX", # dify api key
|
||||
"max_words": 8000, # 网页链接内容的最大字数,防止超过最大输入token,使用字符串长度简单计数
|
||||
"white_url_list": [], # url白名单, 列表为空时不做限制,黑名单优先级大于白名单,即当一个url既在白名单又在黑名单时,黑名单生效
|
||||
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"] # url黑名单,排除不支持总结的视频号等链接
|
||||
}
|
||||
```
|
|
@ -0,0 +1 @@
|
|||
from .vlink_to_text import *
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"jina_reader_base": "https://r.jina.ai",
|
||||
"dify_base_url": "http://sl.vrgon.com:6062/v1",
|
||||
"dify_api_key": "app-MHDQgABuewVvmTChGVXUcl3S",
|
||||
"max_words": 8000,
|
||||
"white_url_list": [],
|
||||
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"]
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
{
|
||||
"jina_reader_base": "https://r.jina.ai",
|
||||
"dify_base_url": "",
|
||||
"dify_api_key": "app-XXXX",
|
||||
"max_words": 8000,
|
||||
"white_url_list": [],
|
||||
"black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"],
|
||||
}
|
Binary file not shown.
After Width: | Height: | Size: 107 KiB |
Binary file not shown.
After Width: | Height: | Size: 88 KiB |
Binary file not shown.
After Width: | Height: | Size: 115 KiB |
|
@ -0,0 +1,135 @@
|
|||
# encoding:utf-8
|
||||
import json
|
||||
import os
|
||||
import html
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
import requests
|
||||
|
||||
import plugins
|
||||
from bridge.context import ContextType
|
||||
from bridge.reply import Reply, ReplyType
|
||||
from common.log import logger
|
||||
from plugins import *
|
||||
|
||||
from lib.dify.dify_client import ChatClient
|
||||
|
||||
|
||||
@plugins.register(
|
||||
name="VLink2Text",
|
||||
desire_priority=10,
|
||||
hidden=False,
|
||||
enabled=True,
|
||||
desc="Sum url link content with jina reader and llm",
|
||||
version="0.0.1",
|
||||
author="wangzixiang",
|
||||
)
|
||||
|
||||
|
||||
class VLink2Text(Plugin):
|
||||
jina_reader_base = "https://r.jina.ai"
|
||||
max_words = 8000
|
||||
white_url_list = []
|
||||
black_url_list = [
|
||||
"https://support.weixin.qq.com", # 视频号视频
|
||||
"https://channels-aladin.wxqcloud.qq.com", # 视频号音乐
|
||||
]
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
try:
|
||||
self.config = super().load_config()
|
||||
if not self.config:
|
||||
self.config = self._load_config_template()
|
||||
self.first_reply = "🎉正在为您审核,请稍候..."
|
||||
self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))
|
||||
self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
|
||||
self.max_words = self.config.get("max_words", self.max_words)
|
||||
self.first_reply = self.config.get("first_reply", self.first_reply)
|
||||
self.white_url_list = self.config.get("white_url_list", self.white_url_list)
|
||||
self.black_url_list = self.config.get("black_url_list", self.black_url_list)
|
||||
logger.info(f"[VLink2Text] inited, config={self.config}")
|
||||
self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
|
||||
except Exception as e:
|
||||
logger.error(f"[VLink2Text] 初始化异常:{e}")
|
||||
raise "[VLink2Text] init failed, ignore "
|
||||
|
||||
def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
|
||||
try:
|
||||
context = e_context["context"]
|
||||
content = context.content
|
||||
if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
|
||||
return
|
||||
if not self._check_url(content):
|
||||
logger.debug(f"[VLink2Text] {content} is not a valid url, skip")
|
||||
return
|
||||
if retry_count == 0:
|
||||
logger.debug("[VLink2Text] on_handle_context. content: %s" % content)
|
||||
reply = Reply(ReplyType.TEXT, self.first_reply)
|
||||
channel = e_context["channel"]
|
||||
channel.send(reply, context)
|
||||
|
||||
target_url = html.unescape(content) # 解决公众号卡片链接校验问题,参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
|
||||
jina_url = self._get_jina_url(target_url)
|
||||
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
|
||||
response = requests.get(jina_url, headers=headers, timeout=60)
|
||||
response.raise_for_status()
|
||||
# 通过正则过滤掉图片以及链接,并保留最大字数
|
||||
target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words]
|
||||
|
||||
|
||||
response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")
|
||||
response.raise_for_status()
|
||||
|
||||
result = json.loads(response.content)["answer"]
|
||||
logger.info(f"[VLink2Text] response: {result}")
|
||||
|
||||
reply = Reply(ReplyType.TEXT, result)
|
||||
e_context["reply"] = reply
|
||||
e_context.action = EventAction.BREAK_PASS
|
||||
|
||||
except Exception as e:
|
||||
if retry_count < 3:
|
||||
logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")
|
||||
self.on_handle_context(e_context, retry_count + 1)
|
||||
return
|
||||
|
||||
logger.exception(f"[VLink2Text] {str(e)}")
|
||||
reply = Reply(ReplyType.ERROR, "我暂时无法审核链接,请稍后再试")
|
||||
e_context["reply"] = reply
|
||||
e_context.action = EventAction.BREAK_PASS
|
||||
|
||||
def get_help_text(self, verbose, **kwargs):
|
||||
return f'使用jina reader和ChatGPT审核网页链接内容'
|
||||
|
||||
def _load_config_template(self):
|
||||
logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
|
||||
try:
|
||||
plugin_config_path = os.path.join(self.path, "config.json.template")
|
||||
if os.path.exists(plugin_config_path):
|
||||
with open(plugin_config_path, "r", encoding="utf-8") as f:
|
||||
plugin_conf = json.load(f)
|
||||
return plugin_conf
|
||||
except Exception as e:
|
||||
logger.exception(e)
|
||||
|
||||
def _get_jina_url(self, target_url):
|
||||
return self.jina_reader_base + "/" + target_url
|
||||
|
||||
def _check_url(self, target_url: str):
|
||||
stripped_url = target_url.strip()
|
||||
# 简单校验是否是url
|
||||
if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):
|
||||
return False
|
||||
|
||||
# 检查白名单
|
||||
if len(self.white_url_list):
|
||||
if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):
|
||||
return False
|
||||
|
||||
# 排除黑名单,黑名单优先级>白名单
|
||||
for black_url in self.black_url_list:
|
||||
if stripped_url.startswith(black_url):
|
||||
return False
|
||||
|
||||
return True
|
Loading…
Reference in New Issue