commit 127854bd104ea8d021473a7915e8554bc59c0e8c
Author: wzx <17839623189@163.com>
Date:   Thu Oct 17 09:56:32 2024 +0800

    init

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000..317634a
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 Han Fangyuan
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..17e1ba8
--- /dev/null
+++ b/README.md
@@ -0,0 +1,20 @@
+# jina_sumary
+ChatGPT on WeChat项目插件, 使用jina reader和ChatGPT解析网页内容
+
+支持解析公众号、小红书、csdn等分享卡片链接(有的卡片链接会触发验证，一般直链没有此问题)
+
+![wechat_mp](./docs/images/wechat_mp.jpg)
+![red](./docs/images/red.jpg)
+![csdn](./docs/images/csdn.jpg)
+
+config.json 配置说明
+```bash
+{
+  "jina_reader_base": "https://r.jina.ai",    # jina reader链接，默认为https://r.jina.ai
+  "dify_base_url": "",                        # dify api链接，默认为空
+  "dify_api_key": "app-XXXX",                 # dify api key
+  "max_words": 8000,                          # 网页链接内容的最大字数，防止超过最大输入token，使用字符串长度简单计数
+  "white_url_list": [],                       # url白名单, 列表为空时不做限制，黑名单优先级大于白名单，即当一个url既在白名单又在黑名单时，黑名单生效
+  "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"]   # url黑名单，排除不支持总结的视频号等链接
+}
+```
diff --git a/__init__.py b/__init__.py
new file mode 100644
index 0000000..58635bd
--- /dev/null
+++ b/__init__.py
@@ -0,0 +1 @@
+from .vlink_to_text import *
diff --git a/__pycache__/__init__.cpython-310.pyc b/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..94299fd
Binary files /dev/null and b/__pycache__/__init__.cpython-310.pyc differ
diff --git a/__pycache__/jina_sum.cpython-310.pyc b/__pycache__/jina_sum.cpython-310.pyc
new file mode 100644
index 0000000..26c8ea8
Binary files /dev/null and b/__pycache__/jina_sum.cpython-310.pyc differ
diff --git a/__pycache__/vlink_to_text.cpython-310.pyc b/__pycache__/vlink_to_text.cpython-310.pyc
new file mode 100644
index 0000000..a6b8538
Binary files /dev/null and b/__pycache__/vlink_to_text.cpython-310.pyc differ
diff --git a/config.json b/config.json
new file mode 100644
index 0000000..86bb0b9
--- /dev/null
+++ b/config.json
@@ -0,0 +1,8 @@
+{
+  "jina_reader_base": "https://r.jina.ai",
+  "dify_base_url": "http://sl.vrgon.com:6062/v1",
+  "dify_api_key": "app-MHDQgABuewVvmTChGVXUcl3S",
+  "max_words": 8000,
+  "white_url_list": [],
+  "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"]
+}
diff --git a/config.json.template b/config.json.template
new file mode 100644
index 0000000..e603504
--- /dev/null
+++ b/config.json.template
@@ -0,0 +1,8 @@
+{
+  "jina_reader_base": "https://r.jina.ai",
+  "dify_base_url": "",
+  "dify_api_key": "app-XXXX",
+  "max_words": 8000,
+  "white_url_list": [],
+  "black_url_list": ["https://support.weixin.qq.com", "https://channels-aladin.wxqcloud.qq.com"],
+}
diff --git a/docs/images/csdn.jpg b/docs/images/csdn.jpg
new file mode 100644
index 0000000..ab98e3b
Binary files /dev/null and b/docs/images/csdn.jpg differ
diff --git a/docs/images/red.jpg b/docs/images/red.jpg
new file mode 100644
index 0000000..e8fb740
Binary files /dev/null and b/docs/images/red.jpg differ
diff --git a/docs/images/wechat_mp.jpg b/docs/images/wechat_mp.jpg
new file mode 100644
index 0000000..42a1359
Binary files /dev/null and b/docs/images/wechat_mp.jpg differ
diff --git a/vlink_to_text.py b/vlink_to_text.py
new file mode 100644
index 0000000..2b2f6f4
--- /dev/null
+++ b/vlink_to_text.py
@@ -0,0 +1,135 @@
+# encoding:utf-8
+import json
+import os
+import html
+from urllib.parse import urlparse
+import re
+import requests
+
+import plugins
+from bridge.context import ContextType
+from bridge.reply import Reply, ReplyType
+from common.log import logger
+from plugins import *
+
+from lib.dify.dify_client import ChatClient
+
+
+@plugins.register(
+    name="VLink2Text",
+    desire_priority=10,
+    hidden=False,
+    enabled=True,
+    desc="Sum url link content with jina reader and llm",
+    version="0.0.1",
+    author="wangzixiang",
+)
+
+
+class VLink2Text(Plugin):
+    jina_reader_base = "https://r.jina.ai"
+    max_words = 8000
+    white_url_list = []
+    black_url_list = [
+        "https://support.weixin.qq.com", # 视频号视频
+        "https://channels-aladin.wxqcloud.qq.com", # 视频号音乐
+    ]
+
+    def __init__(self):
+        super().__init__()
+        try:
+            self.config = super().load_config()
+            if not self.config:
+                self.config = self._load_config_template()
+            self.first_reply = "🎉正在为您审核，请稍候..."
+            self.dify = ChatClient(base_url=self.config.get("dify_base_url", ""), api_key=self.config.get("dify_api_key", ""))
+            self.jina_reader_base = self.config.get("jina_reader_base", self.jina_reader_base)
+            self.max_words = self.config.get("max_words", self.max_words)
+            self.first_reply = self.config.get("first_reply", self.first_reply)
+            self.white_url_list = self.config.get("white_url_list", self.white_url_list)
+            self.black_url_list = self.config.get("black_url_list", self.black_url_list)
+            logger.info(f"[VLink2Text] inited, config={self.config}")
+            self.handlers[Event.ON_HANDLE_CONTEXT] = self.on_handle_context
+        except Exception as e:
+            logger.error(f"[VLink2Text] 初始化异常：{e}")
+            raise "[VLink2Text] init failed, ignore "
+
+    def on_handle_context(self, e_context: EventContext, retry_count: int = 0):
+        try:
+            context = e_context["context"]
+            content = context.content
+            if context.type != ContextType.SHARING and context.type != ContextType.TEXT:
+                return
+            if not self._check_url(content):
+                logger.debug(f"[VLink2Text] {content} is not a valid url, skip")
+                return
+            if retry_count == 0:
+                logger.debug("[VLink2Text] on_handle_context. content: %s" % content)
+                reply = Reply(ReplyType.TEXT, self.first_reply)
+                channel = e_context["channel"]
+                channel.send(reply, context)
+
+            target_url = html.unescape(content) # 解决公众号卡片链接校验问题，参考 https://github.com/fatwang2/sum4all/commit/b983c49473fc55f13ba2c44e4d8b226db3517c45
+            jina_url = self._get_jina_url(target_url)
+            headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"}
+            response = requests.get(jina_url, headers=headers, timeout=60)
+            response.raise_for_status()
+            # 通过正则过滤掉图片以及链接，并保留最大字数
+            target_url_content = re.sub(r'!\[.*?\]\(.*?\)|http\S+|www.\S+', '', response.text)[:self.max_words]
+            
+
+            response = self.dify.create_chat_message(inputs={}, query="【审核】"+target_url_content, conversation_id="", user="vlink2text")
+            response.raise_for_status()
+
+            result = json.loads(response.content)["answer"]
+            logger.info(f"[VLink2Text] response: {result}")
+
+            reply = Reply(ReplyType.TEXT, result)
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+        except Exception as e:
+            if retry_count < 3:
+                logger.warning(f"[VLink2Text] {str(e)}, retry {retry_count + 1}")
+                self.on_handle_context(e_context, retry_count + 1)
+                return
+
+            logger.exception(f"[VLink2Text] {str(e)}")
+            reply = Reply(ReplyType.ERROR, "我暂时无法审核链接，请稍后再试")
+            e_context["reply"] = reply
+            e_context.action = EventAction.BREAK_PASS
+
+    def get_help_text(self, verbose, **kwargs):
+        return f'使用jina reader和ChatGPT审核网页链接内容'
+
+    def _load_config_template(self):
+        logger.debug("No Suno plugin config.json, use plugins/jina_sum/config.json.template")
+        try:
+            plugin_config_path = os.path.join(self.path, "config.json.template")
+            if os.path.exists(plugin_config_path):
+                with open(plugin_config_path, "r", encoding="utf-8") as f:
+                    plugin_conf = json.load(f)
+                    return plugin_conf
+        except Exception as e:
+            logger.exception(e)
+
+    def _get_jina_url(self, target_url):
+        return self.jina_reader_base + "/" + target_url
+
+    def _check_url(self, target_url: str):
+        stripped_url = target_url.strip()
+        # 简单校验是否是url
+        if not stripped_url.startswith("http://") and not stripped_url.startswith("https://"):
+            return False
+
+        # 检查白名单
+        if len(self.white_url_list):
+            if not any(stripped_url.startswith(white_url) for white_url in self.white_url_list):
+                return False
+
+        # 排除黑名单，黑名单优先级>白名单
+        for black_url in self.black_url_list:
+            if stripped_url.startswith(black_url):
+                return False
+
+        return True