Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

①添加了OCR组件,需要先安装并配置tesseract;②添加了图像PDF总结插件;③添加了联网的ChatGPT-百度版 #1633

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions config.py
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@
NUM_CUSTOM_BASIC_BTN = 4


#tesseract路径
TESSERACT_PATH = "path/to/your/tesseract.exe"


"""
--------------- 配置关联关系说明 ---------------
Expand Down
23 changes: 23 additions & 0 deletions crazy_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def get_crazy_functions():
from crazy_functions.批量Markdown翻译 import Markdown中译英
from crazy_functions.虚空终端 import 虚空终端
from crazy_functions.生成多种Mermaid图表 import 生成多种Mermaid图表
from crazy_functions.批量总结图像PDF文档 import 批量总结图像PDF文档

function_plugins = {
"虚空终端": {
Expand Down Expand Up @@ -224,6 +225,15 @@ def get_crazy_functions():
"Info": "批量总结PDF文档的内容 | 输入参数为路径",
"Function": HotReload(批量总结PDF文档),
},
"批量总结PDF文档(图像PDF)": {
"Group": "学术",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "批量总结图像PDF文档的内容 | 输入参数为路径",
"Function": HotReload(批量总结图像PDF文档),
"AdvancedArgs": True,
"ArgsReminder": "请输入要识别语言的代码,支持的语言代码详见此网页:https://gitee.com/dalaomai/tessdata_fast。示例:①简体中文:chi_sim;②简体中文和英文:chi_sim+eng;③竖版繁体中文:chi_tra_vert",
},
"谷歌学术检索助手(输入谷歌学术搜索页url)": {
"Group": "学术",
"Color": "stop",
Expand Down Expand Up @@ -332,6 +342,19 @@ def get_crazy_functions():
}
}
)
from crazy_functions.联网的ChatGPT_百度版 import 连接百度搜索回答问题

function_plugins.update(
{
"连接网络回答问题(百度版,输入问题后点击该插件)": {
"Group": "对话",
"Color": "stop",
"AsButton": False, # 加入下拉菜单中
"Info": "连接网络回答问题| 输入参数是一个问题",
"Function": HotReload(连接百度搜索回答问题),
},
}
)
except:
print(trimmed_format_exc())
print("Load function plugin failed")
Expand Down
64 changes: 64 additions & 0 deletions crazy_functions/ocr_fns/tesseract.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import subprocess, os, urllib.request
from toolbox import get_conf

TESSERACT_PATH = get_conf("TESSERACT_PATH")

lang_list = ["afr","amh","ara","asm","aze","aze_cyrl","bel","ben","bod","bos","bre","bul","cat","ceb","ces","chi_sim","chi_sim_vert","chi_tra","chi_tra_vert","chr","cos","cym","dan",
"deu","div","dzo","ell","eng","enm","epo","equ","est","eus","fao","fas","fil","fin","fra","frk","frm","fry","gla","gle","glg","grc","guj","hat","heb","hin","hrv","hun",
"hye","iku","ind","isl","ita","ita_old","jav","jpn","jpn_vert","kan","kat","kat_old","kaz","khm","kir","kmr","kor","kor_vert","lao","lat","lav","lit","ltz","mal","mar",
"mkd","mlt","mon","mri","msa","mya","nep","nld","nor","oci","ori","pan","pol","por","pus","que","ron","rus","san","sin","slk","slv","snd","spa","spa_old","sqi","srp",
"srp_latn","sun","swa","swe","syr","tam","tat","tel","tgk","tha","tir","ton","tur","uig","ukr","urd","uzb","uzb_cyrl","vie","yid","yor"]

def download_lang(lang):
#从码云的某个仓库下载,github太慢。要是哪天链接挂了就换一个
url = f"https://gitee.com/dalaomai/tessdata_fast/raw/main/{lang}.traineddata"

path = os.path.dirname(TESSERACT_PATH)
path = os.path.join(path, "tessdata")
path = os.path.join(path, f"{lang}.traineddata")

response = urllib.request.urlopen(url)
if response.status == 200:
with open(path, 'wb') as file:
file.write(response.read())
print(f'已将{lang}语言包下载至{path}')
else:
print('未能成功从{url}下载语言包')

def lang_exists(lang):
path = os.path.dirname(TESSERACT_PATH)
path = os.path.join(path, "tessdata")
path = os.path.join(path, f"{lang}.traineddata")
return os.path.isfile(path)

def normalize_lang(text):
langs = []
for l in lang_list:
if l in text:
langs.append(l)
if langs.__len__() == 0:
langs = ["chi_sim", "eng"]

invalid_langs = []
for lang in langs:
if lang_exists(lang):
...
else:
try:
download_lang(lang)
except Exception as e:
print(f"下载语言包失败: {e}")
invalid_langs.append(lang)
for lang in invalid_langs:
langs.remove(lang)

if langs.__len__() == 0:
langs = ["osd"]

return "+".join(langs)

def tesseract_ocr(img_path, output_path, lang):
subprocess.run(f"\"{TESSERACT_PATH}\" \"{img_path}\" \"{output_path}\" -l {lang}")
if os.path.isfile(output_path):
os.remove(output_path)
os.rename(output_path+".txt", output_path)
172 changes: 172 additions & 0 deletions crazy_functions/批量总结图像PDF文档.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import input_clipping



def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
file_write_buffer = []
for file_name in file_manifest:



print('begin ocr on:', file_name)
from crazy_functions.ocr_fns.tesseract import normalize_lang, tesseract_ocr
import fitz, os
lang = normalize_lang(str(plugin_kwargs['advanced_arg']))
img_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.png")
txt_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.txt")
pdf_temp = os.path.join("./", "gpt_log", "default_user", "shared", "tmp.pdf")
pages = []
pdf = fitz.open(file_name)
for idx in range(0, pdf.page_count):
page = pdf[idx]
trans = fitz.Matrix(2, 2).prerotate(0)
pm = page.get_pixmap(matrix=trans, alpha=False)
pm.save(img_temp)
tesseract_ocr(img_temp, txt_temp, lang)
with open(txt_temp, "r", encoding="utf-8") as f:
pages.append(f.read())
pdf.close()



print('begin analysis on:', file_name)
############################## <第 0 步,切割PDF> ##################################
# 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
# 的长度必须小于 2500 个 Token
#file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF
#file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
#page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
page_one = [pages[0]]
file_content = "\n".join(pages[1:])

TOKEN_LIMIT_PER_FRAGMENT = 1000

from crazy_functions.pdf_fns.breakdown_txt import breakdown_text_to_satisfy_token_limit
paper_fragments = breakdown_text_to_satisfy_token_limit(txt=file_content, limit=TOKEN_LIMIT_PER_FRAGMENT, llm_model=llm_kwargs['llm_model'])
page_one_fragments = breakdown_text_to_satisfy_token_limit(txt=str(page_one), limit=TOKEN_LIMIT_PER_FRAGMENT//4, llm_model=llm_kwargs['llm_model'])
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]

############################## <第 1 步,从摘要中提取高价值信息,放到history中> ##################################
final_results = []
final_results.append(paper_meta)

############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ##################################
i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI

iteration_results = []
last_iteration_result = paper_meta # 初始值是摘要
MAX_WORD_TOTAL = 1000
n_fragment = len(paper_fragments)
if n_fragment >= 20: print('文章极长,不能达到预期效果')
for i in range(n_fragment):
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
i_say = f"Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
llm_kwargs, chatbot,
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
sys_prompt="Extract the main idea of this section with Chinese." # 提示
)
iteration_results.append(gpt_say)
last_iteration_result = gpt_say

############################## <第 3 步,整理history,提取总结> ##################################
final_results.extend(iteration_results)
final_results.append(f'Please conclude this paper discussed above。')
# This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
NUM_OF_WORD = 1000
i_say = """
1. Mark the title of the paper (with Chinese translation)
2. list all the authors' names (use English)
3. mark the first author's affiliation (output Chinese translation only)
4. mark the keywords of this article (use English)
5. link to the paper, Github code link (if available, fill in Github:None if not)
6. summarize according to the following four points.Be sure to use Chinese answers (proper nouns need to be marked in English)
- (1):What is the research background of this article?
- (2):What are the past methods? What are the problems with them? Is the approach well motivated?
- (3):What is the research methodology proposed in this paper?
- (4):On what task and what performance is achieved by the methods in this paper? Can the performance support their goals?
Follow the format of the output that follows:
1. Title: xxx\n\n
2. Authors: xxx\n\n
3. Affiliation: xxx\n\n
4. Keywords: xxx\n\n
5. Urls: xxx or xxx , xxx \n\n
6. Summary: \n\n
- (1):xxx;\n
- (2):xxx;\n
- (3):xxx;\n
- (4):xxx.\n\n
Be sure to use Chinese answers (proper nouns need to be marked in English), statements as concise and academic as possible,
do not have too much repetitive information, numerical values using the original numbers.
"""
# This prompt is from https://github.com/kaixindelele/ChatPaper/blob/main/chat_paper.py
file_write_buffer.extend(final_results)
i_say, final_results = input_clipping(i_say, final_results, max_token_limit=2000)
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user='开始最终总结',
llm_kwargs=llm_kwargs, chatbot=chatbot, history=final_results,
sys_prompt= f"Extract the main idea of this paper with less than {NUM_OF_WORD} Chinese characters"
)
final_results.append(gpt_say)
file_write_buffer.extend([i_say, gpt_say])
############################## <第 4 步,设置一个token上限> ##################################
_, final_results = input_clipping("", final_results, max_token_limit=3200)
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了

res = write_history_to_file(file_write_buffer)
promote_file_to_downloadzone(res, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=final_results) # 刷新界面


@CatchException
def 批量总结图像PDF文档(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):

import glob, os

# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结图像PDF文档。函数插件贡献者: ValeriaWong,Eralien,ZeeChung"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import fitz
except:
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 清空历史,以免输入溢出
history = []

# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 搜索需要处理的文件清单
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]

# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 开始正式执行任务
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
93 changes: 93 additions & 0 deletions crazy_functions/联网的ChatGPT_百度版.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
from toolbox import CatchException, update_ui
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive, input_clipping
import requests
from bs4 import BeautifulSoup
from request_llms.bridge_all import model_info
import jieba


def bing_search(query):
url = f"http://www.baidu.com/s?wd={query}&cl=3&pn=1&ie=utf-8&rn=20&tn=baidurt"
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
response = requests.get(url, headers=headers)

urls = []
soup = BeautifulSoup(response.text, 'html.parser')
for paragraph in soup.find_all('a'):
if "href" in paragraph.attrs and "onmousedown" in paragraph.attrs and "\'fm\':\'baidurt\'" in paragraph["onmousedown"] and "http" in paragraph["href"] and "tab" not in paragraph["onmousedown"]:
urls.append(paragraph["href"])
return urls


def scrape_text(key_words, url) -> str:
try:
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'}
response = requests.get(url, headers=headers)
if response.encoding == "ISO-8859-1": response.encoding = response.apparent_encoding

soup = BeautifulSoup(response.text, 'html.parser')
text = soup.body.text
text = text.split("\n")

valid = []
for t in text:
for kw in key_words:
if kw in t:
valid.append(t)
break
valid = "\n".join(valid)
return valid
except:
return ""

@CatchException
def 连接百度搜索回答问题(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, user_request):
"""
txt 输入栏用户输入的文本,例如需要翻译的一段话,再例如一个包含了待处理文件的路径
llm_kwargs gpt模型参数,如温度和top_p等,一般原样传递下去就行
plugin_kwargs 插件模型的参数,暂时没有用武之地
chatbot 聊天显示框的句柄,用于显示给用户
history 聊天历史,前情提要
system_prompt 给gpt的静默提醒
user_request 当前用户的请求信息(IP地址等)
"""
history = [] # 清空历史,以免输入溢出
chatbot.append((f"请结合互联网信息回答以下问题:{txt}",
"[Local Message] 请注意,您正在调用一个[函数插件]的模板,该模板可以实现ChatGPT联网信息综合。该函数面向希望实现更多有趣功能的开发者,它可以作为创建新功能函数的模板。您若希望分享新的功能模组,请不吝PR!"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新

# ------------- < 第1步:爬取搜索引擎的结果 > -------------
urls = bing_search(txt)
history = []
if len(urls) == 0:
chatbot.append((f"结论:{txt}",
"[Local Message] 受到百度限制,无法从百度获取信息!"))
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新
return
# ------------- < 第2步:依次访问网页 > -------------
max_search_result = 8 # 最多收纳多少个网页的结果
kw = jieba.lcut_for_search(txt)
for index, url in enumerate(urls[:max_search_result]):
res = scrape_text(kw, url)
history.extend([f"第{index}份搜索结果:", res])
#chatbot.append([f"第{index}份搜索结果:", res[:500]+"......"])
chatbot[-1] = [f"第{index}份搜索结果:", res[:500]+"......"]
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 由于请求gpt需要一段时间,我们先及时地做一次界面更新

# ------------- < 第3步:ChatGPT综合 > -------------
i_say = f"从以上搜索结果中抽取信息,然后回答问题:{txt}"
i_say, history = input_clipping( # 裁剪输入,从最长的条目开始裁剪,防止爆token
inputs=i_say,
history=history,
max_token_limit=model_info[llm_kwargs['llm_model']]['max_token']*3//4
)

gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(
inputs=i_say, inputs_show_user=i_say,
llm_kwargs=llm_kwargs, chatbot=chatbot, history=history,
sys_prompt="请从给定的若干条搜索结果中抽取信息,对最相关的两个搜索结果进行总结,然后回答问题。"
)
chatbot[-1] = (i_say, gpt_say)
history.append(i_say);history.append(gpt_say)
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面 # 界面更新