Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add 2 plugins #1424

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
28 changes: 28 additions & 0 deletions crazy_functional.py
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,35 @@ def get_crazy_functions():
except:
print(trimmed_format_exc())
print('Load function plugin failed')
try:
from crazy_functions.批量总结PDF文档_初步 import 批量总结PDF文档_初步
function_plugins.update({
"批量总结PDF文档_初步": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
"Info": "批量总结PDF文档的内容(仅做初步提炼) | 输入参数为路径",
"Function": HotReload(批量总结PDF文档_初步)
}
})
except:
print(trimmed_format_exc())
print('Load function plugin failed')

try:
from crazy_functions.批量总结Markdown文档_进阶 import 批量总结Markdown文档_进阶
function_plugins.update({
"批量总结Markdown文档_进阶": {
"Group": "学术",
"Color": "stop",
"AsButton": False,
"Info": "批量总结Markdown文档的内容(在初步提炼的基础上进一步总结) | 输入参数为路径",
"Function": HotReload(批量总结Markdown文档_进阶)
}
})
except:
print(trimmed_format_exc())
print('Load function plugin failed')
# try:
# from crazy_functions.chatglm微调工具 import 微调数据集生成
# function_plugins.update({
Expand Down
127 changes: 127 additions & 0 deletions crazy_functions/批量总结Markdown文档_进阶.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
import logging, os
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
from toolbox import CatchException, report_exception, trimmed_format_exc
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency
from .crazy_utils import input_clipping


def 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
file_write_buffer = []
SUMMARY_WORD_LIMIT = 800
meta_inputs_array = []
meta_inputs_show_user_array = []
meta_sys_prompt_array = []
inputs_array = []
inputs_show_user_array = []
sys_prompt_array = []
file_name_array = []
for idx, file_name in enumerate(file_manifest):
print('begin analysis on:', file_name)
file_name_array.append(f'# {idx}.{os.path.basename(file_name)}')

with open(file_name, 'r', encoding='utf-8', errors='replace') as f:
file_content = f.read()

_ = file_content.split('## metadata')
if len(_) >= 2:
file_meta = _[-2]
file_content = _[-1]
else:
file_meta = file_name

meta_inputs_array.append(
"我需要你从一段文本中识别并提取出这篇文章的1.标题、2.作者、3.作者单位、4.关键词。"
"其中,1.标题和4.关键词需要给出中文和英文的双语结果,2.作者和3.作者单位按原文语言给出。"
"以下是需要你识别的文本: " + file_meta
)
meta_inputs_show_user_array.append(
'开始分析元数据:' + file_name
)
meta_sys_prompt_array.append("As an academic professional, you need to extract basic informations of the paper from its metadata")

inputs_array.append(
"我需要你根据我提供的文本总结一份Markdown文档,分为四个部分:1.研究背景,2.文章主要内容,3.主要创新点,4.结论。"
+ f"各部分的题目采用二级标题前缀(## ),内容可适当的分为若干条,总字数不超过{SUMMARY_WORD_LIMIT}个中文字符."
+ "以下是需要你处理的文本: " + file_content)
inputs_show_user_array.append('开始总结:' + file_name)
sys_prompt_array.append(f"As an academic professional, you need to summarize the text with less than {SUMMARY_WORD_LIMIT} Chinese characters")

gpt_meta_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=meta_inputs_array,
inputs_show_user_array=meta_inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(len(inputs_array))],
sys_prompt_array=meta_sys_prompt_array,
# max_workers=5, # OpenAI所允许的最大并行过载
scroller_max_len=80
)

gpt_response_collection = yield from request_gpt_model_multi_threads_with_very_awesome_ui_and_high_efficiency(
inputs_array=inputs_array,
inputs_show_user_array=inputs_show_user_array,
llm_kwargs=llm_kwargs,
chatbot=chatbot,
history_array=[[""] for _ in range(len(inputs_array))],
sys_prompt_array=sys_prompt_array,
# max_workers=5, # OpenAI所允许的最大并行过载
scroller_max_len=80
)
try:
for idx, (gpt_say_meta, gpt_say) in enumerate(zip(gpt_meta_response_collection[1::2], gpt_response_collection[1::2])):
file_write_buffer.append(file_name_array[idx])
file_write_buffer.append("## 元数据\n\n" + gpt_say_meta)
file_write_buffer.append(gpt_say)
except:
logging.error(trimmed_format_exc())

res = write_history_to_file(file_write_buffer, file_basename="result.md", auto_caption=False)
promote_file_to_downloadzone(res, chatbot=chatbot)
yield from update_ui(chatbot=chatbot, history=gpt_response_collection) # 刷新界面


@CatchException
def 批量总结Markdown文档_进阶(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
import glob, os

# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结Markdown文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import fitz
except:
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 清空历史,以免输入溢出
history = []

# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 搜索需要处理的文件清单
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.md', recursive=True)]

# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.md文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 开始正式执行任务
yield from 总结Markdown(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)
131 changes: 131 additions & 0 deletions crazy_functions/批量总结PDF文档_初步.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import zipfile
import os
from toolbox import update_ui, promote_file_to_downloadzone, gen_time_str, get_log_folder
from toolbox import CatchException, report_exception
from toolbox import write_history_to_file, promote_file_to_downloadzone
from .crazy_utils import request_gpt_model_in_new_thread_with_ui_alive
from .crazy_utils import read_and_clean_pdf_text
from .crazy_utils import input_clipping
pj = os.path.join


def move_file_to_zip(file_path, zip_file):
zip_file.write(file_path, os.path.basename(file_path))
os.remove(file_path)


def 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt):
zip_file_path = pj(get_log_folder(), 'result.zip')
with zipfile.ZipFile(zip_file_path, 'w') as zip_file:
for file_name in file_manifest:
file_write_buffer = []
print('begin analysis on:', file_name)
############################## <第 0 步,切割PDF> ##################################
# 递归地切割PDF文件,每一块(尽量是完整的一个section,比如introduction,experiment等,必要时再进行切割)
# 的长度必须小于 2500 个 Token
file_content, page_one = read_and_clean_pdf_text(file_name) # (尝试)按照章节切割PDF
file_content = file_content.encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars
page_one = str(page_one).encode('utf-8', 'ignore').decode() # avoid reading non-utf8 chars

TOKEN_LIMIT_PER_FRAGMENT = 2500

from .crazy_utils import breakdown_txt_to_satisfy_token_limit_for_pdf
from request_llms.bridge_all import model_info
enc = model_info["gpt-3.5-turbo"]['tokenizer']
def get_token_num(txt): return len(enc.encode(txt, disallowed_special=()))
paper_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=file_content, get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT)
page_one_fragments = breakdown_txt_to_satisfy_token_limit_for_pdf(
txt=str(page_one), get_token_fn=get_token_num, limit=TOKEN_LIMIT_PER_FRAGMENT//4)
# 为了更好的效果,我们剥离Introduction之后的部分(如果有)
paper_meta = page_one_fragments[0].split('introduction')[0].split('Introduction')[0].split('INTRODUCTION')[0]

############################## <第 1 步,从摘要中提取高价值信息,放到history中> ##################################
final_results = []
final_results.append("## metadata\n\n" + paper_meta + "\n\n## metadata")

############################## <第 2 步,迭代地历遍整个文章,提取精炼信息> ##################################
i_say_show_user = f'首先你在中文语境下通读整篇论文。'; gpt_say = "[Local Message] 收到。" # 用户提示
chatbot.append([i_say_show_user, gpt_say]); yield from update_ui(chatbot=chatbot, history=[]) # 更新UI

iteration_results = []
last_iteration_result = paper_meta # 初始值是摘要
MAX_WORD_TOTAL = 4096 * 0.7
n_fragment = len(paper_fragments)
if n_fragment >= 20: print('文章极长,不能达到预期效果')
for i in range(n_fragment):
NUM_OF_WORD = MAX_WORD_TOTAL // n_fragment
i_say = f"Read this section, recapitulate the content of this section in Chinese with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i]}"
i_say_show_user = f"[{i+1}/{n_fragment}] Read this section, recapitulate the content of this section with less than {NUM_OF_WORD} Chinese characters: {paper_fragments[i][:200]}"
gpt_say = yield from request_gpt_model_in_new_thread_with_ui_alive(i_say, i_say_show_user, # i_say=真正给chatgpt的提问, i_say_show_user=给用户看的提问
llm_kwargs, chatbot,
history=["The main idea of the previous section is?", last_iteration_result], # 迭代上一次的结果
sys_prompt="Extract the main idea of this section with Chinese." # 提示
)
iteration_results.append(gpt_say)
last_iteration_result = gpt_say

############################## <第 3 步,整理history,提取总结> ##################################
final_results.extend(iteration_results)
file_write_buffer.extend(final_results)

############################## <第 4 步,设置一个token上限> ##################################
_, final_results = input_clipping("", final_results, max_token_limit=3200)
yield from update_ui(chatbot=chatbot, history=final_results) # 注意这里的历史记录被替代了

res = write_history_to_file(
file_write_buffer,
file_basename=os.path.splitext(os.path.basename(file_name))[0] + '.md',
auto_caption=False
)
if len(file_manifest) == 1:
promote_file_to_downloadzone(res, chatbot=chatbot)
return
move_file_to_zip(res, zip_file)

promote_file_to_downloadzone(zip_file_path, chatbot=chatbot)


@CatchException
def 批量总结PDF文档_初步(txt, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt, web_port):
import glob, os

# 基本信息:功能、贡献者
chatbot.append([
"函数插件功能?",
"批量总结PDF文档。函数插件贡献者: ValeriaWong,Eralien,Joshua Reed"])
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面

# 尝试导入依赖,如果缺少依赖,则给出安装建议
try:
import fitz
except:
report_exception(chatbot, history,
a = f"解析项目: {txt}",
b = f"导入软件依赖失败。使用该模块需要额外依赖,安装方法```pip install --upgrade pymupdf```。")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 清空历史,以免输入溢出
history = []

# 检测输入参数,如没有给定输入参数,直接退出
if os.path.exists(txt):
project_folder = txt
else:
if txt == "": txt = '空空如也的输入栏'
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到本地项目或无权访问: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 搜索需要处理的文件清单
file_manifest = [f for f in glob.glob(f'{project_folder}/**/*.pdf', recursive=True)]

# 如果没找到任何文件
if len(file_manifest) == 0:
report_exception(chatbot, history, a = f"解析项目: {txt}", b = f"找不到任何.tex或.pdf文件: {txt}")
yield from update_ui(chatbot=chatbot, history=history) # 刷新界面
return

# 开始正式执行任务
yield from 解析PDF(file_manifest, project_folder, llm_kwargs, plugin_kwargs, chatbot, history, system_prompt)