Skip to content

FEAT: 作业提交V4版本 #97

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions selected_homework/openai-translator_v4/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
### 作业需求
基于 ChatGLM2-6B 实现带图形化界面的 openai-translator

### 作业总结
+ [openai_api_demo](openai_api_demo)

利用的chatGLM中的api_demo进行调整为server项,故没有采用ChatGLM2-6b,而是ChatGLM3-6b

运行起来需要:
1. git clone https://www.modelscope.cn/ZhipuAI/chatglm3-6b.git
2. 确保机器有足够的资源【俺没有。。。故暂未实现。。。】

+ [ai_translator](ai_translator)

将历史的
from langchain_openai import ChatOpenAI
替换为
from langchain.llms import ChatGLM
并针对ChatGLM做参数匹配
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .book import Book
from .page import Page
from .content import ContentType, Content, TableContent
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from .page import Page

class Book:
def __init__(self, pdf_file_path):
self.pdf_file_path = pdf_file_path
self.pages = []

def add_page(self, page: Page):
self.pages.append(page)
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import pandas as pd

from enum import Enum, auto
from PIL import Image as PILImage
from utils import LOG
from io import StringIO

class ContentType(Enum):
TEXT = auto()
TABLE = auto()
IMAGE = auto()

class Content:
def __init__(self, content_type, original, translation=None):
self.content_type = content_type
self.original = original
self.translation = translation
self.status = False

def set_translation(self, translation, status):
if not self.check_translation_type(translation):
raise ValueError(f"Invalid translation type. Expected {self.content_type}, but got {type(translation)}")
self.translation = translation
self.status = status

def check_translation_type(self, translation):
if self.content_type == ContentType.TEXT and isinstance(translation, str):
return True
elif self.content_type == ContentType.TABLE and isinstance(translation, list):
return True
elif self.content_type == ContentType.IMAGE and isinstance(translation, PILImage.Image):
return True
return False

def __str__(self):
return self.original


class TableContent(Content):
def __init__(self, data, translation=None):
df = pd.DataFrame(data)

# Verify if the number of rows and columns in the data and DataFrame object match
if len(data) != len(df) or len(data[0]) != len(df.columns):
raise ValueError("The number of rows and columns in the extracted table data and DataFrame object do not match.")

super().__init__(ContentType.TABLE, df)

def set_translation(self, translation, status):
try:
if not isinstance(translation, str):
raise ValueError(f"Invalid translation type. Expected str, but got {type(translation)}")

LOG.debug(f"[translation]\n{translation}")
# Extract column names from the first set of brackets
header = translation.split(']')[0][1:].split(', ')
# Extract data rows from the remaining brackets
data_rows = translation.split('] ')[1:]
# Replace Chinese punctuation and split each row into a list of values
data_rows = [row[1:-1].split(', ') for row in data_rows]
# Create a DataFrame using the extracted header and data
translated_df = pd.DataFrame(data_rows, columns=header)
LOG.debug(f"[translated_df]\n{translated_df}")
self.translation = translated_df
self.status = status
except Exception as e:
LOG.error(f"An error occurred during table translation: {e}")
self.translation = None
self.status = False

def __str__(self):
return self.original.to_string(header=False, index=False)

def iter_items(self, translated=False):
target_df = self.translation if translated else self.original
for row_idx, row in target_df.iterrows():
for col_idx, item in enumerate(row):
yield (row_idx, col_idx, item)

def update_item(self, row_idx, col_idx, new_value, translated=False):
target_df = self.translation if translated else self.original
target_df.at[row_idx, col_idx] = new_value

def get_original_as_str(self):
return self.original.to_string(header=False, index=False)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from .content import Content

class Page:
def __init__(self):
self.contents = []

def add_content(self, content: Content):
self.contents.append(content)
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
model_name: "chatglm2-6b"
input_file: "tests/test.pdf"
output_file_format: "markdown"
source_language: "English"
target_language: "Chinese"
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from flask import Flask, request, send_file, jsonify
from translator import PDFTranslator, TranslationConfig
from utils import ArgumentParser, LOG

app = Flask(__name__)

TEMP_FILE_DIR = "flask_temps/"

@app.route('/translation', methods=['POST'])
def translation():
try:
input_file = request.files['input_file']
source_language = request.form.get('source_language', 'English')
target_language = request.form.get('target_language', 'Chinese')

LOG.debug(f"[input_file]\n{input_file}")
LOG.debug(f"[input_file.filename]\n{input_file.filename}")

if input_file and input_file.filename:
# # 创建临时文件
input_file_path = TEMP_FILE_DIR+input_file.filename
LOG.debug(f"[input_file_path]\n{input_file_path}")

input_file.save(input_file_path)

# 调用翻译函数
output_file_path = Translator.translate_pdf(
input_file=input_file_path,
source_language=source_language,
target_language=target_language)

# 移除临时文件
# os.remove(input_file_path)

# 构造完整的文件路径
output_file_path = os.getcwd() + "/" + output_file_path
LOG.debug(output_file_path)

# 返回翻译后的文件
return send_file(output_file_path, as_attachment=True)
except Exception as e:
response = {
'status': 'error',
'message': str(e)
}
return jsonify(response), 400


def initialize_translator():
# 解析命令行
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()

# 初始化配置单例
config = TranslationConfig()
config.initialize(args)
# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(config.model_name)


if __name__ == "__main__":
# 初始化 translator
initialize_translator()
# 启动 Flask Web Server
app.run(host="0.0.0.0", port=5000, debug=True)
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import sys
import os
import gradio as gr


sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from utils import ArgumentParser, LOG
from translator import PDFTranslator, TranslationConfig


def translation(input_file, source_language, target_language, translate_style):
LOG.debug(
f"[翻译任务]\n源文件: {input_file.name}\n源语言: {source_language}\n目标语言: {target_language}\n翻译风格: {translate_style}")

output_file_path = Translator.translate_pdf(
input_file.name, source_language=source_language,
target_language=target_language, translate_style=translate_style
)

return output_file_path

def launch_gradio():

iface = gr.Interface(
fn=translation,
title="[Homework]OpenAI-Translator v4(接入GLM3-6b)",
inputs=[
gr.File(label="上传PDF文件"),
gr.Textbox(label="源语言(默认:英文)", placeholder="English", value="English"),
gr.Textbox(label="目标语言(默认:中文)", placeholder="Chinese", value="Chinese"),
gr.Radio(["Normal people", "Children", "Professor"]),
],
outputs=[
gr.File(label="下载翻译文件")
],
allow_flagging="never"
)

iface.launch(share=True, server_name="0.0.0.0")

def initialize_translator():
# 解析命令行
argument_parser = ArgumentParser()
args = argument_parser.parse_arguments()

# 初始化配置单例
config = TranslationConfig()
config.initialize(args)
# 实例化 PDFTranslator 类,并调用 translate_pdf() 方法
global Translator
Translator = PDFTranslator(config.model_name)


if __name__ == "__main__":
# 初始化 translator
initialize_translator()
# 启动 Gradio 服务
launch_gradio()
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
测试数据
这个数据集包含了由OpenAI的AI语言模型ChatGPT提供的两个测试样本。
这些样本包括一个Markdown表格和一个英文文本段落,可以用来测试支持文本和表格格式的英译中翻译软件。
文本测试
快速的棕色狐狸跳过懒狗。这个句子包含了英语字母表中的每个字母至少一次。句子是经常用来测试字体、键盘和其他与文本相关的工具的。除了英语,其他许多语言也有句子。由于语言的独特特点,有些句子更难构造。

| 水果 | 颜色 | 价格(美元) |
| --- | --- | --- |
| 苹果 | 红色 | 1.2 |
| 香蕉 | 黄色 | 0.5 |
| 橙子 | 橙色 | 0.8 |
| 草莓 | 红色 | 2.5 |
| 蓝莓 | 蓝色 | 3.0 |
| 猕猴桃 | 绿色 | 1.0 |
| 芒果 | 橙色 | 1.5 |
| 葡萄 | 紫色 | 2.00 |

---

Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .pdf_translator import PDFTranslator
from .translation_config import TranslationConfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
class PageOutOfRangeException(Exception):
def __init__(self, book_pages, requested_pages):
self.book_pages = book_pages
self.requested_pages = requested_pages
super().__init__(f"Page out of range: Book has {book_pages} pages, but {requested_pages} pages were requested.")
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import pdfplumber
from typing import Optional
from book import Book, Page, Content, ContentType, TableContent
from translator.exceptions import PageOutOfRangeException
from utils import LOG


class PDFParser:
def __init__(self):
pass

def parse_pdf(self, pdf_file_path: str, pages: Optional[int] = None) -> Book:
book = Book(pdf_file_path)

with pdfplumber.open(pdf_file_path) as pdf:
if pages is not None and pages > len(pdf.pages):
raise PageOutOfRangeException(len(pdf.pages), pages)

if pages is None:
pages_to_parse = pdf.pages
else:
pages_to_parse = pdf.pages[:pages]

for pdf_page in pages_to_parse:
page = Page()

# Store the original text content
raw_text = pdf_page.extract_text()
tables = pdf_page.extract_tables()

# Remove each cell's content from the original text
for table_data in tables:
for row in table_data:
for cell in row:
raw_text = raw_text.replace(cell, "", 1)

# Handling text
if raw_text:
# Remove empty lines and leading/trailing whitespaces
raw_text_lines = raw_text.splitlines()
cleaned_raw_text_lines = [line.strip() for line in raw_text_lines if line.strip()]
cleaned_raw_text = "\n".join(cleaned_raw_text_lines)

text_content = Content(content_type=ContentType.TEXT, original=cleaned_raw_text)
page.add_content(text_content)
LOG.debug(f"[raw_text]\n {cleaned_raw_text}")



# Handling tables
if tables:
table = TableContent(tables)
page.add_content(table)
LOG.debug(f"[table]\n{table}")

book.add_page(page)

return book
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
from typing import Optional
from translator.pdf_parser import PDFParser
from translator.writer import Writer
from translator.translation_chain import TranslationChain

class PDFTranslator:
def __init__(self, model_name: str):
self.translate_chain = TranslationChain(model_name)
self.pdf_parser = PDFParser()
self.writer = Writer()

def translate_pdf(self,
input_file: str,
output_file_format: str = 'markdown',
source_language: str = "English",
target_language: str = 'Chinese',
translate_style: str = "Normal Style",
pages: Optional[int] = None):

self.book = self.pdf_parser.parse_pdf(input_file, pages)

for page_idx, page in enumerate(self.book.pages):
for content_idx, content in enumerate(page.contents):
# Translate content.original
translation, status = self.translate_chain.run(
content, source_language, target_language, translate_style)
# Update the content in self.book.pages directly
self.book.pages[page_idx].contents[content_idx].set_translation(translation, status)

return self.writer.save_translated_book(self.book, output_file_format)
Loading