@@ -1239,7 +1239,6 @@ def extract_text_from_image(
12391239 test_compress : bool = False ,
12401240 enable_visualization : bool = False ,
12411241 save_results : bool = False ,
1242- clean_annotations : bool = False ,
12431242 progress = gr .Progress (),
12441243 ) -> Union [str , Tuple [str , str , str ]]:
12451244 from ...client import RESTfulClient
@@ -1261,10 +1260,12 @@ def extract_text_from_image(
12611260 progress (0.1 , desc = "Processing image for OCR" )
12621261
12631262 # Prepare prompt based on OCR type
1264- if ocr_type == "format" :
1265- prompt = "<image>\n <|grounding|>Convert the document to markdown."
1266- else :
1267- prompt = "<image>\n Free OCR."
1263+ if ocr_type == "markdown" :
1264+ prompt = "<image>\n Convert this document to clean markdown format. Extract the text content and format it properly using markdown syntax. Do not include any coordinate annotations or special formatting markers."
1265+ elif ocr_type == "format" :
1266+ prompt = "<image>\n <|grounding|>Convert the document to markdown with structure annotations. Include coordinate information for text regions and maintain the document structure."
1267+ else : # ocr
1268+ prompt = "<image>\n Free OCR. Extract all text content from the image."
12681269
12691270 try :
12701271 if enable_visualization and hasattr (model , "visualize_ocr" ):
@@ -1285,28 +1286,43 @@ def extract_text_from_image(
12851286 text_result = response .get ("text" , "No text extracted" )
12861287 else :
12871288 error_msg = response .get ("error" , "OCR visualization failed" )
1288- return f"Error: { error_msg } " , "" , ""
1289+ # Return formatted error message for Markdown
1290+ error_md = f"**错误**: { error_msg } "
1291+ return error_md , "" , ""
12891292 elif isinstance (response , str ):
12901293 # Handle string response from original model
12911294 text_result = response
12921295 else :
12931296 text_result = str (response )
12941297
1298+ # Check if the result looks like Markdown and format it properly
1299+ if ocr_type == "markdown" and isinstance (text_result , str ):
1300+ # Already in Markdown format, keep as is
1301+ pass
1302+ elif ocr_type == "format" and isinstance (text_result , str ):
1303+ # For format mode, keep annotations but format as code block
1304+ if "<|ref|>" in text_result :
1305+ text_result = f"```\n { text_result } \n ```"
1306+ elif ocr_type == "ocr" and isinstance (text_result , str ):
1307+ # For plain text, format as a simple block
1308+ text_result = text_result # Keep as plain text, Markdown will render it normally
1309+
12951310 # Add compression info if available
12961311 if isinstance (response , dict ) and test_compress and "compression_ratio" in response :
1297- text_result += f"\n \n --- Compression Info ---\n "
1298- text_result += f"Compression Ratio: { response .get ('compression_ratio' , 'N/A' )} \n "
1299- text_result += f"Valid Image Tokens: { response .get ('valid_image_tokens' , 'N/A' )} \n "
1300- text_result += f"Output Text Tokens: { response .get ('output_text_tokens' , 'N/A' )} \n "
1312+ compression_info = f"\n \n --- 压缩比信息 ---\n "
1313+ compression_info += f"压缩比: { response .get ('compression_ratio' , 'N/A' )} \n "
1314+ compression_info += f"有效图像 Tokens: { response .get ('valid_image_tokens' , 'N/A' )} \n "
1315+ compression_info += f"输出文本 Tokens: { response .get ('output_text_tokens' , 'N/A' )} \n "
1316+ text_result += compression_info
13011317
13021318 # Add visualization info
13031319 viz_info = {}
13041320 if isinstance (response , dict ):
13051321 viz_info = response .get ("visualization" , {})
13061322 if viz_info .get ("has_annotations" ):
1307- viz_text = f"\n \n --- Visualization Info ---\n "
1308- viz_text += f"Bounding Boxes : { viz_info .get ('num_bounding_boxes' , 0 )} \n "
1309- viz_text += f"Extracted Images : { viz_info .get ('num_extracted_images' , 0 )} \n "
1323+ viz_text = f"\n \n --- 可视化信息 ---\n "
1324+ viz_text += f"边界框数量 : { viz_info .get ('num_bounding_boxes' , 0 )} \n "
1325+ viz_text += f"提取图像数量 : { viz_info .get ('num_extracted_images' , 0 )} \n "
13101326 text_result += viz_text
13111327
13121328 saved_files = response .get ("saved_files" , {})
@@ -1324,7 +1340,6 @@ def extract_text_from_image(
13241340 test_compress = test_compress ,
13251341 save_results = save_results ,
13261342 eval_mode = True ,
1327- clean_annotations = clean_annotations ,
13281343 )
13291344
13301345 progress (0.8 , desc = "Extracting text" )
@@ -1335,19 +1350,33 @@ def extract_text_from_image(
13351350 text_result = response .get ("text" , "No text extracted" )
13361351 else :
13371352 error_msg = response .get ("error" , "OCR failed" )
1338- return f"Error: { error_msg } " , "" , ""
1353+ error_md = f"**错误**: { error_msg } "
1354+ return error_md , "" , ""
13391355 elif isinstance (response , str ):
13401356 # Handle string response from original model
13411357 text_result = response
13421358 else :
13431359 text_result = str (response )
13441360
1361+ # Format based on OCR type
1362+ if ocr_type == "markdown" and isinstance (text_result , str ):
1363+ # Markdown mode - keep as is for proper rendering
1364+ pass
1365+ elif ocr_type == "format" and isinstance (text_result , str ):
1366+ # Format mode - show annotations in code block
1367+ if "<|ref|>" in text_result :
1368+ text_result = f"```text\n { text_result } \n ```"
1369+ elif ocr_type == "ocr" and isinstance (text_result , str ):
1370+ # Plain text mode - keep as plain text
1371+ text_result = text_result
1372+
13451373 # Add compression info if available
13461374 if isinstance (response , dict ) and test_compress and "compression_ratio" in response :
1347- text_result += f"\n \n --- Compression Info ---\n "
1348- text_result += f"Compression Ratio: { response .get ('compression_ratio' , 'N/A' )} \n "
1349- text_result += f"Valid Image Tokens: { response .get ('valid_image_tokens' , 'N/A' )} \n "
1350- text_result += f"Output Text Tokens: { response .get ('output_text_tokens' , 'N/A' )} \n "
1375+ compression_info = f"\n \n --- 压缩比信息 ---\n "
1376+ compression_info += f"压缩比: { response .get ('compression_ratio' , 'N/A' )} \n "
1377+ compression_info += f"有效图像 Tokens: { response .get ('valid_image_tokens' , 'N/A' )} \n "
1378+ compression_info += f"输出文本 Tokens: { response .get ('output_text_tokens' , 'N/A' )} \n "
1379+ text_result += compression_info
13511380
13521381 return text_result , "" , ""
13531382
@@ -1381,16 +1410,16 @@ def extract_text_from_image(
13811410 )
13821411
13831412 ocr_type = gr .Dropdown (
1384- choices = ["ocr" , "format" ],
1413+ choices = ["ocr" , "format" , "markdown" ],
13851414 value = "ocr" ,
1386- label = "OCR Type " ,
1387- info = "ocr: Basic text extraction , format: Document formatting " ,
1415+ label = "Output Format " ,
1416+ info = "ocr: 纯文本提取 , format: 结构化文档(含标注), markdown: 标准Markdown格式 " ,
13881417 )
13891418
13901419 enable_visualization = gr .Checkbox (
13911420 label = "Enable Visualization" ,
13921421 value = False ,
1393- info = "Generate bounding boxes and annotations (requires document formatting )" ,
1422+ info = "Generate bounding boxes and annotations (仅适用于format模式 )" ,
13941423 )
13951424
13961425 test_compress = gr .Checkbox (
@@ -1405,22 +1434,19 @@ def extract_text_from_image(
14051434 info = "Save OCR results to files (if supported)" ,
14061435 )
14071436
1408- clean_annotations = gr .Checkbox (
1409- label = "Clean Annotations" ,
1410- value = True ,
1411- info = "Remove annotation tags and return plain text" ,
1412- )
1413-
1437+
14141438 extract_btn = gr .Button ("Extract Text" , variant = "primary" )
14151439
14161440 with gr .Column (scale = 1 ):
1417- text_output = gr .Textbox (
1418- label = "Extracted Text" ,
1419- lines = 20 ,
1420- placeholder = "Extracted text will appear here..." ,
1421- interactive = True ,
1422- show_copy_button = True ,
1423- )
1441+ # Create a bordered container for the output
1442+ with gr .Group (elem_classes = "output-container" ):
1443+ gr .Markdown ("### 📄 提取结果" )
1444+
1445+ text_output = gr .Markdown (
1446+ value = "提取的文本将在这里显示..." ,
1447+ elem_classes = "output-text" ,
1448+ container = False
1449+ )
14241450
14251451 # Additional info outputs (hidden by default)
14261452 viz_info_output = gr .Textbox (
@@ -1463,7 +1489,7 @@ def toggle_additional_outputs(enable_viz):
14631489 # Extract button click event
14641490 extract_btn .click (
14651491 fn = extract_text_from_image ,
1466- inputs = [image_input , ocr_type , model_size , test_compress , enable_visualization , save_results , clean_annotations ],
1492+ inputs = [image_input , ocr_type , model_size , test_compress , enable_visualization , save_results ],
14671493 outputs = [text_output , viz_info_output , file_info_output ],
14681494 )
14691495
@@ -1490,6 +1516,87 @@ def build_main_interface(self) -> "gr.Blocks":
14901516 padding: 0px;
14911517 color: #9ea4b0 !important;
14921518 }
1519+
1520+ .output-container {
1521+ border: 1px solid #e0e0e0;
1522+ border-radius: 8px;
1523+ padding: 16px;
1524+ background-color: #f8f9fa;
1525+ margin: 8px 0;
1526+ }
1527+
1528+ .output-text {
1529+ background-color: white;
1530+ border: 1px solid #dee2e6;
1531+ border-radius: 6px;
1532+ padding: 16px;
1533+ min-height: 200px;
1534+ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
1535+ line-height: 1.6;
1536+ }
1537+
1538+ .output-text h1, .output-text h2, .output-text h3,
1539+ .output-text h4, .output-text h5, .output-text h6 {
1540+ margin-top: 0.5em !important;
1541+ margin-bottom: 0.5em !important;
1542+ color: #2d3748 !important;
1543+ }
1544+
1545+ .output-text p {
1546+ margin: 0.5em 0 !important;
1547+ }
1548+
1549+ .output-text pre {
1550+ background-color: #f6f8fa !important;
1551+ border: 1px solid #e9ecef !important;
1552+ border-radius: 4px !important;
1553+ padding: 12px !important;
1554+ margin: 8px 0 !important;
1555+ }
1556+
1557+ .output-text code {
1558+ background-color: #e9ecef !important;
1559+ padding: 2px 4px !important;
1560+ border-radius: 3px !important;
1561+ font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace !important;
1562+ }
1563+
1564+ .output-text ul, .output-text ol {
1565+ margin: 0.5em 0 !important;
1566+ padding-left: 20px !important;
1567+ }
1568+
1569+ .output-text blockquote {
1570+ border-left: 4px solid #6c757d !important;
1571+ padding-left: 16px !important;
1572+ margin: 0.5em 0 !important;
1573+ color: #6c757d !important;
1574+ background-color: #f8f9fa !important;
1575+ }
1576+
1577+ .output-text table {
1578+ border-collapse: collapse !important;
1579+ width: 100% !important;
1580+ margin: 8px 0 !important;
1581+ }
1582+
1583+ .output-text th, .output-text td {
1584+ border: 1px solid #dee2e6 !important;
1585+ padding: 8px 12px !important;
1586+ text-align: left !important;
1587+ }
1588+
1589+ .output-text th {
1590+ background-color: #f8f9fa !important;
1591+ font-weight: bold !important;
1592+ }
1593+
1594+ /* 确保 Markdown 正确显示 */
1595+ .output-text .katex-display {
1596+ display: block !important;
1597+ text-align: center !important;
1598+ margin: 1em 0 !important;
1599+ }
14931600 """ ,
14941601 analytics_enabled = False ,
14951602 ) as app :
0 commit comments