Skip to content

Commit b10bee8

Browse files
committed
DeepSeek-OCR Markdown output
1 parent e0a7521 commit b10bee8

File tree

4 files changed

+147
-61
lines changed

4 files changed

+147
-61
lines changed

xinference/client/restful/async_restful_client.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -607,7 +607,6 @@ async def ocr(
607607
save_results: bool = False,
608608
save_dir: Optional[str] = None,
609609
eval_mode: bool = False,
610-
clean_annotations: bool = False,
611610
**kwargs
612611
):
613612
"""
@@ -647,7 +646,6 @@ async def ocr(
647646
"save_results": save_results,
648647
"save_dir": save_dir,
649648
"eval_mode": eval_mode,
650-
"clean_annotations": clean_annotations,
651649
**kwargs
652650
}
653651

xinference/client/restful/restful_client.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -544,7 +544,6 @@ def ocr(
544544
save_results: bool = False,
545545
save_dir: Optional[str] = None,
546546
eval_mode: bool = False,
547-
clean_annotations: bool = False,
548547
**kwargs
549548
):
550549
"""
@@ -566,8 +565,6 @@ def ocr(
566565
Directory to save results.
567566
eval_mode: bool, optional
568567
Whether to use evaluation mode.
569-
clean_annotations: bool, optional
570-
Whether to clean annotation tags and return plain text.
571568
**kwargs
572569
Additional parameters.
573570
@@ -586,7 +583,6 @@ def ocr(
586583
"save_results": save_results,
587584
"save_dir": save_dir,
588585
"eval_mode": eval_mode,
589-
"clean_annotations": clean_annotations,
590586
**kwargs
591587
}
592588

xinference/model/image/ocr/deepseek_ocr.py

Lines changed: 3 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,6 @@ def ocr(
433433
save_results: bool = False,
434434
save_dir: Optional[str] = None,
435435
eval_mode: bool = False,
436-
clean_annotations: bool = False,
437436
**kwargs,
438437
) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
439438
"""
@@ -447,7 +446,6 @@ def ocr(
447446
save_results: Whether to save results to files
448447
save_dir: Directory to save results (required if save_results=True)
449448
eval_mode: Whether to use evaluation mode
450-
clean_annotations: Whether to clean annotation tags and return plain text
451449
**kwargs: Additional parameters
452450
453451
Returns:
@@ -465,13 +463,13 @@ def ocr(
465463
# Handle single image input
466464
if isinstance(image, PIL.Image.Image):
467465
return self._ocr_single(
468-
image, prompt, model_size, test_compress, save_results, save_dir, eval_mode, clean_annotations, **kwargs
466+
image, prompt, model_size, test_compress, save_results, save_dir, eval_mode, **kwargs
469467
)
470468
# Handle batch image input
471469
elif isinstance(image, list):
472470
return [
473471
self._ocr_single(
474-
img, prompt, model_size, test_compress, save_results, save_dir, eval_mode, clean_annotations, **kwargs
472+
img, prompt, model_size, test_compress, save_results, save_dir, eval_mode, **kwargs
475473
) for img in image
476474
]
477475
else:
@@ -656,7 +654,6 @@ def _ocr_single(
656654
save_results: bool = False,
657655
save_dir: Optional[str] = None,
658656
eval_mode: bool = False,
659-
clean_annotations: bool = False,
660657
**kwargs
661658
) -> Dict[str, Any]:
662659
"""Perform OCR on a single image with all enhanced features."""
@@ -699,16 +696,9 @@ def _ocr_single(
699696
eval_mode=eval_mode,
700697
)
701698

702-
# Clean annotations if requested
703-
cleaned_text = result
704-
annotations_cleaned = False
705-
if clean_annotations and isinstance(result, str):
706-
cleaned_text = clean_ocr_annotations(result)
707-
annotations_cleaned = True
708-
709699
# Prepare response
710700
response = {
711-
"text": cleaned_text,
701+
"text": result,
712702
"model": "deepseek-ocr",
713703
"success": True,
714704
"model_size": model_size,
@@ -717,11 +707,6 @@ def _ocr_single(
717707
"crop_mode": model_config.crop_mode,
718708
}
719709

720-
# Add annotation info if cleaned
721-
if annotations_cleaned:
722-
response["annotations_cleaned"] = True
723-
response["raw_text"] = result
724-
725710
# Add compression info if tested
726711
if test_compress:
727712
# Calculate compression ratio (simplified version)

xinference/ui/gradio/media_interface.py

Lines changed: 144 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,7 +1239,6 @@ def extract_text_from_image(
12391239
test_compress: bool = False,
12401240
enable_visualization: bool = False,
12411241
save_results: bool = False,
1242-
clean_annotations: bool = False,
12431242
progress=gr.Progress(),
12441243
) -> Union[str, Tuple[str, str, str]]:
12451244
from ...client import RESTfulClient
@@ -1261,10 +1260,12 @@ def extract_text_from_image(
12611260
progress(0.1, desc="Processing image for OCR")
12621261

12631262
# Prepare prompt based on OCR type
1264-
if ocr_type == "format":
1265-
prompt = "<image>\n<|grounding|>Convert the document to markdown."
1266-
else:
1267-
prompt = "<image>\nFree OCR."
1263+
if ocr_type == "markdown":
1264+
prompt = "<image>\nConvert this document to clean markdown format. Extract the text content and format it properly using markdown syntax. Do not include any coordinate annotations or special formatting markers."
1265+
elif ocr_type == "format":
1266+
prompt = "<image>\n<|grounding|>Convert the document to markdown with structure annotations. Include coordinate information for text regions and maintain the document structure."
1267+
else: # ocr
1268+
prompt = "<image>\nFree OCR. Extract all text content from the image."
12681269

12691270
try:
12701271
if enable_visualization and hasattr(model, "visualize_ocr"):
@@ -1285,28 +1286,43 @@ def extract_text_from_image(
12851286
text_result = response.get("text", "No text extracted")
12861287
else:
12871288
error_msg = response.get("error", "OCR visualization failed")
1288-
return f"Error: {error_msg}", "", ""
1289+
# Return formatted error message for Markdown
1290+
error_md = f"**错误**: {error_msg}"
1291+
return error_md, "", ""
12891292
elif isinstance(response, str):
12901293
# Handle string response from original model
12911294
text_result = response
12921295
else:
12931296
text_result = str(response)
12941297

1298+
# Check if the result looks like Markdown and format it properly
1299+
if ocr_type == "markdown" and isinstance(text_result, str):
1300+
# Already in Markdown format, keep as is
1301+
pass
1302+
elif ocr_type == "format" and isinstance(text_result, str):
1303+
# For format mode, keep annotations but format as code block
1304+
if "<|ref|>" in text_result:
1305+
text_result = f"```\n{text_result}\n```"
1306+
elif ocr_type == "ocr" and isinstance(text_result, str):
1307+
# For plain text, format as a simple block
1308+
text_result = text_result # Keep as plain text, Markdown will render it normally
1309+
12951310
# Add compression info if available
12961311
if isinstance(response, dict) and test_compress and "compression_ratio" in response:
1297-
text_result += f"\n\n--- Compression Info ---\n"
1298-
text_result += f"Compression Ratio: {response.get('compression_ratio', 'N/A')}\n"
1299-
text_result += f"Valid Image Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1300-
text_result += f"Output Text Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1312+
compression_info = f"\n\n--- 压缩比信息 ---\n"
1313+
compression_info += f"压缩比: {response.get('compression_ratio', 'N/A')}\n"
1314+
compression_info += f"有效图像 Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1315+
compression_info += f"输出文本 Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1316+
text_result += compression_info
13011317

13021318
# Add visualization info
13031319
viz_info = {}
13041320
if isinstance(response, dict):
13051321
viz_info = response.get("visualization", {})
13061322
if viz_info.get("has_annotations"):
1307-
viz_text = f"\n\n--- Visualization Info ---\n"
1308-
viz_text += f"Bounding Boxes: {viz_info.get('num_bounding_boxes', 0)}\n"
1309-
viz_text += f"Extracted Images: {viz_info.get('num_extracted_images', 0)}\n"
1323+
viz_text = f"\n\n--- 可视化信息 ---\n"
1324+
viz_text += f"边界框数量: {viz_info.get('num_bounding_boxes', 0)}\n"
1325+
viz_text += f"提取图像数量: {viz_info.get('num_extracted_images', 0)}\n"
13101326
text_result += viz_text
13111327

13121328
saved_files = response.get("saved_files", {})
@@ -1324,7 +1340,6 @@ def extract_text_from_image(
13241340
test_compress=test_compress,
13251341
save_results=save_results,
13261342
eval_mode=True,
1327-
clean_annotations=clean_annotations,
13281343
)
13291344

13301345
progress(0.8, desc="Extracting text")
@@ -1335,19 +1350,33 @@ def extract_text_from_image(
13351350
text_result = response.get("text", "No text extracted")
13361351
else:
13371352
error_msg = response.get("error", "OCR failed")
1338-
return f"Error: {error_msg}", "", ""
1353+
error_md = f"**错误**: {error_msg}"
1354+
return error_md, "", ""
13391355
elif isinstance(response, str):
13401356
# Handle string response from original model
13411357
text_result = response
13421358
else:
13431359
text_result = str(response)
13441360

1361+
# Format based on OCR type
1362+
if ocr_type == "markdown" and isinstance(text_result, str):
1363+
# Markdown mode - keep as is for proper rendering
1364+
pass
1365+
elif ocr_type == "format" and isinstance(text_result, str):
1366+
# Format mode - show annotations in code block
1367+
if "<|ref|>" in text_result:
1368+
text_result = f"```text\n{text_result}\n```"
1369+
elif ocr_type == "ocr" and isinstance(text_result, str):
1370+
# Plain text mode - keep as plain text
1371+
text_result = text_result
1372+
13451373
# Add compression info if available
13461374
if isinstance(response, dict) and test_compress and "compression_ratio" in response:
1347-
text_result += f"\n\n--- Compression Info ---\n"
1348-
text_result += f"Compression Ratio: {response.get('compression_ratio', 'N/A')}\n"
1349-
text_result += f"Valid Image Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1350-
text_result += f"Output Text Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1375+
compression_info = f"\n\n--- 压缩比信息 ---\n"
1376+
compression_info += f"压缩比: {response.get('compression_ratio', 'N/A')}\n"
1377+
compression_info += f"有效图像 Tokens: {response.get('valid_image_tokens', 'N/A')}\n"
1378+
compression_info += f"输出文本 Tokens: {response.get('output_text_tokens', 'N/A')}\n"
1379+
text_result += compression_info
13511380

13521381
return text_result, "", ""
13531382

@@ -1381,16 +1410,16 @@ def extract_text_from_image(
13811410
)
13821411

13831412
ocr_type = gr.Dropdown(
1384-
choices=["ocr", "format"],
1413+
choices=["ocr", "format", "markdown"],
13851414
value="ocr",
1386-
label="OCR Type",
1387-
info="ocr: Basic text extraction, format: Document formatting",
1415+
label="Output Format",
1416+
info="ocr: 纯文本提取, format: 结构化文档(含标注), markdown: 标准Markdown格式",
13881417
)
13891418

13901419
enable_visualization = gr.Checkbox(
13911420
label="Enable Visualization",
13921421
value=False,
1393-
info="Generate bounding boxes and annotations (requires document formatting)",
1422+
info="Generate bounding boxes and annotations (仅适用于format模式)",
13941423
)
13951424

13961425
test_compress = gr.Checkbox(
@@ -1405,22 +1434,19 @@ def extract_text_from_image(
14051434
info="Save OCR results to files (if supported)",
14061435
)
14071436

1408-
clean_annotations = gr.Checkbox(
1409-
label="Clean Annotations",
1410-
value=True,
1411-
info="Remove annotation tags and return plain text",
1412-
)
1413-
1437+
14141438
extract_btn = gr.Button("Extract Text", variant="primary")
14151439

14161440
with gr.Column(scale=1):
1417-
text_output = gr.Textbox(
1418-
label="Extracted Text",
1419-
lines=20,
1420-
placeholder="Extracted text will appear here...",
1421-
interactive=True,
1422-
show_copy_button=True,
1423-
)
1441+
# Create a bordered container for the output
1442+
with gr.Group(elem_classes="output-container"):
1443+
gr.Markdown("### 📄 提取结果")
1444+
1445+
text_output = gr.Markdown(
1446+
value="提取的文本将在这里显示...",
1447+
elem_classes="output-text",
1448+
container=False
1449+
)
14241450

14251451
# Additional info outputs (hidden by default)
14261452
viz_info_output = gr.Textbox(
@@ -1463,7 +1489,7 @@ def toggle_additional_outputs(enable_viz):
14631489
# Extract button click event
14641490
extract_btn.click(
14651491
fn=extract_text_from_image,
1466-
inputs=[image_input, ocr_type, model_size, test_compress, enable_visualization, save_results, clean_annotations],
1492+
inputs=[image_input, ocr_type, model_size, test_compress, enable_visualization, save_results],
14671493
outputs=[text_output, viz_info_output, file_info_output],
14681494
)
14691495

@@ -1490,6 +1516,87 @@ def build_main_interface(self) -> "gr.Blocks":
14901516
padding: 0px;
14911517
color: #9ea4b0 !important;
14921518
}
1519+
1520+
.output-container {
1521+
border: 1px solid #e0e0e0;
1522+
border-radius: 8px;
1523+
padding: 16px;
1524+
background-color: #f8f9fa;
1525+
margin: 8px 0;
1526+
}
1527+
1528+
.output-text {
1529+
background-color: white;
1530+
border: 1px solid #dee2e6;
1531+
border-radius: 6px;
1532+
padding: 16px;
1533+
min-height: 200px;
1534+
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
1535+
line-height: 1.6;
1536+
}
1537+
1538+
.output-text h1, .output-text h2, .output-text h3,
1539+
.output-text h4, .output-text h5, .output-text h6 {
1540+
margin-top: 0.5em !important;
1541+
margin-bottom: 0.5em !important;
1542+
color: #2d3748 !important;
1543+
}
1544+
1545+
.output-text p {
1546+
margin: 0.5em 0 !important;
1547+
}
1548+
1549+
.output-text pre {
1550+
background-color: #f6f8fa !important;
1551+
border: 1px solid #e9ecef !important;
1552+
border-radius: 4px !important;
1553+
padding: 12px !important;
1554+
margin: 8px 0 !important;
1555+
}
1556+
1557+
.output-text code {
1558+
background-color: #e9ecef !important;
1559+
padding: 2px 4px !important;
1560+
border-radius: 3px !important;
1561+
font-family: "SFMono-Regular", Consolas, "Liberation Mono", Menlo, Courier, monospace !important;
1562+
}
1563+
1564+
.output-text ul, .output-text ol {
1565+
margin: 0.5em 0 !important;
1566+
padding-left: 20px !important;
1567+
}
1568+
1569+
.output-text blockquote {
1570+
border-left: 4px solid #6c757d !important;
1571+
padding-left: 16px !important;
1572+
margin: 0.5em 0 !important;
1573+
color: #6c757d !important;
1574+
background-color: #f8f9fa !important;
1575+
}
1576+
1577+
.output-text table {
1578+
border-collapse: collapse !important;
1579+
width: 100% !important;
1580+
margin: 8px 0 !important;
1581+
}
1582+
1583+
.output-text th, .output-text td {
1584+
border: 1px solid #dee2e6 !important;
1585+
padding: 8px 12px !important;
1586+
text-align: left !important;
1587+
}
1588+
1589+
.output-text th {
1590+
background-color: #f8f9fa !important;
1591+
font-weight: bold !important;
1592+
}
1593+
1594+
/* 确保 Markdown 正确显示 */
1595+
.output-text .katex-display {
1596+
display: block !important;
1597+
text-align: center !important;
1598+
margin: 1em 0 !important;
1599+
}
14931600
""",
14941601
analytics_enabled=False,
14951602
) as app:

0 commit comments

Comments
 (0)