Skip to content

Commit e0f10bd

Browse files
committed
feat: add advanced recognition sample
1 parent e2797a7 commit e0f10bd

15 files changed

+543
-44
lines changed

Create-TestFiles.ps1

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
<#
2+
.SYNOPSIS
3+
通过命令行参数,为测试目的创建一组标准的 HTTP 请求和响应文件。
4+
5+
.DESCRIPTION
6+
此脚本从命令行接受一个字符串 S 和一个类型 P (nosse 或 sse)。
7+
然后,它会在指定的目录 'src/Cnblogs.DashScope.Tests.Shared/RawHttpData' 下
8+
创建四个空文件:
9+
- S-P.request.header.txt
10+
- S-P.request.body.json
11+
- S-P.response.header.txt
12+
- S-P.response.body.txt
13+
14+
如果目标目录不存在,脚本会自动创建它。
15+
16+
.PARAMETER S
17+
用于文件名前缀的字符串,例如: GetChatCompletion。
18+
19+
.PARAMETER P
20+
指定请求类型,只能是 'nosse' 或 'sse'。
21+
22+
.EXAMPLE
23+
# 使用命名参数创建文件
24+
.\Create-TestFiles.ps1 -S "GetChatCompletion" -P "nosse"
25+
26+
.EXAMPLE
27+
# 使用位置参数创建文件(第一个值对应-S,第二个值对应-P)
28+
.\Create-TestFiles.ps1 "StreamChat" "sse"
29+
30+
.EXAMPLE
31+
# 如果参数无效,PowerShell 会自动报错
32+
.\Create-TestFiles.ps1 -S "Test" -P "invalid"
33+
# 错误: Cannot validate argument on parameter 'P'. The argument "invalid" does not belong to the set "nosse","sse"...
34+
#>
35+
36+
# --- 1. 定义命令行参数 ---
37+
38+
param (
39+
[Parameter(Mandatory=$true, HelpMessage="请输入字符串 S,例如: GetChatCompletion")]
40+
[string]$S,
41+
42+
[Parameter(Mandatory=$true, HelpMessage="请输入类型 P,只能是 'nosse' 或 'sse'")]
43+
[ValidateSet("nosse", "sse")]
44+
[string]$P
45+
)
46+
47+
# --- 2. 定义路径和文件名 ---
48+
49+
# 定义基础路径
50+
$basePath = "src/Cnblogs.DashScope.Tests.Shared/RawHttpData"
51+
52+
# 构建文件名的基础部分
53+
$baseFileName = "$S-$P"
54+
55+
# 定义所有需要创建的文件名
56+
$filesToCreate = @(
57+
"$baseFileName.request.header.txt",
58+
"$baseFileName.request.body.json",
59+
"$baseFileName.response.header.txt",
60+
"$baseFileName.response.body.txt"
61+
)
62+
63+
# --- 3. 检查并创建目录 ---
64+
65+
# 检查目录是否存在,如果不存在则创建
66+
if (-not (Test-Path -Path $basePath -PathType Container)) {
67+
Write-Host "目录 '$basePath' 不存在,正在创建..." -ForegroundColor Yellow
68+
New-Item -Path $basePath -ItemType Directory -Force | Out-Null
69+
}
70+
71+
# --- 4. 创建文件 ---
72+
73+
Write-Host "开始为 '$baseFileName' 创建文件..."
74+
75+
# 遍历文件名数组,创建每个文件
76+
foreach ($fileName in $filesToCreate) {
77+
# 组合完整的文件路径
78+
$fullPath = Join-Path -Path $basePath -ChildPath $fileName
79+
80+
# 创建空文件。-Force 参数会覆盖已存在的同名文件。
81+
New-Item -Path $fullPath -ItemType File -Force | Out-Null
82+
83+
Write-Host "已创建: '$fullPath'" -ForegroundColor Cyan
84+
}
85+
86+
Write-Host "所有文件创建完成!" -ForegroundColor Green

README.md

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -650,6 +650,91 @@ if (usage != null)
650650
}
651651
```
652652

653+
#### Built-in Tasks
654+
655+
##### Advanced Recognition
656+
657+
When using this task, do not enable streaming. Otherwise, `completion.Output.Choices[0].Message.Content[0].OcrResult.WordsInfo` will be `null`.
658+
659+
In addition to the standard text content, this task also returns the coordinates of the text.
660+
661+
To call this built-in task, set `Parameters.OcrOptions.Task` to `advanced_recognition`. No additional prompt is required.
662+
```csharp
663+
var messages = new List<MultimodalMessage>();
664+
messages.Add(
665+
MultimodalMessage.User(
666+
[
667+
MultimodalMessageContent.ImageContent(ossLink),
668+
]));
669+
var completion = client.GetMultimodalGenerationAsync(
670+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
671+
{
672+
Model = "qwen-vl-ocr-latest",
673+
Input = new MultimodalInput() { Messages = messages },
674+
Parameters = new MultimodalParameters()
675+
{
676+
OcrOptions = new MultimodalOcrOptions()
677+
{
678+
Task = "advanced_recognition"
679+
}
680+
}
681+
});
682+
```
683+
684+
Output usage:
685+
686+
```csharp
687+
Console.WriteLine("Text:");
688+
Console.WriteLine(completion.Output.Choices[0].Message.Content[0].Text);
689+
Console.WriteLine("WordsInfo:");
690+
foreach (var info in completion.Output.Choices[0].Message.Content[0].OcrResult!.WordsInfo!)
691+
{
692+
var location = $"[{string.Join(',', info.Location)}]";
693+
var rect = $"[{string.Join(',', info.RotateRect)}]";
694+
Console.WriteLine(info.Text);
695+
Console.WriteLine($"Location: {location}");
696+
Console.WriteLine($"RotateRect: {rect}");
697+
Console.WriteLine();
698+
}
699+
```
700+
701+
Output:
702+
703+
````csharp
704+
Text:
705+
```json
706+
[
707+
{"rotate_rect": [236, 254, 115, 299, 90], "text": "OpenAI 兼容"},
708+
{"rotate_rect": [646, 254, 115, 269, 90], "text": "DashScope"},
709+
{"rotate_rect": [236, 684, 115, 163, 90], "text": "Python"},
710+
{"rotate_rect": [492, 684, 115, 105, 90], "text": "Java"},
711+
{"rotate_rect": [712, 684, 115, 85, 90], "text": "curl"}
712+
]
713+
```
714+
WordsInfo:
715+
OpenAI 兼容
716+
Location: [46,55,205,55,205,87,46,87]
717+
RotateRect: [125,71,159,32,0]
718+
719+
DashScope
720+
Location: [272,55,415,55,415,87,272,87]
721+
RotateRect: [344,71,32,143,90]
722+
723+
Python
724+
Location: [82,175,169,175,169,207,82,207]
725+
RotateRect: [126,191,32,87,90]
726+
727+
Java
728+
Location: [234,175,289,175,289,207,234,207]
729+
RotateRect: [262,191,55,32,0]
730+
731+
curl
732+
Location: [356,175,401,175,401,207,356,207]
733+
RotateRect: [378,191,32,45,90]
734+
````
735+
736+
737+
653738
## Text-to-Speech
654739

655740
Create a speech synthesis session using `dashScopeClient.CreateSpeechSynthesizerSocketSessionAsync()`.

README.zh-Hans.md

Lines changed: 89 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2331,6 +2331,7 @@ Usage: in(721)/out(7505)/total(0)
23312331
您也可以通过 `UploadTemporaryFileAsync` 方法上传临时文件获取 `oss://` 开头的链接。
23322332
23332333
```csharp
2334+
// 上传本地文件
23342335
await using var lenna = File.OpenRead("Lenna.jpg");
23352336
string ossLink = await client.UploadTemporaryFileAsync("qwen3-vl-plus", lenna, "lenna.jpg");
23362337
Console.WriteLine($"File uploaded: {ossLink}");
@@ -2450,7 +2451,7 @@ messages.Add(
24502451
24512452
### 文字提取
24522453
2453-
使用 `qwen-vl-ocr` 系列模型可以很好的完成文字提取任务,基础用法(使用本地文件)
2454+
使用 `qwen-vl-ocr` 系列模型可以很好的完成文字提取任务,基础用法:
24542455
24552456
```csharp
24562457
// upload file
@@ -2462,6 +2463,7 @@ messages.Add(
24622463
MultimodalMessage.User(
24632464
[
24642465
// 如果你的图片存在偏斜,可尝试将 enableRotate 设置为 true
2466+
// 除了本地上传外,您也可以直接传入公网 URL
24652467
MultimodalMessageContent.ImageContent(ossLink, enableRotate: true),
24662468
]));
24672469
var completion = client.GetMultimodalGenerationStreamAsync(
@@ -2545,6 +2547,92 @@ Usage: in(2434)/out(155)/image(2410)/total(2589)
25452547
*/
25462548
```
25472549
2550+
#### 调用内置任务
2551+
2552+
##### 高精识别
2553+
2554+
使用这个任务时,不要开启流式传输,否则 `completion.Output.Choices[0].Message.Content[0].OcrResult.WordsInfo` 将为 `null`
2555+
2556+
除了常规的返回文字内容外,该任务还会返回文字的坐标。
2557+
2558+
设置 `Parameters.OcrOptions.Task``advanced_recognition` 即可调用该内置任务,不需要传入额外的 Prompt。
2559+
2560+
```csharp
2561+
var messages = new List<MultimodalMessage>();
2562+
messages.Add(
2563+
MultimodalMessage.User(
2564+
[
2565+
MultimodalMessageContent.ImageContent(ossLink),
2566+
]));
2567+
var completion = client.GetMultimodalGenerationAsync(
2568+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
2569+
{
2570+
Model = "qwen-vl-ocr-latest",
2571+
Input = new MultimodalInput() { Messages = messages },
2572+
Parameters = new MultimodalParameters()
2573+
{
2574+
OcrOptions = new MultimodalOcrOptions()
2575+
{
2576+
Task = "advanced_recognition"
2577+
}
2578+
}
2579+
});
2580+
```
2581+
2582+
任务返回的文字是一个 JSON 代码块,包含文本坐标和文本内容。您可以使用 `completion.Output.Choices[0].Message.Content[0].OcrResult.WordsInfo` 直接访问结果,不需要手动反序列化模型返回的代码块。
2583+
2584+
示例:
2585+
2586+
```csharp
2587+
Console.WriteLine("Text:");
2588+
Console.WriteLine(completion.Output.Choices[0].Message.Content[0].Text);
2589+
Console.WriteLine("WordsInfo:");
2590+
foreach (var info in completion.Output.Choices[0].Message.Content[0].OcrResult!.WordsInfo!)
2591+
{
2592+
var location = $"[{string.Join(',', info.Location)}]";
2593+
var rect = $"[{string.Join(',', info.RotateRect)}]";
2594+
Console.WriteLine(info.Text);
2595+
Console.WriteLine($"Location: {location}");
2596+
Console.WriteLine($"RotateRect: {rect}");
2597+
Console.WriteLine();
2598+
}
2599+
```
2600+
2601+
输出结果:
2602+
2603+
````csharp
2604+
Text:
2605+
```json
2606+
[
2607+
{"rotate_rect": [236, 254, 115, 299, 90], "text": "OpenAI 兼容"},
2608+
{"rotate_rect": [646, 254, 115, 269, 90], "text": "DashScope"},
2609+
{"rotate_rect": [236, 684, 115, 163, 90], "text": "Python"},
2610+
{"rotate_rect": [492, 684, 115, 105, 90], "text": "Java"},
2611+
{"rotate_rect": [712, 684, 115, 85, 90], "text": "curl"}
2612+
]
2613+
```
2614+
WordsInfo:
2615+
OpenAI 兼容
2616+
Location: [46,55,205,55,205,87,46,87]
2617+
RotateRect: [125,71,159,32,0]
2618+
2619+
DashScope
2620+
Location: [272,55,415,55,415,87,272,87]
2621+
RotateRect: [344,71,32,143,90]
2622+
2623+
Python
2624+
Location: [82,175,169,175,169,207,82,207]
2625+
RotateRect: [126,191,32,87,90]
2626+
2627+
Java
2628+
Location: [234,175,289,175,289,207,234,207]
2629+
RotateRect: [262,191,55,32,0]
2630+
2631+
curl
2632+
Location: [356,175,401,175,401,207,356,207]
2633+
RotateRect: [378,191,32,45,90]
2634+
````
2635+
25482636
25492637
25502638
## 语音合成

sample/Cnblogs.DashScope.Sample/Cnblogs.DashScope.Sample.csproj

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@
3232
<None Update="tilted.png">
3333
<CopyToOutputDirectory>Always</CopyToOutputDirectory>
3434
</None>
35+
<None Update="webpage.jpg">
36+
<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
37+
</None>
3538
</ItemGroup>
3639

3740
<ItemGroup>
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
using Cnblogs.DashScope.Core;
2+
3+
namespace Cnblogs.DashScope.Sample.Multimodal;
4+
5+
public class OcrAdvancedRecognitionSample : ISample
6+
{
7+
/// <inheritdoc />
8+
public string Description => "OCR Advanced Recognition Task Sample";
9+
10+
/// <inheritdoc />
11+
public async Task RunAsync(IDashScopeClient client)
12+
{
13+
// upload file
14+
await using var file = File.OpenRead("webpage.jpg");
15+
var ossLink = await client.UploadTemporaryFileAsync("qwen-vl-ocr-latest", file, "webpage.jpg");
16+
Console.WriteLine($"File uploaded: {ossLink}");
17+
var messages =
18+
new List<MultimodalMessage> { MultimodalMessage.User([MultimodalMessageContent.ImageContent(ossLink)]) };
19+
var completion = await client.GetMultimodalGenerationAsync(
20+
new ModelRequest<MultimodalInput, IMultimodalParameters>()
21+
{
22+
Model = "qwen-vl-ocr-latest",
23+
Input = new MultimodalInput() { Messages = messages },
24+
Parameters = new MultimodalParameters()
25+
{
26+
OcrOptions = new MultimodalOcrOptions() { Task = "advanced_recognition" }
27+
}
28+
});
29+
30+
Console.WriteLine("Text:");
31+
Console.WriteLine(completion.Output.Choices[0].Message.Content[0].Text);
32+
Console.WriteLine("WordsInfo:");
33+
foreach (var info in completion.Output.Choices[0].Message.Content[0].OcrResult!.WordsInfo!)
34+
{
35+
var location = $"[{string.Join(',', info.Location)}]";
36+
var rect = $"[{string.Join(',', info.RotateRect)}]";
37+
Console.WriteLine(info.Text);
38+
Console.WriteLine($"Location: {location}");
39+
Console.WriteLine($"RotateRect: {rect}");
40+
Console.WriteLine();
41+
}
42+
43+
if (completion.Usage != null)
44+
{
45+
var usage = completion.Usage;
46+
Console.WriteLine(
47+
$"Usage: in({usage.InputTokens})/out({usage.OutputTokens})/image({usage.ImageTokens})/total({usage.TotalTokens})");
48+
}
49+
}
50+
}
51+
52+
/*
53+
File uploaded: oss://dashscope-instant/52afe077fb4825c6d74411758cb1ab98/2025-11-29/90f86409-6868-4e34-83e1-efce3c72477c/webpage.jpg
54+
Text:
55+
```json
56+
[
57+
{"rotate_rect": [236, 254, 115, 299, 90], "text": "OpenAI 兼容"},
58+
{"rotate_rect": [646, 254, 115, 269, 90], "text": "DashScope"},
59+
{"rotate_rect": [236, 684, 115, 163, 90], "text": "Python"},
60+
{"rotate_rect": [492, 684, 115, 105, 90], "text": "Java"},
61+
{"rotate_rect": [712, 684, 115, 85, 90], "text": "curl"}
62+
]
63+
```
64+
WordsInfo:
65+
OpenAI 兼容
66+
Location: [46,55,205,55,205,87,46,87]
67+
RotateRect: [125,71,159,32,0]
68+
69+
DashScope
70+
Location: [272,55,415,55,415,87,272,87]
71+
RotateRect: [344,71,32,143,90]
72+
73+
Python
74+
Location: [82,175,169,175,169,207,82,207]
75+
RotateRect: [126,191,32,87,90]
76+
77+
Java
78+
Location: [234,175,289,175,289,207,234,207]
79+
RotateRect: [262,191,55,32,0]
80+
81+
curl
82+
Location: [356,175,401,175,401,207,356,207]
83+
RotateRect: [378,191,32,45,90]
84+
85+
Usage: in(175)/out(186)/image(142)/total(361)
86+
*/
14.8 KB
Loading

0 commit comments

Comments
 (0)