Skip to content

Commit a65ceb7

Browse files
authored
Initial version of HtmlToText custom connector (#1509)
1 parent e03509a commit a65ceb7

File tree

4 files changed

+148
-0
lines changed

4 files changed

+148
-0
lines changed
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
{
2+
"swagger": "2.0",
3+
"info": {
4+
"title": "HtmlToText",
5+
"description": "Converts HTML to plain text.",
6+
"version": "1.0"
7+
},
8+
"host": "api.contoso.com",
9+
"basePath": "/",
10+
"schemes": [
11+
"https"
12+
],
13+
"consumes": [],
14+
"produces": [],
15+
"paths": {
16+
"/HtmlToText": {
17+
"post": {
18+
"responses": {
19+
"default": {
20+
"description": "HTML To Text",
21+
"schema": {}
22+
}
23+
},
24+
"summary": "Convert HTML to plain text",
25+
"description": "Converts HTML input into plain text output",
26+
"operationId": "HtmlToText",
27+
"parameters": [
28+
{
29+
"name": "body",
30+
"in": "body",
31+
"required": false,
32+
"schema": {
33+
"type": "object",
34+
"properties": {
35+
"html": {
36+
"type": "string",
37+
"description": "html",
38+
"title": "html",
39+
"default": "<html><body><p>Hello World</p></body></html>"
40+
}
41+
},
42+
"required": [
43+
"html"
44+
]
45+
}
46+
}
47+
]
48+
}
49+
}
50+
},
51+
"definitions": {},
52+
"parameters": {},
53+
"responses": {},
54+
"securityDefinitions": {},
55+
"security": [],
56+
"tags": []
57+
}
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"properties": {
3+
"connectionParameters": {},
4+
"iconBrandColor": "#007ee5",
5+
"capabilities": [],
6+
"policyTemplateInstances": []
7+
}
8+
}
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Title
2+
HtmlToText is a simple custom connector to convert HTML to plain text.
3+
4+
## Prerequisites
5+
6+
## Connector Documentation
7+
The HtmlToText custom connector is an alternative to the [Content Conversion (Preview) connector](https://docs.microsoft.com/en-us/connectors/conversionservice/)
8+
It is not meant to be reverse compatible with the Content Conversion (Preview) connector.
9+
The HtmlToText custom connector by default will:
10+
* Remove all table tags and content from HTML
11+
* Replace br tags with newline.
12+
* Replace br, div, header, p, hr, li, ol, ul blocks with newline.
13+
* Remove HTML tags.
14+
15+
## Known Issues and Limitations
16+
17+
## Deployment Instructions
18+
Please use [these instructions](https://docs.microsoft.com/en-us/connectors/custom-connectors/paconn-cli) to deploy this connector as custom connector in Microsoft Power Automate and Power Apps.
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
public class Script : ScriptBase
2+
{
3+
public override async Task<HttpResponseMessage> ExecuteAsync()
4+
{
5+
// Check if the operation ID matches what is specified in the OpenAPI definition of the connector
6+
if (this.Context.OperationId == "HtmlToText")
7+
{
8+
return await this.HandleHtmlToTextOperation().ConfigureAwait(false);
9+
}
10+
11+
// Handle an invalid operation ID
12+
HttpResponseMessage response = new HttpResponseMessage(HttpStatusCode.BadRequest);
13+
response.Content = CreateJsonContent($"Unknown operation ID '{this.Context.OperationId}'");
14+
return response;
15+
}
16+
17+
private async Task<HttpResponseMessage> HandleHtmlToTextOperation()
18+
{
19+
HttpResponseMessage response;
20+
21+
// The body of the incoming request is of following format:
22+
// {
23+
// "html": "<some html>"
24+
// }
25+
var contentAsString = await this.Context.Request.Content.ReadAsStringAsync().ConfigureAwait(false);
26+
27+
// Parse as JSON object
28+
var contentAsJson = JObject.Parse(contentAsString);
29+
30+
// Get the value of text to check
31+
var html = (string)contentAsJson["html"];
32+
33+
string outText;
34+
35+
// Remove tables from html
36+
outText = Regex.Replace(html, @"<(table)[^>]*>(?><(table)[^>]*(?<tableTag>)|</table(?<-tableTag>)|.?)*(?(tableTag)(?!))</table>", "", RegexOptions.IgnoreCase);
37+
38+
// Replace blocks with new lines
39+
string block = "div|header|p|hr|li|ol|ul";
40+
string patNestedBlock = $"(\\s*?</?({block})[^>]*?>)+\\s*";
41+
outText = Regex.Replace(outText, patNestedBlock, "\n", RegexOptions.IgnoreCase);
42+
43+
// Replace br tag to newline.
44+
outText = Regex.Replace(outText, @"<(br)[^>]*>", "\n", RegexOptions.IgnoreCase);
45+
46+
// Remove styles and scripts.
47+
outText = Regex.Replace(outText, @"<(script|style)[^>]*?>.*?</\1>", "", RegexOptions.Singleline);
48+
49+
// Remove all tags.
50+
outText = Regex.Replace(outText, @"<[^>]*(>|$)", "", RegexOptions.Multiline);
51+
52+
// Remove the whitespace preceding a new line.
53+
outText = Regex.Replace(outText, @"^(&nbsp;)\n", "\n", RegexOptions.Multiline);
54+
55+
// Decode html specific characters
56+
outText = HttpUtility.HtmlDecode(outText);
57+
58+
// Removes leading and trailing whitespaces
59+
outText = outText.Trim();
60+
61+
response = new HttpResponseMessage(HttpStatusCode.OK);
62+
response.Content = new StringContent(outText);
63+
return response;
64+
}
65+
}

0 commit comments

Comments
 (0)