Skip to content

Commit 1756f46

Browse files
committed
Fix implicit sentence detection
Natural language depends on either sentence terminal markers, or implicit empty lines, for sentence disambiguation. Markdown does not. Previously, when list-items did not end in a typical terminal marker, e.g., `!`, `?`, or `.`, these in markdown clear sentences were not detected reliably. This update ensures that, between markdown block level nodes, sentences can occur in NLCST.
1 parent 3605be0 commit 1756f46

File tree

10 files changed

+962
-433
lines changed

10 files changed

+962
-433
lines changed

component.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
],
1414
"dependencies": {
1515
"wooorm/remark-range": "^2.0.0",
16+
"jonschlinkert/repeat-string": "^1.5.2",
1617
"wooorm/nlcst-to-string": "^1.0.0"
1718
},
1819
"repository": "wooorm/mdast-util-to-nlcst",

index.js

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
var range = require('remark-range');
1919
var toString = require('nlcst-to-string');
20+
var repeat = require('repeat-string');
2021

2122
/*
2223
* Map of ignored mdast nodes: nodes which have no (simple)
@@ -34,7 +35,7 @@ var IGNORE = {
3435
* Constants.
3536
*/
3637

37-
var NON_NEWLINE = /[^\n]/;
38+
var C_NEWLINE = '\n';
3839

3940
/**
4041
* Create an position object for `offset` in `file`.
@@ -129,13 +130,39 @@ all = function (parent, file, parser) {
129130
var index = -1;
130131
var result = [];
131132
var child;
133+
var node;
134+
var pos;
135+
var prevEndLine;
136+
var prevOffset;
137+
var endLine;
132138

133139
while (++index < length) {
134-
child = one(children[index], index, parent, file, parser);
140+
node = children[index];
141+
pos = node.position;
142+
endLine = pos.start.line;
143+
144+
if (prevEndLine && endLine !== prevEndLine) {
145+
child = parser.tokenizeWhiteSpace(
146+
repeat(C_NEWLINE, endLine - prevEndLine)
147+
);
148+
149+
patch([child], file, prevOffset);
150+
151+
if (child.value.length < 2) {
152+
child.value = repeat(C_NEWLINE, 2);
153+
}
154+
155+
result.push(child);
156+
}
157+
158+
child = one(node, index, parent, file, parser);
135159

136160
if (child) {
137161
result = result.concat(child);
138162
}
163+
164+
prevEndLine = pos.end.line;
165+
prevOffset = pos.end.offset;
139166
}
140167

141168
return result;
@@ -154,17 +181,10 @@ all = function (parent, file, parser) {
154181
*/
155182
one = function (node, index, parent, file, parser) {
156183
var type = node.type;
157-
var siblings = parent && parent.children;
158-
var prev = siblings && siblings[index - 1];
159184
var pos = node.position;
160185
var start = pos.start;
161186
var end = pos.end;
162-
var final = prev && prev.position.end.offset;
163187
var replacement;
164-
var result;
165-
var space;
166-
167-
space = final && file.toString().slice(final, start.offset);
168188

169189
if (type in IGNORE) {
170190
return null;
@@ -194,20 +214,6 @@ one = function (node, index, parent, file, parser) {
194214
)], file, start.offset);
195215
}
196216

197-
/**
198-
* There’s a difference between block-nodes with
199-
* lines between them. NLCST parsers need them to
200-
* differentiate between paragraphs.
201-
*/
202-
203-
if (replacement && space && !NON_NEWLINE.test(space)) {
204-
result = parser.tokenizeWhiteSpace(space);
205-
206-
patch([result], file, final);
207-
208-
replacement.unshift(result);
209-
}
210-
211217
return replacement || null;
212218
};
213219

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,9 @@
1212
"language"
1313
],
1414
"dependencies": {
15+
"nlcst-to-string": "^1.0.0",
1516
"remark-range": "^2.0.0",
16-
"nlcst-to-string": "^1.0.0"
17+
"repeat-string": "^1.5.2"
1718
},
1819
"repository": {
1920
"type": "git",

test/fixtures/blocks/output.json

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,22 @@
345345
}
346346
}
347347
},
348+
{
349+
"type": "WhiteSpaceNode",
350+
"value": "\n\n\n",
351+
"position": {
352+
"start": {
353+
"line": 7,
354+
"column": 24,
355+
"offset": 53
356+
},
357+
"end": {
358+
"line": 10,
359+
"column": 1,
360+
"offset": 56
361+
}
362+
}
363+
},
348364
{
349365
"type": "WhiteSpaceNode",
350366
"value": "\n\n",
@@ -591,6 +607,22 @@
591607
}
592608
}
593609
},
610+
{
611+
"type": "WhiteSpaceNode",
612+
"value": "\n\n",
613+
"position": {
614+
"start": {
615+
"line": 12,
616+
"column": 20,
617+
"offset": 100
618+
},
619+
"end": {
620+
"line": 14,
621+
"column": 1,
622+
"offset": 102
623+
}
624+
}
625+
},
594626
{
595627
"type": "WhiteSpaceNode",
596628
"value": "\n\n",

test/fixtures/ignore/input.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ Rules:
88

99
Tables:
1010

11-
| Foo |
12-
| --- |
13-
| Bar |
11+
| Foo | Bar |
12+
| --- | --- |
13+
| Baz | Qux |
1414

1515
...that’s all, for now 😄.

0 commit comments

Comments
 (0)