Description
whitespace is significant for lossless AST transformers
when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node
similar #40
In the specific case of HTML parsing, I couldn't tell if or when it's reasonable to treat leading and trailing whitespace as significant.
test.html
<div>
aaa
bbb
</div>
the right source column has lookbehind source plus node.text
which is easy to do with node.range.end_byte
of the previous node
lookahead would be more complex...
tree-sitter-html
node 5 = < : "<" : "\n<"
node 17 = tag_name : "div" : "div"
node 3 = > : ">" : ">"
node 16 = text : "aaa\n\n bbb" : "\n\n aaa\n\n bbb"
node 7 = </ : "</" : "\n\n</"
node 17 = tag_name : "div" : "div"
node 3 = > : ">" : ">"
lezer-parser-html
note how both source columns are identical
so this is a truly "lossless" parser (CST parser)
node 16 = Text : "\n" : "\n"
node 6 = StartTag : "<" : "<"
node 22 = TagName : "div" : "div"
node 4 = EndTag : ">" : ">"
node 16 = Text : "\n\n aaa\n\n bbb\n\n" : "\n\n aaa\n\n bbb\n\n"
node 11 = StartCloseTag : "</" : "</"
node 22 = TagName : "div" : "div"
node 4 = EndTag : ">" : ">"
node 16 = Text : "\n" : "\n"
diff
+ node 16 = Text : "\n" : "\n"
- node 16 = text : "aaa\n\n bbb" : "\n\n aaa\n\n bbb"
+ node 16 = Text : "\n\n aaa\n\n bbb\n\n" : "\n\n aaa\n\n bbb\n\n"
+ node 16 = Text : "\n" : "\n"
repro.py
#!/usr/bin/env python3
# pip install tree-sitter tree-sitter-languages
import json
import tree_sitter
import tree_sitter_languages
input_html_bytes = b"""
<div>
aaa
bbb
</div>
"""
def walk_html_tree(tree, func):
# compound tags
# these are ignored when serializing the tree
compound_kind_id = [
25, # fragment
26, # doctype
#1, # '<!'
#3, # '>'
28, # element
29, # script_element
30, # style_element
31, # start_tag
34, # self_closing_tag
35, # end_tag
37, # attribute
38, # quoted_attribute_value
#14, # double quote '"'
#12, # single quote "'"
#10, # attribute_value
]
cursor = tree.walk()
reached_root = False
while reached_root == False:
is_compound = cursor.node.kind_id in compound_kind_id
#yield cursor.node
func(cursor.node, is_compound)
if cursor.goto_first_child():
continue
if cursor.goto_next_sibling():
continue
retracing = True
while retracing:
if not cursor.goto_parent():
retracing = False
reached_root = True
if cursor.goto_next_sibling():
retracing = False
last_node_to = 0
node_idx = -1
max_len = 30
show_compound_nodes = False
def walk_callback_test(node, is_compound):
global node_idx
global last_node_to
node_text = json.dumps(node.text.decode("utf8"))
if len(node_text) > max_len:
node_text = node_text[0:max_len] + "..."
if not is_compound:
space_node_text = json.dumps(input_html_bytes[last_node_to:node.range.end_byte].decode("utf8"))
if len(space_node_text) > max_len:
space_node_text = space_node_text[0:max_len] + "..."
line_prefix = " " if show_compound_nodes else ""
print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
last_node_to = node.range.end_byte
else:
if show_compound_nodes:
line_prefix = "# "
print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
node_idx += 1
#if node_idx > 20: raise "todo"
tree_sitter_html = tree_sitter_languages.get_parser("html")
html_tree = tree_sitter_html.parse(input_html_bytes)
walk_html_tree(html_tree.root_node, walk_callback_test)
repro.js
#!/usr/bin/env node
/*
npm init -y
npm install @lezer/html
*/
import { parser as lezerParserHtml } from '@lezer/html';
const inputHtml = `
<div>
aaa
bbb
</div>
`;
const htmlParser = lezerParserHtml.configure({
strict: true, // throw on parse error
//dialect: "selfClosing",
});
const htmlTree = htmlParser.parse(inputHtml);
const rootNode = htmlTree.topNode;
// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js
/** @param {Tree | TreeNode} tree */
function walkHtmlTree(tree, func) {
const cursor = tree.cursor();
//if (!cursor) return '';
if (!cursor) return;
let depth = 0;
while (true) {
// NLR: Node, Left, Right
// Node
// NOTE InvalidEntity breaks the parser
// <a t="a&b&c">a&b&c</a>
// -> require valid input, throw on parse error
const cursorTypeId = cursor.type.id;
if (
//true || // debug: dont filter
!(
cursorTypeId == 15 || // Document
cursorTypeId == 20 || // Element
cursorTypeId == 23 || // Attribute
cursorTypeId == 21 || // OpenTag <script>
cursorTypeId == 30 || // OpenTag <style>
cursorTypeId == 36 || // OpenTag
cursorTypeId == 32 || // CloseTag </style>
cursorTypeId == 29 || // CloseTag </script>
cursorTypeId == 37 || // CloseTag
cursorTypeId == 38 || // SelfClosingTag
// note: this is inconsistent in the parser
// InvalidEntity is child node
// EntityReference is separate node (sibling of other text nodes)
cursorTypeId == 19 || // InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity
//cursorTypeId == 17 || // EntityReference: "&" or "—" is parsed as EntityReference
false
)
) {
func(cursor)
}
// Left
if (cursor.firstChild()) {
// moved down
depth++;
continue;
}
// Right
if (depth > 0 && cursor.nextSibling()) {
// moved right
continue;
}
let continueMainLoop = false;
let firstUp = true;
while (cursor.parent()) {
// moved up
depth--;
if (depth <= 0) {
// when tree is a node, stop at the end of node
// == dont visit sibling or parent nodes
return;
}
if (cursor.nextSibling()) {
// moved up + right
continueMainLoop = true;
break;
}
firstUp = false;
}
if (continueMainLoop) continue;
break;
}
}
let lastNodeTo = 0;
const maxLen = 30;
walkHtmlTree(rootNode, (node) => {
let nodeSource = JSON.stringify(inputHtml.slice(node.from, node.to));
let spaceNodeSource = JSON.stringify(inputHtml.slice(lastNodeTo, node.to));
if (nodeSource.length > maxLen) {
nodeSource = nodeSource.slice(0, maxLen);
}
if (spaceNodeSource.length > maxLen) {
spaceNodeSource = spaceNodeSource.slice(0, maxLen);
}
console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);
lastNodeTo = node.to;
});
Metadata
Metadata
Assignees
Labels
No labels