Skip to content

leading and trailing whitespace is lost when it should be part of text nodes #87

@milahu

Description

@milahu

whitespace is significant for lossless AST transformers

when this is not handled by the parser
then i need extra code in the semantic stage, to lookahead to the next node

similar #40

In the specific case of HTML parsing, I couldn't tell if or when it's reasonable to treat leading and trailing whitespace as significant.

test.html


<div>

  aaa

  bbb

</div>

the right source column has lookbehind source plus node.text
which is easy to do with node.range.end_byte of the previous node

lookahead would be more complex...

tree-sitter-html

node  5 = <               : "<"                            : "\n<"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"
node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
node  7 = </              : "</"                           : "\n\n</"
node 17 = tag_name        : "div"                          : "div"
node  3 = >               : ">"                            : ">"

lezer-parser-html

note how both source columns are identical
so this is a truly "lossless" parser (CST parser)

node 16 = Text            : "\n"                           : "\n"
node  6 = StartTag        : "<"                            : "<"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
node 11 = StartCloseTag   : "</"                           : "</"
node 22 = TagName         : "div"                          : "div"
node  4 = EndTag          : ">"                            : ">"
node 16 = Text            : "\n"                           : "\n"

diff

+ node 16 = Text            : "\n"                           : "\n"
- node 16 = text            : "aaa\n\n  bbb"                 : "\n\n  aaa\n\n  bbb"
+ node 16 = Text            : "\n\n  aaa\n\n  bbb\n\n"       : "\n\n  aaa\n\n  bbb\n\n"
+ node 16 = Text            : "\n"                           : "\n"
repro.py
#!/usr/bin/env python3

# pip install tree-sitter tree-sitter-languages

import json
import tree_sitter
import tree_sitter_languages

input_html_bytes = b"""
<div>

  aaa

  bbb

</div>
"""

def walk_html_tree(tree, func):
    # compound tags
    # these are ignored when serializing the tree
    compound_kind_id = [
        25, # fragment
        26, # doctype
        #1, # '<!'
        #3, # '>'
        28, # element
        29, # script_element
        30, # style_element
        31, # start_tag
        34, # self_closing_tag
        35, # end_tag
        37, # attribute
        38, # quoted_attribute_value
        #14, # double quote '"'
        #12, # single quote "'"
        #10, # attribute_value
    ]
    cursor = tree.walk()
    reached_root = False
    while reached_root == False:
        is_compound = cursor.node.kind_id in compound_kind_id
        #yield cursor.node
        func(cursor.node, is_compound)
        if cursor.goto_first_child():
            continue
        if cursor.goto_next_sibling():
            continue
        retracing = True
        while retracing:
            if not cursor.goto_parent():
                retracing = False
                reached_root = True
            if cursor.goto_next_sibling():
                retracing = False

last_node_to = 0
node_idx = -1

max_len = 30

show_compound_nodes = False

def walk_callback_test(node, is_compound):
    global node_idx
    global last_node_to

    node_text = json.dumps(node.text.decode("utf8"))
    if len(node_text) > max_len:
        node_text = node_text[0:max_len] + "..."

    if not is_compound:
        space_node_text = json.dumps(input_html_bytes[last_node_to:node.range.end_byte].decode("utf8"))
        if len(space_node_text) > max_len:
            space_node_text = space_node_text[0:max_len] + "..."
        line_prefix = "  " if show_compound_nodes else ""
        print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")
        last_node_to = node.range.end_byte
    else:
        if show_compound_nodes:
            line_prefix = "# "
            print(line_prefix + f"node {node.kind_id:2d} = {node.type:15s} : {node_text:30s} : {space_node_text}")

    node_idx += 1
    #if node_idx > 20: raise "todo"

tree_sitter_html = tree_sitter_languages.get_parser("html")

html_tree = tree_sitter_html.parse(input_html_bytes)

walk_html_tree(html_tree.root_node, walk_callback_test)
repro.js
#!/usr/bin/env node

/*
npm init -y
npm install @lezer/html
*/

import { parser as lezerParserHtml } from '@lezer/html';

const inputHtml = `
<div>

  aaa

  bbb

</div>
`;

const htmlParser = lezerParserHtml.configure({
    strict: true, // throw on parse error
    //dialect: "selfClosing",
});

const htmlTree = htmlParser.parse(inputHtml);

const rootNode = htmlTree.topNode;

// based on nix-eval-js/src/lezer-parser-nix/src/nix-format.js
/** @param {Tree | TreeNode} tree */
function walkHtmlTree(tree, func) {
    const cursor = tree.cursor();
    //if (!cursor) return '';
    if (!cursor) return;
    let depth = 0;
    while (true) {
        // NLR: Node, Left, Right
        // Node
        // NOTE InvalidEntity breaks the parser
        // <a t="a&amp;b&c">a&amp;b&c</a>
        // -> require valid input, throw on parse error
        const cursorTypeId = cursor.type.id;
        if (
            //true || // debug: dont filter
            !(
                cursorTypeId == 15 || // Document
                cursorTypeId == 20 || // Element
                cursorTypeId == 23 || // Attribute
                cursorTypeId == 21 || // OpenTag <script>
                cursorTypeId == 30 || // OpenTag <style>
                cursorTypeId == 36 || // OpenTag
                cursorTypeId == 32 || // CloseTag </style>
                cursorTypeId == 29 || // CloseTag </script>
                cursorTypeId == 37 || // CloseTag
                cursorTypeId == 38 || // SelfClosingTag
                // note: this is inconsistent in the parser
                // InvalidEntity is child node
                // EntityReference is separate node (sibling of other text nodes)
                cursorTypeId == 19 || // InvalidEntity: <a href="?a=1&b=2" -> "&" is parsed as InvalidEntity
                //cursorTypeId == 17 || // EntityReference: "&amp;" or "&mdash;" is parsed as EntityReference
                false
            )
        ) {
            func(cursor)
        }
        // Left
        if (cursor.firstChild()) {
            // moved down
            depth++;
            continue;
        }
        // Right
        if (depth > 0 && cursor.nextSibling()) {
            // moved right
            continue;
        }
        let continueMainLoop = false;
        let firstUp = true;
        while (cursor.parent()) {
            // moved up
            depth--;
            if (depth <= 0) {
                // when tree is a node, stop at the end of node
                // == dont visit sibling or parent nodes
                return;
            }
            if (cursor.nextSibling()) {
                // moved up + right
                continueMainLoop = true;
                break;
            }
            firstUp = false;
        }
        if (continueMainLoop) continue;
        break;
    }
}

let lastNodeTo = 0;
const maxLen = 30;
walkHtmlTree(rootNode, (node) => {
    let nodeSource = JSON.stringify(inputHtml.slice(node.from, node.to));
    let spaceNodeSource = JSON.stringify(inputHtml.slice(lastNodeTo, node.to));
    if (nodeSource.length > maxLen) {
        nodeSource = nodeSource.slice(0, maxLen);
    }
    if (spaceNodeSource.length > maxLen) {
        spaceNodeSource = spaceNodeSource.slice(0, maxLen);
    }
    console.log(`node ${String(node.type.id).padStart(2)} = ${node.type.name.padEnd(15)} : ${nodeSource.padEnd(maxLen)} : ${spaceNodeSource}`);
    lastNodeTo = node.to;
});

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions