Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sync for validator/cpp/{engine,htmlparser} #35044

Merged
merged 2 commits into from
Jun 25, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
- Add support for recording node's start/end offset in original html …
…src. This in addition to line/col already supported in htmlparser.

- Add support for recording number of words (num_terms) in a text node.

PiperOrigin-RevId: 378533243
  • Loading branch information
amaltas authored and antiphoton committed Jun 25, 2021
commit d8c2f790bb45ea2c4ac9d8b397076b207efdbcee
16 changes: 8 additions & 8 deletions validator/cpp/htmlparser/node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -418,17 +418,17 @@ std::string Node::InnerText() const {

void Node::UpdateChildNodesPositions(Node* relative_node) {
// Cannot proceed if relative node has no positional information.
if (!relative_node->PositionInHtmlSrc().has_value()) return;
if (!relative_node->LineColInHtmlSrc().has_value()) return;

auto [r_line, r_col] = relative_node->PositionInHtmlSrc().value();
auto [r_line, r_col] = relative_node->LineColInHtmlSrc().value();

// Update the positions of this node.
if (position_in_html_src_.has_value()) {
auto [line, col] = position_in_html_src_.value();
if (line_col_in_html_src_.has_value()) {
auto [line, col] = line_col_in_html_src_.value();
int effective_col = line == 1 ?
r_col + col + AtomUtil::ToString(
relative_node->DataAtom()).size() + 1 /* closing > */ : col;
position_in_html_src_ = LineCol({line + r_line - 1, effective_col});
line_col_in_html_src_ = LineCol({line + r_line - 1, effective_col});
}

// Update the positions of this node's children.
Expand All @@ -454,9 +454,9 @@ std::string Node::DebugString() {
break;
}

if (position_in_html_src_.has_value()) {
ost << position_in_html_src_.value().first << ":"
<< position_in_html_src_.value().second;
if (line_col_in_html_src_.has_value()) {
ost << line_col_in_html_src_.value().first << ":"
<< line_col_in_html_src_.value().second;
}
ost << "\n";

Expand Down
17 changes: 14 additions & 3 deletions validator/cpp/htmlparser/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,14 @@ class Node {
Atom DataAtom() const { return atom_; }
std::string_view NameSpace() const { return name_space_; }
// Returns nullopt if ParseOptions.store_node_offsets is not set.
std::optional<LineCol> PositionInHtmlSrc() const {
return position_in_html_src_;
std::optional<LineCol> LineColInHtmlSrc() const {
return line_col_in_html_src_;
}
std::optional<Offsets> OffsetsInHtmlSrc() const {
return offsets_in_html_src_;
}
int NumTerms() const {
return num_terms_;
}

const std::vector<Attribute>& Attributes() const { return attributes_; }
Expand Down Expand Up @@ -153,7 +159,12 @@ class Node {
std::string data_;
std::string name_space_;
// Position at which this node appears in HTML source.
std::optional<LineCol> position_in_html_src_;
std::optional<LineCol> line_col_in_html_src_;
// Start/End offsets in original html src.
std::optional<LineCol> offsets_in_html_src_;
// Records the number of terms for text contents.
// Populated and meaningful only if node is of type TEXT_NODE.
int num_terms_ = 0;
std::vector<Attribute> attributes_{};
Node* first_child_ = nullptr;
Node* next_sibling_ = nullptr;
Expand Down
107 changes: 81 additions & 26 deletions validator/cpp/htmlparser/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@ void Parser::FosterParent(Node* node) {

if (prev && prev->node_type_ == NodeType::TEXT_NODE &&
node->node_type_ == NodeType::TEXT_NODE) {
prev->data_ += node->data_;
prev->data_.append(node->data_);
prev->num_terms_ += node->num_terms_;
return;
}

Expand All @@ -405,7 +406,11 @@ void Parser::AddText(const std::string& text) {

if (ShouldFosterParent()) {
text_node->data_.assign(text, 0, text.size());
text_node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
text_node->line_col_in_html_src_ = token_.line_col_in_html_src;
text_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
text_node->num_terms_ = token_.num_terms;
FosterParent(text_node);
return;
}
Expand All @@ -414,11 +419,16 @@ void Parser::AddText(const std::string& text) {
if (top_node->LastChild() &&
top_node->LastChild()->node_type_ == NodeType::TEXT_NODE) {
top_node->LastChild()->data_.append(text);
top_node->num_terms_ += token_.num_terms;
return;
}

text_node->data_.assign(text, 0, text.size());
text_node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
text_node->line_col_in_html_src_ = token_.line_col_in_html_src;
text_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
text_node->num_terms_ = token_.num_terms;
AddChild(text_node);
} // Parser::AddText.

Expand Down Expand Up @@ -446,7 +456,8 @@ void Parser::AddElement() {
}

if (record_node_offsets_) {
element_node->position_in_html_src_ = token_.position_in_html_src;
element_node->line_col_in_html_src_ = token_.line_col_in_html_src;
element_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}

std::copy(token_.attributes.begin(), token_.attributes.end(),
Expand All @@ -457,7 +468,7 @@ void Parser::AddElement() {
std::transform(
element_node->attributes_.begin(), element_node->attributes_.end(),
element_node->attributes_.begin(), [](Attribute attr) -> Attribute {
attr.position_in_html_src = std::nullopt;
attr.line_col_in_html_src = std::nullopt;
return attr;
});
}
Expand Down Expand Up @@ -662,7 +673,8 @@ bool Parser::InitialIM() {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->data_ = std::move(token_.data);
if (record_node_offsets_) {
node->position_in_html_src_ = token_.position_in_html_src;
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->SetManufactured(token_.is_manufactured);
document_->root_node_->AppendChild(node);
Expand All @@ -672,7 +684,8 @@ bool Parser::InitialIM() {
auto doctype_node = document_->NewNode(NodeType::DOCTYPE_NODE);
bool quirks_mode = ParseDoctype(token_.data, doctype_node);
if (record_node_offsets_) {
doctype_node->position_in_html_src_ = token_.position_in_html_src;
doctype_node->line_col_in_html_src_ = token_.line_col_in_html_src;
doctype_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
document_->root_node_->AppendChild(doctype_node);
accounting_.quirks_mode = quirks_mode;
Expand Down Expand Up @@ -735,7 +748,10 @@ bool Parser::BeforeHTMLIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
document_->root_node_->AppendChild(node);
return true;
Expand Down Expand Up @@ -792,7 +808,10 @@ bool Parser::BeforeHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -924,7 +943,10 @@ bool Parser::InHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -1088,7 +1110,10 @@ bool Parser::AfterHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -1155,7 +1180,7 @@ bool Parser::InBodyIM() {
if (!accounting_.has_manufactured_html || num_html_tags_ > 1) {
accounting_.duplicate_html_elements = true;
accounting_.duplicate_html_element_location =
token_.position_in_html_src;
token_.line_col_in_html_src;
}
break;
}
Expand Down Expand Up @@ -1185,7 +1210,7 @@ bool Parser::InBodyIM() {
if (!accounting_.has_manufactured_body || num_body_tags_ > 1) {
accounting_.duplicate_body_elements = true;
accounting_.duplicate_body_element_location =
token_.position_in_html_src;
token_.line_col_in_html_src;
}
}
}
Expand Down Expand Up @@ -1692,7 +1717,10 @@ bool Parser::InBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2117,7 +2145,10 @@ bool Parser::InTableIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2231,7 +2262,10 @@ bool Parser::InColumnGroupIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2368,7 +2402,10 @@ bool Parser::InTableBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2659,7 +2696,10 @@ bool Parser::InSelectIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2849,7 +2889,10 @@ bool Parser::AfterBodyIM() {
"after-body insertion mode";
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
open_elements_stack_.at(0)->AppendChild(node);
return true;
Expand All @@ -2868,7 +2911,10 @@ bool Parser::InFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2924,7 +2970,10 @@ bool Parser::AfterFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2981,7 +3030,10 @@ bool Parser::AfterAfterBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
document_->root_node_->AppendChild(node);
return true;
Expand All @@ -3002,7 +3054,10 @@ bool Parser::AfterAfterFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
document_->root_node_->AppendChild(node);
break;
Expand Down Expand Up @@ -3194,7 +3249,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
Token real_token = {.token_type = token_.token_type,
.atom = token_.atom,
.data = token_.data,
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = token_.attributes};
bool self_closing = has_self_closing_token_;
// Create implied tokens.
Expand All @@ -3203,7 +3258,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
.data = data,
// For reporting purposes implied tokens are assumed to be parsed at
// the current tag location.
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = {}};
has_self_closing_token_ = false;

Expand All @@ -3229,7 +3284,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
token_ = {.token_type = real_token.token_type,
.atom = real_token.atom,
.data = real_token.data,
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = real_token.attributes};
has_self_closing_token_ = self_closing;
} // Parser::ParseImpliedToken.
Expand Down
Loading