Skip to content

Commit

Permalink
Sync for validator/cpp/htmlparser (#34999)
Browse files Browse the repository at this point in the history
* No description.

PiperOrigin-RevId: 375955262

* Replace `master` with `main` in htmlparser links.

PiperOrigin-RevId: 376840072

* - Add support for recording node's start/end offset in original html src. This in addition to line/col already supported in htmlparser.
- Add support for recording number of words (num_terms) in a text node.

PiperOrigin-RevId: 378533243

* Fix integer overflow caused by uninitialized int garbage value.

PiperOrigin-RevId: 378549050

* Add support for counting number of terms in a text node.

- Reverses previous changelist's num_terms by removing all logic from
tokenizer. I think going forward we will do less things in tokenizer and move
all post tokenization work in parser.
- Add CountTerms method in strings library.
- Add a note that support for callbacks is only partially implemented and is
intended to use only in unit tests.

PiperOrigin-RevId: 378755403

Co-authored-by: Googler <noreply@google.com>
Co-authored-by: Greg Grothaus <greggrothaus@google.com>
Co-authored-by: Amaltas Bohra <amaltas@google.com>
  • Loading branch information
4 people authored Jun 23, 2021
1 parent 644f33e commit f26e5ec
Show file tree
Hide file tree
Showing 13 changed files with 258 additions and 93 deletions.
1 change: 0 additions & 1 deletion validator/cpp/htmlparser/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -433,7 +433,6 @@ cc_library(
":defer",
":strings",
":token",
"@com_google_absl//absl/flags:flag",
],
copts = ["-std=c++17"],
)
Expand Down
16 changes: 8 additions & 8 deletions validator/cpp/htmlparser/node.cc
Original file line number Diff line number Diff line change
Expand Up @@ -418,17 +418,17 @@ std::string Node::InnerText() const {

void Node::UpdateChildNodesPositions(Node* relative_node) {
// Cannot proceed if relative node has no positional information.
if (!relative_node->PositionInHtmlSrc().has_value()) return;
if (!relative_node->LineColInHtmlSrc().has_value()) return;

auto [r_line, r_col] = relative_node->PositionInHtmlSrc().value();
auto [r_line, r_col] = relative_node->LineColInHtmlSrc().value();

// Update the positions of this node.
if (position_in_html_src_.has_value()) {
auto [line, col] = position_in_html_src_.value();
if (line_col_in_html_src_.has_value()) {
auto [line, col] = line_col_in_html_src_.value();
int effective_col = line == 1 ?
r_col + col + AtomUtil::ToString(
relative_node->DataAtom()).size() + 1 /* closing > */ : col;
position_in_html_src_ = LineCol({line + r_line - 1, effective_col});
line_col_in_html_src_ = LineCol({line + r_line - 1, effective_col});
}

// Update the positions of this node's children.
Expand All @@ -454,9 +454,9 @@ std::string Node::DebugString() {
break;
}

if (position_in_html_src_.has_value()) {
ost << position_in_html_src_.value().first << ":"
<< position_in_html_src_.value().second;
if (line_col_in_html_src_.has_value()) {
ost << line_col_in_html_src_.value().first << ":"
<< line_col_in_html_src_.value().second;
}
ost << "\n";

Expand Down
17 changes: 14 additions & 3 deletions validator/cpp/htmlparser/node.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,14 @@ class Node {
Atom DataAtom() const { return atom_; }
std::string_view NameSpace() const { return name_space_; }
// Returns nullopt if ParseOptions.store_node_offsets is not set.
std::optional<LineCol> PositionInHtmlSrc() const {
return position_in_html_src_;
std::optional<LineCol> LineColInHtmlSrc() const {
return line_col_in_html_src_;
}
std::optional<Offsets> OffsetsInHtmlSrc() const {
return offsets_in_html_src_;
}
int NumTerms() const {
return num_terms_;
}

const std::vector<Attribute>& Attributes() const { return attributes_; }
Expand Down Expand Up @@ -153,7 +159,12 @@ class Node {
std::string data_;
std::string name_space_;
// Position at which this node appears in HTML source.
std::optional<LineCol> position_in_html_src_;
std::optional<LineCol> line_col_in_html_src_;
// Start/End offsets in original html src.
std::optional<LineCol> offsets_in_html_src_;
// Records the number of terms for text contents.
// Populated and meaningful only if node is of type TEXT_NODE.
int num_terms_ = -1;
std::vector<Attribute> attributes_{};
Node* first_child_ = nullptr;
Node* next_sibling_ = nullptr;
Expand Down
113 changes: 85 additions & 28 deletions validator/cpp/htmlparser/parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,8 @@ std::unique_ptr<Document> Parse(std::string_view html) {
ParseOptions{.scripting = true,
.frameset_ok = true,
.record_node_offsets = true,
.record_attribute_offsets = true});
.record_attribute_offsets = true,
.count_num_terms_in_text_node = true});
return parser->Parse();
}

Expand Down Expand Up @@ -122,7 +123,8 @@ std::unique_ptr<Document> ParseFragment(std::string_view html,
ParseOptions options = {.scripting = true,
.frameset_ok = true,
.record_node_offsets = true,
.record_attribute_offsets = true};
.record_attribute_offsets = true,
.count_num_terms_in_text_node = true};
return ParseFragmentWithOptions(html, options, fragment_parent);
}

Expand All @@ -138,6 +140,7 @@ Parser::Parser(std::string_view html, const ParseOptions& options,
frameset_ok_(options.frameset_ok),
record_node_offsets_(options.record_node_offsets),
record_attribute_offsets_(options.record_attribute_offsets),
count_num_terms_in_text_node_(options.count_num_terms_in_text_node),
fragment_(fragment_parent != nullptr),
context_node_(fragment_parent) {
insertion_mode_ = std::bind(&Parser::InitialIM, this);
Expand Down Expand Up @@ -391,7 +394,7 @@ void Parser::FosterParent(Node* node) {

if (prev && prev->node_type_ == NodeType::TEXT_NODE &&
node->node_type_ == NodeType::TEXT_NODE) {
prev->data_ += node->data_;
prev->data_.append(node->data_);
return;
}

Expand All @@ -405,7 +408,10 @@ void Parser::AddText(const std::string& text) {

if (ShouldFosterParent()) {
text_node->data_.assign(text, 0, text.size());
text_node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
text_node->line_col_in_html_src_ = token_.line_col_in_html_src;
text_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
FosterParent(text_node);
return;
}
Expand All @@ -418,7 +424,13 @@ void Parser::AddText(const std::string& text) {
}

text_node->data_.assign(text, 0, text.size());
text_node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
text_node->line_col_in_html_src_ = token_.line_col_in_html_src;
text_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
if (count_num_terms_in_text_node_) {
text_node->num_terms_ = Strings::CountTerms(text);
}
AddChild(text_node);
} // Parser::AddText.

Expand Down Expand Up @@ -446,7 +458,8 @@ void Parser::AddElement() {
}

if (record_node_offsets_) {
element_node->position_in_html_src_ = token_.position_in_html_src;
element_node->line_col_in_html_src_ = token_.line_col_in_html_src;
element_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}

std::copy(token_.attributes.begin(), token_.attributes.end(),
Expand All @@ -457,7 +470,7 @@ void Parser::AddElement() {
std::transform(
element_node->attributes_.begin(), element_node->attributes_.end(),
element_node->attributes_.begin(), [](Attribute attr) -> Attribute {
attr.position_in_html_src = std::nullopt;
attr.line_col_in_html_src = std::nullopt;
return attr;
});
}
Expand Down Expand Up @@ -662,7 +675,8 @@ bool Parser::InitialIM() {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->data_ = std::move(token_.data);
if (record_node_offsets_) {
node->position_in_html_src_ = token_.position_in_html_src;
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->SetManufactured(token_.is_manufactured);
document_->root_node_->AppendChild(node);
Expand All @@ -672,7 +686,8 @@ bool Parser::InitialIM() {
auto doctype_node = document_->NewNode(NodeType::DOCTYPE_NODE);
bool quirks_mode = ParseDoctype(token_.data, doctype_node);
if (record_node_offsets_) {
doctype_node->position_in_html_src_ = token_.position_in_html_src;
doctype_node->line_col_in_html_src_ = token_.line_col_in_html_src;
doctype_node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
document_->root_node_->AppendChild(doctype_node);
accounting_.quirks_mode = quirks_mode;
Expand Down Expand Up @@ -735,7 +750,10 @@ bool Parser::BeforeHTMLIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
document_->root_node_->AppendChild(node);
return true;
Expand Down Expand Up @@ -792,7 +810,10 @@ bool Parser::BeforeHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -924,7 +945,10 @@ bool Parser::InHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -1088,7 +1112,10 @@ bool Parser::AfterHeadIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = std::move(token_.data);
AddChild(node);
return true;
Expand Down Expand Up @@ -1155,7 +1182,7 @@ bool Parser::InBodyIM() {
if (!accounting_.has_manufactured_html || num_html_tags_ > 1) {
accounting_.duplicate_html_elements = true;
accounting_.duplicate_html_element_location =
token_.position_in_html_src;
token_.line_col_in_html_src;
}
break;
}
Expand Down Expand Up @@ -1185,7 +1212,7 @@ bool Parser::InBodyIM() {
if (!accounting_.has_manufactured_body || num_body_tags_ > 1) {
accounting_.duplicate_body_elements = true;
accounting_.duplicate_body_element_location =
token_.position_in_html_src;
token_.line_col_in_html_src;
}
}
}
Expand Down Expand Up @@ -1692,7 +1719,10 @@ bool Parser::InBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2117,7 +2147,10 @@ bool Parser::InTableIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2231,7 +2264,10 @@ bool Parser::InColumnGroupIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2368,7 +2404,10 @@ bool Parser::InTableBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
return true;
Expand Down Expand Up @@ -2659,7 +2698,10 @@ bool Parser::InSelectIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2849,7 +2891,10 @@ bool Parser::AfterBodyIM() {
"after-body insertion mode";
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
open_elements_stack_.at(0)->AppendChild(node);
return true;
Expand All @@ -2868,7 +2913,10 @@ bool Parser::InFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2924,7 +2972,10 @@ bool Parser::AfterFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
AddChild(node);
break;
Expand Down Expand Up @@ -2981,7 +3032,10 @@ bool Parser::AfterAfterBodyIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
document_->root_node_->AppendChild(node);
return true;
Expand All @@ -3002,7 +3056,10 @@ bool Parser::AfterAfterFramesetIM() {
case TokenType::COMMENT_TOKEN: {
Node* node = document_->NewNode(NodeType::COMMENT_NODE);
node->SetManufactured(token_.is_manufactured);
node->position_in_html_src_ = token_.position_in_html_src;
if (record_node_offsets_) {
node->line_col_in_html_src_ = token_.line_col_in_html_src;
node->offsets_in_html_src_ = token_.offsets_in_html_src;
}
node->data_ = token_.data;
document_->root_node_->AppendChild(node);
break;
Expand Down Expand Up @@ -3194,7 +3251,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
Token real_token = {.token_type = token_.token_type,
.atom = token_.atom,
.data = token_.data,
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = token_.attributes};
bool self_closing = has_self_closing_token_;
// Create implied tokens.
Expand All @@ -3203,7 +3260,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
.data = data,
// For reporting purposes implied tokens are assumed to be parsed at
// the current tag location.
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = {}};
has_self_closing_token_ = false;

Expand All @@ -3229,7 +3286,7 @@ void Parser::ParseImpliedToken(TokenType token_type, Atom atom,
token_ = {.token_type = real_token.token_type,
.atom = real_token.atom,
.data = real_token.data,
.position_in_html_src = token_.position_in_html_src,
.line_col_in_html_src = token_.line_col_in_html_src,
.attributes = real_token.attributes};
has_self_closing_token_ = self_closing;
} // Parser::ParseImpliedToken.
Expand Down
Loading

0 comments on commit f26e5ec

Please sign in to comment.