Skip to content

Commit

Permalink
Added nwc-toolkit-html-parser.
Browse files Browse the repository at this point in the history
  This tool parses an HTML document and prints text/tag/comment/other-units.
Fixed a bug of HtmlDocument::ParseHtmlSpecialTag().
  • Loading branch information
susumu.yata committed Oct 11, 2010
1 parent 6816747 commit 8249510
Show file tree
Hide file tree
Showing 4 changed files with 209 additions and 15 deletions.
26 changes: 14 additions & 12 deletions lib/html-document.cc
Original file line number Diff line number Diff line change
Expand Up @@ -527,21 +527,23 @@ void HtmlDocument::ParseHtmlOtherUnit(String *tag) {
void HtmlDocument::ParseHtmlSpecialTag(const String &body_left,
const String &tag_name, String *tag) {
for (String avail = body_left; !avail.is_empty(); ) {
tag->Assign(avail.Find("</").begin(), avail.end());
if (tag->length() > 2) {
avail = tag->SubString(2);
if (avail.StartsWith(tag_name, ToLower())) {
avail = avail.SubString(tag_name.length());
if (avail.is_empty() || avail[0] == '>' || IsSpace()(avail[0])) {
String text_content(body_left.begin(), tag->begin());
AppendTextUnit(text_content, text_content, PLAIN_TEXT_FLAG);
ParseHtmlTagUnit(tag);
return;
}
String start_mark = avail.Find("</").begin();
if (start_mark.is_empty()) {
break;
}
tag->Assign(start_mark.begin(), avail.end());
avail = tag->SubString(2);
if (avail.StartsWith(tag_name, ToLower())) {
avail = avail.SubString(tag_name.length());
if (avail.is_empty() || avail[0] == '>' || IsSpace()(avail[0])) {
String text_content(body_left.begin(), tag->begin());
AppendTextUnit(text_content, text_content, PLAIN_TEXT_FLAG);
ParseHtmlTagUnit(tag);
return;
}
}
avail.set_begin(tag->end());
}
AppendTextUnit(body_left, body_left, PLAIN_TEXT_FLAG);
}

void HtmlDocument::AppendTextUnit(const String &src,
Expand Down
4 changes: 4 additions & 0 deletions tools/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,14 @@ noinst_SCRIPTS = \
cgi/text-extractor.py

bin_PROGRAMS = \
nwc-toolkit-html-parser \
nwc-toolkit-text-extractor \
nwc-toolkit-text-filter \
nwc-toolkit-unicode-normalizer

nwc_toolkit_html_parser_SOURCES = nwc-toolkit-html-parser.cc
nwc_toolkit_html_parser_LDADD = ../lib/libnwc-toolkit.a

nwc_toolkit_text_extractor_SOURCES = nwc-toolkit-text-extractor.cc
nwc_toolkit_text_extractor_LDADD = ../lib/libnwc-toolkit.a

Expand Down
20 changes: 17 additions & 3 deletions tools/Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ POST_INSTALL = :
NORMAL_UNINSTALL = :
PRE_UNINSTALL = :
POST_UNINSTALL = :
bin_PROGRAMS = nwc-toolkit-text-extractor$(EXEEXT) \
bin_PROGRAMS = nwc-toolkit-html-parser$(EXEEXT) \
nwc-toolkit-text-extractor$(EXEEXT) \
nwc-toolkit-text-filter$(EXEEXT) \
nwc-toolkit-unicode-normalizer$(EXEEXT)
subdir = tools
Expand All @@ -47,6 +48,11 @@ CONFIG_CLEAN_FILES =
CONFIG_CLEAN_VPATH_FILES =
am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)"
PROGRAMS = $(bin_PROGRAMS)
am_nwc_toolkit_html_parser_OBJECTS = \
nwc-toolkit-html-parser.$(OBJEXT)
nwc_toolkit_html_parser_OBJECTS = \
$(am_nwc_toolkit_html_parser_OBJECTS)
nwc_toolkit_html_parser_DEPENDENCIES = ../lib/libnwc-toolkit.a
am_nwc_toolkit_text_extractor_OBJECTS = \
nwc-toolkit-text-extractor.$(OBJEXT)
nwc_toolkit_text_extractor_OBJECTS = \
Expand Down Expand Up @@ -93,10 +99,12 @@ CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
CXXLD = $(CXX)
CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
-o $@
SOURCES = $(nwc_toolkit_text_extractor_SOURCES) \
SOURCES = $(nwc_toolkit_html_parser_SOURCES) \
$(nwc_toolkit_text_extractor_SOURCES) \
$(nwc_toolkit_text_filter_SOURCES) \
$(nwc_toolkit_unicode_normalizer_SOURCES)
DIST_SOURCES = $(nwc_toolkit_text_extractor_SOURCES) \
DIST_SOURCES = $(nwc_toolkit_html_parser_SOURCES) \
$(nwc_toolkit_text_extractor_SOURCES) \
$(nwc_toolkit_text_filter_SOURCES) \
$(nwc_toolkit_unicode_normalizer_SOURCES)
ETAGS = etags
Expand Down Expand Up @@ -201,6 +209,8 @@ bin_SCRIPTS = \
noinst_SCRIPTS = \
cgi/text-extractor.py

nwc_toolkit_html_parser_SOURCES = nwc-toolkit-html-parser.cc
nwc_toolkit_html_parser_LDADD = ../lib/libnwc-toolkit.a
nwc_toolkit_text_extractor_SOURCES = nwc-toolkit-text-extractor.cc
nwc_toolkit_text_extractor_LDADD = ../lib/libnwc-toolkit.a
nwc_toolkit_text_filter_SOURCES = nwc-toolkit-text-filter.cc
Expand Down Expand Up @@ -282,6 +292,9 @@ uninstall-binPROGRAMS:

clean-binPROGRAMS:
-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
nwc-toolkit-html-parser$(EXEEXT): $(nwc_toolkit_html_parser_OBJECTS) $(nwc_toolkit_html_parser_DEPENDENCIES)
@rm -f nwc-toolkit-html-parser$(EXEEXT)
$(CXXLINK) $(nwc_toolkit_html_parser_OBJECTS) $(nwc_toolkit_html_parser_LDADD) $(LIBS)
nwc-toolkit-text-extractor$(EXEEXT): $(nwc_toolkit_text_extractor_OBJECTS) $(nwc_toolkit_text_extractor_DEPENDENCIES)
@rm -f nwc-toolkit-text-extractor$(EXEEXT)
$(CXXLINK) $(nwc_toolkit_text_extractor_OBJECTS) $(nwc_toolkit_text_extractor_LDADD) $(LIBS)
Expand Down Expand Up @@ -332,6 +345,7 @@ mostlyclean-compile:
distclean-compile:
-rm -f *.tab.c

@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-html-parser.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-text-extractor.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-text-filter.Po@am__quote@
@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-unicode-normalizer.Po@am__quote@
Expand Down
174 changes: 174 additions & 0 deletions tools/nwc-toolkit-html-parser.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
// Copyright 2010 Susumu Yata <syata@acm.org>

#include <cstdlib>
#include <iostream>

#include <nwc-toolkit/character-reference.h>
#include <nwc-toolkit/html-document.h>

namespace {

class HtmlParser {
public:
HtmlParser() {}
~HtmlParser() {}

void ParseOptions(int *argc, char *argv[]);

bool Parse(nwc_toolkit::InputFile *input_file,
nwc_toolkit::OutputFile *output_file);

private:
// Disallows copy and assignment.
HtmlParser(const HtmlParser &);
HtmlParser &operator=(const HtmlParser &);
};

void HtmlParser::ParseOptions(int *argc, char *argv[]) {
int new_argc = 1;
for (int i = 1; i < *argc; ++i) {
nwc_toolkit::String arg = argv[i];
argv[new_argc++] = argv[i];
}
*argc = new_argc;
}

bool HtmlParser::Parse(nwc_toolkit::InputFile *input_file,
nwc_toolkit::OutputFile *output_file) {
nwc_toolkit::String line;
nwc_toolkit::StringBuilder body;
while (input_file->ReadLine(&line)) {
body.Append(line);
}

nwc_toolkit::HtmlDocument document;
if (!document.Parse(body.str())) {
std::cerr << "error: failed to parse document" << std::endl;
return false;
}

nwc_toolkit::StringBuilder name_buf;
nwc_toolkit::StringBuilder value_buf;
for (std::size_t i = 0; i < document.num_units(); ++i) {
const nwc_toolkit::HtmlDocumentUnit &unit = document.unit(i);
switch (unit.type()) {
case nwc_toolkit::HtmlDocumentUnit::TEXT_UNIT: {
std::cout << "Text: " << unit.text_content().length() << "bytes\n";
break;
}
case nwc_toolkit::HtmlDocumentUnit::TAG_UNIT: {
std::cout << "Tag: <";
if (unit.is_end_tag()) {
std::cout << '/';
}
std::cout << unit.tag_name();
for (std::size_t j = 0; j < unit.num_attributes(); ++j) {
name_buf.Clear();
nwc_toolkit::CharacterReference::Decode(
unit.attribute(j).name(), &name_buf);
value_buf.Clear();
nwc_toolkit::CharacterReference::Decode(
unit.attribute(j).value(), &value_buf);
std::cout << ' ' << name_buf << "=\"" << value_buf << '"';
}
if (unit.is_empty_element_tag()) {
std::cout << " /";
}
std::cout << ">\n";
break;
}
case nwc_toolkit::HtmlDocumentUnit::COMMENT_UNIT: {
std::cout << "Comment: " << unit.comment().length() << "bytes\n";
break;
}
case nwc_toolkit::HtmlDocumentUnit::OTHER_UNIT: {
std::cout << "Other: " << unit.other_content().length() << "bytes\n";
break;
}
default: {
std::cerr << "error: undefined document unit" << std::endl;
return false;
}
}
}
return true;
}

void PrintHelp(const char *command) {
std::cerr << "Usage: " << command << " [OPTION]... [FILE]...\n\n"
"Options:\n"
" --output=[FILE] output Parseed texts to this file\n"
" --help print this help\n"
<< std::flush;
}

} // namespace

int main(int argc, char *argv[]) {
HtmlParser html_parser;
html_parser.ParseOptions(&argc, argv);

nwc_toolkit::String output_file_path;

int new_argc = 1;
for (int i = 1; i < argc; ++i) {
nwc_toolkit::String arg = argv[i];
if (arg.StartsWith("--output", nwc_toolkit::ToLower())) {
arg = arg.SubString(8);
if (arg.StartsWith("=")) {
output_file_path = arg.SubString(1);
} else if (arg.is_empty()) {
if ((i + 1) < argc) {
output_file_path = argv[++i];
} else {
PrintHelp(argv[0]);
return -1;
}
} else {
argv[new_argc++] = argv[i];
}
} else if (arg.Compare("--help", nwc_toolkit::ToLower()) == 0) {
PrintHelp(argv[0]);
return 0;
} else {
argv[new_argc++] = argv[i];
}
}
argc = new_argc;

nwc_toolkit::OutputFile output_file;
std::cerr << "output: " << (output_file_path.is_empty()
? "standard output" : output_file_path) << std::endl;
if (!output_file.Open(output_file_path)) {
std::cerr << "error: failed to open output file: "
<< output_file_path << std::endl;
return -2;
}

if (argc == 1) {
nwc_toolkit::InputFile input_file;
std::cerr << "input: standard input" << std::endl;
if (!input_file.Open(NULL)) {
std::cerr << "error: failed to open standard input: " << std::endl;
return -3;
} else if (!html_parser.Parse(&input_file, &output_file)) {
return -4;
}
}

for (int i = 1; i < argc; ++i) {
nwc_toolkit::String input_file_path = argv[i];
nwc_toolkit::InputFile input_file;
std::cerr << "input: " << (input_file_path.is_empty()
? "standard input" : input_file_path) << std::endl;
if (!input_file.Open(input_file_path)) {
std::cerr << "error: failed to open input file: "
<< input_file_path << std::endl;
return -3;
} else if (!html_parser.Parse(&input_file, &output_file)) {
return -4;
}
}

return 0;
}

0 comments on commit 8249510

Please sign in to comment.