Added nwc-toolkit-html-parser.

This tool parses an HTML document and prints text/tag/comment/other-units. Fixed a bug of HtmlDocument::ParseHtmlSpecialTag().
jg1uaa · Oct 11, 2010 · 8249510 · 8249510
1 parent 6816747
commit 8249510
Show file tree

Hide file tree

Showing 4 changed files with 209 additions and 15 deletions.
diff --git a/lib/html-document.cc b/lib/html-document.cc
@@ -527,21 +527,23 @@ void HtmlDocument::ParseHtmlOtherUnit(String *tag) {
 void HtmlDocument::ParseHtmlSpecialTag(const String &body_left,
     const String &tag_name, String *tag) {
   for (String avail = body_left; !avail.is_empty(); ) {
-    tag->Assign(avail.Find("</").begin(), avail.end());
-    if (tag->length() > 2) {
-      avail = tag->SubString(2);
-      if (avail.StartsWith(tag_name, ToLower())) {
-        avail = avail.SubString(tag_name.length());
-        if (avail.is_empty() || avail[0] == '>' || IsSpace()(avail[0])) {
-          String text_content(body_left.begin(), tag->begin());
-          AppendTextUnit(text_content, text_content, PLAIN_TEXT_FLAG);
-          ParseHtmlTagUnit(tag);
-          return;
-        }
+    String start_mark = avail.Find("</").begin();
+    if (start_mark.is_empty()) {
+      break;
+    }
+    tag->Assign(start_mark.begin(), avail.end());
+    avail = tag->SubString(2);
+    if (avail.StartsWith(tag_name, ToLower())) {
+      avail = avail.SubString(tag_name.length());
+      if (avail.is_empty() || avail[0] == '>' || IsSpace()(avail[0])) {
+        String text_content(body_left.begin(), tag->begin());
+        AppendTextUnit(text_content, text_content, PLAIN_TEXT_FLAG);
+        ParseHtmlTagUnit(tag);
+        return;
       }
     }
-    avail.set_begin(tag->end());
   }
+  AppendTextUnit(body_left, body_left, PLAIN_TEXT_FLAG);
 }
 
 void HtmlDocument::AppendTextUnit(const String &src,

diff --git a/tools/Makefile.am b/tools/Makefile.am
@@ -8,10 +8,14 @@ noinst_SCRIPTS = \
   cgi/text-extractor.py
 
 bin_PROGRAMS = \
+  nwc-toolkit-html-parser \
   nwc-toolkit-text-extractor \
   nwc-toolkit-text-filter \
   nwc-toolkit-unicode-normalizer
 
+nwc_toolkit_html_parser_SOURCES = nwc-toolkit-html-parser.cc
+nwc_toolkit_html_parser_LDADD = ../lib/libnwc-toolkit.a
+
 nwc_toolkit_text_extractor_SOURCES = nwc-toolkit-text-extractor.cc
 nwc_toolkit_text_extractor_LDADD = ../lib/libnwc-toolkit.a
 

diff --git a/tools/Makefile.in b/tools/Makefile.in
@@ -33,7 +33,8 @@ POST_INSTALL = :
 NORMAL_UNINSTALL = :
 PRE_UNINSTALL = :
 POST_UNINSTALL = :
-bin_PROGRAMS = nwc-toolkit-text-extractor$(EXEEXT) \
+bin_PROGRAMS = nwc-toolkit-html-parser$(EXEEXT) \
+	nwc-toolkit-text-extractor$(EXEEXT) \
 	nwc-toolkit-text-filter$(EXEEXT) \
 	nwc-toolkit-unicode-normalizer$(EXEEXT)
 subdir = tools
@@ -47,6 +48,11 @@ CONFIG_CLEAN_FILES =
 CONFIG_CLEAN_VPATH_FILES =
 am__installdirs = "$(DESTDIR)$(bindir)" "$(DESTDIR)$(bindir)"
 PROGRAMS = $(bin_PROGRAMS)
+am_nwc_toolkit_html_parser_OBJECTS =  \
+	nwc-toolkit-html-parser.$(OBJEXT)
+nwc_toolkit_html_parser_OBJECTS =  \
+	$(am_nwc_toolkit_html_parser_OBJECTS)
+nwc_toolkit_html_parser_DEPENDENCIES = ../lib/libnwc-toolkit.a
 am_nwc_toolkit_text_extractor_OBJECTS =  \
 	nwc-toolkit-text-extractor.$(OBJEXT)
 nwc_toolkit_text_extractor_OBJECTS =  \
@@ -93,10 +99,12 @@ CXXCOMPILE = $(CXX) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) \
 CXXLD = $(CXX)
 CXXLINK = $(CXXLD) $(AM_CXXFLAGS) $(CXXFLAGS) $(AM_LDFLAGS) $(LDFLAGS) \
 	-o $@
-SOURCES = $(nwc_toolkit_text_extractor_SOURCES) \
+SOURCES = $(nwc_toolkit_html_parser_SOURCES) \
+	$(nwc_toolkit_text_extractor_SOURCES) \
 	$(nwc_toolkit_text_filter_SOURCES) \
 	$(nwc_toolkit_unicode_normalizer_SOURCES)
-DIST_SOURCES = $(nwc_toolkit_text_extractor_SOURCES) \
+DIST_SOURCES = $(nwc_toolkit_html_parser_SOURCES) \
+	$(nwc_toolkit_text_extractor_SOURCES) \
 	$(nwc_toolkit_text_filter_SOURCES) \
 	$(nwc_toolkit_unicode_normalizer_SOURCES)
 ETAGS = etags
@@ -201,6 +209,8 @@ bin_SCRIPTS = \
 noinst_SCRIPTS = \
   cgi/text-extractor.py
 
+nwc_toolkit_html_parser_SOURCES = nwc-toolkit-html-parser.cc
+nwc_toolkit_html_parser_LDADD = ../lib/libnwc-toolkit.a
 nwc_toolkit_text_extractor_SOURCES = nwc-toolkit-text-extractor.cc
 nwc_toolkit_text_extractor_LDADD = ../lib/libnwc-toolkit.a
 nwc_toolkit_text_filter_SOURCES = nwc-toolkit-text-filter.cc
@@ -282,6 +292,9 @@ uninstall-binPROGRAMS:
 
 clean-binPROGRAMS:
 	-test -z "$(bin_PROGRAMS)" || rm -f $(bin_PROGRAMS)
+nwc-toolkit-html-parser$(EXEEXT): $(nwc_toolkit_html_parser_OBJECTS) $(nwc_toolkit_html_parser_DEPENDENCIES) 
+	@rm -f nwc-toolkit-html-parser$(EXEEXT)
+	$(CXXLINK) $(nwc_toolkit_html_parser_OBJECTS) $(nwc_toolkit_html_parser_LDADD) $(LIBS)
 nwc-toolkit-text-extractor$(EXEEXT): $(nwc_toolkit_text_extractor_OBJECTS) $(nwc_toolkit_text_extractor_DEPENDENCIES) 
 	@rm -f nwc-toolkit-text-extractor$(EXEEXT)
 	$(CXXLINK) $(nwc_toolkit_text_extractor_OBJECTS) $(nwc_toolkit_text_extractor_LDADD) $(LIBS)
@@ -332,6 +345,7 @@ mostlyclean-compile:
 distclean-compile:
 	-rm -f *.tab.c
 
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-html-parser.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-text-extractor.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-text-filter.Po@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/nwc-toolkit-unicode-normalizer.Po@am__quote@

diff --git a/tools/nwc-toolkit-html-parser.cc b/tools/nwc-toolkit-html-parser.cc
@@ -0,0 +1,174 @@
+// Copyright 2010 Susumu Yata <syata@acm.org>
+
+#include <cstdlib>
+#include <iostream>
+
+#include <nwc-toolkit/character-reference.h>
+#include <nwc-toolkit/html-document.h>
+
+namespace {
+
+class HtmlParser {
+ public:
+  HtmlParser() {}
+  ~HtmlParser() {}
+
+  void ParseOptions(int *argc, char *argv[]);
+
+  bool Parse(nwc_toolkit::InputFile *input_file,
+      nwc_toolkit::OutputFile *output_file);
+
+ private:
+  // Disallows copy and assignment.
+  HtmlParser(const HtmlParser &);
+  HtmlParser &operator=(const HtmlParser &);
+};
+
+void HtmlParser::ParseOptions(int *argc, char *argv[]) {
+  int new_argc = 1;
+  for (int i = 1; i < *argc; ++i) {
+    nwc_toolkit::String arg = argv[i];
+    argv[new_argc++] = argv[i];
+  }
+  *argc = new_argc;
+}
+
+bool HtmlParser::Parse(nwc_toolkit::InputFile *input_file,
+    nwc_toolkit::OutputFile *output_file) {
+  nwc_toolkit::String line;
+  nwc_toolkit::StringBuilder body;
+  while (input_file->ReadLine(&line)) {
+    body.Append(line);
+  }
+
+  nwc_toolkit::HtmlDocument document;
+  if (!document.Parse(body.str())) {
+    std::cerr << "error: failed to parse document" << std::endl;
+    return false;
+  }
+
+  nwc_toolkit::StringBuilder name_buf;
+  nwc_toolkit::StringBuilder value_buf;
+  for (std::size_t i = 0; i < document.num_units(); ++i) {
+    const nwc_toolkit::HtmlDocumentUnit &unit = document.unit(i);
+    switch (unit.type()) {
+      case nwc_toolkit::HtmlDocumentUnit::TEXT_UNIT: {
+        std::cout << "Text: " << unit.text_content().length() << "bytes\n";
+        break;
+      }
+      case nwc_toolkit::HtmlDocumentUnit::TAG_UNIT: {
+        std::cout << "Tag: <";
+        if (unit.is_end_tag()) {
+          std::cout << '/';
+        }
+        std::cout << unit.tag_name();
+        for (std::size_t j = 0; j < unit.num_attributes(); ++j) {
+          name_buf.Clear();
+          nwc_toolkit::CharacterReference::Decode(
+              unit.attribute(j).name(), &name_buf);
+          value_buf.Clear();
+          nwc_toolkit::CharacterReference::Decode(
+              unit.attribute(j).value(), &value_buf);
+          std::cout << ' ' << name_buf << "=\"" << value_buf << '"';
+        }
+        if (unit.is_empty_element_tag()) {
+          std::cout << " /";
+        }
+        std::cout << ">\n";
+        break;
+      }
+      case nwc_toolkit::HtmlDocumentUnit::COMMENT_UNIT: {
+        std::cout << "Comment: " << unit.comment().length() << "bytes\n";
+        break;
+      }
+      case nwc_toolkit::HtmlDocumentUnit::OTHER_UNIT: {
+        std::cout << "Other: " << unit.other_content().length() << "bytes\n";
+        break;
+      }
+      default: {
+        std::cerr << "error: undefined document unit" << std::endl;
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+void PrintHelp(const char *command) {
+  std::cerr << "Usage: " << command << " [OPTION]... [FILE]...\n\n"
+      "Options:\n"
+      "  --output=[FILE]  output Parseed texts to this file\n"
+      "  --help      print this help\n"
+      << std::flush;
+}
+
+}  // namespace
+
+int main(int argc, char *argv[]) {
+  HtmlParser html_parser;
+  html_parser.ParseOptions(&argc, argv);
+
+  nwc_toolkit::String output_file_path;
+
+  int new_argc = 1;
+  for (int i = 1; i < argc; ++i) {
+    nwc_toolkit::String arg = argv[i];
+    if (arg.StartsWith("--output", nwc_toolkit::ToLower())) {
+      arg = arg.SubString(8);
+      if (arg.StartsWith("=")) {
+        output_file_path = arg.SubString(1);
+      } else if (arg.is_empty()) {
+        if ((i + 1) < argc) {
+          output_file_path = argv[++i];
+        } else {
+          PrintHelp(argv[0]);
+          return -1;
+        }
+      } else {
+        argv[new_argc++] = argv[i];
+      }
+    } else if (arg.Compare("--help", nwc_toolkit::ToLower()) == 0) {
+      PrintHelp(argv[0]);
+      return 0;
+    } else {
+      argv[new_argc++] = argv[i];
+    }
+  }
+  argc = new_argc;
+
+  nwc_toolkit::OutputFile output_file;
+  std::cerr << "output: " << (output_file_path.is_empty()
+      ? "standard output" : output_file_path) << std::endl;
+  if (!output_file.Open(output_file_path)) {
+    std::cerr << "error: failed to open output file: "
+        << output_file_path << std::endl;
+    return -2;
+  }
+
+  if (argc == 1) {
+    nwc_toolkit::InputFile input_file;
+    std::cerr << "input: standard input" << std::endl;
+    if (!input_file.Open(NULL)) {
+      std::cerr << "error: failed to open standard input: " << std::endl;
+      return -3;
+    } else if (!html_parser.Parse(&input_file, &output_file)) {
+      return -4;
+    }
+  }
+
+  for (int i = 1; i < argc; ++i) {
+    nwc_toolkit::String input_file_path = argv[i];
+    nwc_toolkit::InputFile input_file;
+    std::cerr << "input: " << (input_file_path.is_empty()
+        ? "standard input" : input_file_path) << std::endl;
+    if (!input_file.Open(input_file_path)) {
+      std::cerr << "error: failed to open input file: "
+          << input_file_path << std::endl;
+      return -3;
+    } else if (!html_parser.Parse(&input_file, &output_file)) {
+      return -4;
+    }
+  }
+
+  return 0;
+}