From 6816747f707e7c7f333753a0a04544764009dfe0 Mon Sep 17 00:00:00 2001 From: "susumu.yata" Date: Mon, 11 Oct 2010 10:57:26 +0000 Subject: [PATCH] Added tools/cgi/text-extractor.py. --- configure | 4 +- tools/Makefile.am | 6 +- tools/Makefile.in | 8 +- tools/cgi/text-extractor.py | 429 ++++++++++++++++++++++++++++++++++++ 4 files changed, 442 insertions(+), 5 deletions(-) create mode 100755 tools/cgi/text-extractor.py diff --git a/configure b/configure index c8d2ca2..54fa422 100755 --- a/configure +++ b/configure @@ -5011,7 +5011,7 @@ if test "x$ac_cv_header_bzlib_h" = x""yes; then : else as_fn_error "\ -The NWC Toolkit requires zlib. +The NWC Toolkit requires libbz2. Project URL: http://www.bzip.org/" "$LINENO" 5 fi @@ -5239,7 +5239,7 @@ _ACEOF else as_fn_error "\ -The NWC Toolkit requires zlib. +The NWC Toolkit requires libbz2. Project URL: http://www.bzip.org/" "$LINENO" 5 fi diff --git a/tools/Makefile.am b/tools/Makefile.am index 4ed36c1..39af4a7 100644 --- a/tools/Makefile.am +++ b/tools/Makefile.am @@ -4,6 +4,9 @@ AM_LDFLAGS = `icu-config --ldflags` bin_SCRIPTS = \ nwc-toolkit-config +noinst_SCRIPTS = \ + cgi/text-extractor.py + bin_PROGRAMS = \ nwc-toolkit-text-extractor \ nwc-toolkit-text-filter \ @@ -19,4 +22,5 @@ nwc_toolkit_unicode_normalizer_SOURCES = nwc-toolkit-unicode-normalizer.cc nwc_toolkit_unicode_normalizer_LDADD = ../lib/libnwc-toolkit.a EXTRA_DIST = \ - nwc-toolkit-config + $(bin_SCRIPTS) \ + $(noinst_SCRIPTS) diff --git a/tools/Makefile.in b/tools/Makefile.in index afb4d5f..7b0219b 100644 --- a/tools/Makefile.in +++ b/tools/Makefile.in @@ -83,7 +83,7 @@ am__nobase_list = $(am__nobase_strip_setup); \ am__base_list = \ sed '$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;$$!N;s/\n/ /g' | \ sed '$$!N;$$!N;$$!N;$$!N;s/\n/ /g' -SCRIPTS = $(bin_SCRIPTS) +SCRIPTS = $(bin_SCRIPTS) $(noinst_SCRIPTS) DEFAULT_INCLUDES = -I.@am__isrc@ depcomp = $(SHELL) $(top_srcdir)/depcomp am__depfiles_maybe = depfiles @@ -198,6 +198,9 @@ AM_LDFLAGS = `icu-config --ldflags` bin_SCRIPTS = \ nwc-toolkit-config +noinst_SCRIPTS = \ + cgi/text-extractor.py + nwc_toolkit_text_extractor_SOURCES = nwc-toolkit-text-extractor.cc nwc_toolkit_text_extractor_LDADD = ../lib/libnwc-toolkit.a nwc_toolkit_text_filter_SOURCES = nwc-toolkit-text-filter.cc @@ -205,7 +208,8 @@ nwc_toolkit_text_filter_LDADD = ../lib/libnwc-toolkit.a nwc_toolkit_unicode_normalizer_SOURCES = nwc-toolkit-unicode-normalizer.cc nwc_toolkit_unicode_normalizer_LDADD = ../lib/libnwc-toolkit.a EXTRA_DIST = \ - nwc-toolkit-config + $(bin_SCRIPTS) \ + $(noinst_SCRIPTS) all: all-am diff --git a/tools/cgi/text-extractor.py b/tools/cgi/text-extractor.py new file mode 100755 index 0000000..7555cbc --- /dev/null +++ b/tools/cgi/text-extractor.py @@ -0,0 +1,429 @@ +#! /usr/bin/python +# -*- coding: utf-8 -*- + +import cgi +import cgitb +import re +import sys + +cgitb.enable() + +USER_AGENT = """Mozilla/5.0 (compatible; nwc-toolkit; http://code.google.com/p/nwc-toolkit/)""" + +AVAILABLE_SCHEMES = frozenset(["http", "https", "ftp"]) + +RESPONSE_HEADER = """Content-Type: text/html; charset=utf-8\n\n""" + +HTML_HEADER = """ + + + + HTML テキスト抽出 + + + + + + + + + + + + +""" + +HTML_FOOTER = """ + +""" + +NAVIGATOR = """ + +""" + +URL_FORM = """
+
+ 入力された URL から HTML をダウンロードしてテキストを抽出します. +
+

+ +

+

+ +

+
+ + +""" + +FILE_FORM = """
+
+ 入力された HTML ファイルからテキストを抽出します. +
+

+ +

+

+ +

+
+ + +""" + +HTML_FORM = """
+
+ 入力された HTML からテキストを抽出します. +
+

+ +

+

+ +

+
+ + +""" + +ERROR_BODY = """
+ テキスト抽出に失敗しました. +
+

+ %s +

+
+""" + +TEXT_BODY = """
+ テキスト抽出に成功しました. +
+ +%s
+
+""" + +def PrintResponseHeader(): + sys.stdout.write(RESPONSE_HEADER) + +def PrintHtmlHeader(): + sys.stdout.write(HTML_HEADER) + +def PrintHtmlFooter(): + sys.stdout.write(HTML_FOOTER) + +def PrintForm(form_value): + if form_value == "url": + sys.stdout.write(NAVIGATOR % ("on", "off", "off")) + sys.stdout.write(URL_FORM) + elif form_value == "file": + sys.stdout.write(NAVIGATOR % ("off", "on", "off")) + sys.stdout.write(FILE_FORM) + elif form_value == "html": + sys.stdout.write(NAVIGATOR % ("off", "off", "on")) + sys.stdout.write(HTML_FORM) + else: + sys.stdout.write(NAVIGATOR % ("on", "off", "off")) + sys.stdout.write(URL_FORM) + +def PrintError(error_message): + sys.stdout.write(ERROR_BODY % (cgi.escape(error_message))) + +def PrintText(text): + import StringIO + string_io = StringIO.StringIO() + lines = text.splitlines() + line_id = 0 + for line in lines: + if not line: + continue + line_id = line_id + 1 + string_io.write(" \n") + string_io.write(" %s\n" % str(line_id)) + string_io.write(" %s\n" % cgi.escape(line)) + string_io.write(" \n") + sys.stdout.write(TEXT_BODY % (string_io.getvalue())) + string_io.close() + +def ExtractTextFromArchive(archive): + import subprocess + sub_process = subprocess.Popen( + "nwc-toolkit-text-extractor --archive --1 --nfkc --filter", + shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE) + sub_process.stdin.write(archive) + sub_process.stdin.close() + text = sub_process.stdout.read() + sub_process.stdout.close() + return_code = sub_process.wait() + if return_code != 0: + return None + else: + return text + +def ExtractTextFromDocument(document): + import subprocess + sub_process = subprocess.Popen( + "nwc-toolkit-text-extractor --document --nfkc --filter", + shell = True, stdin = subprocess.PIPE, stdout = subprocess.PIPE) + sub_process.stdin.write(document) + sub_process.stdin.close() + text = sub_process.stdout.read() + sub_process.stdout.close() + return_code = sub_process.wait() + if return_code != 0: + return None + else: + return text + +def ExtractTextFromUrl(url_value): + sys.stdout.write(NAVIGATOR % ("on", "off", "off")) + + import urlparse + url_parts = urlparse.urlparse(url_value) + url_scheme = url_parts.scheme.lower() + if url_scheme not in AVAILABLE_SCHEMES: + PrintError("指定された scheme(\"%s\")には対応していません." % ( + url_scheme)) + return + + import urllib + class SimpleUrlOpener(urllib.FancyURLopener): + version = USER_AGENT + def __init__(self): + urllib.FancyURLopener.__init__(self) + def prompt_user_passwd(self, host, realm): + return (None, None) + url_opener = SimpleUrlOpener() + try: + response_handle = url_opener.open(url_value) + except: + PrintError("指定された URL(\"%s\")からのダウンロードに失敗しました." %( + url_value)) + return + response_url = response_handle.geturl() + response_code = response_handle.getcode() + response_header = str(response_handle.info()) + response_body = response_handle.read() + archive = "%s\n%d\n%d\n%s%d\n%s" % (response_url, response_code, + len(response_header), response_header, len(response_body), response_body) + + text = ExtractTextFromArchive(archive) + if text == None: + PrintError("HTML の解析に失敗しました.") + else: + PrintText(text) + +def ExtractTextFromFile(file_value): + sys.stdout.write(NAVIGATOR % ("off", "on", "off")) + text = ExtractTextFromDocument(file_value) + if text == None: + PrintError("HTML の解析に失敗しました.") + else: + PrintText(text) + +def ExtractTextFromHtml(html_value): + sys.stdout.write(NAVIGATOR % ("off", "off", "on")) + text = ExtractTextFromDocument(html_value) + if text == None: + PrintError("HTML の解析に失敗しました.") + else: + PrintText(text) + +def main(argv): + PrintResponseHeader() + PrintHtmlHeader() + field_storage = cgi.FieldStorage() + if field_storage.has_key("url"): + url_value = field_storage.getfirst("url") + ExtractTextFromUrl(url_value) + elif field_storage.has_key("file"): + file_value = field_storage.getfirst("file") + ExtractTextFromFile(file_value) + elif field_storage.has_key("html"): + html_value = field_storage.getfirst("html") + ExtractTextFromHtml(html_value) + else: + form_value = field_storage.getvalue("form", "") + PrintForm(form_value) + PrintHtmlFooter() + +if __name__ == "__main__": + main(sys.argv)