hxlquickimporttab (#7): v0.1 started (based on hxlquickimport, #6)

EticaAI · Feb 6, 2021 · bc59ace · bc59ace
1 parent 6dbdb9f
commit bc59ace
Show file tree

Hide file tree

Showing 2 changed files with 389 additions and 2 deletions.
diff --git a/bin/hxlquickimport b/bin/hxlquickimport
@@ -29,8 +29,7 @@
 #
 #  REQUIREMENTS:  - python3
 #                     - libhxl (@see https://pypi.org/project/libhxl/)
-#          BUGS:  - The v0.2-draft is just an draft. It's not implemented even
-#                   minimal functionality.
+#          BUGS:  ---
 #         NOTES:  ---
 #        AUTHOR:  Emerson Rocha <rocha[at]ieee.org>
 #       COMPANY:  EticaAI

diff --git a/bin/hxlquickimporttab b/bin/hxlquickimporttab
@@ -0,0 +1,388 @@
+#!/usr/bin/env python3
+# ==============================================================================
+#
+#          FILE:  hxlquickimporttab
+#
+#         USAGE:  hxlquickimporttab orange-file.tab hxlated-file.hxl.csv
+#                 cat orange-file.tab | hxlquickimporttab > hxlated-file.hxl.csv
+#
+#   DESCRIPTION:  ---
+#
+#       OPTIONS:  ---
+#
+#  REQUIREMENTS:  - python3
+#                     - libhxl (@see https://pypi.org/project/libhxl/)
+#          BUGS:  ---
+#         NOTES:  ---
+#        AUTHOR:  Emerson Rocha <rocha[at]ieee.org>
+#       COMPANY:  EticaAI
+#       LICENSE:  Public Domain dedication
+#                 SPDX-License-Identifier: Unlicense
+#       VERSION:  v0.1
+#       CREATED: 2021-02-06 21:21 UTC v0.1 started (based on hxlquickimport)
+#      REVISION: ---
+# ==============================================================================
+
+import sys
+import os
+import logging
+import argparse
+
+# Do not import hxl, to avoid circular imports
+import hxl.converters
+import hxl.filters
+import hxl.io
+
+import csv
+from slugify import slugify
+
+# In Python2, sys.stdin is a byte stream; in Python3, it's a text stream
+STDIN = sys.stdin.buffer
+
+
+class HXLQuickImportTab:
+    """
+    HXLQuickImportTab is a classe to export already HXLated data in the format
+    example.
+    """
+
+    def __init__(self):
+        """
+        Constructs all the necessary attributes for the HXLQuickImportTab object.
+        """
+        self.hxlhelper = None
+        self.args = None
+
+        # Posix exit codes
+        self.EXIT_OK = 0
+        self.EXIT_ERROR = 1
+        self.EXIT_SYNTAX = 2
+
+    def make_args_hxlquickimporttab(self):
+
+        self.hxlhelper = HXLUtils()
+        parser = self.hxlhelper.make_args(
+            description=("""
+hxlquickimporttab is a quick (and wrong) way to import
+non-HXL dataset (like an .csv or .xlsx, but requires headers already on the
+first row) without human intervention. It will try to slugify the original
+header and add as +attributefor a base hashtag like #meta.
+The result may be an HXL with valid syntax (that can be used for automated
+testing) but most HXL powered tools would still be human review.
+
+How does it work?
+"[Max Power] Kids: there's three ways to do things; the right way,
+the wrong way and the Max Power way!
+[Bart Simpson] Isn't that the wrong way?
+[Max Power] Yeah, but faster!"
+(via https://www.youtube.com/watch?v=7P0JM3h7IQk)
+
+How to do it the right way?
+Read the documentation on https://hxlstandard.org/.
+(Tip: both HXL Postcards and the hxl-hashtag-chooser are very helpful!)
+            """))
+
+        self.args = parser.parse_args()
+        return self.args
+
+    def execute(self, args,
+                stdin=STDIN, stdout=sys.stdout, stderr=sys.stderr):
+        """
+        The do_example_output is the main entrypoint of HXLQuickImportTab. When
+        called will convert the HXL source to example format.
+        """
+
+        # NOTE: the next lines, in fact, only generate an csv outut. So you
+        #       can use as starting point.
+        with self.hxlhelper.make_source(args, stdin) as source:
+            self.hxlquickimporttab(source, args, True)
+
+        return self.EXIT_OK
+
+    def hxlquickimporttab(self, hxlated_input, tab_output, is_stdout):
+        """
+        hic sunt dracones
+                           (__)    )
+                           (..)   /|\\
+                          (o_o)  / | \\
+                          ___) \\/,-|,-\\|
+                        //,-/_\\ )  '  '
+                           (//,-'\\|
+                           (  ( . \\_
+                        gnv `._\\(___`.
+                             '---' _)/
+                                  `-'
+        """
+
+        header_original = hxlated_input._get_row()
+
+        header_new = self.hxlquickimporttab_header(header_original)
+
+        if not args.outfile:
+            # txt_writer = csv.writer(sys.stdout, delimiter='\t')
+            txt_writer = csv.writer(sys.stdout)
+            txt_writer.writerow(header_new)
+            # for line in hxlated_input:
+            line = hxlated_input._get_row()
+
+            while line:
+                txt_writer.writerow(line)
+                try:
+                    line = hxlated_input._get_row()
+                except:
+                    line = False
+
+        else:
+
+            tab_output_cleanup = open(args.outfile, 'w')
+            tab_output_cleanup.truncate()
+            tab_output_cleanup.close()
+
+            with open(args.outfile, 'a') as new_txt:
+                # txt_writer = csv.writer(new_txt, delimiter='\t')
+                txt_writer = csv.writer(new_txt)
+                txt_writer.writerow(header_new)
+
+                line = hxlated_input._get_row()
+
+                while line:
+                    txt_writer.writerow(line)
+                    try:
+                        line = hxlated_input._get_row()
+                    except:
+                        line = False
+
+    def hxlquickimporttab_header(self, hxlated_header, basehashtag="#item"):
+        """
+        hhxlquickimporttab_header is a hackish to HXLate an CSV-like dataset
+        without human intervention.
+
+        How it works? It replaces the original header with the base
+        hashtag and then slugify the original header as attribute, so
+
+        ID_REGISTRO        -> #item+id_registro
+        NACIONALIDAD       -> #item+nacionalidad
+
+        The current version will not avoid 'conflicts' with HXL data types like
+
+        BOOL               -> #item+bool
+        number             -> #item+number
+        phone              -> #item+phone
+        """
+
+        for idx, a in enumerate(hxlated_header):
+            hxlated_header[idx] = basehashtag + '+' \
+                + slugify(hxlated_header[idx], separator="_")
+
+        return hxlated_header
+
+
+class HXLUtils:
+    """
+    HXLUtils contains functions from the Console scripts of libhxl-python
+    (HXLStandard/libhxl-python/blob/master/hxl/scripts.py) with few changes
+    to be used as class (and have one single place to change).
+    Last update on this class was 2021-01-25.
+
+    Author: David Megginson
+    License: Public Domain
+    """
+
+    def __init__(self):
+
+        self.logger = logging.getLogger(__name__)
+
+        # Posix exit codes
+        self.EXIT_OK = 0
+        self.EXIT_ERROR = 1
+        self.EXIT_SYNTAX = 2
+
+    def make_args(self, description, hxl_output=True):
+        """Set up parser with default arguments.
+        @param description: usage description to show
+        @param hxl_output: if True (default), include options for HXL output.
+        @returns: an argument parser, partly set up.
+        """
+        parser = argparse.ArgumentParser(description=description)
+        parser.add_argument(
+            'infile',
+            help='HXL file to read (if omitted, use standard input).',
+            nargs='?'
+        )
+        if hxl_output:
+            parser.add_argument(
+                'outfile',
+                help='HXL file to write (if omitted, use standard output).',
+                nargs='?'
+            )
+        parser.add_argument(
+            '--sheet',
+            help='Select sheet from a workbook (1 is first sheet)',
+            metavar='number',
+            type=int,
+            nargs='?'
+        )
+        parser.add_argument(
+            '--selector',
+            help='JSONPath expression for starting point in JSON input',
+            metavar='path',
+            nargs='?'
+        )
+        parser.add_argument(
+            '--http-header',
+            help='Custom HTTP header to send with request',
+            metavar='header',
+            action='append'
+        )
+        if hxl_output:
+            parser.add_argument(
+                '--remove-headers',
+                help='Strip text headers from the CSV output',
+                action='store_const',
+                const=True,
+                default=False
+            )
+            parser.add_argument(
+                '--strip-tags',
+                help='Strip HXL tags from the CSV output',
+                action='store_const',
+                const=True,
+                default=False
+            )
+        parser.add_argument(
+            "--ignore-certs",
+            help="Don't verify SSL connections (useful for self-signed)",
+            action='store_const',
+            const=True,
+            default=False
+        )
+        parser.add_argument(
+            '--log',
+            help='Set minimum logging level',
+            metavar='debug|info|warning|error|critical|none',
+            choices=['debug', 'info', 'warning', 'error', 'critical'],
+            default='error'
+        )
+        return parser
+
+    def add_queries_arg(
+        self,
+        parser,
+        help='Apply only to rows matching at least one query.'
+    ):
+        parser.add_argument(
+            '-q',
+            '--query',
+            help=help,
+            metavar='<tagspec><op><value>',
+            action='append'
+        )
+        return parser
+
+    def do_common_args(self, args):
+        """Process standard args"""
+        logging.basicConfig(
+            format='%(levelname)s (%(name)s): %(message)s',
+            level=args.log.upper())
+
+    def make_source(self, args, stdin=STDIN):
+        """Create a HXL input source."""
+
+        # construct the input object
+        input = self.make_input(args, stdin)
+        return hxl.io.data(input)
+
+    def make_input(self, args, stdin=sys.stdin, url_or_filename=None):
+        """Create an input object"""
+
+        if url_or_filename is None:
+            url_or_filename = args.infile
+
+        # sheet index
+        sheet_index = args.sheet
+        if sheet_index is not None:
+            sheet_index -= 1
+
+        # JSONPath selector
+        selector = args.selector
+
+        http_headers = self.make_headers(args)
+
+        return hxl.io.make_input(
+            url_or_filename or stdin,
+            sheet_index=sheet_index,
+            selector=selector,
+            allow_local=True,
+            http_headers=http_headers,
+            verify_ssl=(not args.ignore_certs)
+        )
+
+    def make_output(self, args, stdout=sys.stdout):
+        """Create an output stream."""
+        if args.outfile:
+            return FileOutput(args.outfile)
+        else:
+            return StreamOutput(stdout)
+
+    def make_headers(self, args):
+        # get custom headers
+        header_strings = []
+        header = os.environ.get("HXL_HTTP_HEADER")
+        if header is not None:
+            header_strings.append(header)
+        if args.http_header is not None:
+            header_strings += args.http_header
+        http_headers = {}
+        for header in header_strings:
+            parts = header.partition(':')
+            http_headers[parts[0].strip()] = parts[2].strip()
+        return http_headers
+
+
+class FileOutput(object):
+    """
+    FileOutput contains is based on libhxl-python with no changes..
+    Last update on this class was 2021-01-25.
+
+    Author: David Megginson
+    License: Public Domain
+    """
+
+    def __init__(self, filename):
+        self.output = open(filename, 'w')
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, value, type, traceback):
+        self.output.close()
+
+
+class StreamOutput(object):
+    """
+    StreamOutput contains is based on libhxl-python with no changes..
+    Last update on this class was 2021-01-25.
+
+    Author: David Megginson
+    License: Public Domain
+    """
+
+    def __init__(self, output):
+        self.output = output
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, value, type, traceback):
+        pass
+
+    def write(self, s):
+        self.output.write(s)
+
+
+if __name__ == "__main__":
+
+    hxlquickimporttab = HXLQuickImportTab()
+    args = hxlquickimporttab.make_args_hxlquickimporttab()
+
+    hxlquickimporttab.execute(args)