Skip to content

Commit

Permalink
hxlquickimporttab (#7): v0.1 started (based on hxlquickimport, #6)
Browse files Browse the repository at this point in the history
  • Loading branch information
fititnt committed Feb 6, 2021
1 parent 6dbdb9f commit bc59ace
Show file tree
Hide file tree
Showing 2 changed files with 389 additions and 2 deletions.
3 changes: 1 addition & 2 deletions bin/hxlquickimport
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,7 @@
#
# REQUIREMENTS: - python3
# - libhxl (@see https://pypi.org/project/libhxl/)
# BUGS: - The v0.2-draft is just an draft. It's not implemented even
# minimal functionality.
# BUGS: ---
# NOTES: ---
# AUTHOR: Emerson Rocha <rocha[at]ieee.org>
# COMPANY: EticaAI
Expand Down
388 changes: 388 additions & 0 deletions bin/hxlquickimporttab
Original file line number Diff line number Diff line change
@@ -0,0 +1,388 @@
#!/usr/bin/env python3
# ==============================================================================
#
# FILE: hxlquickimporttab
#
# USAGE: hxlquickimporttab orange-file.tab hxlated-file.hxl.csv
# cat orange-file.tab | hxlquickimporttab > hxlated-file.hxl.csv
#
# DESCRIPTION: ---
#
# OPTIONS: ---
#
# REQUIREMENTS: - python3
# - libhxl (@see https://pypi.org/project/libhxl/)
# BUGS: ---
# NOTES: ---
# AUTHOR: Emerson Rocha <rocha[at]ieee.org>
# COMPANY: EticaAI
# LICENSE: Public Domain dedication
# SPDX-License-Identifier: Unlicense
# VERSION: v0.1
# CREATED: 2021-02-06 21:21 UTC v0.1 started (based on hxlquickimport)
# REVISION: ---
# ==============================================================================

import sys
import os
import logging
import argparse

# Do not import hxl, to avoid circular imports
import hxl.converters
import hxl.filters
import hxl.io

import csv
from slugify import slugify

# In Python2, sys.stdin is a byte stream; in Python3, it's a text stream
STDIN = sys.stdin.buffer


class HXLQuickImportTab:
"""
HXLQuickImportTab is a classe to export already HXLated data in the format
example.
"""

def __init__(self):
"""
Constructs all the necessary attributes for the HXLQuickImportTab object.
"""
self.hxlhelper = None
self.args = None

# Posix exit codes
self.EXIT_OK = 0
self.EXIT_ERROR = 1
self.EXIT_SYNTAX = 2

def make_args_hxlquickimporttab(self):

self.hxlhelper = HXLUtils()
parser = self.hxlhelper.make_args(
description=("""
hxlquickimporttab is a quick (and wrong) way to import
non-HXL dataset (like an .csv or .xlsx, but requires headers already on the
first row) without human intervention. It will try to slugify the original
header and add as +attributefor a base hashtag like #meta.
The result may be an HXL with valid syntax (that can be used for automated
testing) but most HXL powered tools would still be human review.
How does it work?
"[Max Power] Kids: there's three ways to do things; the right way,
the wrong way and the Max Power way!
[Bart Simpson] Isn't that the wrong way?
[Max Power] Yeah, but faster!"
(via https://www.youtube.com/watch?v=7P0JM3h7IQk)
How to do it the right way?
Read the documentation on https://hxlstandard.org/.
(Tip: both HXL Postcards and the hxl-hashtag-chooser are very helpful!)
"""))

self.args = parser.parse_args()
return self.args

def execute(self, args,
stdin=STDIN, stdout=sys.stdout, stderr=sys.stderr):
"""
The do_example_output is the main entrypoint of HXLQuickImportTab. When
called will convert the HXL source to example format.
"""

# NOTE: the next lines, in fact, only generate an csv outut. So you
# can use as starting point.
with self.hxlhelper.make_source(args, stdin) as source:
self.hxlquickimporttab(source, args, True)

return self.EXIT_OK

def hxlquickimporttab(self, hxlated_input, tab_output, is_stdout):
"""
hic sunt dracones
(__) )
(..) /|\\
(o_o) / | \\
___) \\/,-|,-\\|
//,-/_\\ ) ' '
(//,-'\\|
( ( . \\_
gnv `._\\(___`.
'---' _)/
`-'
"""

header_original = hxlated_input._get_row()

header_new = self.hxlquickimporttab_header(header_original)

if not args.outfile:
# txt_writer = csv.writer(sys.stdout, delimiter='\t')
txt_writer = csv.writer(sys.stdout)
txt_writer.writerow(header_new)
# for line in hxlated_input:
line = hxlated_input._get_row()

while line:
txt_writer.writerow(line)
try:
line = hxlated_input._get_row()
except:
line = False

else:

tab_output_cleanup = open(args.outfile, 'w')
tab_output_cleanup.truncate()
tab_output_cleanup.close()

with open(args.outfile, 'a') as new_txt:
# txt_writer = csv.writer(new_txt, delimiter='\t')
txt_writer = csv.writer(new_txt)
txt_writer.writerow(header_new)

line = hxlated_input._get_row()

while line:
txt_writer.writerow(line)
try:
line = hxlated_input._get_row()
except:
line = False

def hxlquickimporttab_header(self, hxlated_header, basehashtag="#item"):
"""
hhxlquickimporttab_header is a hackish to HXLate an CSV-like dataset
without human intervention.
How it works? It replaces the original header with the base
hashtag and then slugify the original header as attribute, so
ID_REGISTRO -> #item+id_registro
NACIONALIDAD -> #item+nacionalidad
The current version will not avoid 'conflicts' with HXL data types like
BOOL -> #item+bool
number -> #item+number
phone -> #item+phone
"""

for idx, a in enumerate(hxlated_header):
hxlated_header[idx] = basehashtag + '+' \
+ slugify(hxlated_header[idx], separator="_")

return hxlated_header


class HXLUtils:
"""
HXLUtils contains functions from the Console scripts of libhxl-python
(HXLStandard/libhxl-python/blob/master/hxl/scripts.py) with few changes
to be used as class (and have one single place to change).
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self):

self.logger = logging.getLogger(__name__)

# Posix exit codes
self.EXIT_OK = 0
self.EXIT_ERROR = 1
self.EXIT_SYNTAX = 2

def make_args(self, description, hxl_output=True):
"""Set up parser with default arguments.
@param description: usage description to show
@param hxl_output: if True (default), include options for HXL output.
@returns: an argument parser, partly set up.
"""
parser = argparse.ArgumentParser(description=description)
parser.add_argument(
'infile',
help='HXL file to read (if omitted, use standard input).',
nargs='?'
)
if hxl_output:
parser.add_argument(
'outfile',
help='HXL file to write (if omitted, use standard output).',
nargs='?'
)
parser.add_argument(
'--sheet',
help='Select sheet from a workbook (1 is first sheet)',
metavar='number',
type=int,
nargs='?'
)
parser.add_argument(
'--selector',
help='JSONPath expression for starting point in JSON input',
metavar='path',
nargs='?'
)
parser.add_argument(
'--http-header',
help='Custom HTTP header to send with request',
metavar='header',
action='append'
)
if hxl_output:
parser.add_argument(
'--remove-headers',
help='Strip text headers from the CSV output',
action='store_const',
const=True,
default=False
)
parser.add_argument(
'--strip-tags',
help='Strip HXL tags from the CSV output',
action='store_const',
const=True,
default=False
)
parser.add_argument(
"--ignore-certs",
help="Don't verify SSL connections (useful for self-signed)",
action='store_const',
const=True,
default=False
)
parser.add_argument(
'--log',
help='Set minimum logging level',
metavar='debug|info|warning|error|critical|none',
choices=['debug', 'info', 'warning', 'error', 'critical'],
default='error'
)
return parser

def add_queries_arg(
self,
parser,
help='Apply only to rows matching at least one query.'
):
parser.add_argument(
'-q',
'--query',
help=help,
metavar='<tagspec><op><value>',
action='append'
)
return parser

def do_common_args(self, args):
"""Process standard args"""
logging.basicConfig(
format='%(levelname)s (%(name)s): %(message)s',
level=args.log.upper())

def make_source(self, args, stdin=STDIN):
"""Create a HXL input source."""

# construct the input object
input = self.make_input(args, stdin)
return hxl.io.data(input)

def make_input(self, args, stdin=sys.stdin, url_or_filename=None):
"""Create an input object"""

if url_or_filename is None:
url_or_filename = args.infile

# sheet index
sheet_index = args.sheet
if sheet_index is not None:
sheet_index -= 1

# JSONPath selector
selector = args.selector

http_headers = self.make_headers(args)

return hxl.io.make_input(
url_or_filename or stdin,
sheet_index=sheet_index,
selector=selector,
allow_local=True,
http_headers=http_headers,
verify_ssl=(not args.ignore_certs)
)

def make_output(self, args, stdout=sys.stdout):
"""Create an output stream."""
if args.outfile:
return FileOutput(args.outfile)
else:
return StreamOutput(stdout)

def make_headers(self, args):
# get custom headers
header_strings = []
header = os.environ.get("HXL_HTTP_HEADER")
if header is not None:
header_strings.append(header)
if args.http_header is not None:
header_strings += args.http_header
http_headers = {}
for header in header_strings:
parts = header.partition(':')
http_headers[parts[0].strip()] = parts[2].strip()
return http_headers


class FileOutput(object):
"""
FileOutput contains is based on libhxl-python with no changes..
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self, filename):
self.output = open(filename, 'w')

def __enter__(self):
return self

def __exit__(self, value, type, traceback):
self.output.close()


class StreamOutput(object):
"""
StreamOutput contains is based on libhxl-python with no changes..
Last update on this class was 2021-01-25.
Author: David Megginson
License: Public Domain
"""

def __init__(self, output):
self.output = output

def __enter__(self):
return self

def __exit__(self, value, type, traceback):
pass

def write(self, s):
self.output.write(s)


if __name__ == "__main__":

hxlquickimporttab = HXLQuickImportTab()
args = hxlquickimporttab.make_args_hxlquickimporttab()

hxlquickimporttab.execute(args)

0 comments on commit bc59ace

Please sign in to comment.