-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Browse files
Browse the repository at this point in the history
- Loading branch information
Showing
2 changed files
with
389 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,388 @@ | ||
#!/usr/bin/env python3 | ||
# ============================================================================== | ||
# | ||
# FILE: hxlquickimporttab | ||
# | ||
# USAGE: hxlquickimporttab orange-file.tab hxlated-file.hxl.csv | ||
# cat orange-file.tab | hxlquickimporttab > hxlated-file.hxl.csv | ||
# | ||
# DESCRIPTION: --- | ||
# | ||
# OPTIONS: --- | ||
# | ||
# REQUIREMENTS: - python3 | ||
# - libhxl (@see https://pypi.org/project/libhxl/) | ||
# BUGS: --- | ||
# NOTES: --- | ||
# AUTHOR: Emerson Rocha <rocha[at]ieee.org> | ||
# COMPANY: EticaAI | ||
# LICENSE: Public Domain dedication | ||
# SPDX-License-Identifier: Unlicense | ||
# VERSION: v0.1 | ||
# CREATED: 2021-02-06 21:21 UTC v0.1 started (based on hxlquickimport) | ||
# REVISION: --- | ||
# ============================================================================== | ||
|
||
import sys | ||
import os | ||
import logging | ||
import argparse | ||
|
||
# Do not import hxl, to avoid circular imports | ||
import hxl.converters | ||
import hxl.filters | ||
import hxl.io | ||
|
||
import csv | ||
from slugify import slugify | ||
|
||
# In Python2, sys.stdin is a byte stream; in Python3, it's a text stream | ||
STDIN = sys.stdin.buffer | ||
|
||
|
||
class HXLQuickImportTab: | ||
""" | ||
HXLQuickImportTab is a classe to export already HXLated data in the format | ||
example. | ||
""" | ||
|
||
def __init__(self): | ||
""" | ||
Constructs all the necessary attributes for the HXLQuickImportTab object. | ||
""" | ||
self.hxlhelper = None | ||
self.args = None | ||
|
||
# Posix exit codes | ||
self.EXIT_OK = 0 | ||
self.EXIT_ERROR = 1 | ||
self.EXIT_SYNTAX = 2 | ||
|
||
def make_args_hxlquickimporttab(self): | ||
|
||
self.hxlhelper = HXLUtils() | ||
parser = self.hxlhelper.make_args( | ||
description=(""" | ||
hxlquickimporttab is a quick (and wrong) way to import | ||
non-HXL dataset (like an .csv or .xlsx, but requires headers already on the | ||
first row) without human intervention. It will try to slugify the original | ||
header and add as +attributefor a base hashtag like #meta. | ||
The result may be an HXL with valid syntax (that can be used for automated | ||
testing) but most HXL powered tools would still be human review. | ||
How does it work? | ||
"[Max Power] Kids: there's three ways to do things; the right way, | ||
the wrong way and the Max Power way! | ||
[Bart Simpson] Isn't that the wrong way? | ||
[Max Power] Yeah, but faster!" | ||
(via https://www.youtube.com/watch?v=7P0JM3h7IQk) | ||
How to do it the right way? | ||
Read the documentation on https://hxlstandard.org/. | ||
(Tip: both HXL Postcards and the hxl-hashtag-chooser are very helpful!) | ||
""")) | ||
|
||
self.args = parser.parse_args() | ||
return self.args | ||
|
||
def execute(self, args, | ||
stdin=STDIN, stdout=sys.stdout, stderr=sys.stderr): | ||
""" | ||
The do_example_output is the main entrypoint of HXLQuickImportTab. When | ||
called will convert the HXL source to example format. | ||
""" | ||
|
||
# NOTE: the next lines, in fact, only generate an csv outut. So you | ||
# can use as starting point. | ||
with self.hxlhelper.make_source(args, stdin) as source: | ||
self.hxlquickimporttab(source, args, True) | ||
|
||
return self.EXIT_OK | ||
|
||
def hxlquickimporttab(self, hxlated_input, tab_output, is_stdout): | ||
""" | ||
hic sunt dracones | ||
(__) ) | ||
(..) /|\\ | ||
(o_o) / | \\ | ||
___) \\/,-|,-\\| | ||
//,-/_\\ ) ' ' | ||
(//,-'\\| | ||
( ( . \\_ | ||
gnv `._\\(___`. | ||
'---' _)/ | ||
`-' | ||
""" | ||
|
||
header_original = hxlated_input._get_row() | ||
|
||
header_new = self.hxlquickimporttab_header(header_original) | ||
|
||
if not args.outfile: | ||
# txt_writer = csv.writer(sys.stdout, delimiter='\t') | ||
txt_writer = csv.writer(sys.stdout) | ||
txt_writer.writerow(header_new) | ||
# for line in hxlated_input: | ||
line = hxlated_input._get_row() | ||
|
||
while line: | ||
txt_writer.writerow(line) | ||
try: | ||
line = hxlated_input._get_row() | ||
except: | ||
line = False | ||
|
||
else: | ||
|
||
tab_output_cleanup = open(args.outfile, 'w') | ||
tab_output_cleanup.truncate() | ||
tab_output_cleanup.close() | ||
|
||
with open(args.outfile, 'a') as new_txt: | ||
# txt_writer = csv.writer(new_txt, delimiter='\t') | ||
txt_writer = csv.writer(new_txt) | ||
txt_writer.writerow(header_new) | ||
|
||
line = hxlated_input._get_row() | ||
|
||
while line: | ||
txt_writer.writerow(line) | ||
try: | ||
line = hxlated_input._get_row() | ||
except: | ||
line = False | ||
|
||
def hxlquickimporttab_header(self, hxlated_header, basehashtag="#item"): | ||
""" | ||
hhxlquickimporttab_header is a hackish to HXLate an CSV-like dataset | ||
without human intervention. | ||
How it works? It replaces the original header with the base | ||
hashtag and then slugify the original header as attribute, so | ||
ID_REGISTRO -> #item+id_registro | ||
NACIONALIDAD -> #item+nacionalidad | ||
The current version will not avoid 'conflicts' with HXL data types like | ||
BOOL -> #item+bool | ||
number -> #item+number | ||
phone -> #item+phone | ||
""" | ||
|
||
for idx, a in enumerate(hxlated_header): | ||
hxlated_header[idx] = basehashtag + '+' \ | ||
+ slugify(hxlated_header[idx], separator="_") | ||
|
||
return hxlated_header | ||
|
||
|
||
class HXLUtils: | ||
""" | ||
HXLUtils contains functions from the Console scripts of libhxl-python | ||
(HXLStandard/libhxl-python/blob/master/hxl/scripts.py) with few changes | ||
to be used as class (and have one single place to change). | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self): | ||
|
||
self.logger = logging.getLogger(__name__) | ||
|
||
# Posix exit codes | ||
self.EXIT_OK = 0 | ||
self.EXIT_ERROR = 1 | ||
self.EXIT_SYNTAX = 2 | ||
|
||
def make_args(self, description, hxl_output=True): | ||
"""Set up parser with default arguments. | ||
@param description: usage description to show | ||
@param hxl_output: if True (default), include options for HXL output. | ||
@returns: an argument parser, partly set up. | ||
""" | ||
parser = argparse.ArgumentParser(description=description) | ||
parser.add_argument( | ||
'infile', | ||
help='HXL file to read (if omitted, use standard input).', | ||
nargs='?' | ||
) | ||
if hxl_output: | ||
parser.add_argument( | ||
'outfile', | ||
help='HXL file to write (if omitted, use standard output).', | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--sheet', | ||
help='Select sheet from a workbook (1 is first sheet)', | ||
metavar='number', | ||
type=int, | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--selector', | ||
help='JSONPath expression for starting point in JSON input', | ||
metavar='path', | ||
nargs='?' | ||
) | ||
parser.add_argument( | ||
'--http-header', | ||
help='Custom HTTP header to send with request', | ||
metavar='header', | ||
action='append' | ||
) | ||
if hxl_output: | ||
parser.add_argument( | ||
'--remove-headers', | ||
help='Strip text headers from the CSV output', | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
'--strip-tags', | ||
help='Strip HXL tags from the CSV output', | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
"--ignore-certs", | ||
help="Don't verify SSL connections (useful for self-signed)", | ||
action='store_const', | ||
const=True, | ||
default=False | ||
) | ||
parser.add_argument( | ||
'--log', | ||
help='Set minimum logging level', | ||
metavar='debug|info|warning|error|critical|none', | ||
choices=['debug', 'info', 'warning', 'error', 'critical'], | ||
default='error' | ||
) | ||
return parser | ||
|
||
def add_queries_arg( | ||
self, | ||
parser, | ||
help='Apply only to rows matching at least one query.' | ||
): | ||
parser.add_argument( | ||
'-q', | ||
'--query', | ||
help=help, | ||
metavar='<tagspec><op><value>', | ||
action='append' | ||
) | ||
return parser | ||
|
||
def do_common_args(self, args): | ||
"""Process standard args""" | ||
logging.basicConfig( | ||
format='%(levelname)s (%(name)s): %(message)s', | ||
level=args.log.upper()) | ||
|
||
def make_source(self, args, stdin=STDIN): | ||
"""Create a HXL input source.""" | ||
|
||
# construct the input object | ||
input = self.make_input(args, stdin) | ||
return hxl.io.data(input) | ||
|
||
def make_input(self, args, stdin=sys.stdin, url_or_filename=None): | ||
"""Create an input object""" | ||
|
||
if url_or_filename is None: | ||
url_or_filename = args.infile | ||
|
||
# sheet index | ||
sheet_index = args.sheet | ||
if sheet_index is not None: | ||
sheet_index -= 1 | ||
|
||
# JSONPath selector | ||
selector = args.selector | ||
|
||
http_headers = self.make_headers(args) | ||
|
||
return hxl.io.make_input( | ||
url_or_filename or stdin, | ||
sheet_index=sheet_index, | ||
selector=selector, | ||
allow_local=True, | ||
http_headers=http_headers, | ||
verify_ssl=(not args.ignore_certs) | ||
) | ||
|
||
def make_output(self, args, stdout=sys.stdout): | ||
"""Create an output stream.""" | ||
if args.outfile: | ||
return FileOutput(args.outfile) | ||
else: | ||
return StreamOutput(stdout) | ||
|
||
def make_headers(self, args): | ||
# get custom headers | ||
header_strings = [] | ||
header = os.environ.get("HXL_HTTP_HEADER") | ||
if header is not None: | ||
header_strings.append(header) | ||
if args.http_header is not None: | ||
header_strings += args.http_header | ||
http_headers = {} | ||
for header in header_strings: | ||
parts = header.partition(':') | ||
http_headers[parts[0].strip()] = parts[2].strip() | ||
return http_headers | ||
|
||
|
||
class FileOutput(object): | ||
""" | ||
FileOutput contains is based on libhxl-python with no changes.. | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self, filename): | ||
self.output = open(filename, 'w') | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, value, type, traceback): | ||
self.output.close() | ||
|
||
|
||
class StreamOutput(object): | ||
""" | ||
StreamOutput contains is based on libhxl-python with no changes.. | ||
Last update on this class was 2021-01-25. | ||
Author: David Megginson | ||
License: Public Domain | ||
""" | ||
|
||
def __init__(self, output): | ||
self.output = output | ||
|
||
def __enter__(self): | ||
return self | ||
|
||
def __exit__(self, value, type, traceback): | ||
pass | ||
|
||
def write(self, s): | ||
self.output.write(s) | ||
|
||
|
||
if __name__ == "__main__": | ||
|
||
hxlquickimporttab = HXLQuickImportTab() | ||
args = hxlquickimporttab.make_args_hxlquickimporttab() | ||
|
||
hxlquickimporttab.execute(args) |