Skip to content

Commit

Permalink
improve and standardize CSV output (#496)
Browse files Browse the repository at this point in the history
* improve and standardize CSV output

* add xmltocsv function and process like other formats

* scrap txttocsv
  • Loading branch information
adbar authored Feb 6, 2024
1 parent 67ff3a7 commit 337267a
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 40 deletions.
47 changes: 34 additions & 13 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,25 +159,44 @@ def test_input():
assert testresult != 'A\u0308ffin' and testresult == 'Äffin'


def test_txttocsv():
mymeta = Document()
assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\tNone\t\t\tNone\tNone\n'
mymeta.title = 'Test title'
mymeta.url = 'https://example.org'
mymeta.hostname = 'example.org'
mymeta.id = '1'
mymeta.license = 'CC BY-SA'
mymeta.image = 'https://example.org/image.jpg'
mymeta.pagetype = 'article'
assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\thttps://example.org/image.jpg\tNone\tTest text\tTest comment\tCC BY-SA\tarticle\n'
def test_xmltocsv():
doc = Document()
doc.body = etree.fromstring('<xml/>')
doc.commentsbody = etree.fromstring('<xml/>')
assert xml.xmltocsv(doc, False) == 'null\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\r\n'

doc.title = 'Test title'
doc.url = 'https://example.org'
doc.hostname = 'example.org'
doc.id = '1'
doc.license = 'CC BY-SA'
doc.image = 'https://example.org/image.jpg'
doc.pagetype = 'article'
text = 'Test text'
comments = 'Test comment'
doc.body = etree.fromstring(f'<p>{text}</p>')
doc.commentsbody = etree.fromstring(f'<p>{comments}</p>')

target = 'https://example.org\t1\tnull\texample.org\tTest title\thttps://example.org/image.jpg\tnull\tTest text\tTest comment\tCC BY-SA\tarticle\r\n'

assert xml.xmltocsv(doc, False) == target

mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None
assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n')
assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tnull\r\n')


def test_tojson():
# test json
mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
result = extract(mystring, output_format='json', config=ZERO_CONFIG)
assert result.endswith('}') and '"fingerprint":' in result and '"language":' in result
assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')


def test_python_output():
# bare extraction for python
mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True)
assert isinstance(result, dict) and len(result) == 20

Expand Down Expand Up @@ -1206,7 +1225,9 @@ def test_lang_detection():
test_extraction_options()
test_precision_recall()
test_baseline()
test_txttocsv()
test_xmltocsv()
test_tojson()
test_python_output()
test_external()
test_tei()
test_table_processing()
Expand Down
15 changes: 5 additions & 10 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
xmltotxt)
from .utils import (is_image_file, load_html, normalize_unicode, trim,
FORMATTING_PROTECTED)
from .xml import (build_json_output, build_tei_output, build_xml_output, control_xml_output,
remove_empty_elements, strip_double_tags, xmltotxt, xmltocsv)
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
PAYWALL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
Expand Down Expand Up @@ -824,12 +824,7 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
returnstring = control_xml_output(output, output_format, tei_validation, document)
# CSV
elif output_format == 'csv':
posttext = xmltotxt(document.body, include_formatting)
if document.commentsbody is not None:
commentstext = xmltotxt(document.commentsbody, include_formatting)
else:
commentstext = ''
returnstring = txttocsv(posttext, commentstext, document)
returnstring = xmltocsv(document, include_formatting)
# JSON
elif output_format == 'json':
returnstring = build_json_output(document)
Expand Down
18 changes: 1 addition & 17 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license

# import csv
import logging
import re
import warnings
Expand Down Expand Up @@ -34,6 +33,7 @@
# response types
from urllib3.response import HTTPResponse


LOGGER = logging.getLogger(__name__)

UNICODE_ALIASES = {'utf-8', 'utf_8'}
Expand Down Expand Up @@ -240,22 +240,6 @@ def load_html(htmlobject):
return tree


def txttocsv(text, comments, docmeta):
'''Output the result in CSV format (tab-separated values)'''
# outputwriter = csv.writer(sys.stdout, delimiter='\t', quoting=csv.QUOTE_NONE)
# outputwriter.writerow()
# with newlines: '\\n'.join()
text = trim(' '.join(text.splitlines()))
if comments is not None:
comments = trim(' '.join(comments.splitlines()))
tsv_output = \
f'{docmeta.url}\t{docmeta.fingerprint}\t{docmeta.hostname}\t{docmeta.title}\t{docmeta.image}\t{docmeta.date}\t{text}\t{comments}\t{docmeta.license}\t{docmeta.pagetype}\n'
# add id up front if provided
if docmeta.id is not None:
tsv_output = docmeta.id + '\t' + tsv_output
return tsv_output


@lru_cache(maxsize=2**14) # sys.maxunicode = 1114111
def return_printables_and_spaces(char):
'Return a character if it belongs to certain classes'
Expand Down
35 changes: 35 additions & 0 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license

import csv
import logging
import lzma

from html import unescape
from io import StringIO
from json import dumps as json_dumps
from pathlib import Path
from pickle import load as load_pickle
Expand Down Expand Up @@ -294,6 +296,39 @@ def xmltotxt(xmloutput, include_formatting):
return unescape(sanitize(''.join(returnlist)))


def xmltocsv(document, include_formatting, *, delim="\t", null="null"):
"Convert the internal XML document representation to a CSV string."
# preprocessing
posttext = xmltotxt(document.body, include_formatting)
if document.commentsbody is not None:
commentstext = xmltotxt(document.commentsbody, include_formatting)
else:
commentstext = ""

# output config
output = StringIO()
outputwriter = csv.writer(output, delimiter=delim, quoting=csv.QUOTE_MINIMAL)

# organize fields
data = [d or null for d in (
document.url,
document.id,
document.fingerprint,
document.hostname,
document.title,
document.image,
document.date,
posttext,
commentstext,
document.license,
document.pagetype,
)
]

outputwriter.writerow(data)
return output.getvalue()


def write_teitree(docmeta):
'''Bundle the extracted post and comments into a TEI tree'''
teidoc = Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
Expand Down

0 comments on commit 337267a

Please sign in to comment.