Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

improve and standardize CSV output #496

Merged
merged 3 commits into from
Feb 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,25 +159,44 @@ def test_input():
assert testresult != 'A\u0308ffin' and testresult == 'Äffin'


def test_txttocsv():
mymeta = Document()
assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\tNone\t\t\tNone\tNone\n'
mymeta.title = 'Test title'
mymeta.url = 'https://example.org'
mymeta.hostname = 'example.org'
mymeta.id = '1'
mymeta.license = 'CC BY-SA'
mymeta.image = 'https://example.org/image.jpg'
mymeta.pagetype = 'article'
assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\thttps://example.org/image.jpg\tNone\tTest text\tTest comment\tCC BY-SA\tarticle\n'
def test_xmltocsv():
doc = Document()
doc.body = etree.fromstring('<xml/>')
doc.commentsbody = etree.fromstring('<xml/>')
assert xml.xmltocsv(doc, False) == 'null\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\r\n'

doc.title = 'Test title'
doc.url = 'https://example.org'
doc.hostname = 'example.org'
doc.id = '1'
doc.license = 'CC BY-SA'
doc.image = 'https://example.org/image.jpg'
doc.pagetype = 'article'
text = 'Test text'
comments = 'Test comment'
doc.body = etree.fromstring(f'<p>{text}</p>')
doc.commentsbody = etree.fromstring(f'<p>{comments}</p>')

target = 'https://example.org\t1\tnull\texample.org\tTest title\thttps://example.org/image.jpg\tnull\tTest text\tTest comment\tCC BY-SA\tarticle\r\n'

assert xml.xmltocsv(doc, False) == target

mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None
assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n')
assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tnull\r\n')


def test_tojson():
# test json
mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
result = extract(mystring, output_format='json', config=ZERO_CONFIG)
assert result.endswith('}') and '"fingerprint":' in result and '"language":' in result
assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')


def test_python_output():
# bare extraction for python
mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True)
assert isinstance(result, dict) and len(result) == 20

Expand Down Expand Up @@ -1206,7 +1225,9 @@ def test_lang_detection():
test_extraction_options()
test_precision_recall()
test_baseline()
test_txttocsv()
test_xmltocsv()
test_tojson()
test_python_output()
test_external()
test_tei()
test_table_processing()
Expand Down
15 changes: 5 additions & 10 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,10 @@
process_node, prune_unwanted_nodes, tree_cleaning)
from .metadata import Document, extract_metadata
from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
from .xml import (build_json_output, build_tei_output, build_xml_output,
control_xml_output, remove_empty_elements, strip_double_tags,
xmltotxt)
from .utils import (is_image_file, load_html, normalize_unicode, trim,
FORMATTING_PROTECTED)
from .xml import (build_json_output, build_tei_output, build_xml_output, control_xml_output,
remove_empty_elements, strip_double_tags, xmltotxt, xmltocsv)
from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
PAYWALL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
Expand Down Expand Up @@ -824,12 +824,7 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
returnstring = control_xml_output(output, output_format, tei_validation, document)
# CSV
elif output_format == 'csv':
posttext = xmltotxt(document.body, include_formatting)
if document.commentsbody is not None:
commentstext = xmltotxt(document.commentsbody, include_formatting)
else:
commentstext = ''
returnstring = txttocsv(posttext, commentstext, document)
returnstring = xmltocsv(document, include_formatting)
# JSON
elif output_format == 'json':
returnstring = build_json_output(document)
Expand Down
18 changes: 1 addition & 17 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license

# import csv
import logging
import re
import warnings
Expand Down Expand Up @@ -34,6 +33,7 @@
# response types
from urllib3.response import HTTPResponse


LOGGER = logging.getLogger(__name__)

UNICODE_ALIASES = {'utf-8', 'utf_8'}
Expand Down Expand Up @@ -240,22 +240,6 @@ def load_html(htmlobject):
return tree


def txttocsv(text, comments, docmeta):
'''Output the result in CSV format (tab-separated values)'''
# outputwriter = csv.writer(sys.stdout, delimiter='\t', quoting=csv.QUOTE_NONE)
# outputwriter.writerow()
# with newlines: '\\n'.join()
text = trim(' '.join(text.splitlines()))
if comments is not None:
comments = trim(' '.join(comments.splitlines()))
tsv_output = \
f'{docmeta.url}\t{docmeta.fingerprint}\t{docmeta.hostname}\t{docmeta.title}\t{docmeta.image}\t{docmeta.date}\t{text}\t{comments}\t{docmeta.license}\t{docmeta.pagetype}\n'
# add id up front if provided
if docmeta.id is not None:
tsv_output = docmeta.id + '\t' + tsv_output
return tsv_output


@lru_cache(maxsize=2**14) # sys.maxunicode = 1114111
def return_printables_and_spaces(char):
'Return a character if it belongs to certain classes'
Expand Down
35 changes: 35 additions & 0 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@
## This file is available from https://github.com/adbar/trafilatura
## under GNU GPL v3 license

import csv
import logging
import lzma

from html import unescape
from io import StringIO
from json import dumps as json_dumps
from pathlib import Path
from pickle import load as load_pickle
Expand Down Expand Up @@ -294,6 +296,39 @@ def xmltotxt(xmloutput, include_formatting):
return unescape(sanitize(''.join(returnlist)))


def xmltocsv(document, include_formatting, *, delim="\t", null="null"):
"Convert the internal XML document representation to a CSV string."
# preprocessing
posttext = xmltotxt(document.body, include_formatting)
if document.commentsbody is not None:
commentstext = xmltotxt(document.commentsbody, include_formatting)
else:
commentstext = ""

# output config
output = StringIO()
outputwriter = csv.writer(output, delimiter=delim, quoting=csv.QUOTE_MINIMAL)

# organize fields
data = [d or null for d in (
document.url,
document.id,
document.fingerprint,
document.hostname,
document.title,
document.image,
document.date,
posttext,
commentstext,
document.license,
document.pagetype,
)
]

outputwriter.writerow(data)
return output.getvalue()


def write_teitree(docmeta):
'''Bundle the extracted post and comments into a TEI tree'''
teidoc = Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')
Expand Down
Loading