improve and standardize CSV output (#496)

* improve and standardize CSV output * add xmltocsv function and process like other formats * scrap txttocsv
adbar · Feb 6, 2024 · 337267a · 337267a
1 parent 67ff3a7
commit 337267a
Show file tree

Hide file tree

Showing 4 changed files with 75 additions and 40 deletions.
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -159,25 +159,44 @@ def test_input():
     assert testresult != 'A\u0308ffin' and testresult == 'Äffin'
 
 
-def test_txttocsv():
-    mymeta = Document()
-    assert utils.txttocsv('', '', mymeta) == 'None\tNone\tNone\tNone\tNone\tNone\t\t\tNone\tNone\n'
-    mymeta.title = 'Test title'
-    mymeta.url = 'https://example.org'
-    mymeta.hostname = 'example.org'
-    mymeta.id = '1'
-    mymeta.license = 'CC BY-SA'
-    mymeta.image = 'https://example.org/image.jpg'
-    mymeta.pagetype = 'article'
-    assert utils.txttocsv('Test text', 'Test comment', mymeta) == '1\thttps://example.org\tNone\texample.org\tTest title\thttps://example.org/image.jpg\tNone\tTest text\tTest comment\tCC BY-SA\tarticle\n'
+def test_xmltocsv():
+    doc = Document()
+    doc.body = etree.fromstring('<xml/>')
+    doc.commentsbody = etree.fromstring('<xml/>')
+    assert xml.xmltocsv(doc, False) == 'null\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\tnull\r\n'
+
+    doc.title = 'Test title'
+    doc.url = 'https://example.org'
+    doc.hostname = 'example.org'
+    doc.id = '1'
+    doc.license = 'CC BY-SA'
+    doc.image = 'https://example.org/image.jpg'
+    doc.pagetype = 'article'
+    text = 'Test text'
+    comments = 'Test comment'
+    doc.body = etree.fromstring(f'<p>{text}</p>')
+    doc.commentsbody = etree.fromstring(f'<p>{comments}</p>')
+
+    target = 'https://example.org\t1\tnull\texample.org\tTest title\thttps://example.org/image.jpg\tnull\tTest text\tTest comment\tCC BY-SA\tarticle\r\n'
+
+    assert xml.xmltocsv(doc, False) == target
+
     mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
     assert extract(mystring, output_format='csv', config=ZERO_CONFIG) is not None
-    assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tNone\n')
+    assert extract(mystring, output_format='csv', include_comments=False, config=ZERO_CONFIG).endswith('\tnull\r\n')
+
+
+def test_tojson():
     # test json
+    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
     result = extract(mystring, output_format='json', config=ZERO_CONFIG)
     assert result.endswith('}') and '"fingerprint":' in result and '"language":' in result
     assert extract(mystring, output_format='json', include_comments=False, config=ZERO_CONFIG).endswith('}')
+
+
+def test_python_output():
     # bare extraction for python
+    mystring = '<html><body><p>ÄÄÄÄÄÄÄÄÄÄÄÄÄÄ</p></body></html>'
     result = bare_extraction(mystring, config=ZERO_CONFIG, as_dict=True)
     assert isinstance(result, dict) and len(result) == 20
 
@@ -1206,7 +1225,9 @@ def test_lang_detection():
     test_extraction_options()
     test_precision_recall()
     test_baseline()
-    test_txttocsv()
+    test_xmltocsv()
+    test_tojson()
+    test_python_output()
     test_external()
     test_tei()
     test_table_processing()

diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -27,10 +27,10 @@
                              process_node, prune_unwanted_nodes, tree_cleaning)
 from .metadata import Document, extract_metadata
 from .settings import BASIC_CLEAN_XPATH, DEFAULT_CONFIG, TAG_CATALOG, use_config
-from .utils import is_image_file, load_html, normalize_unicode, trim, txttocsv, FORMATTING_PROTECTED
-from .xml import (build_json_output, build_tei_output, build_xml_output,
-                  control_xml_output, remove_empty_elements, strip_double_tags,
-                  xmltotxt)
+from .utils import (is_image_file, load_html, normalize_unicode, trim,
+                    FORMATTING_PROTECTED)
+from .xml import (build_json_output, build_tei_output, build_xml_output, control_xml_output,
+                  remove_empty_elements, strip_double_tags, xmltotxt, xmltocsv)
 from .xpaths import (BODY_XPATH, COMMENTS_DISCARD_XPATH, COMMENTS_XPATH,
                      DISCARD_IMAGE_ELEMENTS, OVERALL_DISCARD_XPATH,
                      PAYWALL_DISCARD_XPATH, PRECISION_DISCARD_XPATH,
@@ -824,12 +824,7 @@ def determine_returnstring(document, output_format, include_formatting, tei_vali
         returnstring = control_xml_output(output, output_format, tei_validation, document)
     # CSV
     elif output_format == 'csv':
-        posttext = xmltotxt(document.body, include_formatting)
-        if document.commentsbody is not None:
-            commentstext = xmltotxt(document.commentsbody, include_formatting)
-        else:
-            commentstext = ''
-        returnstring = txttocsv(posttext, commentstext, document)
+        returnstring = xmltocsv(document, include_formatting)
     # JSON
     elif output_format == 'json':
         returnstring = build_json_output(document)

diff --git a/trafilatura/utils.py b/trafilatura/utils.py
@@ -6,7 +6,6 @@
 ## This file is available from https://github.com/adbar/trafilatura
 ## under GNU GPL v3 license
 
-# import csv
 import logging
 import re
 import warnings
@@ -34,6 +33,7 @@
 # response types
 from urllib3.response import HTTPResponse
 
+
 LOGGER = logging.getLogger(__name__)
 
 UNICODE_ALIASES = {'utf-8', 'utf_8'}
@@ -240,22 +240,6 @@ def load_html(htmlobject):
     return tree
 
 
-def txttocsv(text, comments, docmeta):
-    '''Output the result in CSV format (tab-separated values)'''
-    # outputwriter = csv.writer(sys.stdout, delimiter='\t', quoting=csv.QUOTE_NONE)
-    # outputwriter.writerow()
-    # with newlines: '\\n'.join()
-    text = trim(' '.join(text.splitlines()))
-    if comments is not None:
-        comments = trim(' '.join(comments.splitlines()))
-    tsv_output = \
-        f'{docmeta.url}\t{docmeta.fingerprint}\t{docmeta.hostname}\t{docmeta.title}\t{docmeta.image}\t{docmeta.date}\t{text}\t{comments}\t{docmeta.license}\t{docmeta.pagetype}\n'
-    # add id up front if provided
-    if docmeta.id is not None:
-        tsv_output = docmeta.id + '\t' + tsv_output
-    return tsv_output
-
-
 @lru_cache(maxsize=2**14)  # sys.maxunicode = 1114111
 def return_printables_and_spaces(char):
     'Return a character if it belongs to certain classes'

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -6,10 +6,12 @@
 ## This file is available from https://github.com/adbar/trafilatura
 ## under GNU GPL v3 license
 
+import csv
 import logging
 import lzma
 
 from html import unescape
+from io import StringIO
 from json import dumps as json_dumps
 from pathlib import Path
 from pickle import load as load_pickle
@@ -294,6 +296,39 @@ def xmltotxt(xmloutput, include_formatting):
     return unescape(sanitize(''.join(returnlist)))
 
 
+def xmltocsv(document, include_formatting, *, delim="\t", null="null"):
+    "Convert the internal XML document representation to a CSV string."
+    # preprocessing
+    posttext = xmltotxt(document.body, include_formatting)
+    if document.commentsbody is not None:
+        commentstext = xmltotxt(document.commentsbody, include_formatting)
+    else:
+        commentstext = ""
+
+    # output config
+    output = StringIO()
+    outputwriter = csv.writer(output, delimiter=delim, quoting=csv.QUOTE_MINIMAL)
+
+    # organize fields
+    data = [d or null for d in (
+                document.url,
+                document.id,
+                document.fingerprint,
+                document.hostname,
+                document.title,
+                document.image,
+                document.date,
+                posttext,
+                commentstext,
+                document.license,
+                document.pagetype,
+                )
+            ]
+
+    outputwriter.writerow(data)
+    return output.getvalue()
+
+
 def write_teitree(docmeta):
     '''Bundle the extracted post and comments into a TEI tree'''
     teidoc = Element('TEI', xmlns='http://www.tei-c.org/ns/1.0')