etalab · AntoineAugusti · Sep 4, 2019 · Sep 3, 2019 · Sep 3, 2019 · Sep 4, 2019
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,5 @@ dbs/*.db
 *.pyc
 build/
 dist/
+reports/
+.pytest_cache/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,7 @@
 ## Current (in progress)
 
 - Upgrade to Quart-0.9.1 :warning: requires python-3.7 [#21](https://github.com/opendatateam/csvapi/pull/21)
+- Parse hours, SIREN and SIRET as text [#42](https://github.com/opendatateam/csvapi/pull/42)
 
 ## 0.0.9 (2019-01-18)
 

diff --git a/csvapi/parser.py b/csvapi/parser.py
@@ -5,6 +5,7 @@
 import cchardet as chardet
 
 from csvapi.utils import get_db_info
+from csvapi.type_tester import agate_tester
 
 SNIFF_LIMIT = 4096
 
@@ -22,14 +23,14 @@ def detect_encoding(filepath):
 def from_csv(filepath, encoding='utf-8', sniff_limit=SNIFF_LIMIT):
     """Try first w/ sniffing and then w/o sniffing if it fails"""
     try:
-        return agate.Table.from_csv(filepath, sniff_limit=sniff_limit, encoding=encoding)
+        return agate.Table.from_csv(filepath, sniff_limit=sniff_limit, encoding=encoding, column_types=agate_tester())
     except ValueError:
-        return agate.Table.from_csv(filepath, encoding=encoding)
+        return agate.Table.from_csv(filepath, encoding=encoding, column_types=agate_tester())
 
 
 def from_excel(filepath):
     import agateexcel  # noqa
-    return agate.Table.from_xls(filepath)
+    return agate.Table.from_xls(filepath, column_types=agate_tester())
 
 
 def to_sql(table, urlhash, storage):

diff --git a/csvapi/type_tester.py b/csvapi/type_tester.py
@@ -0,0 +1,70 @@
+import re
+
+from agate.data_types.base import DataType
+from agate.data_types.boolean import Boolean
+from agate.data_types.date import Date
+from agate.data_types.date_time import DateTime
+from agate.data_types.number import Number
+from agate.data_types.text import Text
+from agate.data_types.time_delta import TimeDelta
+from agate.exceptions import CastError
+from agate.type_tester import TypeTester
+
+from agatesql import table as agatesqltable
+
+from sqlalchemy.types import VARCHAR
+
+from stdnum.fr.siren import is_valid as is_valid_siren
+from stdnum.fr.siret import is_valid as is_valid_siret
+
+
+class Time(DataType):
+    # Detect an hour minute string.
+    # Examples: 12:20, 9:50, 23:30
+    def __init__(self, **kwargs):
+        super(Time, self).__init__(**kwargs)
+
+    def cast(self, d):
+        if re.match(r"^(?:[01]\d|2[0-3]|\d):[0-5]\d$", d):
+            return Text().cast(d)
+        raise CastError('Can not parse value "%s" as time.' % d)
+
+
+class SirenSiret(DataType):
+    # Detect a SIREN or SIRET number
+    def __init__(self):
+        super(SirenSiret, self).__init__()
+
+    def cast(self, d):
+        if is_valid_siret(d) or is_valid_siren(d):
+            return Text().cast(d)
+        raise CastError('Can not parse value "%s" as a SIREN or SIRET.' % d)
+
+
+# agatesql needs to know the SQL equivalent of a type.
+# Tell agatesql how our custom types should be converted in SQL.
+#
+# Reference:
+# https://github.com/wireservice/agate-sql/blob/7466073d81289323851c21817ea33170e36ce2a5/agatesql/table.py#L21-L28
+agatesqltable.SQL_TYPE_MAP[Time] = VARCHAR
+agatesqltable.SQL_TYPE_MAP[SirenSiret] = VARCHAR
+
+
+def agate_tester():
+    # Override the original list of type checkers present in agate
+    # to detect types.
+    #
+    # Original list here:
+    # https://github.com/wireservice/agate/blob/e3078dca8b3566e8408e65981f79918c2f36f9fe/agate/type_tester.py#L64-L71
+    return TypeTester(
+        types=[
+            Boolean(),
+            SirenSiret(),
+            Number(),
+            Time(),
+            TimeDelta(),
+            Date(),
+            DateTime(),
+            Text(),
+        ]
+    )
diff --git a/requirements/install.pip b/requirements/install.pip
@@ -7,4 +7,5 @@ validators==0.13.0
 agate-excel==0.2.3
 Quart==0.9.1
 raven==6.10.0
-cchardet==2.1.4
+cchardet==2.1.4
+python-stdnum==1.11
diff --git a/tests/test_api.py b/tests/test_api.py
@@ -1,4 +1,5 @@
 import os
+import uuid
 from pathlib import Path
 
 import pytest
@@ -49,6 +50,27 @@ def csv_col_mismatch():
 '''
 
 
+@pytest.fixture
+def csv_hour():
+    return '''id<sep>hour
+a<sep>12:30
+b<sep>9:15
+c<sep>09:45
+'''
+
+
+@pytest.fixture
+def csv_siren_siret():
+    return """id<sep>siren<sep>siret
+a<sep>130025265<sep>13002526500013
+b<sep>522816651<sep>52281665100056
+"""
+
+
+def random_url():
+    return f"https://example.com/{uuid.uuid4()}.csv"
+
+
 @pytest.fixture
 @pytest.mark.asyncio
 async def uploaded_csv(rmock, csv, client):
@@ -106,6 +128,41 @@ async def test_apify_col_mismatch(rmock, csv_col_mismatch, client):
     assert jsonres['ok']
 
 
+@pytest.mark.asyncio
+async def test_apify_hour_format(rmock, csv_hour, client):
+    content = csv_hour.replace('<sep>', ';').encode('utf-8')
+    url = random_url()
+    rmock.get(url, content=content)
+    await client.get('/apify?url={}'.format(url))
+    res = await client.get('/api/{}'.format(get_hash(url)))
+    assert res.status_code == 200
+    jsonres = await res.json
+    assert jsonres['columns'] == ['rowid', 'id', 'hour']
+    assert jsonres['total'] == 3
+    assert jsonres['rows'] == [
+        [1, 'a', '12:30'],
+        [2, 'b', '9:15'],
+        [3, 'c', '09:45'],
+    ]
+
+
+@pytest.mark.asyncio
+async def test_apify_siren_siret_format(rmock, csv_siren_siret, client):
+    content = csv_siren_siret.replace('<sep>', ';').encode('utf-8')
+    url = random_url()
+    rmock.get(url, content=content)
+    await client.get('/apify?url={}'.format(url))
+    res = await client.get('/api/{}'.format(get_hash(url)))
+    assert res.status_code == 200
+    jsonres = await res.json
+    assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret']
+    assert jsonres['total'] == 2
+    assert jsonres['rows'] == [
+        [1, 'a', '130025265', '13002526500013'],
+        [2, 'b', '522816651', '52281665100056'],
+    ]
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize('separator', [';', ',', '\t'])
 @pytest.mark.parametrize('encoding', ['utf-8', 'iso-8859-15', 'iso-8859-1'])
-Original file line number
+Diff line change
@@ Expand Up / @@ -3,3 +3,5 @@ dbs/*.db @@
     *.pyc
     build/
     dist/
+    reports/
+    .pytest_cache/