Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,5 @@ dbs/*.db
*.pyc
build/
dist/
reports/
.pytest_cache/
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
## Current (in progress)

- Upgrade to Quart-0.9.1 :warning: requires python-3.7 [#21](https://github.com/opendatateam/csvapi/pull/21)
- Parse hours, SIREN and SIRET as text [#42](https://github.com/opendatateam/csvapi/pull/42)

## 0.0.9 (2019-01-18)

Expand Down
7 changes: 4 additions & 3 deletions csvapi/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import cchardet as chardet

from csvapi.utils import get_db_info
from csvapi.type_tester import agate_tester

SNIFF_LIMIT = 4096

Expand All @@ -22,14 +23,14 @@ def detect_encoding(filepath):
def from_csv(filepath, encoding='utf-8', sniff_limit=SNIFF_LIMIT):
"""Try first w/ sniffing and then w/o sniffing if it fails"""
try:
return agate.Table.from_csv(filepath, sniff_limit=sniff_limit, encoding=encoding)
return agate.Table.from_csv(filepath, sniff_limit=sniff_limit, encoding=encoding, column_types=agate_tester())
except ValueError:
return agate.Table.from_csv(filepath, encoding=encoding)
return agate.Table.from_csv(filepath, encoding=encoding, column_types=agate_tester())


def from_excel(filepath):
import agateexcel # noqa
return agate.Table.from_xls(filepath)
return agate.Table.from_xls(filepath, column_types=agate_tester())


def to_sql(table, urlhash, storage):
Expand Down
70 changes: 70 additions & 0 deletions csvapi/type_tester.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import re

from agate.data_types.base import DataType
from agate.data_types.boolean import Boolean
from agate.data_types.date import Date
from agate.data_types.date_time import DateTime
from agate.data_types.number import Number
from agate.data_types.text import Text
from agate.data_types.time_delta import TimeDelta
from agate.exceptions import CastError
from agate.type_tester import TypeTester

from agatesql import table as agatesqltable

from sqlalchemy.types import VARCHAR

from stdnum.fr.siren import is_valid as is_valid_siren
from stdnum.fr.siret import is_valid as is_valid_siret


class Time(DataType):
# Detect an hour minute string.
# Examples: 12:20, 9:50, 23:30
def __init__(self, **kwargs):
super(Time, self).__init__(**kwargs)

def cast(self, d):
if re.match(r"^(?:[01]\d|2[0-3]|\d):[0-5]\d$", d):
return Text().cast(d)
raise CastError('Can not parse value "%s" as time.' % d)


class SirenSiret(DataType):
# Detect a SIREN or SIRET number
def __init__(self):
super(SirenSiret, self).__init__()

def cast(self, d):
if is_valid_siret(d) or is_valid_siren(d):
return Text().cast(d)
raise CastError('Can not parse value "%s" as a SIREN or SIRET.' % d)


# agatesql needs to know the SQL equivalent of a type.
# Tell agatesql how our custom types should be converted in SQL.
#
# Reference:
# https://github.com/wireservice/agate-sql/blob/7466073d81289323851c21817ea33170e36ce2a5/agatesql/table.py#L21-L28
agatesqltable.SQL_TYPE_MAP[Time] = VARCHAR
agatesqltable.SQL_TYPE_MAP[SirenSiret] = VARCHAR


def agate_tester():
# Override the original list of type checkers present in agate
# to detect types.
#
# Original list here:
# https://github.com/wireservice/agate/blob/e3078dca8b3566e8408e65981f79918c2f36f9fe/agate/type_tester.py#L64-L71
return TypeTester(
types=[
Boolean(),
SirenSiret(),
Number(),
Time(),
TimeDelta(),
Date(),
DateTime(),
Text(),
]
)
3 changes: 2 additions & 1 deletion requirements/install.pip
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ validators==0.13.0
agate-excel==0.2.3
Quart==0.9.1
raven==6.10.0
cchardet==2.1.4
cchardet==2.1.4
python-stdnum==1.11
57 changes: 57 additions & 0 deletions tests/test_api.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
import uuid
from pathlib import Path

import pytest
Expand Down Expand Up @@ -49,6 +50,27 @@ def csv_col_mismatch():
'''


@pytest.fixture
def csv_hour():
return '''id<sep>hour
a<sep>12:30
b<sep>9:15
c<sep>09:45
'''


@pytest.fixture
def csv_siren_siret():
return """id<sep>siren<sep>siret
a<sep>130025265<sep>13002526500013
b<sep>522816651<sep>52281665100056
"""


def random_url():
return f"https://example.com/{uuid.uuid4()}.csv"


@pytest.fixture
@pytest.mark.asyncio
async def uploaded_csv(rmock, csv, client):
Expand Down Expand Up @@ -106,6 +128,41 @@ async def test_apify_col_mismatch(rmock, csv_col_mismatch, client):
assert jsonres['ok']


@pytest.mark.asyncio
async def test_apify_hour_format(rmock, csv_hour, client):
content = csv_hour.replace('<sep>', ';').encode('utf-8')
url = random_url()
rmock.get(url, content=content)
await client.get('/apify?url={}'.format(url))
res = await client.get('/api/{}'.format(get_hash(url)))
assert res.status_code == 200
jsonres = await res.json
assert jsonres['columns'] == ['rowid', 'id', 'hour']
assert jsonres['total'] == 3
assert jsonres['rows'] == [
[1, 'a', '12:30'],
[2, 'b', '9:15'],
[3, 'c', '09:45'],
]


@pytest.mark.asyncio
async def test_apify_siren_siret_format(rmock, csv_siren_siret, client):
content = csv_siren_siret.replace('<sep>', ';').encode('utf-8')
url = random_url()
rmock.get(url, content=content)
await client.get('/apify?url={}'.format(url))
res = await client.get('/api/{}'.format(get_hash(url)))
assert res.status_code == 200
jsonres = await res.json
assert jsonres['columns'] == ['rowid', 'id', 'siren', 'siret']
assert jsonres['total'] == 2
assert jsonres['rows'] == [
[1, 'a', '130025265', '13002526500013'],
[2, 'b', '522816651', '52281665100056'],
]


@pytest.mark.asyncio
@pytest.mark.parametrize('separator', [';', ',', '\t'])
@pytest.mark.parametrize('encoding', ['utf-8', 'iso-8859-15', 'iso-8859-1'])
Expand Down