Skip to content

Commit

Permalink
Allow to specify the quotechar in CSVFeedSpider
Browse files Browse the repository at this point in the history
  • Loading branch information
ahlen committed Sep 13, 2014
1 parent 5bcabfe commit 47b6dff
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 8 deletions.
6 changes: 6 additions & 0 deletions docs/topics/spiders.rst
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,11 @@ CSVFeedSpider
A string with the separator character for each field in the CSV file
Defaults to ``','`` (comma).

.. attribute:: quotechar

A string with the enclosure character for each field in the CSV file
Defaults to ``'"'`` (quotation mark).

.. attribute:: headers

A list of the rows contained in the file CSV feed which will be used to
Expand Down Expand Up @@ -550,6 +555,7 @@ Let's see an example similar to the previous one, but using a
allowed_domains = ['example.com']
start_urls = ['http://www.example.com/feed.csv']
delimiter = ';'
quotechar = "'"
headers = ['id', 'name', 'description']

def parse_row(self, response, row):
Expand Down
5 changes: 3 additions & 2 deletions scrapy/contrib/spiders/feed.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,12 @@ class CSVFeedSpider(Spider):
It receives a CSV file in a response; iterates through each of its rows,
and calls parse_row with a dict containing each field's data.
You can set some options regarding the CSV file, such as the delimiter
You can set some options regarding the CSV file, such as the delimiter, quotechar
and the file's headers.
"""

delimiter = None # When this is None, python's csv module's default delimiter is used
quotechar = None # When this is None, python's csv module's default quotechar is used
headers = None

def process_results(self, response, results):
Expand All @@ -123,7 +124,7 @@ def parse_rows(self, response):
process_results methods for pre and post-processing purposes.
"""

for row in csviter(response, self.delimiter, self.headers):
for row in csviter(response, self.delimiter, self.headers, self.quotechar):
ret = self.parse_row(response, row)
if isinstance(ret, (BaseItem, Request)):
ret = [ret]
Expand Down
16 changes: 10 additions & 6 deletions scrapy/utils/iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,28 +35,32 @@ def xmliter(obj, nodename):
yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]


def csviter(obj, delimiter=None, headers=None, encoding=None):
def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
""" Returns an iterator of dictionaries from the given csv object
obj can be:
- a Response object
- a unicode string
- a string encoded as utf-8
delimiter is the character used to separate field on the given obj.
delimiter is the character used to separate fields on the given obj.
headers is an iterable that when provided offers the keys
for the returned dictionaries, if not the first row is used.
quotechar is the character used to enclosure fields on the given obj.
"""

encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
def _getrow(csv_r):
return [str_to_unicode(field, encoding) for field in next(csv_r)]

lines = BytesIO(_body_or_str(obj, unicode=False))
if delimiter:
csv_r = csv.reader(lines, delimiter=delimiter)
else:
csv_r = csv.reader(lines)

kwargs = {}
if delimiter: kwargs["delimiter"] = delimiter
if quotechar: kwargs["quotechar"] = quotechar
csv_r = csv.reader(lines, **kwargs)

if not headers:
headers = _getrow(csv_r)
Expand Down
6 changes: 6 additions & 0 deletions tests/sample_data/feeds/feed-sample6.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
'id','name','value'
1,'alpha','foobar'
2,'unicode','únícódé‽'
'3','multi','foo
bar'
4,'empty',
22 changes: 22 additions & 0 deletions tests/test_utils_iterators.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,28 @@ def test_csviter_delimiter(self):
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
{u'id': u'4', u'name': u'empty', u'value': u''}])

def test_csviter_quotechar(self):
body1 = get_testdata('feeds', 'feed-sample6.csv')
body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')

response1 = TextResponse(url="http://example.com/", body=body1)
csv1 = csviter(response1, quotechar="'")

self.assertEqual([row for row in csv1],
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
{u'id': u'4', u'name': u'empty', u'value': u''}])

response2 = TextResponse(url="http://example.com/", body=body2)
csv2 = csviter(response2, delimiter="|", quotechar="'")

self.assertEqual([row for row in csv2],
[{u'id': u'1', u'name': u'alpha', u'value': u'foobar'},
{u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
{u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL},
{u'id': u'4', u'name': u'empty', u'value': u''}])

def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
response = Response(url="http://example.com/", body=body)
Expand Down

0 comments on commit 47b6dff

Please sign in to comment.