diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst index 188b51836b2..cb3f6caebd5 100644 --- a/docs/topics/spiders.rst +++ b/docs/topics/spiders.rst @@ -523,6 +523,11 @@ CSVFeedSpider A string with the separator character for each field in the CSV file Defaults to ``','`` (comma). + .. attribute:: quotechar + + A string with the enclosure character for each field in the CSV file + Defaults to ``'"'`` (quotation mark). + .. attribute:: headers A list of the rows contained in the file CSV feed which will be used to @@ -550,6 +555,7 @@ Let's see an example similar to the previous one, but using a allowed_domains = ['example.com'] start_urls = ['http://www.example.com/feed.csv'] delimiter = ';' + quotechar = "'" headers = ['id', 'name', 'description'] def parse_row(self, response, row): diff --git a/scrapy/contrib/spiders/feed.py b/scrapy/contrib/spiders/feed.py index fa538f4739d..1a95c5c3505 100644 --- a/scrapy/contrib/spiders/feed.py +++ b/scrapy/contrib/spiders/feed.py @@ -97,11 +97,12 @@ class CSVFeedSpider(Spider): It receives a CSV file in a response; iterates through each of its rows, and calls parse_row with a dict containing each field's data. - You can set some options regarding the CSV file, such as the delimiter + You can set some options regarding the CSV file, such as the delimiter, quotechar and the file's headers. """ delimiter = None # When this is None, python's csv module's default delimiter is used + quotechar = None # When this is None, python's csv module's default quotechar is used headers = None def process_results(self, response, results): @@ -123,7 +124,7 @@ def parse_rows(self, response): process_results methods for pre and post-processing purposes. """ - for row in csviter(response, self.delimiter, self.headers): + for row in csviter(response, self.delimiter, self.headers, self.quotechar): ret = self.parse_row(response, row) if isinstance(ret, (BaseItem, Request)): ret = [ret] diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py index 150b077aef3..78ea7114ead 100644 --- a/scrapy/utils/iterators.py +++ b/scrapy/utils/iterators.py @@ -35,7 +35,7 @@ def xmliter(obj, nodename): yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0] -def csviter(obj, delimiter=None, headers=None, encoding=None): +def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None): """ Returns an iterator of dictionaries from the given csv object obj can be: @@ -43,20 +43,24 @@ def csviter(obj, delimiter=None, headers=None, encoding=None): - a unicode string - a string encoded as utf-8 - delimiter is the character used to separate field on the given obj. + delimiter is the character used to separate fields on the given obj. headers is an iterable that when provided offers the keys for the returned dictionaries, if not the first row is used. + + quotechar is the character used to enclosure fields on the given obj. """ + encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8' def _getrow(csv_r): return [str_to_unicode(field, encoding) for field in next(csv_r)] lines = BytesIO(_body_or_str(obj, unicode=False)) - if delimiter: - csv_r = csv.reader(lines, delimiter=delimiter) - else: - csv_r = csv.reader(lines) + + kwargs = {} + if delimiter: kwargs["delimiter"] = delimiter + if quotechar: kwargs["quotechar"] = quotechar + csv_r = csv.reader(lines, **kwargs) if not headers: headers = _getrow(csv_r) diff --git a/tests/sample_data/feeds/feed-sample6.csv b/tests/sample_data/feeds/feed-sample6.csv new file mode 100644 index 00000000000..a2604653e4e --- /dev/null +++ b/tests/sample_data/feeds/feed-sample6.csv @@ -0,0 +1,6 @@ +'id','name','value' +1,'alpha','foobar' +2,'unicode','únícódé‽' +'3','multi','foo +bar' +4,'empty', diff --git a/tests/test_utils_iterators.py b/tests/test_utils_iterators.py index fe53f831f33..544941de102 100644 --- a/tests/test_utils_iterators.py +++ b/tests/test_utils_iterators.py @@ -159,6 +159,28 @@ def test_csviter_delimiter(self): {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, {u'id': u'4', u'name': u'empty', u'value': u''}]) + def test_csviter_quotechar(self): + body1 = get_testdata('feeds', 'feed-sample6.csv') + body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|') + + response1 = TextResponse(url="http://example.com/", body=body1) + csv1 = csviter(response1, quotechar="'") + + self.assertEqual([row for row in csv1], + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + + response2 = TextResponse(url="http://example.com/", body=body2) + csv2 = csviter(response2, delimiter="|", quotechar="'") + + self.assertEqual([row for row in csv2], + [{u'id': u'1', u'name': u'alpha', u'value': u'foobar'}, + {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'}, + {u'id': u'3', u'name': u'multi', u'value': FOOBAR_NL}, + {u'id': u'4', u'name': u'empty', u'value': u''}]) + def test_csviter_delimiter_binary_response_assume_utf8_encoding(self): body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t') response = Response(url="http://example.com/", body=body)