Allow to specify the quotechar in CSVFeedSpider

DamonGuo · Sep 13, 2014 · 47b6dff · 47b6dff
1 parent 5bcabfe
commit 47b6dff
Show file tree

Hide file tree

Showing 5 changed files with 47 additions and 8 deletions.
diff --git a/docs/topics/spiders.rst b/docs/topics/spiders.rst
@@ -523,6 +523,11 @@ CSVFeedSpider
        A string with the separator character for each field in the CSV file
        Defaults to ``','`` (comma).
 
+   .. attribute:: quotechar
+
+       A string with the enclosure character for each field in the CSV file
+       Defaults to ``'"'`` (quotation mark).
+
    .. attribute:: headers
 
        A list of the rows contained in the file CSV feed which will be used to
@@ -550,6 +555,7 @@ Let's see an example similar to the previous one, but using a
         allowed_domains = ['example.com']
         start_urls = ['http://www.example.com/feed.csv']
         delimiter = ';'
+        quotechar = "'"
         headers = ['id', 'name', 'description']
 
         def parse_row(self, response, row):

diff --git a/scrapy/contrib/spiders/feed.py b/scrapy/contrib/spiders/feed.py
@@ -97,11 +97,12 @@ class CSVFeedSpider(Spider):
     It receives a CSV file in a response; iterates through each of its rows,
     and calls parse_row with a dict containing each field's data.
 
-    You can set some options regarding the CSV file, such as the delimiter
+    You can set some options regarding the CSV file, such as the delimiter, quotechar
     and the file's headers.
     """
 
     delimiter = None # When this is None, python's csv module's default delimiter is used
+    quotechar = None # When this is None, python's csv module's default quotechar is used
     headers = None
 
     def process_results(self, response, results):
@@ -123,7 +124,7 @@ def parse_rows(self, response):
         process_results methods for pre and post-processing purposes.
         """
 
-        for row in csviter(response, self.delimiter, self.headers):
+        for row in csviter(response, self.delimiter, self.headers, self.quotechar):
             ret = self.parse_row(response, row)
             if isinstance(ret, (BaseItem, Request)):
                 ret = [ret]

diff --git a/scrapy/utils/iterators.py b/scrapy/utils/iterators.py
@@ -35,28 +35,32 @@ def xmliter(obj, nodename):
         yield Selector(text=nodetext, type='xml').xpath('//' + nodename)[0]
 
 
-def csviter(obj, delimiter=None, headers=None, encoding=None):
+def csviter(obj, delimiter=None, headers=None, encoding=None, quotechar=None):
     """ Returns an iterator of dictionaries from the given csv object
 
     obj can be:
     - a Response object
     - a unicode string
     - a string encoded as utf-8
 
-    delimiter is the character used to separate field on the given obj.
+    delimiter is the character used to separate fields on the given obj.
 
     headers is an iterable that when provided offers the keys
     for the returned dictionaries, if not the first row is used.
+    
+    quotechar is the character used to enclosure fields on the given obj.
     """
+
     encoding = obj.encoding if isinstance(obj, TextResponse) else encoding or 'utf-8'
     def _getrow(csv_r):
         return [str_to_unicode(field, encoding) for field in next(csv_r)]
 
     lines = BytesIO(_body_or_str(obj, unicode=False))
-    if delimiter:
-        csv_r = csv.reader(lines, delimiter=delimiter)
-    else:
-        csv_r = csv.reader(lines)
+
+    kwargs = {}
+    if delimiter: kwargs["delimiter"] = delimiter
+    if quotechar: kwargs["quotechar"] = quotechar
+    csv_r = csv.reader(lines, **kwargs)
 
     if not headers:
         headers = _getrow(csv_r)

diff --git a/tests/sample_data/feeds/feed-sample6.csv b/tests/sample_data/feeds/feed-sample6.csv
@@ -0,0 +1,6 @@
+'id','name','value'
+1,'alpha','foobar'
+2,'unicode','únícódé‽'
+'3','multi','foo
+bar'
+4,'empty',
diff --git a/tests/test_utils_iterators.py b/tests/test_utils_iterators.py
@@ -159,6 +159,28 @@ def test_csviter_delimiter(self):
                           {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
                           {u'id': u'4', u'name': u'empty',   u'value': u''}])
 
+    def test_csviter_quotechar(self):
+        body1 = get_testdata('feeds', 'feed-sample6.csv')
+        body2 = get_testdata('feeds', 'feed-sample6.csv').replace(",", '|')
+
+        response1 = TextResponse(url="http://example.com/", body=body1)
+        csv1 = csviter(response1, quotechar="'")
+
+        self.assertEqual([row for row in csv1],
+                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
+                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
+                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
+                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
+
+        response2 = TextResponse(url="http://example.com/", body=body2)
+        csv2 = csviter(response2, delimiter="|", quotechar="'")
+
+        self.assertEqual([row for row in csv2],
+                         [{u'id': u'1', u'name': u'alpha',   u'value': u'foobar'},
+                          {u'id': u'2', u'name': u'unicode', u'value': u'\xfan\xedc\xf3d\xe9\u203d'},
+                          {u'id': u'3', u'name': u'multi',   u'value': FOOBAR_NL},
+                          {u'id': u'4', u'name': u'empty',   u'value': u''}])
+
     def test_csviter_delimiter_binary_response_assume_utf8_encoding(self):
         body = get_testdata('feeds', 'feed-sample3.csv').replace(',', '\t')
         response = Response(url="http://example.com/", body=body)