CLN: merge with master

pandas-dev · nateGeorge · Jul 5, 2016 · Jul 6, 2016 · Jul 6, 2016 · Jul 6, 2016
commit 3c30cd084a82a05e4ff8a38a8a7202d8fd97154f
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -233,11 +233,52 @@ New behaviour:
    In [2]: pd.read_csv(StringIO(data), names=names)
 
 
+<<<<<<< HEAD
 New behaviour:
 
 .. ipython :: python
 
    In [2]: pd.read_csv(StringIO(data), names=names)
+=======
+.. _whatsnew_0190.enhancements.read_csv_categorical:
+
+:func:`read_csv` supports parsing ``Categorical`` directly
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The :func:`read_csv` function now supports parsing a ``Categorical`` column when
+specified as a dtype (:issue:`10153`).  Depending on the structure of the data,
+this can result in a faster parse time and lower memory usage compared to
+converting to ``Categorical`` after parsing.  See the io :ref:`docs here <io.categorical>`
+
+.. ipython:: python
+
+   data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3'
+
+   pd.read_csv(StringIO(data))
+   pd.read_csv(StringIO(data)).dtypes
+   pd.read_csv(StringIO(data), dtype='category').dtypes
+
+Individual columns can be parsed as a ``Categorical`` using a dict specification
+
+.. ipython:: python
+
+   pd.read_csv(StringIO(data), dtype={'col1': 'category'}).dtypes
+
+.. note::
+
+   The resulting categories will always be parsed as strings (object dtype).
+   If the categories are numeric they can be converted using the
+   :func:`to_numeric` function, or as appropriate, another converter
+   such as :func:`to_datetime`.
+
+   .. ipython:: python
+
+      df = pd.read_csv(StringIO(data), dtype='category')
+      df.dtypes
+      df['col3']
+      df['col3'].cat.categories = pd.to_numeric(df['col3'].cat.categories)
+      df['col3']
+>>>>>>> f93ad1ca828dc70a865445f1555958acbf132af1
 
 .. _whatsnew_0190.enhancements.semi_month_offsets:
 
@@ -968,5 +1009,20 @@ Bug Fixes
 
 - Bug in the CSS classes assigned to ``DataFrame.style`` for index names. Previously they were assigned ``"col_heading level<n> col<c>"`` where ``n`` was the number of levels + 1. Now they are assigned ``"index_name level<n>"``, where ``n`` is the correct level for that MultiIndex.
 - Bug where ``pd.read_gbq()`` could throw ``ImportError: No module named discovery`` as a result of a naming conflict with another python package called apiclient  (:issue:`13454`)
+- Bug in ``Index.union`` returns an incorrect result with a named empty index (:issue:`13432`)
+- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)
+- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
+- Bug in invalid frequency offset string like "D1", "-2-3H" may not raise ``ValueError (:issue:`13930`)
+- Bug in ``concat`` and ``groupby`` for hierarchical frames with ``RangeIndex`` levels (:issue:`13542`).
+
+- Bug in ``agg()`` function on groupby dataframe changes dtype of ``datetime64[ns]`` column to ``float64`` (:issue:`12821`)
+
+- Bug in operations on ``NaT`` returning ``float`` instead of ``datetime64[ns]`` (:issue:`12941`)
+
+- Bug in ``pd.read_csv`` in Python 2.x with non-UTF8 encoded, multi-character separated data (:issue:`3404`)
 
+- Bug in ``Index`` raises ``KeyError`` displaying incorrect column when column is not in the df and columns contains duplicate values (:issue:`13822`)
+- Bug in ``Period`` and ``PeriodIndex`` creating wrong dates when frequency has combined offset aliases (:issue:`13874`)
+- Bug in ``pd.to_datetime()`` did not cast floats correctly when ``unit`` was specified, resulting in truncated datetime (:issue:`13845`)
+- Bug in ``.to_string()`` when called with an integer ``line_width`` and ``index=False`` raises an UnboundLocalError exception because ``idx`` referenced before assignment.
 - Bug in ``pd.read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised ``UnicodeDecodeError`` (:issue:`13549`)
diff --git a/pandas/io/tests/parser/common.py b/pandas/io/tests/parser/common.py
@@ -1513,3 +1513,85 @@ def test_read_csv_utf_aliases(self):
                 data = 'mb_num,multibyte\n4.8,test'.encode(encoding)
                 result = self.read_csv(BytesIO(data), encoding=encoding)
                 tm.assert_frame_equal(result, expected)
+
+    def test_null_byte_char(self):
+        # see gh-2741
+        data = '\x00,foo'
+        cols = ['a', 'b']
+
+        expected = DataFrame([[np.nan, 'foo']],
+                             columns=cols)
+
+        if self.engine == 'c':
+            out = self.read_csv(StringIO(data), names=cols)
+            tm.assert_frame_equal(out, expected)
+        else:
+            msg = "NULL byte detected"
+            with tm.assertRaisesRegexp(csv.Error, msg):
+                self.read_csv(StringIO(data), names=cols)
+
+    def test_utf8_bom(self):
+        # see gh-4793
+        bom = u('\ufeff')
+        utf8 = 'utf-8'
+
+        def _encode_data_with_bom(_data):
+            bom_data = (bom + _data).encode(utf8)
+            return BytesIO(bom_data)
+
+        # basic test
+        data = 'a\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8)
+        tm.assert_frame_equal(out, expected)
+
+        # test with "regular" quoting
+        data = '"a"\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, quotechar='"')
+        tm.assert_frame_equal(out, expected)
+
+        # test in a data row instead of header
+        data = 'b\n1'
+        expected = DataFrame({'a': ['b', '1']})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'])
+        tm.assert_frame_equal(out, expected)
+
+        # test in empty data row with skipping
+        data = '\n1'
+        expected = DataFrame({'a': [1]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'],
+                            skip_blank_lines=True)
+        tm.assert_frame_equal(out, expected)
+
+        # test in empty data row without skipping
+        data = '\n1'
+        expected = DataFrame({'a': [np.nan, 1.0]})
+
+        out = self.read_csv(_encode_data_with_bom(data),
+                            encoding=utf8, names=['a'],
+                            skip_blank_lines=False)
+        tm.assert_frame_equal(out, expected)
+
+    def test_temporary_file(self):
+        # see gh-13398
+        data1 = "0 0"
+
+        from tempfile import TemporaryFile
+        new_file = TemporaryFile("w+")
+        new_file.write(data1)
+        new_file.flush()
+        new_file.seek(0)
+
+        result = self.read_csv(new_file, sep='\s+', header=None)
+        new_file.close()
+        expected = DataFrame([[0, 0]])
+        tm.assert_frame_equal(result, expected)