pandas-dev
diff --git a/‎RELEASE.rst‎
Lines changed: 1 addition & 0 deletions b/‎RELEASE.rst‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pandas/core/frame.py‎
Lines changed: 17 additions & 102 deletions b/‎pandas/core/frame.py‎
Lines changed: 17 additions & 102 deletions
diff --git a/‎pandas/core/index.py‎
Lines changed: 73 additions & 11 deletions b/‎pandas/core/index.py‎
Lines changed: 73 additions & 11 deletions
diff --git a/‎pandas/core/panel.py‎
Lines changed: 8 additions & 41 deletions b/‎pandas/core/panel.py‎
Lines changed: 8 additions & 41 deletions
diff --git a/‎pandas/io/parsers.py‎
Lines changed: 1 addition & 1 deletion b/‎pandas/io/parsers.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pandas/sparse/panel.py‎
Lines changed: 5 additions & 3 deletions b/‎pandas/sparse/panel.py‎
Lines changed: 5 additions & 3 deletions
@@ -89,6 +89,7 @@ pandas 0.7.0
     5-10x in most typical use cases (GH #374)
   - Some performance enhancements in constructing a Panel from a dict of
     DataFrame objects
+  - Made ``Index._get_duplicates`` a public method by removing the underscore
 
 **Bug fixes**
 
 
@@ -23,7 +23,7 @@
 import numpy.ma as ma
 
 from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
-                                _default_index, _stringify, _maybe_upcast)
+                                _default_index, _stringify)
 from pandas.core.daterange import DateRange
 from pandas.core.generic import NDFrame
 from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index
@@ -1638,7 +1638,8 @@ def reindex_like(self, other, method=None, copy=True):
 
     truncate = generic.truncate
 
-    def set_index(self, col_or_cols, drop=True, inplace=False):
+    def set_index(self, col_or_cols, drop=True, inplace=False,
+                  verify_integrity=True):
         """
         Set the DataFrame index (row labels) using one or more existing
         columns. By default yields a new object.
@@ -1650,6 +1651,10 @@ def set_index(self, col_or_cols, drop=True, inplace=False):
             Delete columns to be used as the new index
         inplace : boolean, default False
             Modify the DataFrame in place (do not create a new object)
+        verify_integrity : boolean, default True
+            Check the new index for duplicates. Otherwise defer the check until
+            necessary. Setting to False will improve the performance of this
+            method
 
         Returns
         -------
@@ -1674,8 +1679,8 @@ def set_index(self, col_or_cols, drop=True, inplace=False):
 
         index = MultiIndex.from_arrays(arrays, names=cols)
 
-        if not index._verify_integrity():
-            duplicates = index._get_duplicates()
+        if verify_integrity and not index._verify_integrity():
+            duplicates = index.get_duplicates()
             raise Exception('Index has duplicate keys: %s' % duplicates)
 
         # clear up memory usage
@@ -2738,60 +2743,13 @@ def append(self, other, ignore_index=False):
         if not self:
             return other.copy()
 
-        if ignore_index:
-            new_index = None
+        from pandas.tools.merge import concat
+        if isinstance(other, list):
+            to_concat = [self] + other
         else:
-            new_index = self.index.append(other.index)
-            assert(new_index._verify_integrity())
-
-        if self.columns.equals(other.columns):
-            return self._append_same_columns(other, new_index)
-        else:
-            return self._append_different_columns(other, new_index)
-
-    def _append_different_columns(self, other, new_index):
-        indexer = self.columns.get_indexer(other.columns)
-
-        if not (indexer == -1).any():
-            new_columns = self.columns
-        else:
-            new_columns = self.columns.union(other.columns)
-
-        new_data = self._append_column_by_column(other)
-        return self._constructor(data=new_data, index=new_index,
-                                 columns=new_columns)
-
-    def _append_same_columns(self, other, new_index):
-        if self._is_mixed_type:
-            new_data = self._append_column_by_column(other)
-        else:
-            new_data = np.concatenate((self.values, other.values), axis=0)
-        return self._constructor(new_data, index=new_index,
-                                 columns=self.columns)
-
-    def _append_column_by_column(self, other):
-        def _concat_missing(values, n):
-            values = _maybe_upcast(values)
-            missing_values = np.empty(n, dtype=values.dtype)
-            missing_values.fill(np.nan)
-            return values, missing_values
-
-        new_data = {}
-        for col in self:
-            values = self._get_raw_column(col)
-            if col in other:
-                other_values = other._get_raw_column(col)
-            else:
-                values, other_values = _concat_missing(values, len(other))
-            new_data[col] = np.concatenate((values, other_values))
-
-        for col in other:
-            values = other._get_raw_column(col)
-            if col not in self:
-                values, missing_values = _concat_missing(values, len(self))
-                new_data[col] = np.concatenate((missing_values, values))
-
-        return new_data
+            to_concat = [self, other]
+        return concat(to_concat, ignore_index=ignore_index,
+                      verify_integrity=True)
 
     def _get_raw_column(self, col):
         return self._data.get(col)
@@ -3618,6 +3576,8 @@ def factor_agg(factor, vec, func):
 
 
 def extract_index(data):
+    from pandas.core.index import _union_indexes
+
     index = None
     if len(data) == 0:
         index = NULL_INDEX
@@ -3663,51 +3623,6 @@ def extract_index(data):
     return _ensure_index(index)
 
 
-def _union_indexes(indexes):
-    if len(indexes) == 0:
-        return Index([])
-
-    if len(indexes) == 1:
-        result = indexes[0]
-        if isinstance(result, list):
-            result = Index(sorted(result))
-        return result
-
-    indexes, kind = _sanitize_and_check(indexes)
-
-    if kind == 'special':
-        result = indexes[0]
-        for other in indexes[1:]:
-            result = result.union(other)
-        return result
-    elif kind == 'array':
-        index = indexes[0]
-        for other in indexes[1:]:
-            if not index.equals(other):
-                return Index(lib.fast_unique_multiple(indexes))
-
-        return index
-    else:
-        return Index(lib.fast_unique_multiple_list(indexes))
-
-
-def _sanitize_and_check(indexes):
-    kinds = list(set([type(index) for index in indexes]))
-
-    if list in kinds:
-        if len(kinds) > 1:
-            indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
-                       for x in indexes]
-            kinds.remove(list)
-        else:
-            return indexes, 'list'
-
-
-    if len(kinds) > 1 or Index not in kinds:
-        return indexes, 'special'
-    else:
-        return indexes, 'array'
-
 
 def _check_data_types(data):
     have_raw_arrays = False
 
@@ -5,7 +5,7 @@
 
 import numpy as np
 
-from pandas.core.common import (adjoin as _adjoin, _stringify,
+from pandas.core.common import (adjoin as _adjoin, _stringify, _try_sort,
                                 _is_bool_indexer, _asarray_tuplesafe)
 from pandas.util.decorators import cache_readonly
 import pandas.core.common as com
@@ -119,6 +119,15 @@ def is_monotonic(self):
         except TypeError:
             return False
 
+    def get_duplicates(self):
+        from collections import defaultdict
+        counter = defaultdict(lambda: 0)
+        for k in self.values:
+            counter[k] += 1
+        return sorted(k for k, v in counter.iteritems() if v > 1)
+
+    _get_duplicates = get_duplicates
+
     @property
     def indexMap(self):
         "{label -> location}"
@@ -143,13 +152,6 @@ def _get_level_number(self, level):
     def _verify_integrity(self):
         return self._engine.has_integrity
 
-    def _get_duplicates(self):
-        from collections import defaultdict
-        counter = defaultdict(lambda: 0)
-        for k in self.values:
-            counter[k] += 1
-        return sorted(k for k, v in counter.iteritems() if v > 1)
-
     _allDates = None
     def is_all_dates(self):
         """
@@ -1261,9 +1263,6 @@ def append(self, other):
         appended : Index
         """
         if isinstance(other, (list, tuple)):
-            for k in other:
-                assert(isinstance(k, MultiIndex))
-
             to_concat = (self.values,) + tuple(k.values for k in other)
         else:
             to_concat = self.values, other.values
@@ -1871,3 +1870,66 @@ def _ensure_index(index_like):
 def _validate_join_method(method):
     if method not in ['left', 'right', 'inner', 'outer']:
         raise Exception('do not recognize join method %s' % method)
+
+# TODO: handle index names!
+
+def _get_combined_index(indexes, intersect=False):
+    indexes = _get_distinct_indexes(indexes)
+    if len(indexes) == 1:
+        return indexes[0]
+    if intersect:
+        index = indexes[0]
+        for other in indexes[1:]:
+            index = index.intersection(other)
+        return index
+    union =  _union_indexes(indexes)
+    return Index(union)
+
+def _get_distinct_indexes(indexes):
+    return dict((id(x), x) for x in indexes).values()
+
+
+def _union_indexes(indexes):
+    if len(indexes) == 0:
+        return Index([])
+
+    if len(indexes) == 1:
+        result = indexes[0]
+        if isinstance(result, list):
+            result = Index(sorted(result))
+        return result
+
+    indexes, kind = _sanitize_and_check(indexes)
+
+    if kind == 'special':
+        result = indexes[0]
+        for other in indexes[1:]:
+            result = result.union(other)
+        return result
+    elif kind == 'array':
+        index = indexes[0]
+        for other in indexes[1:]:
+            if not index.equals(other):
+                return Index(lib.fast_unique_multiple(indexes))
+
+        return index
+    else:
+        return Index(lib.fast_unique_multiple_list(indexes))
+
+
+def _sanitize_and_check(indexes):
+    kinds = list(set([type(index) for index in indexes]))
+
+    if list in kinds:
+        if len(kinds) > 1:
+            indexes = [Index(_try_sort(x)) if not isinstance(x, Index) else x
+                       for x in indexes]
+            kinds.remove(list)
+        else:
+            return indexes, 'list'
+
+
+    if len(kinds) > 1 or Index not in kinds:
+        return indexes, 'special'
+    else:
+        return indexes, 'array'
@@ -9,10 +9,11 @@
 
 from pandas.core.common import (PandasError, _mut_exclusive,
                                 _try_sort, _default_index, _infer_dtype)
-from pandas.core.index import Factor, Index, MultiIndex, _ensure_index
+from pandas.core.index import (Factor, Index, MultiIndex, _ensure_index,
+                               _get_combined_index, _union_indexes)
 from pandas.core.indexing import _NDFrameIndexer
 from pandas.core.internals import BlockManager, make_block, form_blocks
-from pandas.core.frame import DataFrame, _union_indexes
+from pandas.core.frame import DataFrame
 from pandas.core.generic import NDFrame
 from pandas.util import py3compat
 from pandas.util.decorators import deprecate
@@ -1152,52 +1153,18 @@ def _homogenize_dict(frames, intersect=True, dtype=None):
         else:
             adj_frames[k] = v
 
-    index = _get_combined_index(adj_frames, intersect=intersect)
-    columns = _get_combined_columns(adj_frames, intersect=intersect)
+    all_indexes = [df.index for df in adj_frames.values()]
+    all_columns = [df.columns for df in adj_frames.values()]
+
+    index = _get_combined_index(all_indexes, intersect=intersect)
+    columns = _get_combined_index(all_columns, intersect=intersect)
 
     for key, frame in adj_frames.iteritems():
         result[key] = frame.reindex(index=index, columns=columns,
                                     copy=False)
 
     return result, index, columns
 
-def _get_combined_columns(frames, intersect=False):
-    columns = None
-
-    if intersect:
-        combine = set.intersection
-    else:
-        combine = set.union
-
-    for _, frame in frames.iteritems():
-        this_cols = set(frame.columns)
-
-        if columns is None:
-            columns = this_cols
-        else:
-            columns = combine(columns, this_cols)
-
-    return Index(sorted(columns))
-
-def _get_combined_index(frames, intersect=False):
-    from pandas.core.frame import _union_indexes
-
-    indexes = _get_distinct_indexes([df.index for df in frames.values()])
-    if len(indexes) == 1:
-        return indexes[0]
-    if intersect:
-        index = indexes[0]
-        for other in indexes[1:]:
-            index = index.intersection(other)
-        return index
-    union =  _union_indexes(indexes)
-    return Index(union)
-
-def _get_distinct_indexes(indexes):
-    from itertools import groupby
-    indexes = sorted(indexes, key=id)
-    return [gp.next() for _, gp in groupby(indexes, id)]
-
 def _monotonic(arr):
     return not (arr[1:] < arr[:-1]).any()
 
 
@@ -413,7 +413,7 @@ def get_chunk(self, rows=None):
             index = Index(np.arange(len(content)))
 
         if not index._verify_integrity():
-            dups = index._get_duplicates()
+            dups = index.get_duplicates()
             raise Exception('Index has duplicates: %s' % str(dups))
 
         if len(self.columns) != len(zipped_content):
 
@@ -426,7 +426,7 @@ def minor_xs(self, key):
 SparseWidePanel = SparsePanel
 
 def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
-    from pandas.core.panel import _get_combined_index, _get_combined_columns
+    from pandas.core.panel import _get_combined_index
     output = {}
     for item, df in frames.iteritems():
         if not isinstance(df, SparseDataFrame):
@@ -436,9 +436,11 @@ def _convert_frames(frames, index, columns, fill_value=np.nan, kind='block'):
         output[item] = df
 
     if index is None:
-        index = _get_combined_index(output)
+        all_indexes = [df.index for df in output.values()]
+        index = _get_combined_index(all_indexes)
     if columns is None:
-        columns = _get_combined_columns(output)
+        all_columns = [df.columns for df in output.values()]
+        columns = _get_combined_index(all_columns)
 
     index = _ensure_index(index)
     columns = _ensure_index(columns)