Skip to content

Commit f82d931

Browse files
committed
BUG/ENH: treat NA as additional group in merge operations. close pandas-dev#1990
1 parent eb48812 commit f82d931

File tree

4 files changed

+40
-4
lines changed

4 files changed

+40
-4
lines changed

RELEASE.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ pandas 0.9.0
3939

4040
**Improvements to existing features**
4141

42+
- Proper handling of NA values in merge operations (#1990)
4243
- Add ``flags`` option for ``re.compile`` in some Series.str methods (#1659)
4344
- Parsing of UTC date strings in read_* functions (#1693)
4445
- Handle generator input to Series (#1679)

pandas/tools/merge.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ def _merger(x, y):
142142

143143

144144

145-
# TODO: NA group handling
146145
# TODO: transformations??
147146
# TODO: only copy DataFrames when modification necessary
148147

@@ -572,7 +571,16 @@ def _factorize_keys(lk, rk, sort=True):
572571
if sort:
573572
llab, rlab = _sort_labels(rizer.uniques, llab, rlab)
574573

575-
# TODO: na handling
574+
# NA group
575+
lmask = llab == -1; lany = lmask.any()
576+
rmask = rlab == -1; rany = rmask.any()
577+
578+
if lany or rany:
579+
if lany:
580+
np.putmask(llab, lmask, count)
581+
if rany:
582+
np.putmask(rlab, rmask, count)
583+
count += 1
576584

577585
return llab, rlab, count
578586

pandas/tools/tests/test_merge.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,34 @@ def test_left_merge_na_buglet(self):
777777
expected = left.join(rdf)
778778
tm.assert_frame_equal(merged, expected)
779779

780+
def test_merge_na_keys(self):
781+
data = [[1950, "A", 1.5],
782+
[1950, "B", 1.5],
783+
[1955, "B", 1.5],
784+
[1960, "B", np.nan],
785+
[1970, "B", 4.],
786+
[1950, "C", 4.],
787+
[1960, "C", np.nan],
788+
[1965, "C", 3.],
789+
[1970, "C", 4.]]
790+
791+
frame = DataFrame(data, columns=["year", "panel", "data"])
792+
793+
other_data = [[1960, 'A', np.nan],
794+
[1970, 'A', np.nan],
795+
[1955, 'A', np.nan],
796+
[1965, 'A', np.nan],
797+
[1965, 'B', np.nan],
798+
[1955, 'C', np.nan]]
799+
other = DataFrame(other_data, columns=['year', 'panel', 'data'])
800+
801+
result = frame.merge(other, how='outer')
802+
803+
expected = frame.fillna(-999).merge(other.fillna(-999), how='outer')
804+
expected = expected.replace(-999, np.nan)
805+
806+
tm.assert_frame_equal(result, expected)
807+
780808

781809
def _check_join(left, right, result, join_col, how='left',
782810
lsuffix='_x', rsuffix='_y'):

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -372,8 +372,7 @@ def srcpath(name=None, suffix='.pyx', subdir='src'):
372372

373373
algos_ext = Extension('pandas._algos',
374374
sources=[srcpath('generated', suffix=suffix)],
375-
include_dirs=[np.get_include()],
376-
)
375+
include_dirs=[np.get_include()])
377376

378377
lib_depends = tseries_depends + ['pandas/src/numpy_helper.h',
379378
'pandas/src/datetime/np_datetime.h',

0 commit comments

Comments
 (0)