JuliaData · bkamins · Jun 3, 2021 · Apr 17, 2021 · May 30, 2021 · May 30, 2021
diff --git a/src/join/composer.jl b/src/join/composer.jl
@@ -46,37 +46,36 @@ struct DataFrameJoiner
                                     "Symbol or Pair{Symbol, Symbol}."))
             end
         end
-        dfl_on = dfl[!, left_on]
-        dfr_on = dfr[!, right_on]
-
-        if matchmissing === :error
-            for df in (dfl_on, dfr_on), col in eachcol(df)
-                if any(ismissing, col)
-                    throw(ArgumentError("missing values in key columns are not allowed " *
-                                        "when matchmissing == :error"))
-                end
-            end
-        elseif matchmissing === :notequal
+
+        if matchmissing === :notequal
             if kind in (:left, :semi, :anti)
                 dfr = dropmissing(dfr, right_on, view=true)
-                dfr_on = select(dfr, right_on)
             elseif kind === :right
                 dfl = dropmissing(dfl, left_on, view=true)
-                dfl_on = select(dfl, left_on)
             elseif kind === :inner
+                # it possible to drop only left or right df
+                # to gain some performance but needs more testing, see #2724
                 dfl = dropmissing(dfl, left_on, view=true)
-                dfl_on = select(dfl, left_on)
                 dfr = dropmissing(dfr, right_on, view=true)
-                dfr_on = select(dfr, right_on)
             elseif kind === :outer
                 throw(ArgumentError("matchmissing == :notequal for `outerjoin` is not allowed"))
             else
-                throw(ArgumentError("matchmissing == :notequal not implemented for kind == $kind"))
+                throw(ArgumentError("matchmissing == :notequal not implemented for kind == :$kind"))
             end
-        elseif matchmissing !== :equal
+        end
+        dfl_on = select(dfl, left_on, copycols=false)
+        dfr_on = select(dfr, right_on, copycols=false)
+        if matchmissing === :error
+            for df in (dfl_on, dfr_on), col in eachcol(df)
+                if any(ismissing, col)
+                    throw(ArgumentError("missing values in key columns are not allowed " *
+                                        "when matchmissing == :error"))
+                end
+            end
+        elseif !(matchmissing in (:equal, :notequal))
             throw(ArgumentError("matchmissing allows only :error, :equal, or :notequal"))
         end
-
+    
         for df in (dfl_on, dfr_on), col in eachcol(df)
             if any(x -> (x isa Union{Complex, Real}) &&
                         (isnan(x) || isequal(real(x), -0.0) || isequal(imag(x), -0.0)), col)
@@ -503,8 +502,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched; if equal to `:notequal` then missings are dropped in `df1` and `df2` 
-  `on` columns (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df1` and `df2`
+  `on` columns; `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use
@@ -645,8 +644,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns
-  (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -792,8 +791,8 @@ change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched; if equal to `:notequal` then missings are dropped in `df1` `on` columns
-  (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df1` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -944,7 +943,7 @@ This behavior may change in future releases.
   data frame and left unchanged.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched (`isequal` is used for comparisons of rows for equality)
+  matched; `isequal` is used for comparisons of rows for equality
 
 All columns of the returned data table will support missing values.
 
@@ -1092,8 +1091,8 @@ The order of rows in the result is undefined and may change in the future releas
    By default no check is performed.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns
-  (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use
@@ -1198,8 +1197,8 @@ The order of rows in the result is undefined and may change in the future releas
    By default no check is performed.
 - `matchmissing` : if equal to `:error` throw an error if `missing` is present
   in `on` columns; if equal to `:equal` then `missing` is allowed and missings are
-  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns
-  (`isequal` is used for comparisons of rows for equality)
+  matched; if equal to `:notequal` then missings are dropped in `df2` `on` columns;
+  `isequal` is used for comparisons of rows for equality
 
 It is not allowed to join on columns that contain `NaN` or `-0.0` in real or
 imaginary part of the number. If you need to perform a join on such values use

diff --git a/test/join.jl b/test/join.jl
@@ -1562,47 +1562,48 @@ end
 @testset "matchmissing :notequal correctness" begin
     Random.seed!(1337)
     names = [
-        DataFrame(ID = Union{Int, Missing}[1, 2, missing],
-            Name = Union{String, Missing}["John Doe", "Jane Doe", "Joe Blogs"]),
-        DataFrame(ID = Union{Int, Missing}[], Name = String[]),
-        DataFrame(ID = Union{Int, Missing}[missing, missing, missing],
-            Name = String["John Doe", "Jane Doe", "Joe Blogs"]),
-        DataFrame(ID = Union{Int, Missing}[1, 2, 3],
-            Name = Union{String, Missing}[missing, "Jane Doe", missing]),
-        DataFrame(ID = Union{Int, Missing}[1:100; missings(100)],
-            Name = Union{String, Missing}[repeat(["Jane Doe"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[missings(100); 1:100],
-            Name = Union{String, Missing}[repeat(["Jane Doe"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:50; missings(100); 51:100],
-            Name = Union{String, Missing}[repeat(["Jane Doe"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:64; missings(64); 129:200],
-            Name = Union{String, Missing}[repeat(["Jane Doe"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:63; missings(65); 129:200],
-            Name = Union{String, Missing}[repeat(["Jane Doe"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[rand([1:1000; missing], 10000)...],
-            Name = Union{String, Missing}[rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)...]),
+        DataFrame(ID=[1, 2, missing],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID=Int[],
+                  Name=String[]),
+        DataFrame(ID=Union{Int, Missing}[missings(3)...],
+                  Name=String["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID=[1, 2, 3],
+                  Name=[missing, "Jane Doe", missing]),
+        DataFrame(ID=[1:100; missings(100)],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[missings(100); 1:100],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:50; missings(100); 51:100],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:64; missings(64); 129:200],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=[1:63; missings(65); 129:200],
+                  Name=repeat(["Jane Doe"], 200)),
+        DataFrame(ID=rand([1:1000; missing], 10000),
+                  Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
     ]
     jobs = [
-        DataFrame(ID = Union{Int, Missing}[1, 2, 2, 4],
-            Job = Union{String, Missing}["Lawyer", "Doctor", "Florist", "Farmer"]),
-        DataFrame(ID = Union{Int, Missing}[missing, 2, 2, 4],
-            Job = Union{String, Missing}["Lawyer", "Doctor", "Florist", "Farmer"]),
-        DataFrame(ID = Union{Int, Missing}[missing, 2, 2, 4],
-            Job = Union{String, Missing}["Lawyer", "Doctor", missing, "Farmer"]),
-        DataFrame(ID = Union{Int, Missing}[],
-            Job = Union{String, Missing}[]),
-        DataFrame(ID = Union{Int, Missing}[1:100; missings(100)],
-            Job = Union{String, Missing}[repeat(["Lawyer"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[missings(100); 1:100],
-            Job = Union{String, Missing}[repeat(["Lawyer"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:50; missings(100); 51:100],
-            Job = Union{String, Missing}[repeat(["Lawyer"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:64; missings(64); 129:200],
-            Job = Union{String, Missing}[repeat(["Lawyer"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[1:63; missings(65); 129:200],
-            Job = Union{String, Missing}[repeat(["Lawyer"], 200)...]),
-        DataFrame(ID = Union{Int, Missing}[rand([1:1000; missing], 10000)...],
-            Job = Union{String, Missing}[rand(["Lawyer", "Doctor", "Florist", missing], 10000)...]),
+        DataFrame(ID=[1, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID=[missing, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID=[missing, 2, 2, 4],
+                  Job=["Lawyer", "Doctor", missing, "Farmer"]),
+        DataFrame(ID=Union{Int, Missing}[],
+                  Job=Union{String, Missing}[]),
+        DataFrame(ID=[1:100; missings(100)],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[missings(100); 1:100],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:50; missings(100); 51:100],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:64; missings(64); 129:200],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=[1:63; missings(65); 129:200],
+                  Job=repeat(["Lawyer"], 200)),
+        DataFrame(ID=rand([1:1000; missing], 10000),
+                  Job=rand(["Lawyer", "Doctor", "Florist", missing], 10000)),
     ]
     for name in names, job in jobs
         @test leftjoin(name, dropmissing(job, :ID), on=:ID, matchmissing=:equal) ≅
@@ -1619,90 +1620,69 @@ end
 
     rl(n) = rand(["a", "b", "c"], n)
     names2 = [
-        DataFrame(
-            ID1 = Union{Int, Missing}[1, 1, 2],
-            ID2 = Union{String, Missing}["a", "b", "a"],
-            Name = Union{String, Missing}["John Doe", "Jane Doe", "Joe Blogs"]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1, 1, 2, missing],
-            ID2 = Union{String, Missing}["a", "b", "a", missing],
-            Name = Union{String, Missing}["John Doe", "Jane Doe", "Joe Blogs", missing]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[missing, 1, 2, missing],
-            ID2 = Union{String, Missing}["a", "b", missing, missing],
-            Name = Union{String, Missing}[missing, "Jane Doe", "Joe Blogs", missing]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[missing, 1, 2, missing],
-            ID2 = Union{String, Missing}["a", "b", missing, missing],
-            Name = Union{String, Missing}[missings(4)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[missing, 1, 2, missing],
-            ID2 = Union{String, Missing}[missings(4)...],
-            Name = Union{String, Missing}["John Doe", "Jane Doe", "Joe Blogs", missing]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:100; missings(100)],
-            ID2 = Union{String, Missing}[rl(100); missings(100)],
-            Name = Union{String, Missing}[rand(["Jane Doe", "Jane Doe"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[missings(100); 1:100],
-            ID2 = Union{String, Missing}[missings(100); rl(100)],
-            Name = Union{String, Missing}[rand(["Jane Doe", "Jane Doe"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:50; missings(100); 51:100],
-            ID2 = Union{String, Missing}[rl(50); missings(100); rl(50)],
-            Name = Union{String, Missing}[rand(["Jane Doe", "Jane Doe"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:64; missings(64); 129:200],
-            ID2 = Union{String, Missing}[rl(64); missings(64); rl(200 - 128)],
-            Name = Union{String, Missing}[rand(["Jane Doe", "Jane Doe"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:63; missings(65); 129:200],
-            ID2 = Union{String, Missing}[rl(64); missings(65); rl(200 - 129)],
-            Name = Union{String, Missing}[rand(["Jane Doe", "Jane Doe"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[rand([1:100; missing], 10000)...],
-            ID2 = Union{String, Missing}[rand(["a", "b", "c", missing], 10000)...],
-            Name = Union{String, Missing}[rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)...]),
+        DataFrame(ID1=[1, 1, 2],
+                  ID2=["a", "b", "a"],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs"]),
+        DataFrame(ID1=[1, 1, 2, missing],
+                  ID2=["a", "b", "a", missing],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=["a", "b", missing, missing],
+                  Name=[missing, "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=["a", "b", missing, missing],
+                  Name=Union{String, Missing}[missings(4)...]),
+        DataFrame(ID1=[missing, 1, 2, missing],
+                  ID2=Union{String, Missing}[missings(4)...],
+                  Name=["John Doe", "Jane Doe", "Joe Blogs", missing]),
+        DataFrame(ID1=[1:100; missings(100)],
+                  ID2=[rl(100); missings(100)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[missings(100); 1:100],
+                  ID2=[missings(100); rl(100)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:50; missings(100); 51:100],
+                  ID2=[rl(50); missings(100); rl(50)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:64; missings(64); 129:200],
+                  ID2=[rl(64); missings(64); rl(200 - 128)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=[1:63; missings(65); 129:200],
+                  ID2=[rl(64); missings(65); rl(200 - 129)],
+                  Name=rand(["Jane Doe", "Jane Doe"], 200)),
+        DataFrame(ID1=rand([1:100; missing], 10000),
+                  ID2=rand(["a", "b", "c", missing], 10000),
+                  Name=rand(["John Doe", "Jane Doe", "Joe Blogs", missing], 10000)),
     ]
     jobs2 = [
-        DataFrame(
-            ID1 = Union{Int, Missing}[1, 2, 2, 4],
-            ID2 = Union{String, Missing}["a", "b", "b", "c"],
-            Job = Union{String, Missing}["Lawyer", "Doctor", "Florist", "Farmer"]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1, 2, 2, 4, missing],
-            ID2 = Union{String, Missing}["a", "b", "b", "c", missing],
-            Job = Union{String, Missing}["Lawyer", "Doctor", "Florist", "Farmer", missing]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1, 2, missing, 4, missing],
-            ID2 = Union{String, Missing}["a", "b", missing, "c", missing],
-            Job = Union{String, Missing}[missing, "Doctor", "Florist", "Farmer", missing]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:100; missings(100)],
-            ID2 = Union{String, Missing}[rl(100); missings(100)],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[missings(100); 1:100],
-            ID2 = Union{String, Missing}[missings(100); rl(100)],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:50; missings(100); 51:100],
-            ID2 = Union{String, Missing}[rl(50); missings(100); rl(50)],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:64; missings(64); 129:200],
-            ID2 = Union{String, Missing}[rl(64); missings(64); rl(200 - 128)],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[1:63; missings(65); 129:200],
-            ID2 = Union{String, Missing}[rl(64); missings(65); rl(200 - 129)],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist"], 200)...]),
-        DataFrame(
-            ID1 = Union{Int, Missing}[rand([1:100; missing], 10000)...],
-            ID2 = Union{String, Missing}[rand(["a", "b", "c", missing], 10000)...],
-            Job = Union{String, Missing}[rand(["Doctor", "Florist", "Farmer", missing], 10000)...]),
+        DataFrame(ID1=[1, 2, 2, 4],
+                  ID2=["a", "b", "b", "c"],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer"]),
+        DataFrame(ID1=[1, 2, 2, 4, missing],
+                  ID2=["a", "b", "b", "c", missing],
+                  Job=["Lawyer", "Doctor", "Florist", "Farmer", missing]),
+        DataFrame(ID1=[1, 2, missing, 4, missing],
+                  ID2=["a", "b", missing, "c", missing],
+                  Job=[missing, "Doctor", "Florist", "Farmer", missing]),
+        DataFrame(ID1=[1:100; missings(100)],
+                  ID2=[rl(100); missings(100)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[missings(100); 1:100],
+                  ID2=[missings(100); rl(100)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:50; missings(100); 51:100],
+                  ID2=[rl(50); missings(100); rl(50)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:64; missings(64); 129:200],
+                  ID2=[rl(64); missings(64); rl(200 - 128)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=[1:63; missings(65); 129:200],
+                  ID2=[rl(64); missings(65); rl(200 - 129)],
+                  Job=rand(["Doctor", "Florist"], 200)),
+        DataFrame(ID1=rand([1:100; missing], 10000),
+                  ID2=rand(["a", "b", "c", missing], 10000),
+                  Job=rand(["Doctor", "Florist", "Farmer", missing], 10000)),
     ]
-
     k = [:ID1, :ID2]
     for name in names2, job in jobs2
         @test leftjoin(name, dropmissing(job, k), on=k, matchmissing=:equal) ≅