diff --git a/src/DataFrames.jl b/src/DataFrames.jl index 2e261d3f10..49a9f7f631 100644 --- a/src/DataFrames.jl +++ b/src/DataFrames.jl @@ -70,6 +70,7 @@ export @~, readtable, rename!, rename, + sample, showcols, stack, stackdf, @@ -111,7 +112,9 @@ for (dir, filename) in [ ("statsmodels", "statsmodel.jl"), ("", "RDA.jl"), - ("", "deprecated.jl") + ("", "deprecated.jl"), + + ("other", "sample.jl"), ] include(joinpath(dir, filename)) diff --git a/src/other/sample.jl b/src/other/sample.jl new file mode 100644 index 0000000000..45b81ab65c --- /dev/null +++ b/src/other/sample.jl @@ -0,0 +1,30 @@ +import StatsBase: sample + +""" + sample(df[, n]) + +Draw a random sample of `n` rows from a data frame `df` and return the result as a data frame + +# Arguments + +- `replace::Bool=true`: Should sampling be performed with replacement? +- `ordered::Bool=false`: Should an ordered sample be taken? + +# Example +``` +julia> using RDatasets +julia> iris = dataset("datasets", "iris") +julia> srand(1) +julia> sample(iris, 5) +5×5 DataFrames.DataFrame +│ Row │ SepalLength │ SepalWidth │ PetalLength │ PetalWidth │ Species │ +│ 1 │ 5.0 │ 2.0 │ 3.5 │ 1.0 │ "versicolor" │ +│ 2 │ 6.2 │ 2.9 │ 4.3 │ 1.3 │ "versicolor" │ +│ 3 │ 6.7 │ 3.1 │ 4.7 │ 1.5 │ "versicolor" │ +│ 4 │ 5.5 │ 2.3 │ 4.0 │ 1.3 │ "versicolor" │ +│ 5 │ 5.8 │ 2.7 │ 5.1 │ 1.9 │ "virginica" │ +``` +""" +function sample(df::AbstractDataFrame, n::Integer=1; replace::Bool=true, ordered::Bool=false) + df[sample(1:size(df, 1), n, replace=replace, ordered=ordered), :] +end diff --git a/test/runtests.jl b/test/runtests.jl index bfc53ca433..255444eb33 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -27,7 +27,8 @@ my_tests = ["utils.jl", "iteration.jl", "duplicates.jl", "show.jl", - "statsmodel.jl"] + "statsmodel.jl", + "sample.jl"] println("Running tests:") diff --git a/test/sample.jl b/test/sample.jl new file mode 100644 index 0000000000..68a6e2ce8d --- /dev/null +++ b/test/sample.jl @@ -0,0 +1,10 @@ +module TestUtils + using Base.Test + using DataFrames + + df = DataFrame(A=1:10,B=11:20) + srand(1) + df_sample = sample(df, 5) + @test df_sample[:A] == [1,8,7,4,2] + @test df_sample[:B] == [11,18,17,14,12] +end