-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #77 from TidierOrg/regex_support_for_str_detect
Regex support for str detect and much more
- Loading branch information
Showing
27 changed files
with
638 additions
and
365 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
# In this example, we will reproduce a DuckDB and duckplyr blog post example to demonstrate TidierDB's v0.5.0 capability. | ||
|
||
# The (example by Hannes)[https://duckdb.org/2024/10/09/analyzing-open-government-data-with-duckplyr.html] that is being reproduced is exploring Open Data from the New Zealand government that is ~ 1GB. | ||
|
||
# ## Set up | ||
# First we will set up the local duckdb database and pull in the metadata for the files. Notice we are not reading this data into memory, only the paths and and column, and table names. | ||
# To follow along, copy the set up code below after downloading the data, but add the directory to the local data. | ||
# ```julia | ||
# import TidierDB as DB | ||
# db = DB.connect(DB.duckdb()) | ||
|
||
# dir = "/Downloads/nzcensus/" | ||
# data = dir * "Data8277.csv" | ||
# age = dir * "DimenLookupAge8277.csv" | ||
# area = dir * "DimenLookupArea8277.csv" | ||
# ethnic = dir * "DimenLookupEthnic8277.csv" | ||
# sex = dir * "DimenLookupSex8277.csv" | ||
# year = dir * "DimenLookupYear8277.csv" | ||
|
||
# data = DB.db_table(db, data); | ||
# age = DB.db_table(db, age); | ||
# area = DB.db_table(db, area); | ||
# ethnic = DB.db_table(db, ethnic); | ||
# sex = DB.db_table(db, sex); | ||
# year = DB.db_table(db, year); | ||
# ``` | ||
# ## Exploration | ||
# While this long chain could be broken up into multiple smaller chains, lets reproduce the duckplyr code from example and demonstrate how TidierDB also supports multiple joins after filtering, mutating, etc the joining tables. 6 different tables are being joined together through sequential inner joins. | ||
# ```julia | ||
# @chain DB.t(data) begin | ||
# DB.@filter(str_detect(count, r"^\d+$")) | ||
# DB.@mutate(count_ = "TRY_CAST(count AS INT)") | ||
# DB.@filter(count_ > 0) | ||
# DB.@inner_join( | ||
# (@chain DB.t(age) begin | ||
# DB.@filter(str_detect(Description, r"^\d+ years$")) | ||
# DB.@mutate(age_ = as_integer(str_remove(Code, "years"))) end), | ||
# Age = Code | ||
# ) | ||
# DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), year = Code) | ||
# DB.@inner_join((@chain DB.t(area) begin | ||
# DB.@mutate(area_ = Description) | ||
# DB.@filter(!str_detect(area_, r"^Total")) | ||
# end) | ||
# , Area = Code) | ||
# DB.@inner_join((@chain DB.t(ethnic) begin | ||
# DB.@mutate(ethnic_ = Description) | ||
# DB.@filter(!str_detect( ethnic_, r"^Total",)) end), Ethnic = Code) | ||
# DB.@inner_join((@chain DB.t(sex) begin | ||
# DB.@mutate(sex_ = Description) | ||
# DB.@filter(!str_detect( sex_, r"^Total")) | ||
# end) | ||
# , Sex = Code) | ||
# DB.@inner_join((@chain DB.t(year) DB.@mutate(year_ = Description)), Year = Code) | ||
# @aside DB.@show_query _ | ||
# DB.@create_view(joined_up) | ||
# end; | ||
|
||
# @chain DB.db_table(db, "joined_up") begin | ||
# DB.@filter begin | ||
# age_ >= 20 | ||
# age_ <= 40 | ||
# str_detect(area_, r"^Auckland") | ||
# year_ == "2018" | ||
# ethnic_ != "European" | ||
# end | ||
# DB.@group_by sex_ | ||
# DB.@summarise(group_count = sum(count_)) | ||
# DB.@collect | ||
# end | ||
# ``` | ||
# ## Results | ||
# When we collect this to a local dataframe, we can see that the results match the duckplyr/DuckDB example. | ||
# ``` | ||
# 2×2 DataFrame | ||
# Row │ sex_ group_count | ||
# │ String Int128 | ||
# ─────┼───────────────────── | ||
# 1 │ Female 398556 | ||
# 2 │ Male 397326 | ||
# ``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,78 +1,87 @@ | ||
# How can functions pass arguments to a TidierDB chain? | ||
|
||
# In short, you have to use a macro instead in conjuction with `@interpolate` | ||
|
||
# ## Setting up the macro | ||
# To write a macro that will take arguments and pass them to a TidierDB chain, there are 3 steps: | ||
# 1. Write macro with the desired argument(s), and, after the quote, add the chain. Arguments to be changed/interpolated must be prefixed with `!!` | ||
# 2. Use `@interpolate` to make these arguemnts accessible to the chain. `@interpolate` takes touples as argument (one for the `!!`name, and one for the actual content you want the chain to use) | ||
# 3. Run `@interpolate` and then the chain macro sequentially | ||
|
||
# ``` | ||
# using TidierDB | ||
# db = connect(duckdb()) | ||
# path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv" | ||
# copy_to(db, path, "mtcars"); | ||
# | ||
# # STEP 1 | ||
# macro f1(conditions, columns) # The arguemnt names will be names of the `!!` values | ||
# return quote | ||
# # add chain here | ||
# @chain db_table(db, :mtcars) begin | ||
# @filter(!!conditions > 3) | ||
# @select(!!columns) | ||
# @aside @show_query _ | ||
# @collect | ||
# end # ends the chain | ||
# end # ends the quote. | ||
# end # ends the macro | ||
# ``` | ||
# ```julia | ||
# # STEP 2 | ||
# variable = :gear; | ||
# cols = [:model, :mpg, :gear, :wt]; | ||
# @interpolate((conditions, variable), (columns, cols)); | ||
# @f1(variable, cols) | ||
# ``` | ||
# ``` | ||
# 17×4 DataFrame | ||
# Row │ model mpg gear wt | ||
# │ String? Float64? Int32? Float64? | ||
# ─────┼──────────────────────────────────────────── | ||
# 1 │ Mazda RX4 21.0 4 2.62 | ||
# 2 │ Mazda RX4 Wag 21.0 4 2.875 | ||
# 3 │ Datsun 710 22.8 4 2.32 | ||
# ⋮ │ ⋮ ⋮ ⋮ ⋮ | ||
# 15 │ Ferrari Dino 19.7 5 2.77 | ||
# 16 │ Maserati Bora 15.0 5 3.57 | ||
# 17 │ Volvo 142E 21.4 4 2.78 | ||
# 11 rows omitted | ||
# ``` | ||
|
||
# Lets say you wanted to filter on new variable with a different name and select new columns, | ||
# ```julia | ||
# new_condition = :wt; | ||
# new_cols = [:model, :drat] | ||
# @interpolate((conditions, new_condition), (columns, new_cols)); | ||
# @f1(new_condition, new_cols) | ||
# ``` | ||
# ``` | ||
# 20×2 DataFrame | ||
# Row │ model drat | ||
# │ String? Float64? | ||
# ─────┼───────────────────────────── | ||
# 1 │ Hornet 4 Drive 3.08 | ||
# 2 │ Hornet Sportabout 3.15 | ||
# 3 │ Valiant 2.76 | ||
# ⋮ │ ⋮ ⋮ | ||
# 18 │ Pontiac Firebird 3.08 | ||
# 19 │ Ford Pantera L 4.22 | ||
# 20 │ Maserati Bora 3.54 | ||
# 14 rows omitted | ||
# ``` | ||
|
||
# You can also interpolate vectors of strings into a `@filter(col in (values))` as well by using the following syntax `@filter(col in [!!values])` | ||
|
||
# In short, the first argument in `@interpolate` must be the name of the macro argument it refers to, and the second argument is what you would like to replace it. | ||
|
||
# We recognize this adds friction and that it is not ideal, but given the TidierDB macro expressions/string interplay, this is currently the most graceful and functional option available and hopefully a temporary solution to better interpolation that mirrors TidierData.jl. | ||
# On this page, we'll briefly explore how to use TidierDB macros and `$` witth `@eval` to bulid a function | ||
|
||
# For a more indepth explanation, please check out the TidierData page on interpolation | ||
|
||
using TidierDB, DataFrames; | ||
|
||
db = connect(duckdb()); | ||
df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], | ||
groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], | ||
value = repeat(1:5, 2), | ||
percent = 0.1:0.1:1.0); | ||
copy_to(db, df, "dfm"); | ||
df_mem = db_table(db, "dfm"); | ||
|
||
# ## Interpolation | ||
# Variables are interpoated using `@eval` and `$`. Place `@eval` before you begin the chain or call a TidierDb macro | ||
# Why Use @eval? In Julia, macros like @filter are expanded at parse time, before runtime variables like vals are available. By using @eval, we force the expression to be evaluated at runtime, allowing us to interpolate the variable into the macro. | ||
|
||
num = [3]; | ||
column = :id; | ||
@eval @chain t(df_mem) begin | ||
@filter(value in $num) | ||
@select($column) | ||
@collect | ||
end | ||
|
||
# ## Function set up | ||
# Begin by defining your function as your normally would, but before `@chain` you need to use `@eval`. For the variables to be interpolated in need to be started with `$` | ||
function test(vals, cols) | ||
@eval @chain t(df_mem) begin | ||
@filter(value in $vals) | ||
@select($cols) | ||
@collect | ||
end | ||
end; | ||
|
||
vals = [1, 2, 3, 3]; | ||
test(vals, [:groups, :value, :percent]) | ||
|
||
# Now with a new variable | ||
other_vals = [1]; | ||
cols = [:value, :percent]; | ||
test(other_vals, cols) | ||
|
||
|
||
# Defineing a new function | ||
function gs(groups, aggs, new_name, threshold) | ||
@eval @chain t(df_mem) begin | ||
@group_by($groups) | ||
@summarize($new_name = mean($aggs)) | ||
@filter($new_name > $threshold) | ||
@collect | ||
end | ||
end; | ||
|
||
gs(:groups, :percent, :mean_percent, .5) | ||
|
||
# Change the column and threshold | ||
gs(:groups, :value, :mean_value, 2) | ||
|
||
|
||
# ## Write pipeline function to use inside of chains | ||
# Lets say there is a particular sequence of macros that you want repeatedly use. Wrap this series into a function that accepts a `t(query` as its first argument and returns a `SQLquery` and you can easily resuse it. | ||
function moving_aggs(table, start, stop, group, order, col) | ||
qry = @eval @chain $table begin | ||
@group_by $group | ||
@window_frame $start $stop | ||
@window_order $order | ||
@mutate(across($col, (minimum, maximum, mean))) | ||
end | ||
return qry | ||
end; | ||
|
||
@chain t(df_mem) begin | ||
moving_aggs(-2, 1, :groups, :percent, :value) | ||
@filter value_mean > 2.75 | ||
@aside @show_query _ | ||
@collect | ||
end | ||
|
||
# Filtering before the window functions | ||
@chain t(df_mem) begin | ||
@filter(value >=2 ) | ||
moving_aggs(-1, 1, :groups, :percent, :value) | ||
@aside @show_query _ | ||
@collect | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.