Merge pull request #53 from TidierOrg/resolve-conflicts

TidierOrg · Aug 11, 2024 · 7823eff · 7823eff
2 parents 3798d67 + fb1df47
commit 7823eff
Show file tree

Hide file tree

Showing 9 changed files with 169 additions and 37 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,8 +1,15 @@
 # TidierDB.jl updates
 
-## v0.3.2 - 2024-08-15
-- adds read in support for URLs for ClickHouse
-- adds support for reading from multiple URLs at once as a vector of paths in `db_table` when using ClickHouse
+
+## v0.3.2 - 2024-08-**
+- adds `@head` for limiting number of collected rows
+- adds support for reading URLS in `db_table` with ClickHouse 
+- adds support for reading from multiple files at once as a vector of urls in `db_table` when using ClickHouse
+    - ie `db_table(db, ["url1", "url2"])`
+- adds docs around using `*` for reading in multiple files from folder
+- adds docs for `db_table`
+- adds docs for previewing or saving intermediate tables in ongoing `@chain`
+- Bugfix: `@count` updates metadata
 
 ## v0.3.1 - 2024-07-28
 - adds support for reading from multiple files at once as a vector of paths in `db_table` when using DuckDB

diff --git a/docs/examples/UserGuide/from_queryex.jl b/docs/examples/UserGuide/from_queryex.jl
@@ -59,4 +59,51 @@
 #    1 │ Pontiac Firebird       8      19.2
 #    2 │ Toyota Corolla         4      33.9
 #    3 │ Hornet 4 Drive         6      21.4
+# ```
+
+# ## Preview or save an intermediate table
+# While querying a dataset, you may wish to see an intermediate table, or even save it. You can use `@aside` and `from_query(_)`, illustrated below, to do just that. 
+# While we opted to print the results in this simple example below, we could have saved them by using `name = DB.@chain...`
+
+# ```julia
+# import ClickHouse;
+# conn = conn = DB.connect(DB.clickhouse(); host="localhost", port=19000, database="default", user="default", password="")
+# path = "https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset/resolve/refs%2Fconvert%2Fparquet/default/train/0000.parquet"
+# DB.@chain DB.db_table(conn, path) begin
+#    DB.@count(cyl)
+#    @aside println(DB.@chain DB.from_query(_) DB.@head(5) DB.@collect)
+#    DB.@arrange(desc(count))
+#    DB.@collect
+# end
+# ```
+# ```
+# 5×2 DataFrame
+#  Row │ artists  count      
+#      │ String?  UInt64 
+# ─────┼─────────────────
+#    1 │ missing       1
+#    2 │ Wizo          3
+#    3 │ MAGIC!        3
+#    4 │ Macaco        1
+#    5 │ SOYOU         1
+# 31438×2 DataFrame
+#    Row │ artists          count      
+#        │ String?          UInt64 
+# ───────┼─────────────────────────
+#      1 │ The Beatles         279
+#      2 │ George Jones        271
+#      3 │ Stevie Wonder       236
+#      4 │ Linkin Park         224
+#      5 │ Ella Fitzgerald     222
+#      6 │ Prateek Kuhad       217
+#      7 │ Feid                202
+#    ⋮   │        ⋮           ⋮
+#  31432 │ Leonard               1
+#  31433 │ marcos g              1
+#  31434 │ BLVKSHP               1
+#  31435 │ Memtrix               1
+#  31436 │ SOYOU                 1
+#  31437 │ Macaco                1
+#  31438 │ missing               1
+#                31424 rows omitted
 # ```
diff --git a/docs/examples/UserGuide/functions_pass_to_DB.jl b/docs/examples/UserGuide/functions_pass_to_DB.jl
@@ -10,6 +10,7 @@
 
 # ```
 # using TidierDB
+# db = connect(duckdb())
 # path = "https://gist.githubusercontent.com/seankross/a412dfbd88b3db70b74b/raw/5f23f993cd87c283ce766e7ac6b329ee7cc2e1d1/mtcars.csv"
 # copy_to(db, path, "mtcars");
 #

diff --git a/docs/examples/UserGuide/getting_started.jl b/docs/examples/UserGuide/getting_started.jl
@@ -9,22 +9,25 @@
 
 # Alternatively, `using Tidier` will import TidierDB in the above manner for you, where TidierDB functions and macros will be available as `DB.@mutate()` and so on, and the TidierData equivalent would be `@mutate()`.
 
+# ## Connecting
 # To connect to a database, you can uset the `connect` function  as shown below, or establish your own connection through the respecitve libraries.
 
 # For example
 # Connecting to MySQL
 # ```julia
-# conn = connect(mysql(); host="localhost", user="root", password="password", db="mydb")
+# conn = DB.connect(DB.mysql(); host="localhost", user="root", password="password", db="mydb")
 # ```
 # versus connecting to DuckDB
 # ```julia
-# conn = connect(duckdb())
+# conn = DB.connect(DB.duckdb())
 # ```
 
+# You can also use establish a connection through an alternate method that you preferred, and use that as your connection as well. 
+
 # ## Package Extensions 
 # The following backends utilize package extensions. To use one of backends listed below, you will need to write `using Library`
 
-# - ClickHouse: `using ClickHouse`
+# - ClickHouse: `import ClickHouse`
 # - MySQL and MariaDB: `using MySQL`
 # - MSSQL: `using ODBC` 
 # - Postgres: `using LibPQ`
@@ -33,3 +36,14 @@
 # - Oracle: `using ODBC` 
 # - Google BigQuery: `using GoogleCloud`
 
+# ## `db_table`
+# What does `db_table` do? 
+# `db_table` starts the underlying SQL query struct, in addition to pulling the table metadata and storing it there. Storing metadata is what enables a lazy interface that also supports tidy selection.  
+# `db_table` has two required arguments: `connection` and `table`
+# `table` can be a table name on a database or a path/url to file to read.  When passing `db_table` a path or url, the table is not copied into memory.
+# With DuckDB and ClickHouse, if you have a folder of multiple files to read, you can use `*` read in all files matching the pattern. 
+# For example, the below would read all files that end in `.csv` in the given folder.
+# ```julia
+# db_table(db, "folder/path/*.csv")
+# ``` 
+# `db_table` also supports iceberg, delta, and S3 file paths via DuckDB.
diff --git a/docs/examples/UserGuide/outofmemex.jl b/docs/examples/UserGuide/outofmemex.jl
@@ -12,6 +12,9 @@
 # ```
 
 # Here, we pass the vector of URLs to `db_table`, which will not copy them into memory. Since these datasets are so large, we will also set `stream = true` in `@collect` to stream the results.
+# If we wanted to read all the files in the folder we could have replace the `0000` with `*` (wildcard)
+# `db_table(db, "Path/to/folder/*.parquet")`
+# Of note, reading these files from URLs is not as rapid as reading them from local files. 
 # ```julia
 # @chain db_table(db, urls) begin
 #     @group_by(horoscope)
@@ -43,4 +46,7 @@
 #   10 │ Pisces        53812         1011.75
 #   11 │ Virgo         64629          996.684
 #   12 │ Aries         69134          918.081
-# ```
+# ```
+
+# To learn more about memory efficient queries on larger than RAM files, this [blog from DuckDB](https://duckdb.org/2024/07/09/memory-management.html#:~:text=DuckDB%20deals%20with%20these%20scenarios,tries%20to%20minimize%20disk%20spilling.) 
+# will help maximize your local `db`
diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl
@@ -455,15 +455,26 @@ macro count(sqlquery, group_by_columns...)
     group_clause = join(group_by_cols_str, ", ")
 
     return quote
+
         sq = $(esc(sqlquery))
         if isa(sq, SQLQuery)
             # Interpolate `group_clause` directly into the quoted code to avoid scope issues
             if !isempty($group_clause)
+                for col in $group_by_cols_str
+                    $(esc(sqlquery)).metadata.current_selxn .= 0
+                    matching_indices = findall($(esc(sqlquery)).metadata.name .== col)
+                    $(esc(sqlquery)).metadata.current_selxn[matching_indices] .= 1
+                 end
                 sq.select = "SELECT " * $group_clause * ", COUNT(*) AS count"
                 sq.groupBy = "GROUP BY " * $group_clause
+                push!(sq.metadata, Dict("name" => "count", "type" => "UNKNOWN", "current_selxn" => 1, "table_name" => sq.from))
+
             else
                 # If no grouping columns are specified, just count all records
+                $(esc(sqlquery)).metadata.current_selxn .= 0
                 sq.select = "SELECT COUNT(*) AS count"
+                push!(sq.metadata, Dict("name" => "count", "type" => "UNKNOWN", "current_selxn" => 1, "table_name" => sq.from))
+
             end
 
             # Adjustments for previously set GROUP BY or ORDER BY clauses might be needed here
@@ -745,3 +756,18 @@ macro collect(sqlquery, stream = false)
         end
     end
 end
+
+
+"""
+$docstring_head
+"""
+macro head(sqlquery, value)
+    value = string(value)
+    return quote
+        sq = $(esc(sqlquery))
+        if $value != ""
+        sq.limit = $value
+        end
+        sq
+    end
+end
diff --git a/src/TidierDB.jl b/src/TidierDB.jl
@@ -17,7 +17,7 @@ using GZip
  export db_table, set_sql_mode, @arrange, @group_by, @filter, @select, @mutate, @summarize, @summarise, 
  @distinct, @left_join, @right_join, @inner_join, @count, @window_order, @window_frame, @show_query, @collect, @slice_max, 
  @slice_min, @slice_sample, @rename, copy_to, duckdb_open, duckdb_connect, @semi_join, @full_join, 
- @anti_join, connect, from_query, @interpolate, add_interp_parameter!, update_con,
+ @anti_join, connect, from_query, @interpolate, add_interp_parameter!, update_con,  @head, 
  clickhouse, duckdb, sqlite, mysql, mssql, postgres, athena, snowflake, gbq, oracle, databricks, SQLQuery
 
  abstract type SQLBackend end
@@ -143,6 +143,8 @@ function finalize_query(sqlquery::SQLQuery)
     if !isempty(sqlquery.groupBy) push!(query_parts, "" * sqlquery.groupBy) end
     if !isempty(sqlquery.having) push!(query_parts, " " * sqlquery.having) end
     if !isempty(sqlquery.orderBy) push!(query_parts, " " * sqlquery.orderBy) end
+    if !isempty(sqlquery.limit) push!(query_parts, " LIMIT " * sqlquery.limit) end
+
     complete_query = join(filter(!isempty, query_parts), " ")
 
     if !isempty(sqlquery.ch_settings) && current_sql_mode[] == clickhouse()

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -1052,30 +1052,20 @@ const docstring_db_table =
 `db_table` starts the underlying SQL query struct, adding the metadata and table. If paths are passed directly to db_table instead of a 
 name it will not copy it to memory, but rather ready directly from the file.
 
-
 # Arguments
-
-`database`: The Database or connection object
-`table_name`: tablename as a string. Table name can be a name of a table on the database or paths to the following types
-      --CSV  
-      
-      --Parquet
-      
-      --Json
-      
-      --Iceberg
-      
-      --Delta
-      
-      --S3 tables from AWS or Google Cloud 
-      
-      --vector of CSV or Parquet paths to read multiple at once
-DuckDB and ClickHouse support vectors of paths and URLs. 
-DuckDB also supports use of `*` wildcards to read all files of a type in a location such as:
-`db_table(db, "Path/to/testing_files/*.parquet")`
-
-`delta`: must be true to read delta files
-`iceberg`: must be true to read iceberg finalize_ctes
+- `database`: The Database or connection object
+- `table_name`: tablename as a string (name, local path, or URL).
+      - CSV/TSV  
+      - Parquet
+      - Json 
+      - Iceberg
+      - Delta
+      - S3 tables from AWS or Google Cloud 
+     - DuckDB and ClickHouse support vectors of paths and URLs. 
+     - DuckDB and ClickHouse also support use of `*` wildcards to read all files of a type in a location such as:
+- `db_table(db, "Path/to/testing_files/*.parquet")`
+- `delta`: must be true to read delta files
+- `iceberg`: must be true to read iceberg finalize_ctes
 
 # Example
 ```julia
@@ -1137,4 +1127,38 @@ julia> @collect db_table(db, "df_mem")
    9 │ AI       bb            4       0.9
   10 │ AJ       aa            5       1.0
 ```
-""" 
+""" 
+
+const docstring_head =
+"""
+    @head(sql_query, value)
+
+Limit SQL table number of rows returned based on specified value. 
+`LIMIT` in SQL
+
+# Arguments
+- `sql_query`: The SQL query to operate on.
+- `value`: Number to limit how many rows are returned.
+
+# Examples
+```jldoctest
+julia> db = connect(duckdb());
+
+julia> df = DataFrame(id = [string('A' + i ÷ 26, 'A' + i % 26) for i in 0:9], 
+                        groups = [i % 2 == 0 ? "aa" : "bb" for i in 1:10], 
+                        value = repeat(1:5, 2), 
+                        percent = 0.1:0.1:1.0);
+
+julia> copy_to(db, df, "df_mem");                     
+
+julia> @chain db_table(db, :df_mem) begin
+        @head(1) ## supports expressions ie `3-2` would return the same df below
+        @collect
+       end
+1×4 DataFrame
+ Row │ id       groups   value   percent  
+     │ String?  String?  Int64?  Float64? 
+─────┼────────────────────────────────────
+   1 │ AA       bb            1       0.1
+```
+"""
diff --git a/src/structs.jl b/src/structs.jl
@@ -27,12 +27,15 @@ mutable struct SQLQuery
     ctes::Vector{CTE}
     cte_count::Int
     athena_params::Any    
+    limit::String
     ch_settings::String
+
     function SQLQuery(;select::String="", from::String="", where::String="", groupBy::String="", orderBy::String="", having::String="", 
-                    window_order::String="", windowFrame::String="", is_aggregated::Bool=false, post_aggregation::Bool=false, metadata::DataFrame=DataFrame(), 
-                    distinct::Bool=false, db::Any=nothing, ctes::Vector{CTE}=Vector{CTE}(), cte_count::Int=0, athena_params::Any=nothing, ch_settings::String="")
-        new(select, from, where, groupBy, orderBy, having, window_order, windowFrame, is_aggregated, post_aggregation, 
-                metadata, distinct, db, ctes, cte_count, athena_params, ch_settings)
+        window_order::String="", windowFrame::String="", is_aggregated::Bool=false, post_aggregation::Bool=false, metadata::DataFrame=DataFrame(), 
+        distinct::Bool=false, db::Any=nothing, ctes::Vector{CTE}=Vector{CTE}(), cte_count::Int=0, athena_params::Any=nothing, limit::String="", 
+        ch_settings::String="")
+        new(select, from, where, groupBy, orderBy, having, window_order, windowFrame, is_aggregated, 
+        post_aggregation, metadata, distinct, db, ctes, cte_count, athena_params, limit, ch_settings)
     end
 end
 
@@ -93,7 +96,9 @@ function from_query(query::TidierDB.SQLQuery)
         db=query.db,
         ctes=[copy(cte) for cte in query.ctes],  
         cte_count=query.cte_count,
-        athena_params = query.athena_params
+        athena_params = query.athena_params,
+        limit = query.limit,
+        ch_settings = query.ch_settings
     )
     return new_query
 end