Merge pull request #70 from TidierOrg/catch-a-bug-then-write-a-test

Catch a bug then write a test
TidierOrg · Oct 11, 2024 · 08db6df · 08db6df
2 parents 02fc484 + f76bb86
commit 08db6df
Show file tree

Hide file tree

Showing 23 changed files with 394 additions and 145 deletions.
diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,12 @@
 # TidierDB.jl updates
+## v0.4.2 - 2024-10-08
+- add support for performing greater than 2 joins using TidierDB queries in a single chain and additional tests
+- add `dmy`, `mdy`, `ymd` support DuckDB, Postgres, GBQ, Clickhouse, MySQL, MsSQL, Athena, MsSQL
+- add date related tests
+- improve Google Big Query type mapping when collecting df
+- change `gbq()`'s `connect()` to accept `location` as second argument
+- adds `copy_to` for MsSQL to write dataframe to database
+
 ## v0.4.1 - 2024-10-02
 - Adds 50 tests comparing TidierDB to TidierData to assure accuracy across a complex chains of operations, including combinations of `@mutate`, `@summarize`, `@filter`, `@select`, `@group_by` and `@join` operations. 
 

diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "TidierDB"
 uuid = "86993f9b-bbba-4084-97c5-ee15961ad48b"
 authors = ["Daniel Rizk <rizk.daniel.12@gmail.com> and contributors"]
-version = "0.4.1"
+version = "0.4.2"
 
 [deps]
 Arrow = "69666777-d1a9-59fb-9406-91d4454c9d45"
@@ -22,6 +22,7 @@ AWS = "fbe9abb3-538b-5e4e-ba9e-bc94f4f92ebc"
 MySQL = "39abe10b-433b-5dbd-92d4-e302a9df00cd"
 ClickHouse = "82f2e89e-b495-11e9-1d9d-fb40d7cf2130"
 ODBC = "be6f12e9-ca4f-5eb2-a339-a4f995cc0291"
+Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 
 
 [extensions]

diff --git a/README.md b/README.md
@@ -43,8 +43,8 @@ TidierDB.jl currently supports the following top-level macros:
 | **Utility**               | `@show_query`, `@collect`, `@head`, `@count`, `show_tables`, `@create_view` , `drop_view`                                                                                                                                          |
 | **Helper Functions**             | `across`, `desc`, `if_else`, `case_when`, `n`, `starts_with`, `ends_with`, `contains`, `as_float`, `as_integer`, `as_string`, `is_missing`, `missing_if`, `replace_missing` |
 | **TidierStrings.jl Functions** | `str_detect`, `str_replace`, `str_replace_all`, `str_remove_all`, `str_remove`                                                                                               |
-| **TidierDates.jl Functions**   | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime`                                                                                                   |
-| **Aggregate Functions**          | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, all SQL aggregate
+| **TidierDates.jl Functions**   | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime`, `mdy`, `ymd`, `dmy`                                                                                                    |
+| **Aggregate Functions**          | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, all aggregate sql fxns
 
 `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work.        
 `@mutate` supports all builtin SQL functions as well.                                                                                            

diff --git a/docs/examples/UserGuide/from_queryex.jl b/docs/examples/UserGuide/from_queryex.jl
@@ -12,7 +12,7 @@
 
 # Start a query to analyze fuel efficiency by number of cylinders. However, to further build on this query later, end the chain without using `@show_query` or `@collect`
 # ```julia
-# query = DB.@chain DB.t(mtcars) begin
+# query = DB.@chain DB.t(query) begin
 #     DB.@group_by cyl
 #     DB.@summarize begin
 #         across(mpg, (mean, minimum, maximum))
@@ -30,7 +30,7 @@
 # ## `from_query()` or `t(query)`
 # Now, `from_query`, or `t()` a convienece wrapper, will allow you to reuse the query to calculate the average horsepower for each efficiency category
 # ```julia
-# DB.@chain DB.t(mtcars)  begin
+# DB.@chain DB.t(query) begin
 #    DB.@left_join("mtcars2", cyl, cyl)
 #    DB.@group_by(efficiency)
 #    DB.@summarize(avg_hp = mean(hp))

diff --git a/docs/src/index.md b/docs/src/index.md
@@ -35,11 +35,11 @@ TidierDB.jl currently supports:
 | **Data Manipulation**     | `@arrange`, `@group_by`, `@filter`, `@select`, `@mutate` (supports `across`), `@summarize`/`@summarise` (supports `across`), `@distinct`                                 |
 | **Joining**                  | `@left_join`, `@right_join`, `@inner_join`, `@anti_join`, `@full_join`, `@semi_join`, `@union`                                          |
 | **Slice and Order**       | `@slice_min`, `@slice_max`, `@slice_sample`, `@order`, `@window_order`, `@window_frame`                                                                                                |
-| **Utility**               | `@show_query`, `@collect`, `@head`, `@count`, `show_tables`, `@create_view`, `drop_view`                                                                                                                                          |
+| **Utility**               | `@show_query`, `@collect`, `@head`, `@count`, `show_tables`, `@create_view` , `drop_view`                                                                                                                                          |
 | **Helper Functions**             | `across`, `desc`, `if_else`, `case_when`, `n`, `starts_with`, `ends_with`, `contains`, `as_float`, `as_integer`, `as_string`, `is_missing`, `missing_if`, `replace_missing` |
 | **TidierStrings.jl Functions** | `str_detect`, `str_replace`, `str_replace_all`, `str_remove_all`, `str_remove`                                                                                               |
-| **TidierDates.jl Functions**   | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime`                                                                                                   |
-| **Aggregate Functions**          | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`
+| **TidierDates.jl Functions**   | `year`, `month`, `day`, `hour`, `min`, `second`, `floor_date`, `difftime`, `mdy`, `ymd`, `dmy`                                                                                                    |
+| **Aggregate Functions**          | `mean`, `minimum`, `maximum`, `std`, `sum`, `cumsum`, `cor`, `cov`, `var`, all aggregate sql fxns
 
 `@summarize` supports any SQL aggregate function in addition to the list above. Simply write the function as written in SQL syntax and it will work.   
 `@mutate` supports all builtin SQL functions as well.                                                                                                 

diff --git a/ext/GBQExt.jl b/ext/GBQExt.jl
@@ -2,22 +2,26 @@ module GBQExt
 
 using TidierDB
 using DataFrames
-using GoogleCloud, HTTP, JSON3
+using GoogleCloud, HTTP, JSON3, Dates
 __init__() = println("Extension was loaded!")
 
+include("GBQ_to_DF.jl")
+
 mutable struct GBQ 
     projectname::String
     session::GoogleSession
     bigquery_resource
     bigquery_method
+    location::String
 end
 
-function TidierDB.connect(::gbq, json_key_path::String, project_id::String) 
+function TidierDB.connect(::gbq, json_key_path::String, location::String) 
     # Expand the user's path to the JSON key
     creds_path = expanduser(json_key_path)
     set_sql_mode(gbq())
     # Create credentials and session for Google Cloud
     creds = JSONCredentials(creds_path)
+    project_id = JSONCredentials(creds_path).project_id
     session = GoogleSession(creds, ["https://www.googleapis.com/auth/bigquery"])
 
     # Define the API method for BigQuery
@@ -36,7 +40,7 @@ function TidierDB.connect(::gbq, json_key_path::String, project_id::String)
     )
 
     # Store all data in a global GBQ instance
-    global gbq_instance = GBQ(project_id, session, bigquery_resource, bigquery_method)
+    global gbq_instance = GBQ(project_id, session, bigquery_resource, bigquery_method, location)
 
     # Return only the session
     return session
@@ -47,7 +51,7 @@ function collect_gbq(conn, query)
     query_data = Dict(
     "query" => query,
     "useLegacySql" => false,
-    "location" => "US")
+    "location" => gbq_instance.location)
 
     response = GoogleCloud.api.execute(
         conn, 
@@ -63,30 +67,34 @@ function collect_gbq(conn, query)
     # First, extract column names from the schema
     column_names = [field["name"] for field in response_data["schema"]["fields"]]
     column_types = [field["type"] for field in response_data["schema"]["fields"]]
-    # Then, convert each row's data (currently nested inside dicts with key "v") into arrays of dicts
+
     if !isempty(rows)
         # Return an empty DataFrame with the correct columns but 0 rows
         data = [get(row["f"][i], "v", missing) for row in rows, i in 1:length(column_names)]
         df = DataFrame(data, Symbol.(column_names))
-        df = TidierDB.parse_gbq_df(df, column_types)
+        convert_df_types!(df, column_names, column_types)
+
         return df
     else
         # Convert each row's data (nested inside dicts with key "v") into arrays of dicts
-        df =DataFrame([Vector{Union{Missing, Any}}(undef, 0) for _ in column_names], Symbol.(column_names))
-        df = TidierDB.parse_gbq_df(df, column_types)
+        df = DataFrame([Vector{Union{Missing, Any}}(undef, 0) for _ in column_names], Symbol.(column_names))
+      #  df = TidierDB.parse_gbq_df(df, column_types)
+         convert_df_types!(df, column_names, column_types)
         return df
     end
 
     return df
 end
+
 function TidierDB.get_table_metadata(conn::GoogleSession{JSONCredentials}, table_name::String)
+    set_sql_mode(gbq());
     query = " SELECT * FROM
     $table_name LIMIT 0
    ;"
     query_data = Dict(
     "query" => query,
     "useLegacySql" => false,
-    "location" => "US")
+    "location" => gbq_instance.location)
     # Define the API resource
 
     response = GoogleCloud.api.execute(
@@ -112,7 +120,8 @@ function TidierDB.final_collect(sqlquery::SQLQuery, ::Type{<:gbq})
     return collect_gbq(sqlquery.db, final_query)
 end
 
-function TidierDB.show_tables(con::GoogleSession{JSONCredentials}, project_id, datasetname)
+function TidierDB.show_tables(con::GoogleSession{JSONCredentials}, datasetname)
+    project_id = gbq_instance.projectname
     query = """
     SELECT table_name
     FROM `$project_id.$datasetname.INFORMATION_SCHEMA.TABLES`

diff --git a/ext/GBQ_to_DF.jl b/ext/GBQ_to_DF.jl
@@ -0,0 +1,87 @@
+
+type_map = Dict(
+    "STRING"    => String,
+    "FLOAT"     => Float64,
+    "INTEGER"   => Int64,
+    "DATE"      => Date,
+    "DATETIME"  => DateTime,
+    "BOOLEAN"   => Bool,
+    "JSON"      => Any  # Map JSON to Any
+)
+
+# Function to get Julia type from BigQuery type string
+function get_julia_type(type_str::String)
+    if startswith(type_str, "ARRAY<") && endswith(type_str, ">")
+        element_type_str = type_str[7:end-1]
+        element_type = get(type_map, element_type_str, Any)
+        return Vector{element_type}
+    else
+        return get(type_map, type_str, Any)
+    end
+end
+
+# Helper function to parse scalar values
+function parse_scalar_value(x, target_type; type_str="")
+    if target_type == Date
+        return Date(x)
+    elseif target_type == DateTime
+        return DateTime(x)
+    elseif target_type == String
+        return String(x)
+    elseif target_type <: Number
+        return parse(target_type, x)
+    elseif target_type == Bool
+        return x in ("true", "1", 1, true)
+    elseif type_str == "JSON"
+        try
+            # Ensure x is a String or Vector{UInt8}
+            if isa(x, AbstractString) || isa(x, Vector{UInt8})
+                return JSON3.read(x)
+            else
+                # Convert x to String if possible
+                x_str = String(x)
+                return JSON3.read(x_str)
+            end
+        catch e
+            println("Failed to parse JSON value '$x' of type $(typeof(x)): ", e)
+            return missing
+        end
+    else
+        return convert(target_type, x)
+    end
+end
+
+
+# Helper function to parse array elements
+function parse_array_elements(x::JSON3.Array, target_type)
+    element_type = eltype(target_type)
+    return [parse_scalar_value(v["v"], element_type) for v in x]
+end
+
+function convert_df_types!(df::DataFrame, new_names::Vector{String}, new_types::Vector{String})
+    for (name, type_str) in zip(new_names, new_types)
+        # Get the corresponding Julia type
+        target_type = get_julia_type(type_str)
+
+        # Check if the DataFrame has the column
+        if hasproperty(df, name)
+            # Get the column data
+            column_data = df[!, name]
+
+            # Replace `nothing` with `missing`
+            column_data = replace(column_data, nothing => missing)
+
+            # Check if the data is an array of values
+            if !isempty(column_data) && isa(column_data[1], JSON3.Array)
+                # Handle arrays
+                df[!, name] = [ismissing(x) ? missing : parse_array_elements(x, target_type) for x in column_data]
+            else
+                # Handle scalar values
+                df[!, name] = [ismissing(x) ? missing : parse_scalar_value(x, target_type; type_str=type_str) for x in column_data]
+            end
+        else
+            println("Warning: Column $name not found in DataFrame.")
+        end
+    end
+    return df
+end
diff --git a/src/TBD_macros.jl b/src/TBD_macros.jl
@@ -424,6 +424,8 @@ macro summarize(sqlquery, expressions...)
             # Check if there's already a SELECT clause and append, otherwise create new
             if startswith(existing_select, "SELECT")
                 sq.select = existing_select * ", " * summary_clause
+            elseif  isempty(summary_clause)
+                sq.select = "SUMMARIZE "
             else
                 sq.select = "SELECT " * summary_clause
             end

diff --git a/src/TidierDB.jl b/src/TidierDB.jl
@@ -162,13 +162,15 @@ function finalize_query(sqlquery::SQLQuery)
      "FROM )" => ")" ,  "SELECT SELECT " => "SELECT ", "SELECT  SELECT " => "SELECT ", "DISTINCT SELECT " => "DISTINCT ", 
      "SELECT SELECT SELECT " => "SELECT ", "PARTITION BY GROUP BY" => "PARTITION BY", "GROUP BY GROUP BY" => "GROUP BY", "HAVING HAVING" => "HAVING", 
      r"var\"(.*?)\"" => s"\1", r"\"\\\$" => "\"\$",  "WHERE \"" => "WHERE ", "WHERE \"NOT" => "WHERE NOT", "%')\"" =>"%\")", "NULL)\"" => "NULL)",
-    "NULL))\"" => "NULL))"
+    "NULL))\"" => "NULL))", r"(?i)INTERVAL(\d+)([a-zA-Z]+)" => s"INTERVAL \1 \2", "SELECT SUMMARIZE " =>  "SUMMARIZE "
      )
      complete_query = replace(complete_query, ", AS " => " AS ", "OR  \"" => "OR ")
     if current_sql_mode[] == postgres() || current_sql_mode[] == duckdb() || current_sql_mode[] == mysql() || current_sql_mode[] == mssql() || current_sql_mode[] == clickhouse() || current_sql_mode[] == athena() || current_sql_mode[] == gbq() || current_sql_mode[] == oracle()  || current_sql_mode[] == snowflake() || current_sql_mode[] == databricks()
         complete_query = replace(complete_query, "\"" => "'", "==" => "=")
     end
-
+    if current_sql_mode[] == postgres()
+        complete_query = replace(complete_query, r"INTERVAL (\d+) ([a-zA-Z]+)" => s"INTERVAL '\1 \2'")
+    end
 
     return complete_query
 end

diff --git a/src/docstrings.jl b/src/docstrings.jl
@@ -1068,7 +1068,7 @@ This function establishes a database connection based on the specified backend a
 # Connect to SQLite
 # conn = connect(sqlite())
 # Connect to Google Big Query
-# conn = connect(gbq(), "json_user_key_path", "project_id")
+# conn = connect(gbq(), "json_user_key_path", "location")
 # Connect to Snowflake
 # conn = connect(snowflake(), "ac_id", "token", "Database_name", "Schema_name", "warehouse_name")
 # Connect to Microsoft SQL Server
@@ -1255,13 +1255,12 @@ julia> @chain db_table(db, :df_mem) begin
 
 const docstring_show_tables =
 """
-    show_tables(con; GBQ_project_id, GBQ_datasetname)
+    show_tables(con; GBQ_datasetname)
 
 Shows tables available in database. currently supports DuckDB, databricks, Snowflake, GBQ, SQLite, LibPQ
 
 # Arguments
 - `con` : connection to backend
-- `GBQ_project_id` : string of project id
 - `GBQ_datasetname` : string of dataset name
 # Examples
 ```jldoctest