From 4542b2b4266cc18d283f7820e503c21820bd10ba Mon Sep 17 00:00:00 2001
From: Stefan Bringuier <stefanbringuier@gmail.com>
Date: Wed, 13 Apr 2022 22:09:37 -0700
Subject: [PATCH] Address Gotchas and peformance to address #4 and other code
 clean-up.

---
 docs/src/examples.md | 12 +++++++
 src/CBFV.jl          |  1 +
 src/Databases.jl     | 12 +++++--
 src/Featurization.jl | 83 +++++++++++++++++++++++++++++++-------------
 src/Types.jl         |  3 +-
 test/runtests.jl     |  9 ++++-
 6 files changed, 92 insertions(+), 28 deletions(-)

diff --git a/docs/src/examples.md b/docs/src/examples.md
index 78c8a18..f9902d2 100644
--- a/docs/src/examples.md
+++ b/docs/src/examples.md
@@ -26,4 +26,16 @@ using CBFV
 data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12])
 rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target"))
 features = generatefeatures(data)
+```
+
+Here is an example with an existing feature combined with the generated features:
+
+```@example
+using DataFrames
+using CBFV
+data = DataFrame(:formula=>["B2O3","Be1I2","Be1F3Li1"],
+                 :temperature=>[1400.00,1200.0,1100.00],
+                 :heat_capacity=>[89.115,134.306,192.464])
+rename!(data,Dict(:heat_capacity=>:target))
+features = generatefeatures(data,combine=true)
 ```
\ No newline at end of file
diff --git a/src/CBFV.jl b/src/CBFV.jl
index 3751471..550d27e 100644
--- a/src/CBFV.jl
+++ b/src/CBFV.jl
@@ -6,6 +6,7 @@ using CSV
 using DataFrames
 
 export FileName
+export readdatabasefile
 export processelementdatabase
 export processinputdata
 export generatefeatures
diff --git a/src/Databases.jl b/src/Databases.jl
index 263b145..0225d5d 100644
--- a/src/Databases.jl
+++ b/src/Databases.jl
@@ -27,13 +27,21 @@ Returns DataFrame of an elemental database file in [databases/](databases/)
 
 # Arguments
 - `pathtofile::String`: path to the CSV formatted file to read
+- `stringtype::Type{Union{String,InlineString}}=String` : `CSV.jl` string storage type
+- `pool::Bool=false` : `CSV.File` will pool `String` column values.
 
 # Returns
 - `data::DataFrame`: the dataframe representation of the csv file.
+
+!!! note
+    Some of the behaviors of `CSV.jl` will create data types that are inconnsistant with
+    the several function argument types in `CBFV`. If you use this function to read the
+    data files the data frame constructed via CSV will work properly.
 """
 function readdatabasefile(pathtofile::AbstractString;
-                          stringtype::Type{T}=String) where T<:Union{String,InlineString}
+                          stringtype::Type{T}=String,
+                          pool=false) where T<:Union{String,InlineString}
     # Use CSV and dataframes
-    data = CSV.File(pathtofile,stringtype=stringtype) |> DataFrame
+    data = CSV.File(pathtofile,stringtype=stringtype,pool=pool) |> DataFrame
     return data
 end  # function readdatabasefile
diff --git a/src/Featurization.jl b/src/Featurization.jl
index 97b4721..29afabe 100644
--- a/src/Featurization.jl
+++ b/src/Featurization.jl
@@ -47,11 +47,22 @@ end
 This is the primary function that assigns the features based on the CBFV approach. For more
 details its best to see the original python CBFV and references in README file.
 
+# Arguments
+- `processeddata::Vector{Dict{Symbol,Any}}` : the formulas processed against elemental database
+- `formulae::AbstractArray` : the formula string values, this should be some subtype of `Array{String,1}`
+- `sumfeatures::Bool=false` : wheter to create a `sum_` feature vector
+
+# Returns
+- `featuresarry::Vector{Matrix{Float64}}` : feature vectors for each row in original data set.
+- `skippedformula::Vector{String}` : skipped formulas
+
+!!! note
+    The `generatefeatures` call does not do anything (i.e. return) the skippedformulas.
 
 """
 function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
-    formulae::Array{String,1},
-    sumfeatures::Bool=false)
+                        formulae::AbstractArray,
+                        sumfeatures::Bool=false)
 
     iterformulae = ProgressBar(1:length(formulae))
     skippedformula = Array{String,1}()
@@ -69,11 +80,11 @@ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}},
         fmax = maximum(properties, dims=1)
         fmin = minimum(properties, dims=1)
         _, fraccomp = fractionalcomposition(formula)
-        favg = sum(fraccomp .* properties, dims=1)
+        favg = sum(fraccomp .* properties, dims=1) #FIX: Not sure whats going on here
         fdev = sum(fraccomp .* abs.(properties .- favg), dims=1)
+
         prominant = isapprox.(fraccomp, maximum(fraccomp))
         fmode = minimum(properties[prominant, :], dims=1)
-
         fweight = sumfeatures ? sum(amount .* properties, dims=1) : amount .* properties
 
         if sumfeatures
@@ -110,9 +121,9 @@ moments from the element features in the formula.
 
 """
 function constructfeaturedataframe(featcolnames::Vector{String},
-    features::Array{Float64,2},
-    extrafeatures::Tuple{Bool,DataFrame},
-    sumfeatures::Bool)
+                                   features::Array{Float64,2},
+                                   extrafeatures::Tuple{Bool,DataFrame},
+                                   sumfeatures::Bool)
 
     if sumfeatures
         colprefixes = ["sum_", "avg_", "dev_", "range_", "max_", "min_", "mode_"]
@@ -143,7 +154,6 @@ end  # function constructfeaturedataframe
 
 """
     generatefeatures(data; elementdata,dropduplicate,combine,sumfeatures,returndataframe)
-    generatefeatures(data, elementdata; kwargs...)
     generatefeatures(dataname; kwargs...)
 
 This is the primary function for generating the CBFV features for a dataset of formulas with or without
@@ -157,7 +167,7 @@ assigning of features is then executed based on the CBFV approach. If the `retur
 
 # Arguments
 - `data::DataFrame`: This is the data set that you want to be featurized for example.
-- `elementdata::Union{String,FileName}`: The name of the internal database or the file path and
+- `elementdata::Union{String,FileName} or Union{String,DataFrame}`: The name of the internal database or the file path and
 name to an external database.
 - `dropduplicate::Bool=true`: Option to drop duplicate entries.
 - `combine::Bool=false`: Option to combine existing features in `data` with the generated feature set.
@@ -184,18 +194,25 @@ d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.
 generatefeatures(d)
 ```
 """
-function generatefeatures(data::DataFrame;
-    elementdata::String="oliynyk",
-    dropduplicate=true,
-    combine=false,
-    sumfeatures=false,
-    returndataframe=true)
+function generatefeatures(data::DataFrame,
+                          elementdata::Union{String,DataFrame}="oliynyk";
+                          dropduplicate=true,
+                          combine=false,
+                          sumfeatures=false,
+                          returndataframe=true)
+
+    # Remove duplicate entries
+    if dropduplicate
+        moddata = unique(data)
+    else 
+        moddata = data
 
+    end
 
     # Process input data
     checkdataframe(data)
-    formulae = data[!, :formula]
-    featcolnames, processeddata = processinputdata(dropduplicate ? unique(data) : data, elementdata)
+    formulae = moddata[!, :formula]
+    featcolnames, processeddata = processinputdata(moddata, elementdata)
 
     targets = [row[:target] for row in processeddata]
 
@@ -205,8 +222,7 @@ function generatefeatures(data::DataFrame;
         sumfeatures)
 
     # Extra features from original data
-    extra_df = dropduplicate ? unique(data) : data
-    extrafeatures = extra_df[!, Not([:formula, :target])]
+    extrafeatures = moddata[!, Not([:formula, :target])]
     if combine checkifempty(extrafeatures) end
 
     if returndataframe
@@ -223,11 +239,30 @@ function generatefeatures(data::DataFrame;
 
 end  # function generatefeaturesdata 
 
-# Issue #4 TODO: Work in support for custom element data sets. Requires
-# working back through `generatefeatures`-> `processinputdata` -> ....
-# generatefeatures(data::DataFrame, elementdata::FileName; kwargs...) = begin
-#    generatefeatures(data, elementdata=elementdata, kwargs...)
-# end
+
+generatefeatures(data::DataFrame;
+                 elementdata::Union{FileName,String}="oliynyk",
+                 dropduplicate=true,
+                 combine=false,
+                 sumfeatures=false,
+                returndataframe=true) = begin
+    if typeof(elementdata) == FileName
+        elementdataframe = readdatabasefile(elementdata.fullpath)
+        generatefeatures(data,elementdataframe,
+                        dropduplicate=dropduplicate,
+                        combine=combine,
+                        sumfeatures=sumfeatures,
+                        returndataframe=returndataframe)
+    else
+        generatefeatures(data,elementdata,
+                        dropduplicate=dropduplicate,
+                        combine=combine,
+                        sumfeatures=sumfeatures,
+                        returndataframe=returndataframe)
+    end
+
+end
+
 
 generatefeatures(dataname::String; kwargs...) = begin
     # Digest data file before processing
diff --git a/src/Types.jl b/src/Types.jl
index 7000330..0d0a673 100644
--- a/src/Types.jl
+++ b/src/Types.jl
@@ -1,7 +1,8 @@
 # see LICENSE
 
 """
-generatefeatures Datatype for multiple dispatch
+generatefeatures Datatype for multiple dispatch. Allows for passing
+external database.
 """ struct FileName
     fullpath::String
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index ba40793..d4faef5 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -108,12 +108,19 @@ using CSV, DataFrames
         tmpfile = tempname()
         CSV.write(tmpfile,d)
         @test featdb == CBFV.generatefeatures(tmpfile,returndataframe=true)
-     
+        
+        @test CBFV.generatefeatures(d,combine=true)[!,:property] == d[!,:property]
+
         testdb = CSV.File("pycbfv_test_data.csv") |> DataFrame
         @test length(names(featdb[!,Not([:target,:formula])])) == length(names(testdb))
         @testset "Column $n" for n in names(testdb)
             @test testdb[!,n] ≈ featdb[!,n]
         end
 
+        featdb_ext = CBFV.generatefeatures(d,
+                                           elementdata=CBFV.FileName((@__DIR__)*"/../databases/oliynyk.csv"))
+        @test featdb_ext == featdb
+
+
     end # Featurization.jl testset
 end