From 4542b2b4266cc18d283f7820e503c21820bd10ba Mon Sep 17 00:00:00 2001 From: Stefan Bringuier Date: Wed, 13 Apr 2022 22:09:37 -0700 Subject: [PATCH] Address Gotchas and peformance to address #4 and other code clean-up. --- docs/src/examples.md | 12 +++++++ src/CBFV.jl | 1 + src/Databases.jl | 12 +++++-- src/Featurization.jl | 83 +++++++++++++++++++++++++++++++------------- src/Types.jl | 3 +- test/runtests.jl | 9 ++++- 6 files changed, 92 insertions(+), 28 deletions(-) diff --git a/docs/src/examples.md b/docs/src/examples.md index 78c8a18..f9902d2 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -26,4 +26,16 @@ using CBFV data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12]) rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target")) features = generatefeatures(data) +``` + +Here is an example with an existing feature combined with the generated features: + +```@example +using DataFrames +using CBFV +data = DataFrame(:formula=>["B2O3","Be1I2","Be1F3Li1"], + :temperature=>[1400.00,1200.0,1100.00], + :heat_capacity=>[89.115,134.306,192.464]) +rename!(data,Dict(:heat_capacity=>:target)) +features = generatefeatures(data,combine=true) ``` \ No newline at end of file diff --git a/src/CBFV.jl b/src/CBFV.jl index 3751471..550d27e 100644 --- a/src/CBFV.jl +++ b/src/CBFV.jl @@ -6,6 +6,7 @@ using CSV using DataFrames export FileName +export readdatabasefile export processelementdatabase export processinputdata export generatefeatures diff --git a/src/Databases.jl b/src/Databases.jl index 263b145..0225d5d 100644 --- a/src/Databases.jl +++ b/src/Databases.jl @@ -27,13 +27,21 @@ Returns DataFrame of an elemental database file in [databases/](databases/) # Arguments - `pathtofile::String`: path to the CSV formatted file to read +- `stringtype::Type{Union{String,InlineString}}=String` : `CSV.jl` string storage type +- `pool::Bool=false` : `CSV.File` will pool `String` column values. # Returns - `data::DataFrame`: the dataframe representation of the csv file. + +!!! note + Some of the behaviors of `CSV.jl` will create data types that are inconnsistant with + the several function argument types in `CBFV`. If you use this function to read the + data files the data frame constructed via CSV will work properly. """ function readdatabasefile(pathtofile::AbstractString; - stringtype::Type{T}=String) where T<:Union{String,InlineString} + stringtype::Type{T}=String, + pool=false) where T<:Union{String,InlineString} # Use CSV and dataframes - data = CSV.File(pathtofile,stringtype=stringtype) |> DataFrame + data = CSV.File(pathtofile,stringtype=stringtype,pool=pool) |> DataFrame return data end # function readdatabasefile diff --git a/src/Featurization.jl b/src/Featurization.jl index 97b4721..29afabe 100644 --- a/src/Featurization.jl +++ b/src/Featurization.jl @@ -47,11 +47,22 @@ end This is the primary function that assigns the features based on the CBFV approach. For more details its best to see the original python CBFV and references in README file. +# Arguments +- `processeddata::Vector{Dict{Symbol,Any}}` : the formulas processed against elemental database +- `formulae::AbstractArray` : the formula string values, this should be some subtype of `Array{String,1}` +- `sumfeatures::Bool=false` : wheter to create a `sum_` feature vector + +# Returns +- `featuresarry::Vector{Matrix{Float64}}` : feature vectors for each row in original data set. +- `skippedformula::Vector{String}` : skipped formulas + +!!! note + The `generatefeatures` call does not do anything (i.e. return) the skippedformulas. """ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}}, - formulae::Array{String,1}, - sumfeatures::Bool=false) + formulae::AbstractArray, + sumfeatures::Bool=false) iterformulae = ProgressBar(1:length(formulae)) skippedformula = Array{String,1}() @@ -69,11 +80,11 @@ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}}, fmax = maximum(properties, dims=1) fmin = minimum(properties, dims=1) _, fraccomp = fractionalcomposition(formula) - favg = sum(fraccomp .* properties, dims=1) + favg = sum(fraccomp .* properties, dims=1) #FIX: Not sure whats going on here fdev = sum(fraccomp .* abs.(properties .- favg), dims=1) + prominant = isapprox.(fraccomp, maximum(fraccomp)) fmode = minimum(properties[prominant, :], dims=1) - fweight = sumfeatures ? sum(amount .* properties, dims=1) : amount .* properties if sumfeatures @@ -110,9 +121,9 @@ moments from the element features in the formula. """ function constructfeaturedataframe(featcolnames::Vector{String}, - features::Array{Float64,2}, - extrafeatures::Tuple{Bool,DataFrame}, - sumfeatures::Bool) + features::Array{Float64,2}, + extrafeatures::Tuple{Bool,DataFrame}, + sumfeatures::Bool) if sumfeatures colprefixes = ["sum_", "avg_", "dev_", "range_", "max_", "min_", "mode_"] @@ -143,7 +154,6 @@ end # function constructfeaturedataframe """ generatefeatures(data; elementdata,dropduplicate,combine,sumfeatures,returndataframe) - generatefeatures(data, elementdata; kwargs...) generatefeatures(dataname; kwargs...) This is the primary function for generating the CBFV features for a dataset of formulas with or without @@ -157,7 +167,7 @@ assigning of features is then executed based on the CBFV approach. If the `retur # Arguments - `data::DataFrame`: This is the data set that you want to be featurized for example. -- `elementdata::Union{String,FileName}`: The name of the internal database or the file path and +- `elementdata::Union{String,FileName} or Union{String,DataFrame}`: The name of the internal database or the file path and name to an external database. - `dropduplicate::Bool=true`: Option to drop duplicate entries. - `combine::Bool=false`: Option to combine existing features in `data` with the generated feature set. @@ -184,18 +194,25 @@ d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91. generatefeatures(d) ``` """ -function generatefeatures(data::DataFrame; - elementdata::String="oliynyk", - dropduplicate=true, - combine=false, - sumfeatures=false, - returndataframe=true) +function generatefeatures(data::DataFrame, + elementdata::Union{String,DataFrame}="oliynyk"; + dropduplicate=true, + combine=false, + sumfeatures=false, + returndataframe=true) + + # Remove duplicate entries + if dropduplicate + moddata = unique(data) + else + moddata = data + end # Process input data checkdataframe(data) - formulae = data[!, :formula] - featcolnames, processeddata = processinputdata(dropduplicate ? unique(data) : data, elementdata) + formulae = moddata[!, :formula] + featcolnames, processeddata = processinputdata(moddata, elementdata) targets = [row[:target] for row in processeddata] @@ -205,8 +222,7 @@ function generatefeatures(data::DataFrame; sumfeatures) # Extra features from original data - extra_df = dropduplicate ? unique(data) : data - extrafeatures = extra_df[!, Not([:formula, :target])] + extrafeatures = moddata[!, Not([:formula, :target])] if combine checkifempty(extrafeatures) end if returndataframe @@ -223,11 +239,30 @@ function generatefeatures(data::DataFrame; end # function generatefeaturesdata -# Issue #4 TODO: Work in support for custom element data sets. Requires -# working back through `generatefeatures`-> `processinputdata` -> .... -# generatefeatures(data::DataFrame, elementdata::FileName; kwargs...) = begin -# generatefeatures(data, elementdata=elementdata, kwargs...) -# end + +generatefeatures(data::DataFrame; + elementdata::Union{FileName,String}="oliynyk", + dropduplicate=true, + combine=false, + sumfeatures=false, + returndataframe=true) = begin + if typeof(elementdata) == FileName + elementdataframe = readdatabasefile(elementdata.fullpath) + generatefeatures(data,elementdataframe, + dropduplicate=dropduplicate, + combine=combine, + sumfeatures=sumfeatures, + returndataframe=returndataframe) + else + generatefeatures(data,elementdata, + dropduplicate=dropduplicate, + combine=combine, + sumfeatures=sumfeatures, + returndataframe=returndataframe) + end + +end + generatefeatures(dataname::String; kwargs...) = begin # Digest data file before processing diff --git a/src/Types.jl b/src/Types.jl index 7000330..0d0a673 100644 --- a/src/Types.jl +++ b/src/Types.jl @@ -1,7 +1,8 @@ # see LICENSE """ -generatefeatures Datatype for multiple dispatch +generatefeatures Datatype for multiple dispatch. Allows for passing +external database. """ struct FileName fullpath::String end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index ba40793..d4faef5 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -108,12 +108,19 @@ using CSV, DataFrames tmpfile = tempname() CSV.write(tmpfile,d) @test featdb == CBFV.generatefeatures(tmpfile,returndataframe=true) - + + @test CBFV.generatefeatures(d,combine=true)[!,:property] == d[!,:property] + testdb = CSV.File("pycbfv_test_data.csv") |> DataFrame @test length(names(featdb[!,Not([:target,:formula])])) == length(names(testdb)) @testset "Column $n" for n in names(testdb) @test testdb[!,n] ≈ featdb[!,n] end + featdb_ext = CBFV.generatefeatures(d, + elementdata=CBFV.FileName((@__DIR__)*"/../databases/oliynyk.csv")) + @test featdb_ext == featdb + + end # Featurization.jl testset end