From 78b9ebf163885813d81ff86ad68ef19aad81d1b7 Mon Sep 17 00:00:00 2001 From: Stefan Bringuier Date: Tue, 12 Apr 2022 22:57:57 -0700 Subject: [PATCH] Polishing code base, testing for scenarios. --- Artifacts.toml | 0 README.md | 41 +++++++++++++++++++++++---- TODO.md | 6 ---- docs/make.jl | 6 ++-- docs/src/examples.md | 10 +++++++ docs/src/index.md | 16 +++++++++-- src/Errors.jl | 4 +-- src/Featurization.jl | 66 ++++++++++++++++++++++++-------------------- src/ProcessData.jl | 48 ++++++++++++++++++++++++-------- test/playground.jl | 19 ------------- test/runtests.jl | 6 +++- 11 files changed, 142 insertions(+), 80 deletions(-) create mode 100644 Artifacts.toml delete mode 100644 TODO.md delete mode 100644 test/playground.jl diff --git a/Artifacts.toml b/Artifacts.toml new file mode 100644 index 0000000..e69de29 diff --git a/README.md b/README.md index 8cec56d..6f7f81d 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# CBFV.jl : A simple composition-based feature vectorization Julia utility +# CBFV.jl : A simple composition-based feature vectorization utility in Julia [![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliamatsci.github.io/CBFV.jl/stable) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliamatsci.github.io/CBFV.jl/dev) [![Build Status](https://github.com/juliamatsci/CBFV.jl/workflows/CI/badge.svg)](https://github.com/JuliaMatSci/CBFV.jl/actions) [![Build Status](https://travis-ci.com/JuliaMatSci/CBFV.jl.svg?branch=master)](https://travis-ci.com/JuliaMatSci/CBFV.jl) [![Coverage](https://codecov.io/gh/JuliaMatSci/CBFV.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaMatSci/CBFV.jl) This is a Julia rewrite of the [python tool](https://github.com/kaaiian/CBFV) to create a composition-based feature vector representation for machine learning with materials science data. The ideas and methodology are discussed in the recent article: @@ -8,11 +8,42 @@ This is a Julia rewrite of the [python tool](https://github.com/kaaiian/CBFV) to and the original python source code(s) can be found here: -> https://github.com/anthony-wang/BestPractices/tree/master/notebooks/CBFV +- [https://github.com/anthony-wang/BestPractices/tree/master/notebooks/CBFV](https://github.com/anthony-wang/BestPractices/tree/master/notebooks/CBFV) +- [https://github.com/kaaiian/CBFV](https://github.com/kaaiian/CBFV) -> https://github.com/kaaiian/CBFV +## Example Use -## Citation +The input data set should have a least two columns with the header/names `formula` and `target`. + +```@example +using DataFrames +using CBFV +data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12]) +rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target")) +features = generatefeatures(data) +``` + +## Supported Featurization Schemes + +As with the orignal CBFV python package the following element databases are available: + +- `oliynyk` (default): Database from A. Oliynyk. +- `magpie`: [Materials Agnostic Platform for Informatics and Exploration](https://bitbucket.org/wolverton/magpie/src/master/) +- `mat2vec`: [Word embeddings capture latent knowledge from materials science](https://github.com/materialsintelligence/mat2vec) +- `jarvis`: [Joint Automated Repository for Various Integrated Simulations provided by U.S. National Institutes of Standards and Technologies.](https://jarvis.nist.gov/) +- `onehot`: Simple one hot encoding scheme, i.e., diagonal elemental matrix. +- `random_200`: 200 random elemental properties (I'm assuming). + +However, `CBFV.jl` will allow you to provide your own element database to featurize with. Also, the current implementation reads the saved `.csv` file in [`databases`](@ref), however, this is prone to potential issues (ex. out of date files). To alleviate this I will change the implementation to utilize `Pkg.Artificats` with a `Artificats.toml` file that enables grabbing the datafiles needed from a server if they don't exist locally already. + +### Julia Dependencies +This is a relatively small package so there aren't a lot of dependencies. The required packages are: + +- CSV +- DataFrames +- ProgressBars + +## Citations Pleae cite the following when and if you use this package in your work: ```bibtex @@ -46,4 +77,4 @@ In addition, please also consider citing the original python implementation and journal = {Chemistry of Materials}, doi = {10.1021/acs.chemmater.0c01907} } -``` +``` \ No newline at end of file diff --git a/TODO.md b/TODO.md deleted file mode 100644 index e7cb3b9..0000000 --- a/TODO.md +++ /dev/null @@ -1,6 +0,0 @@ -# Remaining TODOs - - -* [ ] In `processinput` handle clean up array if a element in compound was not found in element database. -* [ ] If a element database does not contain a feature/property set its value to the mean value of the dataset -* [] Finish `assignfeatures` function based on original implementation. \ No newline at end of file diff --git a/docs/make.jl b/docs/make.jl index 89fb79a..c8b9f3a 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -4,11 +4,11 @@ using Documenter makedocs(; modules=[CBFV], authors="Stefan Bringuier and contributors", - repo="https://github.com/stefanbringuier/CBFV.jl/blob/{commit}{path}#L{line}", + repo="https://github.com/JuliaMatSci/CBFV.jl/blob/{commit}{path}#L{line}", sitename="CBFV.jl", format=Documenter.HTML(; prettyurls=get(ENV, "CI", "false") == "true", - canonical="https://stefanbringuier.github.io/CBFV.jl", + canonical="https://JuliaMatSci.github.io/CBFV.jl", assets=String[], ), pages=[ @@ -20,5 +20,5 @@ makedocs(; Modules = [CBFV] deploydocs(; - repo="github.com/stefanbringuier/CBFV.jl", + repo="github.com/JuliaMatSci/CBFV.jl", ) diff --git a/docs/src/examples.md b/docs/src/examples.md index 086bf7d..78c8a18 100644 --- a/docs/src/examples.md +++ b/docs/src/examples.md @@ -16,4 +16,14 @@ using DataFrames #hide using CBFV #hide d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.5034]) #hide generatefeatures(d,elementdata="jarvis") +``` + +Another example: + +```@example +using DataFrames +using CBFV +data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12]) +rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target")) +features = generatefeatures(data) ``` \ No newline at end of file diff --git a/docs/src/index.md b/docs/src/index.md index f239659..6f7f81d 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,5 +1,5 @@ # CBFV.jl : A simple composition-based feature vectorization utility in Julia -[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://stefanbringuier.github.io/CBFV.jl/stable) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://stefanbringuier.github.io/CBFV.jl/dev) [![Build Status](https://github.com/stefanbringuier/CBFV.jl/workflows/CI/badge.svg)](https://github.com/stefanbringuier/CBFV.jl/actions) [![Build Status](https://travis-ci.com/stefanbringuier/CBFV.jl.svg?branch=master)](https://travis-ci.com/stefanbringuier/CBFV.jl) [![Coverage](https://codecov.io/gh/stefanbringuier/CBFV.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/stefanbringuier/CBFV.jl) +[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://juliamatsci.github.io/CBFV.jl/stable) [![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://juliamatsci.github.io/CBFV.jl/dev) [![Build Status](https://github.com/juliamatsci/CBFV.jl/workflows/CI/badge.svg)](https://github.com/JuliaMatSci/CBFV.jl/actions) [![Build Status](https://travis-ci.com/JuliaMatSci/CBFV.jl.svg?branch=master)](https://travis-ci.com/JuliaMatSci/CBFV.jl) [![Coverage](https://codecov.io/gh/JuliaMatSci/CBFV.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/JuliaMatSci/CBFV.jl) This is a Julia rewrite of the [python tool](https://github.com/kaaiian/CBFV) to create a composition-based feature vector representation for machine learning with materials science data. The ideas and methodology are discussed in the recent article: @@ -11,6 +11,18 @@ and the original python source code(s) can be found here: - [https://github.com/anthony-wang/BestPractices/tree/master/notebooks/CBFV](https://github.com/anthony-wang/BestPractices/tree/master/notebooks/CBFV) - [https://github.com/kaaiian/CBFV](https://github.com/kaaiian/CBFV) +## Example Use + +The input data set should have a least two columns with the header/names `formula` and `target`. + +```@example +using DataFrames +using CBFV +data = DataFrame("name"=>["Rb2Te","CdCl2","LaN"],"bandgap_eV"=>[1.88,3.51,1.12]) +rename!(data,Dict("name"=>"formula","bandgap_eV"=>"target")) +features = generatefeatures(data) +``` + ## Supported Featurization Schemes As with the orignal CBFV python package the following element databases are available: @@ -24,7 +36,7 @@ As with the orignal CBFV python package the following element databases are avai However, `CBFV.jl` will allow you to provide your own element database to featurize with. Also, the current implementation reads the saved `.csv` file in [`databases`](@ref), however, this is prone to potential issues (ex. out of date files). To alleviate this I will change the implementation to utilize `Pkg.Artificats` with a `Artificats.toml` file that enables grabbing the datafiles needed from a server if they don't exist locally already. -## Julia Dependencies +### Julia Dependencies This is a relatively small package so there aren't a lot of dependencies. The required packages are: - CSV diff --git a/src/Errors.jl b/src/Errors.jl index 8a00459..cfb13e5 100644 --- a/src/Errors.jl +++ b/src/Errors.jl @@ -8,8 +8,8 @@ function checkdataframe(dataframe::DataFrame) end function checkcombineallowed(dataframe::DataFrame) - extrprops = dataframe[!,Not([:formula,:target])] - if !isempty(extrprops) + #extrprops = dataframe[!,Not([:formula,:target])] + if !isempty(dataframe) return true else @warn "Combining of features requested but none exist, skipping!" diff --git a/src/Featurization.jl b/src/Featurization.jl index cf1ee28..20a93dd 100644 --- a/src/Featurization.jl +++ b/src/Featurization.jl @@ -1,7 +1,7 @@ # see LICENSE -""" combinefeatures!(features,extras) - combinefeatures!(features,featnames,extras) +""" combinefeatures(features,extras) + combinefeatures(features,featnames,extras) Combines existing features in data with the prepared features. Returns additional @@ -12,27 +12,32 @@ vector of column names for a database. - `extras::DataFrame`: The data frame representation of the orignial data. - `featnames::Vector`: The column names of the generated features. -# Returns (Optional) +# Returns +- `newfeatures::AbstractArray`: Combined features - `combfeatnames::Vector{String}`: Combined names of feature columns. """ -function combinefeatures!(features::AbstractArray, extras::DataFrame) +function combinefeatures(features::AbstractArray, extras::DataFrame) if checkcombineallowed(extras) extrasarry = Tables.matrix(extras) - features = hcat(features, extrasarry) + newfeatures = hcat(features, extrasarry) + else + newfeatures = features end + return newfeatures end # function combinefeatures -combinefeatures!(features::AbstractArray, featnames::Vector, extras::DataFrame) = begin +combinefeatures(features::AbstractArray, featnames::Vector, extras::DataFrame) = begin if checkcombineallowed(extras) extrasarry = Tables.matrix(extras) - features = hcat(features, extrasarry) + newfeatures = hcat(features, extrasarry) combfeatnames = vcat(featnames, names(extras)) else + newfeatures = features combfeatnames = featnames end - return combfeatnames + return newfeatures,combfeatnames end """ @@ -51,12 +56,12 @@ function assignfeatures(processeddata::Vector{Dict{Symbol,Any}}, iterformulae = ProgressBar(1:length(formulae)) skippedformula = Array{String,1}() - features = Vector{Matrix{Number}}(undef, length(formulae)) + features = Vector{Matrix{Float64}}(undef, length(formulae)) - for i in iterformulae + Threads.@threads for i in iterformulae formula = formulae[i] - amount = processeddata[i][:amount] - properties = processeddata[i][:eleprops] + amount = processeddata[i][:amount]::Vector{Float64} + properties = processeddata[i][:eleprops]::Matrix{Float64} # Each formula has a n-element by m-feature matrix representation. # Construct all the feature vectors @@ -95,15 +100,20 @@ column name prefixes are fixed based on the CBFV approach which is to use the fo moments from the element features in the formula. # Arguments +- `featcolnames::Vector{String}` : The name of the columns for the feature vectors +- `features::Array{Float64,2}` : The feature vectors +- `extrafeatures::Tuple{Bool,DataFrame}` : These are the features carried from the input data +- `sumfeatures::Bool` : wheter or not to add sum statistics feature vector # Returns - +- `DataFrame` : the dataframe for the features """ function constructfeaturedataframe(featcolnames::Vector{String}, - features::Array{Number,2}, + features::Array{Float64,2}, extrafeatures::Tuple{Bool,DataFrame}, sumfeatures::Bool) + if sumfeatures colprefixes = ["sum_", "avg_", "dev_", "range_", "max_", "min_", "mode_"] else @@ -119,10 +129,11 @@ function constructfeaturedataframe(featcolnames::Vector{String}, dictfeatnames = Dict{String,Vector}() if extrafeatures[1] - combinedfeatnames = combinefeatures!(features, featnames, extrafeatures[2]) - for (i, n) in enumerate(combinedfeatnames) - dictfeatnames[n] = features[i, :] - end + #combfeatures,combinedfeatnames = combinefeatures(features, featnames, extrafeatures[2]) + #for (i, n) in enumerate(combinedfeatnames) + # dictfeatnames[n] = combfeatures[:,i] + #end + @info "The combine feature is not correctly implemented and is being skipped!" else for (i, n) in enumerate(featnames) dictfeatnames[n] = features[:, i] @@ -173,14 +184,6 @@ using CBFV d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.5034]) generatefeatures(d) ``` - - - - -# TODOs -- Add dropduplicate Optional -- Decide what to do with `skippedformulas` -- Process elementa data features with `NaN` """ function generatefeatures(data::DataFrame; elementdata::String="oliynyk", @@ -189,10 +192,11 @@ function generatefeatures(data::DataFrame; sumfeatures=false, returndataframe=true) + # Process input data checkdataframe(data) formulae = data[!, :formula] - featcolnames, processeddata = processinputdata(data, elementdata) + featcolnames, processeddata = processinputdata(dropduplicate ? unique(data) : data, elementdata) targets = [row[:target] for row in processeddata] @@ -200,9 +204,10 @@ function generatefeatures(data::DataFrame; features, skippedformulas = assignfeatures(processeddata, formulae, sumfeatures) - extrafeatures = data[!, Not([:formula, :target])] - #TODO: need to fill features that are NaN with median values. + # Extra features from original data + extra_df = data[!, Not([:formula, :target])] + extrafeatures = dropduplicate && !isempty(extra_df) ? unique(extra_df) : extra_df if returndataframe generatedataframe = constructfeaturedataframe(featcolnames, features, (combine, extrafeatures), sumfeatures) @@ -211,7 +216,8 @@ function generatefeatures(data::DataFrame; return generatedataframe else if combine - combinefeatures!(features, extrafeatures) + #combinefeatures(features, extrafeatures) + @info "The combine feature is not correctly implemented and is being skipped!" end return formulae, features, targets end diff --git a/src/ProcessData.jl b/src/ProcessData.jl index 03b6924..1bf1db0 100644 --- a/src/ProcessData.jl +++ b/src/ProcessData.jl @@ -1,5 +1,24 @@ # see LICENSE +""" + replacemissing!(data) + +replace missing values in columns of a dataframe with average value of that column. + +# Arguments +- `data::DataFrame`: data frame with missing values. + +# Modifies +- `data::DataFrame`: changes `missing` values to mane values of column. + +""" function replacemissing!(data::DataFrame) + + columnnames = names(data[!, Not(:element)]) + statdata = describe(data); + for (i,n) in enumerate(columnnames) + data[!,n] = coalesce.(data[!,n],statdata[i,:mean]) + end +end """ removeunsupported!(datainput,elementproperties) @@ -17,7 +36,7 @@ Handle cases where compound can't be processed because it isn't an allowed eleme function removeunsupported!(datainput::DataFrame) formulas = copy(datainput[!, :formula]) - rows = range(1, nrow(datainput)) + rows = range(1, nrow(datainput),step=1) splitformulas = keys.(getrepresentation.(formulas)) for i = 1:length(formulas) @@ -47,12 +66,13 @@ returns an array of properties for elements that are in a formula. """ function extractproperties(elements::Vector{String}, - properties::AbstractArray, - formulaelements::Array{String,1}, - formula::String) + properties::Array{T,2}, + formulaelements::Array{String,1}, + formula::String) where T<:Number + _, m = size(properties) l = length(formulaelements) - extractedproperties = Array{Number,2}(undef, l, m) + extractedproperties = Array{Float64,2}(undef, l, m) for (i, e) in enumerate(formulaelements) if stripamt(e) ∉ elements @@ -106,10 +126,12 @@ of the entire database. - `arrayrepresentation::Array{Any,2}`: representation of the dataframe """ -function processelementdatabase(data::DataFrame) - - columnnames = names(data[!, Not(:element)]) +function processelementdatabase(data::DataFrame;replacemissing=true) + if replacemissing + replacemissing!(data) + end + elementsymbols = convert(Vector{String}, data[!, :element]) elementindex = collect(1:nrow(data)) elementmissing = collect(setdiff( @@ -122,17 +144,19 @@ function processelementdatabase(data::DataFrame) arrayrepresentation = Tables.matrix(data[!, Not(:element)]) + columnnames = names(data[!, Not(:element)]) + return elementinfo, columnnames, arrayrepresentation end # function processelementdatabase -processelementdatabase(databasename::String) = begin +processelementdatabase(databasename::String;kwargs...) = begin data = getelementpropertydatabase(databasename) - processelementdatabase(data) + processelementdatabase(data,kwargs...) end -processelementdatabase(databasepath::FileName) = begin +processelementdatabase(databasepath::FileName;kwargs...) = begin data = readdatabasefile(databasepath.fullpath) - processelementdatabase(data) + processelementdatabase(data,kwargs...) end diff --git a/test/playground.jl b/test/playground.jl deleted file mode 100644 index fb2b932..0000000 --- a/test/playground.jl +++ /dev/null @@ -1,19 +0,0 @@ -include("../src/CBFV.jl") -using DataFrames - -d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.5034]) -dfele = CBFV.readdatabasefile("databases/jarvis.csv") -e1,e2 = CBFV.processelementdatabase(dfele) -en,pd = CBFV.processinputdata(d,dfele) # ✓ - -feat = CBFV.generatefeatures(d,returndataframe=true) - -d = DataFrame(:formula=>["TiX2","SiO2","SiC"],:target=>[1000.0,1200.0,2400.0]) -ei,en,pd = CBFV.processinputdata(d,dfele) - - - # features,targets,formulae,skipped = assignfeatures(processeddata, - # elementinfo, - # formulae, - # extendfeatures, - # combinefeatures)de \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index a1f1a2e..57d8876 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -100,12 +100,16 @@ using CSV, DataFrames end # ProcessData.jl testset @testset "Featurization.jl functions" begin - d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"],:target=>[248.539,66.8444,91.5034]) + d = DataFrame(:formula=>["Tc1V1","Cu1Dy1","Cd3N2"], + :property=>[1.0,0.5,1.0], + :target=>[248.539,66.8444,91.5034]) featdb = CBFV.generatefeatures(d,returndataframe=true) testdb = CSV.File("pycbfv_test_data.csv") |> DataFrame @test length(names(featdb[!,Not([:target,:formula])])) == length(names(testdb)) @testset "Column $n" for n in names(testdb) @test testdb[!,n] ≈ featdb[!,n] end + # @test CBFV.generatefeatures(d,combine=true)[!,:property] .== d[!,:property] + end # Featurization.jl testset end