Skip to content

[7.4][ML] Handle unseen categories in encoding #603

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 21, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions include/core/CDataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,8 @@ class CORE_EXPORT CDataFrame final {
std::size_t numberRows,
std::size_t numberColumns);

// TODO We may want an architecture agnostic check pointing mechanism for long
// running tasks.
//! Get the value to use for a missing element in a data frame.
static double valueOfMissing();

private:
using TSizeSizePr = std::pair<std::size_t, std::size_t>;
Expand Down
4 changes: 3 additions & 1 deletion include/maths/CDataFrameCategoryEncoder.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,9 @@ class MATHS_EXPORT CDataFrameCategoryEncoder final {
TSizeVecVec m_OneHotEncodedCategories;
TSizeUSetVec m_RareCategories;
TDoubleVecVec m_CategoryFrequencies;
TDoubleVecVec m_TargetMeanValues;
TDoubleVec m_MeanCategoryFrequencies;
TDoubleVecVec m_CategoryTargetMeanValues;
TDoubleVec m_MeanCategoryTargetMeanValues;
TDoubleVec m_FeatureVectorMics;
TSizeVec m_FeatureVectorColumnMap;
TSizeVec m_FeatureVectorEncodingMap;
Expand Down
4 changes: 2 additions & 2 deletions lib/api/CDataFrameAnalyzer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,10 @@ void CDataFrameAnalyzer::addRowToDataFrame(const TStrVec& fieldValues) {
double value;
if (fieldValue.empty()) {
++m_MissingValueCount;
return core::CFloatStorage{std::numeric_limits<float>::quiet_NaN()};
return core::CFloatStorage{core::CDataFrame::valueOfMissing()};
} else if (core::CStringUtils::stringToTypeSilent(fieldValue, value) == false) {
++m_BadValueCount;
return core::CFloatStorage{std::numeric_limits<float>::quiet_NaN()};
return core::CFloatStorage{core::CDataFrame::valueOfMissing()};
}

// Tuncation is very unlikely since the values will typically be
Expand Down
5 changes: 5 additions & 0 deletions lib/core/CDataFrame.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include <algorithm>
#include <future>
#include <limits>
#include <memory>

namespace ml {
Expand Down Expand Up @@ -261,6 +262,10 @@ std::size_t CDataFrame::estimateMemoryUsage(bool inMainMemory,
return inMainMemory ? numberRows * numberColumns * sizeof(float) : 0;
}

double CDataFrame::valueOfMissing() {
return std::numeric_limits<double>::quiet_NaN();
}

CDataFrame::TRowFuncVecBoolPr
CDataFrame::parallelApplyToAllRows(std::size_t numberThreads,
std::size_t beginRows,
Expand Down
105 changes: 84 additions & 21 deletions lib/maths/CDataFrameCategoryEncoder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,9 @@ const std::string COLUMN_USES_FREQUENCY_ENCODING_TAG{"uses_frequency_encoding"};
const std::string ONE_HOT_ENCODED_CATEGORIES_TAG{"one_hot_encoded_categories"};
const std::string RARE_CATEGORIES_TAG{"rare_categories"};
const std::string CATEGORY_FREQUENCIES_TAG{"category_frequencies"};
const std::string TARGET_MEAN_VALUES_TAG{"target_mean_values"};
const std::string MEAN_CATEGORY_FREQUENCIES_TAG{"mean_category_frequencies"};
const std::string CATEGORY_TARGET_MEAN_VALUES_TAG{"category_target_mean_values"};
const std::string MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG{"mean_category_target_mean_values"};
const std::string FEATURE_VECTOR_MICS_TAG{"feature_vector_mics"};
const std::string FEATURE_VECTOR_COLUMN_MAP_TAG{"feature_vector_column_map"};
const std::string FEATURE_VECTOR_ENCODING_MAP_TAG{"feature_vector_encoding_map"};
Expand Down Expand Up @@ -376,9 +378,9 @@ bool CDataFrameCategoryEncoder::usesFrequencyEncoding(std::size_t feature) const
}

double CDataFrameCategoryEncoder::frequency(std::size_t feature, std::size_t category) const {
return this->usesOneHotEncoding(feature, category)
? 0.0
: m_CategoryFrequencies[feature][category];
const auto& frequencies = m_CategoryFrequencies[feature];
return category < frequencies.size() ? frequencies[category]
: m_MeanCategoryFrequencies[feature];
}

bool CDataFrameCategoryEncoder::isRareCategory(std::size_t feature, std::size_t category) const {
Expand All @@ -387,11 +389,9 @@ bool CDataFrameCategoryEncoder::isRareCategory(std::size_t feature, std::size_t

double CDataFrameCategoryEncoder::targetMeanValue(std::size_t feature,
std::size_t category) const {
// TODO combine rare categories and use one mapping for collections.
return this->usesOneHotEncoding(feature, category) ||
this->isRareCategory(feature, category)
? 0.0
: m_TargetMeanValues[feature][category];
const auto& targetMeanValues = m_CategoryTargetMeanValues[feature];
return category < targetMeanValues.size() ? targetMeanValues[category]
: m_MeanCategoryTargetMeanValues[feature];
}

std::uint64_t CDataFrameCategoryEncoder::checksum(std::uint64_t seed) const {
Expand All @@ -403,7 +403,9 @@ std::uint64_t CDataFrameCategoryEncoder::checksum(std::uint64_t seed) const {
seed = CChecksum::calculate(seed, m_OneHotEncodedCategories);
seed = CChecksum::calculate(seed, m_RareCategories);
seed = CChecksum::calculate(seed, m_CategoryFrequencies);
seed = CChecksum::calculate(seed, m_TargetMeanValues);
seed = CChecksum::calculate(seed, m_MeanCategoryFrequencies);
seed = CChecksum::calculate(seed, m_CategoryTargetMeanValues);
seed = CChecksum::calculate(seed, m_MeanCategoryTargetMeanValues);
seed = CChecksum::calculate(seed, m_FeatureVectorMics);
seed = CChecksum::calculate(seed, m_FeatureVectorColumnMap);
return CChecksum::calculate(seed, m_FeatureVectorEncodingMap);
Expand All @@ -422,7 +424,12 @@ void CDataFrameCategoryEncoder::acceptPersistInserter(core::CStatePersistInserte
m_OneHotEncodedCategories, inserter);
core::CPersistUtils::persist(RARE_CATEGORIES_TAG, m_RareCategories, inserter);
core::CPersistUtils::persist(CATEGORY_FREQUENCIES_TAG, m_CategoryFrequencies, inserter);
core::CPersistUtils::persist(TARGET_MEAN_VALUES_TAG, m_TargetMeanValues, inserter);
core::CPersistUtils::persist(MEAN_CATEGORY_FREQUENCIES_TAG,
m_MeanCategoryFrequencies, inserter);
core::CPersistUtils::persist(CATEGORY_TARGET_MEAN_VALUES_TAG,
m_CategoryTargetMeanValues, inserter);
core::CPersistUtils::persist(MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
m_MeanCategoryTargetMeanValues, inserter);
core::CPersistUtils::persist(FEATURE_VECTOR_MICS_TAG, m_FeatureVectorMics, inserter);
core::CPersistUtils::persist(FEATURE_VECTOR_COLUMN_MAP_TAG,
m_FeatureVectorColumnMap, inserter);
Expand Down Expand Up @@ -450,8 +457,15 @@ bool CDataFrameCategoryEncoder::acceptRestoreTraverser(core::CStateRestoreTraver
RESTORE(CATEGORY_FREQUENCIES_TAG,
core::CPersistUtils::restore(CATEGORY_FREQUENCIES_TAG,
m_CategoryFrequencies, traverser))
RESTORE(TARGET_MEAN_VALUES_TAG,
core::CPersistUtils::restore(TARGET_MEAN_VALUES_TAG, m_TargetMeanValues, traverser))
RESTORE(MEAN_CATEGORY_FREQUENCIES_TAG,
core::CPersistUtils::restore(MEAN_CATEGORY_FREQUENCIES_TAG,
m_MeanCategoryFrequencies, traverser))
RESTORE(CATEGORY_TARGET_MEAN_VALUES_TAG,
core::CPersistUtils::restore(CATEGORY_TARGET_MEAN_VALUES_TAG,
m_CategoryTargetMeanValues, traverser))
RESTORE(MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
core::CPersistUtils::restore(MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
m_MeanCategoryTargetMeanValues, traverser))
RESTORE(FEATURE_VECTOR_MICS_TAG,
core::CPersistUtils::restore(FEATURE_VECTOR_MICS_TAG,
m_FeatureVectorMics, traverser))
Expand Down Expand Up @@ -483,7 +497,7 @@ CDataFrameCategoryEncoder::mics(std::size_t numberThreads,
encoderFactories[E_TargetMean] = std::make_pair(
[this](std::size_t column, std::size_t sampleColumn, std::size_t) {
return std::make_unique<CDataFrameUtils::CTargetMeanCategoricalColumnValue>(
sampleColumn, m_RareCategories[column], m_TargetMeanValues[column]);
sampleColumn, m_RareCategories[column], m_CategoryTargetMeanValues[column]);
},
0.0);
encoderFactories[E_Frequency] = std::make_pair(
Expand Down Expand Up @@ -531,8 +545,13 @@ void CDataFrameCategoryEncoder::setupFrequencyEncoding(std::size_t numberThreads
LOG_TRACE(<< "category frequencies = "
<< core::CContainerPrinter::print(m_CategoryFrequencies));

m_RareCategories.resize(frame.numberColumns());
m_MeanCategoryFrequencies.resize(m_CategoryFrequencies.size());
m_RareCategories.resize(m_CategoryFrequencies.size());
for (std::size_t i = 0; i < m_CategoryFrequencies.size(); ++i) {
m_MeanCategoryFrequencies[i] =
m_CategoryFrequencies[i].empty()
? 1.0
: 1.0 / static_cast<double>(m_CategoryFrequencies[i].size());
for (std::size_t j = 0; j < m_CategoryFrequencies[i].size(); ++j) {
std::size_t count{static_cast<std::size_t>(
m_CategoryFrequencies[i][j] * static_cast<double>(frame.numberRows()) + 0.5)};
Expand All @@ -541,6 +560,8 @@ void CDataFrameCategoryEncoder::setupFrequencyEncoding(std::size_t numberThreads
}
}
}
LOG_TRACE(<< "mean category frequencies = "
<< core::CContainerPrinter::print(m_MeanCategoryFrequencies));
LOG_TRACE(<< "rare categories = " << core::CContainerPrinter::print(m_RareCategories));
}

Expand All @@ -550,11 +571,21 @@ void CDataFrameCategoryEncoder::setupTargetMeanValueEncoding(std::size_t numberT
const TSizeVec& categoricalColumnMask,
std::size_t targetColumn) {

m_TargetMeanValues = CDataFrameUtils::meanValueOfTargetForCategories(
m_CategoryTargetMeanValues = CDataFrameUtils::meanValueOfTargetForCategories(
CDataFrameUtils::CMetricColumnValue{targetColumn}, numberThreads, frame,
rowMask, categoricalColumnMask);
LOG_TRACE(<< "target mean values = "
<< core::CContainerPrinter::print(m_TargetMeanValues));
LOG_TRACE(<< "category target mean values = "
<< core::CContainerPrinter::print(m_CategoryTargetMeanValues));

m_MeanCategoryTargetMeanValues.resize(m_CategoryTargetMeanValues.size());
for (std::size_t i = 0; i < m_CategoryTargetMeanValues.size(); ++i) {
m_MeanCategoryTargetMeanValues[i] =
m_CategoryTargetMeanValues[i].empty()
? 0.0
: CBasicStatistics::mean(m_CategoryTargetMeanValues[i]);
}
LOG_TRACE(<< "mean category target mean values = "
<< core::CContainerPrinter::print(m_MeanCategoryTargetMeanValues));
}

CDataFrameCategoryEncoder::TSizeSizePrDoubleMap
Expand Down Expand Up @@ -654,9 +685,9 @@ CDataFrameCategoryEncoder::selectFeatures(std::size_t numberThreads,
metricColumnMask.end(), feature));
} // else if (selected.isTargetMean()) { nothing to do }

auto columnValue = selected.columnValue(m_RareCategories[feature],
m_CategoryFrequencies[feature],
m_TargetMeanValues[feature]);
auto columnValue = selected.columnValue(
m_RareCategories[feature], m_CategoryFrequencies[feature],
m_CategoryTargetMeanValues[feature]);
mics = this->mics(numberThreads, frame, *columnValue, rowMask,
metricColumnMask, categoricalColumnMask);
search.update(mics);
Expand All @@ -679,6 +710,38 @@ CDataFrameCategoryEncoder::selectFeatures(std::size_t numberThreads,
void CDataFrameCategoryEncoder::finishEncoding(std::size_t targetColumn,
TSizeSizePrDoubleMap selectedFeatureMics) {

using TMeanAccumulator = CBasicStatistics::SSampleMean<double>::TAccumulator;

// Update the frequency and target mean encoding for one-hot and rare categories.

for (std::size_t i = 0; i < m_OneHotEncodedCategories.size(); ++i) {
TMeanAccumulator meanCategoryFrequency;
TMeanAccumulator meanCategoryTargetMeanValue;
for (auto category : m_OneHotEncodedCategories[i]) {
double frequency{m_CategoryFrequencies[i][category]};
double mean{m_CategoryTargetMeanValues[i][category]};
meanCategoryFrequency.add(frequency, frequency);
meanCategoryTargetMeanValue.add(mean, frequency);
}
for (auto category : m_OneHotEncodedCategories[i]) {
m_CategoryFrequencies[i][category] = CBasicStatistics::mean(meanCategoryFrequency);
m_CategoryTargetMeanValues[i][category] =
CBasicStatistics::mean(meanCategoryTargetMeanValue);
}
}
for (std::size_t i = 0; i < m_RareCategories.size(); ++i) {
TMeanAccumulator meanCategoryTargetMeanValue;
for (auto category : m_RareCategories[i]) {
double frequency{m_CategoryFrequencies[i][category]};
double mean{m_CategoryTargetMeanValues[i][category]};
meanCategoryTargetMeanValue.add(mean, frequency);
}
for (auto category : m_RareCategories[i]) {
m_CategoryTargetMeanValues[i][category] =
CBasicStatistics::mean(meanCategoryTargetMeanValue);
}
}

// Fill in a mapping from encoded column indices to raw column indices.

selectedFeatureMics[{targetColumn, CATEGORY_FOR_DEPENDENT_VARIABLE}] = 0.0;
Expand Down
8 changes: 4 additions & 4 deletions lib/maths/unittest/CBoostedTreeTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ auto predictAndComputeEvaluationMetrics(const F& generateFunction,
for (auto row = beginRows; row != endRows; ++row) {
double targetValue{row->index() < trainRows
? target(*row) + noise[row->index()]
: std::numeric_limits<double>::quiet_NaN()};
: core::CDataFrame::valueOfMissing()};
row->writeColumn(cols - 1, targetValue);
}
});
Expand Down Expand Up @@ -582,7 +582,7 @@ void CBoostedTreeTest::testCategoricalRegressors() {
for (auto row = beginRows; row != endRows; ++row) {
double targetValue{row->index() < trainRows
? target(*row)
: std::numeric_limits<double>::quiet_NaN()};
: core::CDataFrame::valueOfMissing()};
row->writeColumn(cols - 1, targetValue);
}
});
Expand All @@ -602,8 +602,8 @@ void CBoostedTreeTest::testCategoricalRegressors() {

LOG_DEBUG(<< "bias = " << modelBias);
LOG_DEBUG(<< " R^2 = " << modelRSquared);
CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.06);
CPPUNIT_ASSERT(modelRSquared > 0.97);
CPPUNIT_ASSERT_DOUBLES_EQUAL(0.0, modelBias, 0.1);
CPPUNIT_ASSERT(modelRSquared > 0.9);
}

void CBoostedTreeTest::testProgressMonitoring() {
Expand Down
Loading