@@ -198,7 +198,9 @@ const std::string COLUMN_USES_FREQUENCY_ENCODING_TAG{"uses_frequency_encoding"};
198
198
const std::string ONE_HOT_ENCODED_CATEGORIES_TAG{" one_hot_encoded_categories" };
199
199
const std::string RARE_CATEGORIES_TAG{" rare_categories" };
200
200
const std::string CATEGORY_FREQUENCIES_TAG{" category_frequencies" };
201
- const std::string TARGET_MEAN_VALUES_TAG{" target_mean_values" };
201
+ const std::string MEAN_CATEGORY_FREQUENCIES_TAG{" mean_category_frequencies" };
202
+ const std::string CATEGORY_TARGET_MEAN_VALUES_TAG{" category_target_mean_values" };
203
+ const std::string MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG{" mean_category_target_mean_values" };
202
204
const std::string FEATURE_VECTOR_MICS_TAG{" feature_vector_mics" };
203
205
const std::string FEATURE_VECTOR_COLUMN_MAP_TAG{" feature_vector_column_map" };
204
206
const std::string FEATURE_VECTOR_ENCODING_MAP_TAG{" feature_vector_encoding_map" };
@@ -376,9 +378,9 @@ bool CDataFrameCategoryEncoder::usesFrequencyEncoding(std::size_t feature) const
376
378
}
377
379
378
380
double CDataFrameCategoryEncoder::frequency (std::size_t feature, std::size_t category) const {
379
- return this -> usesOneHotEncoding (feature, category)
380
- ? 0.0
381
- : m_CategoryFrequencies [feature][category ];
381
+ const auto & frequencies = m_CategoryFrequencies[feature];
382
+ return category < frequencies. size () ? frequencies[category]
383
+ : m_MeanCategoryFrequencies [feature];
382
384
}
383
385
384
386
bool CDataFrameCategoryEncoder::isRareCategory (std::size_t feature, std::size_t category) const {
@@ -387,11 +389,9 @@ bool CDataFrameCategoryEncoder::isRareCategory(std::size_t feature, std::size_t
387
389
388
390
double CDataFrameCategoryEncoder::targetMeanValue (std::size_t feature,
389
391
std::size_t category) const {
390
- // TODO combine rare categories and use one mapping for collections.
391
- return this ->usesOneHotEncoding (feature, category) ||
392
- this ->isRareCategory (feature, category)
393
- ? 0.0
394
- : m_TargetMeanValues[feature][category];
392
+ const auto & targetMeanValues = m_CategoryTargetMeanValues[feature];
393
+ return category < targetMeanValues.size () ? targetMeanValues[category]
394
+ : m_MeanCategoryTargetMeanValues[feature];
395
395
}
396
396
397
397
std::uint64_t CDataFrameCategoryEncoder::checksum (std::uint64_t seed) const {
@@ -403,7 +403,9 @@ std::uint64_t CDataFrameCategoryEncoder::checksum(std::uint64_t seed) const {
403
403
seed = CChecksum::calculate (seed, m_OneHotEncodedCategories);
404
404
seed = CChecksum::calculate (seed, m_RareCategories);
405
405
seed = CChecksum::calculate (seed, m_CategoryFrequencies);
406
- seed = CChecksum::calculate (seed, m_TargetMeanValues);
406
+ seed = CChecksum::calculate (seed, m_MeanCategoryFrequencies);
407
+ seed = CChecksum::calculate (seed, m_CategoryTargetMeanValues);
408
+ seed = CChecksum::calculate (seed, m_MeanCategoryTargetMeanValues);
407
409
seed = CChecksum::calculate (seed, m_FeatureVectorMics);
408
410
seed = CChecksum::calculate (seed, m_FeatureVectorColumnMap);
409
411
return CChecksum::calculate (seed, m_FeatureVectorEncodingMap);
@@ -422,7 +424,12 @@ void CDataFrameCategoryEncoder::acceptPersistInserter(core::CStatePersistInserte
422
424
m_OneHotEncodedCategories, inserter);
423
425
core::CPersistUtils::persist (RARE_CATEGORIES_TAG, m_RareCategories, inserter);
424
426
core::CPersistUtils::persist (CATEGORY_FREQUENCIES_TAG, m_CategoryFrequencies, inserter);
425
- core::CPersistUtils::persist (TARGET_MEAN_VALUES_TAG, m_TargetMeanValues, inserter);
427
+ core::CPersistUtils::persist (MEAN_CATEGORY_FREQUENCIES_TAG,
428
+ m_MeanCategoryFrequencies, inserter);
429
+ core::CPersistUtils::persist (CATEGORY_TARGET_MEAN_VALUES_TAG,
430
+ m_CategoryTargetMeanValues, inserter);
431
+ core::CPersistUtils::persist (MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
432
+ m_MeanCategoryTargetMeanValues, inserter);
426
433
core::CPersistUtils::persist (FEATURE_VECTOR_MICS_TAG, m_FeatureVectorMics, inserter);
427
434
core::CPersistUtils::persist (FEATURE_VECTOR_COLUMN_MAP_TAG,
428
435
m_FeatureVectorColumnMap, inserter);
@@ -450,8 +457,15 @@ bool CDataFrameCategoryEncoder::acceptRestoreTraverser(core::CStateRestoreTraver
450
457
RESTORE (CATEGORY_FREQUENCIES_TAG,
451
458
core::CPersistUtils::restore (CATEGORY_FREQUENCIES_TAG,
452
459
m_CategoryFrequencies, traverser))
453
- RESTORE (TARGET_MEAN_VALUES_TAG,
454
- core::CPersistUtils::restore (TARGET_MEAN_VALUES_TAG, m_TargetMeanValues, traverser))
460
+ RESTORE (MEAN_CATEGORY_FREQUENCIES_TAG,
461
+ core::CPersistUtils::restore (MEAN_CATEGORY_FREQUENCIES_TAG,
462
+ m_MeanCategoryFrequencies, traverser))
463
+ RESTORE (CATEGORY_TARGET_MEAN_VALUES_TAG,
464
+ core::CPersistUtils::restore (CATEGORY_TARGET_MEAN_VALUES_TAG,
465
+ m_CategoryTargetMeanValues, traverser))
466
+ RESTORE (MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
467
+ core::CPersistUtils::restore (MEAN_CATEGORY_TARGET_MEAN_VALUES_TAG,
468
+ m_MeanCategoryTargetMeanValues, traverser))
455
469
RESTORE (FEATURE_VECTOR_MICS_TAG,
456
470
core::CPersistUtils::restore (FEATURE_VECTOR_MICS_TAG,
457
471
m_FeatureVectorMics, traverser))
@@ -483,7 +497,7 @@ CDataFrameCategoryEncoder::mics(std::size_t numberThreads,
483
497
encoderFactories[E_TargetMean] = std::make_pair (
484
498
[this ](std::size_t column, std::size_t sampleColumn, std::size_t ) {
485
499
return std::make_unique<CDataFrameUtils::CTargetMeanCategoricalColumnValue>(
486
- sampleColumn, m_RareCategories[column], m_TargetMeanValues [column]);
500
+ sampleColumn, m_RareCategories[column], m_CategoryTargetMeanValues [column]);
487
501
},
488
502
0.0 );
489
503
encoderFactories[E_Frequency] = std::make_pair (
@@ -531,8 +545,13 @@ void CDataFrameCategoryEncoder::setupFrequencyEncoding(std::size_t numberThreads
531
545
LOG_TRACE (<< " category frequencies = "
532
546
<< core::CContainerPrinter::print (m_CategoryFrequencies));
533
547
534
- m_RareCategories.resize (frame.numberColumns ());
548
+ m_MeanCategoryFrequencies.resize (m_CategoryFrequencies.size ());
549
+ m_RareCategories.resize (m_CategoryFrequencies.size ());
535
550
for (std::size_t i = 0 ; i < m_CategoryFrequencies.size (); ++i) {
551
+ m_MeanCategoryFrequencies[i] =
552
+ m_CategoryFrequencies[i].empty ()
553
+ ? 1.0
554
+ : 1.0 / static_cast <double >(m_CategoryFrequencies[i].size ());
536
555
for (std::size_t j = 0 ; j < m_CategoryFrequencies[i].size (); ++j) {
537
556
std::size_t count{static_cast <std::size_t >(
538
557
m_CategoryFrequencies[i][j] * static_cast <double >(frame.numberRows ()) + 0.5 )};
@@ -541,6 +560,8 @@ void CDataFrameCategoryEncoder::setupFrequencyEncoding(std::size_t numberThreads
541
560
}
542
561
}
543
562
}
563
+ LOG_TRACE (<< " mean category frequencies = "
564
+ << core::CContainerPrinter::print (m_MeanCategoryFrequencies));
544
565
LOG_TRACE (<< " rare categories = " << core::CContainerPrinter::print (m_RareCategories));
545
566
}
546
567
@@ -550,11 +571,21 @@ void CDataFrameCategoryEncoder::setupTargetMeanValueEncoding(std::size_t numberT
550
571
const TSizeVec& categoricalColumnMask,
551
572
std::size_t targetColumn) {
552
573
553
- m_TargetMeanValues = CDataFrameUtils::meanValueOfTargetForCategories (
574
+ m_CategoryTargetMeanValues = CDataFrameUtils::meanValueOfTargetForCategories (
554
575
CDataFrameUtils::CMetricColumnValue{targetColumn}, numberThreads, frame,
555
576
rowMask, categoricalColumnMask);
556
- LOG_TRACE (<< " target mean values = "
557
- << core::CContainerPrinter::print (m_TargetMeanValues));
577
+ LOG_TRACE (<< " category target mean values = "
578
+ << core::CContainerPrinter::print (m_CategoryTargetMeanValues));
579
+
580
+ m_MeanCategoryTargetMeanValues.resize (m_CategoryTargetMeanValues.size ());
581
+ for (std::size_t i = 0 ; i < m_CategoryTargetMeanValues.size (); ++i) {
582
+ m_MeanCategoryTargetMeanValues[i] =
583
+ m_CategoryTargetMeanValues[i].empty ()
584
+ ? 0.0
585
+ : CBasicStatistics::mean (m_CategoryTargetMeanValues[i]);
586
+ }
587
+ LOG_TRACE (<< " mean category target mean values = "
588
+ << core::CContainerPrinter::print (m_MeanCategoryTargetMeanValues));
558
589
}
559
590
560
591
CDataFrameCategoryEncoder::TSizeSizePrDoubleMap
@@ -654,9 +685,9 @@ CDataFrameCategoryEncoder::selectFeatures(std::size_t numberThreads,
654
685
metricColumnMask.end (), feature));
655
686
} // else if (selected.isTargetMean()) { nothing to do }
656
687
657
- auto columnValue = selected.columnValue (m_RareCategories[feature],
658
- m_CategoryFrequencies[feature],
659
- m_TargetMeanValues [feature]);
688
+ auto columnValue = selected.columnValue (
689
+ m_RareCategories[feature], m_CategoryFrequencies[feature],
690
+ m_CategoryTargetMeanValues [feature]);
660
691
mics = this ->mics (numberThreads, frame, *columnValue, rowMask,
661
692
metricColumnMask, categoricalColumnMask);
662
693
search.update (mics);
@@ -679,6 +710,38 @@ CDataFrameCategoryEncoder::selectFeatures(std::size_t numberThreads,
679
710
void CDataFrameCategoryEncoder::finishEncoding (std::size_t targetColumn,
680
711
TSizeSizePrDoubleMap selectedFeatureMics) {
681
712
713
+ using TMeanAccumulator = CBasicStatistics::SSampleMean<double >::TAccumulator;
714
+
715
+ // Update the frequency and target mean encoding for one-hot and rare categories.
716
+
717
+ for (std::size_t i = 0 ; i < m_OneHotEncodedCategories.size (); ++i) {
718
+ TMeanAccumulator meanCategoryFrequency;
719
+ TMeanAccumulator meanCategoryTargetMeanValue;
720
+ for (auto category : m_OneHotEncodedCategories[i]) {
721
+ double frequency{m_CategoryFrequencies[i][category]};
722
+ double mean{m_CategoryTargetMeanValues[i][category]};
723
+ meanCategoryFrequency.add (frequency, frequency);
724
+ meanCategoryTargetMeanValue.add (mean, frequency);
725
+ }
726
+ for (auto category : m_OneHotEncodedCategories[i]) {
727
+ m_CategoryFrequencies[i][category] = CBasicStatistics::mean (meanCategoryFrequency);
728
+ m_CategoryTargetMeanValues[i][category] =
729
+ CBasicStatistics::mean (meanCategoryTargetMeanValue);
730
+ }
731
+ }
732
+ for (std::size_t i = 0 ; i < m_RareCategories.size (); ++i) {
733
+ TMeanAccumulator meanCategoryTargetMeanValue;
734
+ for (auto category : m_RareCategories[i]) {
735
+ double frequency{m_CategoryFrequencies[i][category]};
736
+ double mean{m_CategoryTargetMeanValues[i][category]};
737
+ meanCategoryTargetMeanValue.add (mean, frequency);
738
+ }
739
+ for (auto category : m_RareCategories[i]) {
740
+ m_CategoryTargetMeanValues[i][category] =
741
+ CBasicStatistics::mean (meanCategoryTargetMeanValue);
742
+ }
743
+ }
744
+
682
745
// Fill in a mapping from encoded column indices to raw column indices.
683
746
684
747
selectedFeatureMics[{targetColumn, CATEGORY_FOR_DEPENDENT_VARIABLE}] = 0.0 ;
0 commit comments