elastic · droberts195 · May 23, 2022 · May 23, 2022 · May 23, 2022 · May 23, 2022
diff --git a/docs/CHANGELOG.asciidoc b/docs/CHANGELOG.asciidoc
@@ -55,6 +55,10 @@
 * Make ML native processes work with glibc 2.35 (required for Ubuntu 22.04). (See
   {ml-pull}2272[#2272].)
 
+=== Bug Fixes
+
+* Adjacency weighting fixes in categorization. (See {ml-pull}2277[#2277].)
+
 == {es} version 8.2.1
 
 === Bug Fixes

diff --git a/include/core/CWordDictionary.h b/include/core/CWordDictionary.h
@@ -11,11 +11,11 @@
 #ifndef INCLUDED_ml_core_CWordDictionary_h
 #define INCLUDED_ml_core_CWordDictionary_h
 
-#include <core/CNonCopyable.h>
 #include <core/ImportExport.h>
 
 #include <boost/unordered_map.hpp>
 
+#include <algorithm>
 #include <string>
 
 namespace ml {
@@ -50,7 +50,7 @@ namespace core {
 //! too to avoid repeated locking in the instance() method (see
 //! Modern C++ Design by Andrei Alexandrescu for details).
 //!
-class CORE_EXPORT CWordDictionary : private CNonCopyable {
+class CORE_EXPORT CWordDictionary {
 public:
     //! Types of words.
     //! The values used are deliberately powers of two so that in the
@@ -84,6 +84,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
         void reset() {
             // NO-OP
         }
+
+        std::size_t minMatchingWeight(std::size_t weight) { return weight; }
+
+        std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
     };
 
     using TWeightAll2 = CWeightAll<2>;
@@ -103,6 +107,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
         void reset() {
             // NO-OP
         }
+
+        std::size_t minMatchingWeight(std::size_t weight) { return weight; }
+
+        std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
     };
 
     using TWeightVerbs5Other2 = CWeightOnePart<E_Verb, 5, 2>;
@@ -120,17 +128,28 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
             }
 
             std::size_t weight = (partOfSpeech == SPECIAL_PART1) ? EXTRA_WEIGHT1 : DEFAULT_EXTRA_WEIGHT;
-            std::size_t boost =
-                (m_NumOfAdjacentDictionaryWords > 1 ? ADJACENT_PARTS_BOOST : 1);
+            std::size_t boost = (++m_NumOfAdjacentDictionaryWords > 2) ? ADJACENT_PARTS_BOOST
+                                                                       : 1;
             weight *= boost;
 
-            ++m_NumOfAdjacentDictionaryWords;
-
             return weight;
         }
 
         void reset() { m_NumOfAdjacentDictionaryWords = 0; }
 
+        std::size_t minMatchingWeight(std::size_t weight) {
+            return (weight <= ADJACENT_PARTS_BOOST)
+                       ? weight
+                       : (1 + (weight - 1) / ADJACENT_PARTS_BOOST);
+        }
+
+        std::size_t maxMatchingWeight(std::size_t weight) {
+            return (weight <= std::min(EXTRA_WEIGHT1, DEFAULT_EXTRA_WEIGHT) ||
+                    weight > std::max(EXTRA_WEIGHT1 + 1, DEFAULT_EXTRA_WEIGHT + 1))
+                       ? weight
+                       : (1 + (weight - 1) * ADJACENT_PARTS_BOOST);
+        }
+
     private:
         std::size_t m_NumOfAdjacentDictionaryWords = 0;
     };
@@ -155,6 +174,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
         void reset() {
             // NO-OP
         }
+
+        std::size_t minMatchingWeight(std::size_t weight) { return weight; }
+
+        std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
     };
 
     // Similar templates with more arguments can be added as required...
@@ -176,6 +199,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
     //! aren't in the dictionary.
     EPartOfSpeech partOfSpeech(const std::string& str) const;
 
+    //! No copying
+    CWordDictionary(const CWordDictionary&) = delete;
+    CWordDictionary& operator=(const CWordDictionary&) = delete;
+
 private:
     //! Constructor for a singleton is private
     CWordDictionary();

diff --git a/include/core/WindowsSafe.h b/include/core/WindowsSafe.h
@@ -19,12 +19,6 @@
 
 #include <Windows.h>
 
-#ifdef min
-#undef min
-#endif
-#ifdef max
-#undef max
-#endif
 #ifdef TEXT
 #undef TEXT
 #endif

diff --git a/include/model/CTokenListCategory.h b/include/model/CTokenListCategory.h
@@ -119,8 +119,7 @@ class MODEL_EXPORT CTokenListCategory {
                                         return testItem.first >= commonItem.first;
                                     });
             if (testIter == uniqueTokenIds.end() ||
-                testIter->first != commonItem.first ||
-                testIter->second != commonItem.second) {
+                testIter->first != commonItem.first) {
                 return false;
             }
             ++testIter;

diff --git a/include/model/CTokenListDataCategorizer.h b/include/model/CTokenListDataCategorizer.h
@@ -96,7 +96,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
                         const std::string& str,
                         TSizeSizePrVec& tokenIds,
                         TSizeSizeMap& tokenUniqueIds,
-                        std::size_t& totalWeight) override {
+                        std::size_t& totalWeight,
+                        std::size_t& minReweightedTotalWeight,
+                        std::size_t& maxReweightedTotalWeight) override {
         tokenIds.clear();
         tokenUniqueIds.clear();
         totalWeight = 0;
@@ -128,8 +130,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
                 }
             } else {
                 if (!temp.empty()) {
-                    this->considerToken(fields, nonHexPos, temp, tokenIds,
-                                        tokenUniqueIds, totalWeight);
+                    this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds,
+                                        totalWeight, minReweightedTotalWeight,
+                                        maxReweightedTotalWeight);
                     temp.clear();
                 }
 
@@ -140,7 +143,8 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
         }
 
         if (!temp.empty()) {
-            this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight);
+            this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight,
+                                minReweightedTotalWeight, maxReweightedTotalWeight);
         }
 
         LOG_TRACE(<< str << " tokenised to " << tokenIds.size() << " tokens with total weight "
@@ -154,7 +158,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
     void tokenToIdAndWeight(const std::string& token,
                             TSizeSizePrVec& tokenIds,
                             TSizeSizeMap& tokenUniqueIds,
-                            std::size_t& totalWeight) override {
+                            std::size_t& totalWeight,
+                            std::size_t& minReweightedTotalWeight,
+                            std::size_t& maxReweightedTotalWeight) override {
         TSizeSizePr idWithWeight(this->idForToken(token), 1);
 
         if (token.length() >= MIN_DICTIONARY_LENGTH) {
@@ -165,6 +171,10 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
         tokenIds.push_back(idWithWeight);
         tokenUniqueIds[idWithWeight.first] += idWithWeight.second;
         totalWeight += idWithWeight.second;
+        minReweightedTotalWeight +=
+            m_DictionaryWeightFunc.minMatchingWeight(idWithWeight.second);
+        maxReweightedTotalWeight +=
+            m_DictionaryWeightFunc.maxMatchingWeight(idWithWeight.second);
     }
 
     void reset() override { m_DictionaryWeightFunc.reset(); }
@@ -225,7 +235,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
                        std::string& token,
                        TSizeSizePrVec& tokenIds,
                        TSizeSizeMap& tokenUniqueIds,
-                       std::size_t& totalWeight) {
+                       std::size_t& totalWeight,
+                       std::size_t& minReweightedTotalWeight,
+                       std::size_t& maxReweightedTotalWeight) {
         if (IGNORE_LEADING_DIGIT && std::isdigit(static_cast<unsigned char>(token[0]))) {
             return;
         }
@@ -262,7 +274,8 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
             return;
         }
 
-        this->tokenToIdAndWeight(token, tokenIds, tokenUniqueIds, totalWeight);
+        this->tokenToIdAndWeight(token, tokenIds, tokenUniqueIds, totalWeight,
+                                 minReweightedTotalWeight, maxReweightedTotalWeight);
     }
 
 private:

diff --git a/include/model/CTokenListDataCategorizerBase.h b/include/model/CTokenListDataCategorizerBase.h
@@ -219,14 +219,18 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
                                 const std::string& str,
                                 TSizeSizePrVec& tokenIds,
                                 TSizeSizeMap& tokenUniqueIds,
-                                std::size_t& totalWeight) = 0;
+                                std::size_t& totalWeight,
+                                std::size_t& minReweightedTotalWeight,
+                                std::size_t& maxReweightedTotalWeight) = 0;
 
     //! Take a string token, convert it to a numeric ID and a weighting and
     //! add these to the provided data structures.
     virtual void tokenToIdAndWeight(const std::string& token,
                                     TSizeSizePrVec& tokenIds,
                                     TSizeSizeMap& tokenUniqueIds,
-                                    std::size_t& totalWeight) = 0;
+                                    std::size_t& totalWeight,
+                                    std::size_t& minReweightedTotalWeight,
+                                    std::size_t& maxReweightedTotalWeight) = 0;
 
     virtual void reset() = 0;
 
@@ -339,7 +343,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
     bool addPretokenisedTokens(const std::string& tokensCsv,
                                TSizeSizePrVec& tokenIds,
                                TSizeSizeMap& tokenUniqueIds,
-                               std::size_t& totalWeight);
+                               std::size_t& totalWeight,
+                               std::size_t& minReweightedTotalWeight,
+                               std::size_t& maxReweightedTotalWeight);
 
     //! Get the categories that will never be detected again because the
     //! specified category will always be returned instead.  This overload

diff --git a/lib/core/unittest/CWordDictionaryTest.cc b/lib/core/unittest/CWordDictionaryTest.cc
@@ -58,43 +58,93 @@ BOOST_AUTO_TEST_CASE(testPartOfSpeech) {
                         dict.partOfSpeech("a"));
 }
 
-BOOST_AUTO_TEST_CASE(testWeightingFunctors) {
+BOOST_AUTO_TEST_CASE(testSimpleWeightingFunctors) {
     {
         ml::core::CWordDictionary::TWeightAll2 weighter;
 
-        BOOST_REQUIRE_EQUAL(size_t(0), weighter(ml::core::CWordDictionary::E_NotInDictionary));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_UnknownPart));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Noun));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Plural));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Verb));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adjective));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adverb));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Conjunction));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Preposition));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Interjection));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Pronoun));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_DefiniteArticle));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
+        BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Plural));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Verb));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
+        weighter.reset(); // should make no difference
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adverb));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Conjunction));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Preposition));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Interjection));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Pronoun));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_DefiniteArticle));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
+        // Any given token always gives the same weight, so min/max matching
+        // should always be the same as the original
+        for (std::size_t weight = 1; weight < 10; ++weight) {
+            BOOST_REQUIRE_EQUAL(weight, weighter.minMatchingWeight(weight));
+            BOOST_REQUIRE_EQUAL(weight, weighter.maxMatchingWeight(weight));
+        }
     }
     {
         ml::core::CWordDictionary::TWeightVerbs5Other2 weighter;
 
-        BOOST_REQUIRE_EQUAL(size_t(0), weighter(ml::core::CWordDictionary::E_NotInDictionary));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_UnknownPart));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Noun));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Plural));
-        BOOST_REQUIRE_EQUAL(size_t(5), weighter(ml::core::CWordDictionary::E_Verb));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adjective));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adverb));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Conjunction));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Preposition));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Interjection));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Pronoun));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_DefiniteArticle));
-        BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
+        BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Plural));
+        weighter.reset(); // should make no difference
+        BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adverb));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Conjunction));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Preposition));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Interjection));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Pronoun));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_DefiniteArticle));
+        BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
+        // Any given token always gives the same weight, so min/max matching
+        // should always be the same as the original
+        for (std::size_t weight = 1; weight < 10; ++weight) {
+            BOOST_REQUIRE_EQUAL(weight, weighter.minMatchingWeight(weight));
+            BOOST_REQUIRE_EQUAL(weight, weighter.maxMatchingWeight(weight));
+        }
     }
 }
 
+BOOST_AUTO_TEST_CASE(testAdjacencyDependentWeightingFunctor) {
+    ml::core::CWordDictionary::TWeightVerbs5Other2AdjacentBoost6 weighter;
+
+    BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
+    BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
+    BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
+    BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Plural));
+    BOOST_REQUIRE_EQUAL(30, weighter(ml::core::CWordDictionary::E_Verb));
+    weighter.reset();
+    // Explicit reset stops adjacency multiplier
+    BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
+    BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
+    BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Adverb));
+    BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Conjunction));
+    BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
+    // Non-dictionary word stops adjacency multiplier
+    BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
+    BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
+    weighter.reset();
+    // Explicit reset stops adjacency multiplier
+    BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
+
+    // Of the possible weights, 3 could map to 13 and 6 to 31 depending on
+    // whether adjacency weighting takes place
+    BOOST_REQUIRE_EQUAL(1, weighter.minMatchingWeight(1));
+    BOOST_REQUIRE_EQUAL(1, weighter.maxMatchingWeight(1));
+    BOOST_REQUIRE_EQUAL(3, weighter.minMatchingWeight(3));
+    BOOST_REQUIRE_EQUAL(13, weighter.maxMatchingWeight(3));
+    BOOST_REQUIRE_EQUAL(6, weighter.minMatchingWeight(6));
+    BOOST_REQUIRE_EQUAL(31, weighter.maxMatchingWeight(6));
+    BOOST_REQUIRE_EQUAL(3, weighter.minMatchingWeight(13));
+    BOOST_REQUIRE_EQUAL(13, weighter.maxMatchingWeight(13));
+    BOOST_REQUIRE_EQUAL(6, weighter.minMatchingWeight(31));
+    BOOST_REQUIRE_EQUAL(31, weighter.maxMatchingWeight(31));
+}
+
 // Disabled because it doesn't assert anything
 // Can be run on an ad hoc basis if performance is of interest
 BOOST_AUTO_TEST_CASE(testPerformance, *boost::unit_test::disabled()) {
@@ -104,8 +154,8 @@ BOOST_AUTO_TEST_CASE(testPerformance, *boost::unit_test::disabled()) {
     LOG_INFO(<< "Starting word dictionary throughput test at "
              << ml::core::CTimeUtils::toTimeString(start));
 
-    static const size_t TEST_SIZE(100000);
-    for (size_t count = 0; count < TEST_SIZE; ++count) {
+    static const std::size_t TEST_SIZE(100000);
+    for (std::size_t count = 0; count < TEST_SIZE; ++count) {
         dict.isInDictionary("hello");
         dict.isInDictionary("Hello");
         dict.isInDictionary("HELLO");