Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/CHANGELOG.asciidoc
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
* Make ML native processes work with glibc 2.35 (required for Ubuntu 22.04). (See
{ml-pull}2272[#2272].)

=== Bug Fixes

* Adjacency weighting fixes in categorization. (See {ml-pull}2277[#2277].)

== {es} version 8.2.1

=== Bug Fixes
Expand Down
39 changes: 33 additions & 6 deletions include/core/CWordDictionary.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
#ifndef INCLUDED_ml_core_CWordDictionary_h
#define INCLUDED_ml_core_CWordDictionary_h

#include <core/CNonCopyable.h>
#include <core/ImportExport.h>

#include <boost/unordered_map.hpp>

#include <algorithm>
#include <string>

namespace ml {
Expand Down Expand Up @@ -50,7 +50,7 @@ namespace core {
//! too to avoid repeated locking in the instance() method (see
//! Modern C++ Design by Andrei Alexandrescu for details).
//!
class CORE_EXPORT CWordDictionary : private CNonCopyable {
class CORE_EXPORT CWordDictionary {
public:
//! Types of words.
//! The values used are deliberately powers of two so that in the
Expand Down Expand Up @@ -84,6 +84,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
void reset() {
// NO-OP
}

std::size_t minMatchingWeight(std::size_t weight) { return weight; }

std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
};

using TWeightAll2 = CWeightAll<2>;
Expand All @@ -103,6 +107,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
void reset() {
// NO-OP
}

std::size_t minMatchingWeight(std::size_t weight) { return weight; }

std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
};

using TWeightVerbs5Other2 = CWeightOnePart<E_Verb, 5, 2>;
Expand All @@ -120,17 +128,28 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
}

std::size_t weight = (partOfSpeech == SPECIAL_PART1) ? EXTRA_WEIGHT1 : DEFAULT_EXTRA_WEIGHT;
std::size_t boost =
(m_NumOfAdjacentDictionaryWords > 1 ? ADJACENT_PARTS_BOOST : 1);
std::size_t boost = (++m_NumOfAdjacentDictionaryWords > 2) ? ADJACENT_PARTS_BOOST
: 1;
weight *= boost;

++m_NumOfAdjacentDictionaryWords;

return weight;
}

void reset() { m_NumOfAdjacentDictionaryWords = 0; }

std::size_t minMatchingWeight(std::size_t weight) {
return (weight <= ADJACENT_PARTS_BOOST)
? weight
: (1 + (weight - 1) / ADJACENT_PARTS_BOOST);
}

std::size_t maxMatchingWeight(std::size_t weight) {
return (weight <= std::min(EXTRA_WEIGHT1, DEFAULT_EXTRA_WEIGHT) ||
weight > std::max(EXTRA_WEIGHT1 + 1, DEFAULT_EXTRA_WEIGHT + 1))
? weight
: (1 + (weight - 1) * ADJACENT_PARTS_BOOST);
}

private:
std::size_t m_NumOfAdjacentDictionaryWords = 0;
};
Expand All @@ -155,6 +174,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
void reset() {
// NO-OP
}

std::size_t minMatchingWeight(std::size_t weight) { return weight; }

std::size_t maxMatchingWeight(std::size_t weight) { return weight; }
};

// Similar templates with more arguments can be added as required...
Expand All @@ -176,6 +199,10 @@ class CORE_EXPORT CWordDictionary : private CNonCopyable {
//! aren't in the dictionary.
EPartOfSpeech partOfSpeech(const std::string& str) const;

//! No copying
CWordDictionary(const CWordDictionary&) = delete;
CWordDictionary& operator=(const CWordDictionary&) = delete;

private:
//! Constructor for a singleton is private
CWordDictionary();
Expand Down
6 changes: 0 additions & 6 deletions include/core/WindowsSafe.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@

#include <Windows.h>

#ifdef min
#undef min
#endif
#ifdef max
#undef max
#endif
#ifdef TEXT
#undef TEXT
#endif
Expand Down
3 changes: 1 addition & 2 deletions include/model/CTokenListCategory.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,7 @@ class MODEL_EXPORT CTokenListCategory {
return testItem.first >= commonItem.first;
});
if (testIter == uniqueTokenIds.end() ||
testIter->first != commonItem.first ||
testIter->second != commonItem.second) {
testIter->first != commonItem.first) {
return false;
}
++testIter;
Expand Down
27 changes: 20 additions & 7 deletions include/model/CTokenListDataCategorizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
const std::string& str,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight) override {
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) override {
tokenIds.clear();
tokenUniqueIds.clear();
totalWeight = 0;
Expand Down Expand Up @@ -128,8 +130,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
}
} else {
if (!temp.empty()) {
this->considerToken(fields, nonHexPos, temp, tokenIds,
tokenUniqueIds, totalWeight);
this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds,
totalWeight, minReweightedTotalWeight,
maxReweightedTotalWeight);
temp.clear();
}

Expand All @@ -140,7 +143,8 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
}

if (!temp.empty()) {
this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight);
this->considerToken(fields, nonHexPos, temp, tokenIds, tokenUniqueIds, totalWeight,
minReweightedTotalWeight, maxReweightedTotalWeight);
}

LOG_TRACE(<< str << " tokenised to " << tokenIds.size() << " tokens with total weight "
Expand All @@ -154,7 +158,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
void tokenToIdAndWeight(const std::string& token,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight) override {
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) override {
TSizeSizePr idWithWeight(this->idForToken(token), 1);

if (token.length() >= MIN_DICTIONARY_LENGTH) {
Expand All @@ -165,6 +171,10 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
tokenIds.push_back(idWithWeight);
tokenUniqueIds[idWithWeight.first] += idWithWeight.second;
totalWeight += idWithWeight.second;
minReweightedTotalWeight +=
m_DictionaryWeightFunc.minMatchingWeight(idWithWeight.second);
maxReweightedTotalWeight +=
m_DictionaryWeightFunc.maxMatchingWeight(idWithWeight.second);
}

void reset() override { m_DictionaryWeightFunc.reset(); }
Expand Down Expand Up @@ -225,7 +235,9 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
std::string& token,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight) {
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) {
if (IGNORE_LEADING_DIGIT && std::isdigit(static_cast<unsigned char>(token[0]))) {
return;
}
Expand Down Expand Up @@ -262,7 +274,8 @@ class CTokenListDataCategorizer : public CTokenListDataCategorizerBase {
return;
}

this->tokenToIdAndWeight(token, tokenIds, tokenUniqueIds, totalWeight);
this->tokenToIdAndWeight(token, tokenIds, tokenUniqueIds, totalWeight,
minReweightedTotalWeight, maxReweightedTotalWeight);
}

private:
Expand Down
12 changes: 9 additions & 3 deletions include/model/CTokenListDataCategorizerBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -219,14 +219,18 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
const std::string& str,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight) = 0;
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) = 0;

//! Take a string token, convert it to a numeric ID and a weighting and
//! add these to the provided data structures.
virtual void tokenToIdAndWeight(const std::string& token,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight) = 0;
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight) = 0;

virtual void reset() = 0;

Expand Down Expand Up @@ -339,7 +343,9 @@ class MODEL_EXPORT CTokenListDataCategorizerBase : public CDataCategorizer {
bool addPretokenisedTokens(const std::string& tokensCsv,
TSizeSizePrVec& tokenIds,
TSizeSizeMap& tokenUniqueIds,
std::size_t& totalWeight);
std::size_t& totalWeight,
std::size_t& minReweightedTotalWeight,
std::size_t& maxReweightedTotalWeight);

//! Get the categories that will never be detected again because the
//! specified category will always be returned instead. This overload
Expand Down
108 changes: 79 additions & 29 deletions lib/core/unittest/CWordDictionaryTest.cc
Original file line number Diff line number Diff line change
Expand Up @@ -58,43 +58,93 @@ BOOST_AUTO_TEST_CASE(testPartOfSpeech) {
dict.partOfSpeech("a"));
}

BOOST_AUTO_TEST_CASE(testWeightingFunctors) {
BOOST_AUTO_TEST_CASE(testSimpleWeightingFunctors) {
{
ml::core::CWordDictionary::TWeightAll2 weighter;

BOOST_REQUIRE_EQUAL(size_t(0), weighter(ml::core::CWordDictionary::E_NotInDictionary));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_UnknownPart));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Plural));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Verb));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adjective));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adverb));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Conjunction));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Preposition));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Interjection));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Pronoun));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_DefiniteArticle));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Plural));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Verb));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
weighter.reset(); // should make no difference
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adverb));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Conjunction));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Preposition));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Interjection));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Pronoun));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_DefiniteArticle));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
// Any given token always gives the same weight, so min/max matching
// should always be the same as the original
for (std::size_t weight = 1; weight < 10; ++weight) {
BOOST_REQUIRE_EQUAL(weight, weighter.minMatchingWeight(weight));
BOOST_REQUIRE_EQUAL(weight, weighter.maxMatchingWeight(weight));
}
}
{
ml::core::CWordDictionary::TWeightVerbs5Other2 weighter;

BOOST_REQUIRE_EQUAL(size_t(0), weighter(ml::core::CWordDictionary::E_NotInDictionary));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_UnknownPart));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Plural));
BOOST_REQUIRE_EQUAL(size_t(5), weighter(ml::core::CWordDictionary::E_Verb));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adjective));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Adverb));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Conjunction));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Preposition));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Interjection));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_Pronoun));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_DefiniteArticle));
BOOST_REQUIRE_EQUAL(size_t(2), weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Plural));
weighter.reset(); // should make no difference
BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adverb));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Conjunction));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Preposition));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Interjection));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Pronoun));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_DefiniteArticle));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_IndefiniteArticle));
// Any given token always gives the same weight, so min/max matching
// should always be the same as the original
for (std::size_t weight = 1; weight < 10; ++weight) {
BOOST_REQUIRE_EQUAL(weight, weighter.minMatchingWeight(weight));
BOOST_REQUIRE_EQUAL(weight, weighter.maxMatchingWeight(weight));
}
}
}

BOOST_AUTO_TEST_CASE(testAdjacencyDependentWeightingFunctor) {
ml::core::CWordDictionary::TWeightVerbs5Other2AdjacentBoost6 weighter;

BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_UnknownPart));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Plural));
BOOST_REQUIRE_EQUAL(30, weighter(ml::core::CWordDictionary::E_Verb));
weighter.reset();
// Explicit reset stops adjacency multiplier
BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));
BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Adverb));
BOOST_REQUIRE_EQUAL(12, weighter(ml::core::CWordDictionary::E_Conjunction));
BOOST_REQUIRE_EQUAL(0, weighter(ml::core::CWordDictionary::E_NotInDictionary));
// Non-dictionary word stops adjacency multiplier
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Noun));
BOOST_REQUIRE_EQUAL(5, weighter(ml::core::CWordDictionary::E_Verb));
weighter.reset();
// Explicit reset stops adjacency multiplier
BOOST_REQUIRE_EQUAL(2, weighter(ml::core::CWordDictionary::E_Adjective));

// Of the possible weights, 3 could map to 13 and 6 to 31 depending on
// whether adjacency weighting takes place
BOOST_REQUIRE_EQUAL(1, weighter.minMatchingWeight(1));
BOOST_REQUIRE_EQUAL(1, weighter.maxMatchingWeight(1));
BOOST_REQUIRE_EQUAL(3, weighter.minMatchingWeight(3));
BOOST_REQUIRE_EQUAL(13, weighter.maxMatchingWeight(3));
BOOST_REQUIRE_EQUAL(6, weighter.minMatchingWeight(6));
BOOST_REQUIRE_EQUAL(31, weighter.maxMatchingWeight(6));
BOOST_REQUIRE_EQUAL(3, weighter.minMatchingWeight(13));
BOOST_REQUIRE_EQUAL(13, weighter.maxMatchingWeight(13));
BOOST_REQUIRE_EQUAL(6, weighter.minMatchingWeight(31));
BOOST_REQUIRE_EQUAL(31, weighter.maxMatchingWeight(31));
}

// Disabled because it doesn't assert anything
// Can be run on an ad hoc basis if performance is of interest
BOOST_AUTO_TEST_CASE(testPerformance, *boost::unit_test::disabled()) {
Expand All @@ -104,8 +154,8 @@ BOOST_AUTO_TEST_CASE(testPerformance, *boost::unit_test::disabled()) {
LOG_INFO(<< "Starting word dictionary throughput test at "
<< ml::core::CTimeUtils::toTimeString(start));

static const size_t TEST_SIZE(100000);
for (size_t count = 0; count < TEST_SIZE; ++count) {
static const std::size_t TEST_SIZE(100000);
for (std::size_t count = 0; count < TEST_SIZE; ++count) {
dict.isInDictionary("hello");
dict.isInDictionary("Hello");
dict.isInDictionary("HELLO");
Expand Down
Loading