Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

tfidf transformer #172

Merged
merged 3 commits into from
Oct 25, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions include/meta/learn/dataset.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class dataset
public:
using instance_type = instance;
using const_iterator = std::vector<instance_type>::const_iterator;
using iterator = const_iterator;
using iterator = std::vector<instance_type>::iterator;
using size_type = std::vector<instance_type>::size_type;

/**
Expand Down Expand Up @@ -120,15 +120,31 @@ class dataset
/**
* @return an iterator to the first instance
*/
iterator begin() const
const_iterator begin() const
{
return instances_.begin();
}

/**
* @return an iterator to the first instance
*/
iterator begin()
{
return instances_.begin();
}

/**
* @return an iterator to one past the end of the dataset
*/
const_iterator end() const
{
return instances_.end();
}

/**
* @return an iterator to one past the end of the dataset
*/
iterator end() const
iterator end()
{
return instances_.end();
}
Expand Down
2 changes: 1 addition & 1 deletion include/meta/learn/instance.h
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ struct instance
/// the id within the dataset that contains this instance
instance_id id;
/// the weights of the features in this instance
const feature_vector weights;
feature_vector weights;
};
}
}
Expand Down
161 changes: 161 additions & 0 deletions include/meta/learn/transform.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
/**
* @file dataset.h
* @author Chase Geigle
*
* All files in META are released under the MIT license. For more details,
* consult the file LICENSE in the root of the project.
*/

#ifndef META_LEARN_TRANSFORM_H_
#define META_LEARN_TRANSFORM_H_

#include "meta/index/ranker/ranker.h"
#include "meta/index/score_data.h"
#include "meta/learn/dataset.h"

namespace meta
{
namespace learn
{

/**
* Transformer for converting term frequency vectors into tf-idf weight
* vectors. This transformation is performed with respect to a specific
* index::inverted_index that defines the term statistics, and with respect
* to an index::ranker that defines the "tf-idf" weight (via its
* score_one() function).
*
* For example, one can construct a tfidf_transformer with an
* inverted index and an okapi_bm25 ranker to get tf-idf vectors using
* Okapi BM25's definitions of tf and idf.
*
* Some caveats to be aware of:
*
* 1. if your ranker uses extra information that isn't present in score_data
* (e.g. by using score_data.d_id and querying something), this will only
* work if your instance ids directly correspond to doc ids in the
* inverted index
*
* 2. tf-idf values are computed using statistics from the inverted_index.
* If this index contains your test set, the statistics are going to be
* computed including documents in your test set. If this is
* undesirable, create an inverted_index on just your training data and
* use that instead of one created on both the training and testing
* data.
*
* 3. This transformation only makes sense if your instances' weight
* vectors are actually term frequency vectors. If they aren't, the
* assumptions here that every entry in every weight vector can be
* safely converted to an integral value without rounding is violated.
*/
class tfidf_transformer
{
public:
/**
* @param idx The index to use for term statistics
* @param r The ranker to use for defining the weights
*/
tfidf_transformer(index::inverted_index& idx, index::ranker& r)
: idx_(idx),
rnk_(r),
sdata_(idx, idx.avg_doc_length(), idx.num_docs(),
idx.total_corpus_terms(), 1)
{
sdata_.query_term_weight = 1.0f;
}

/**
* @param inst The instance to transform
*/
void operator()(learn::instance& inst)
{
sdata_.d_id = doc_id{inst.id};
sdata_.doc_size = static_cast<uint64_t>(std::accumulate(
inst.weights.begin(), inst.weights.end(), 0.0,
[](double accum, const std::pair<feature_id, double>& val) {
return accum + val.second;
}));
sdata_.doc_unique_terms = inst.weights.size();
for (auto& pr : inst.weights)
{
sdata_.t_id = term_id{pr.first};
sdata_.doc_count = idx_.doc_freq(sdata_.t_id);
sdata_.corpus_term_count = idx_.total_num_occurences(sdata_.t_id);
sdata_.doc_term_count = static_cast<uint64_t>(pr.second);

pr.second = rnk_.score_one(sdata_);
}
}

private:
index::inverted_index& idx_;
index::ranker& rnk_;
index::score_data sdata_;
};

/**
* Transformer to normalize all unit vectors to unit length.
*/
class l2norm_transformer
{
public:
void operator()(learn::instance& inst) const
{
auto norm = std::sqrt(std::accumulate(
inst.weights.begin(), inst.weights.end(), 0.0,
[](double accum, const std::pair<feature_id, double>& val) {
return accum + val.second * val.second;
}));
for (auto& pr : inst.weights)
pr.second /= norm;
}
};

/**
* Transforms the feature vectors of a dataset **in place** using the given
* transformation function. TransformFunction must have an operator() that
* takes a learn::instance by mutable reference and changes its
* feature values in-place. For example, a simple TransformFunction might
* be one that normalizes all of the feature vectors to be unit length.
*
* @param dset The dataset to be transformed
* @param trans The transformation function to be applied to all
* feature_vectors in dset
*/
template <class TransformFunction>
void transform(dataset& dset, TransformFunction&& trans)
{
for (auto& inst : dset)
trans(inst);
}

/**
* Transforms the feature vectors of a dataset **in place** to be tf-idf
* features using the given index for term statistics and ranker for
* tf-idf weight definitions.
*
* @param dset The dataset to be transformed
* @param idx The inverted_index to use for term statistics like df
* @param rnk The ranker to use to define tf-idf weights (via its
* score_one())
*/
void tfidf_transform(dataset& dset, index::inverted_index& idx,
index::ranker& rnk)
{
tfidf_transformer transformer{idx, rnk};
transform(dset, transformer);
}

/**
* Transforms the feature vectors of a dataset **in place** to be unit
* length according to their L2 norm.
*
* @param dset The dataset to be transformed
*/
void l2norm_transform(dataset& dset)
{
return transform(dset, l2norm_transformer{});
}
}
}
#endif
10 changes: 2 additions & 8 deletions src/index/inverted_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,13 +312,7 @@ uint64_t inverted_index::total_corpus_terms()

uint64_t inverted_index::total_num_occurences(term_id t_id) const
{
auto pdata = search_primary(t_id);

double sum = 0;
for (auto& c : pdata->counts())
sum += c.second;

return static_cast<uint64_t>(sum);
return stream_for(t_id)->total_counts();
}

float inverted_index::avg_doc_length()
Expand All @@ -334,7 +328,7 @@ inverted_index::tokenize(const corpus::document& doc)

uint64_t inverted_index::doc_freq(term_id t_id) const
{
return search_primary(t_id)->counts().size();
return stream_for(t_id)->size();
}

auto inverted_index::search_primary(term_id t_id) const
Expand Down
76 changes: 76 additions & 0 deletions tests/dataset_transform_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* @file dataset_transform_test.cpp
* @author Chase Geigle
*/

#include "bandit/bandit.h"
#include "create_config.h"
#include "meta/classify/multiclass_dataset.h"
#include "meta/index/ranker/okapi_bm25.h"
#include "meta/learn/transform.h"

using namespace bandit;
using namespace meta;

go_bandit([]() {
describe("[learn] dataset l2 transformer", []() {
it("should normalize feature vectors to unit length", []() {

std::vector<learn::feature_vector> vectors(2);

vectors[0].emplace_back(0_tid, 12);
vectors[0].emplace_back(1_tid, 10);
vectors[0].emplace_back(2_tid, 5);

vectors[1].emplace_back(1_tid, 1);
vectors[1].emplace_back(3_tid, 4);
vectors[1].emplace_back(5_tid, 9);

learn::dataset dset{vectors.begin(), vectors.end(), 6};
learn::l2norm_transform(dset);

for (const auto& inst : dset)
{
auto norm = std::sqrt(std::accumulate(
inst.weights.begin(), inst.weights.end(), 0.0,
[](double accum, const std::pair<term_id, double>& val) {
return accum + val.second * val.second;
}));
AssertThat(norm, EqualsWithDelta(1, 1e-12));
}
});
});

describe("[learn] dataset tf-idf transformer", []() {
it("should produce tf-idf vectors", []() {
auto config = tests::create_config("line");
config->insert("uninvert", true);
filesystem::remove_all("ceeaus");

// make both indexes
auto inv = index::make_index<index::inverted_index>(*config);
auto fwd = index::make_index<index::forward_index>(*config);

// convert the data into a dataset
classify::multiclass_dataset dset{fwd};

// make tf-idf vectors
index::okapi_bm25 ranker;
learn::tfidf_transform(dset, *inv, ranker);

// check that we get the same scores for a particular word
std::vector<std::pair<std::string, double>> query
= {{"charact", 1.0}};

auto ranking = ranker.score(*inv, query.begin(), query.end());

auto tid = inv->get_term_id("charact");
for (const auto& result : ranking)
{
const auto& weights = dset(result.d_id).weights;
AssertThat(weights.at(tid),
EqualsWithDelta(result.score, 1e-10));
}
});
});
});