meta-toolkit · smassung · Oct 25, 2016 · Oct 19, 2016 · Oct 19, 2016 · Oct 19, 2016
diff --git a/include/meta/learn/dataset.h b/include/meta/learn/dataset.h
@@ -34,7 +34,7 @@ class dataset
   public:
     using instance_type = instance;
     using const_iterator = std::vector<instance_type>::const_iterator;
-    using iterator = const_iterator;
+    using iterator = std::vector<instance_type>::iterator;
     using size_type = std::vector<instance_type>::size_type;
 
     /**
@@ -120,15 +120,31 @@ class dataset
     /**
      * @return an iterator to the first instance
      */
-    iterator begin() const
+    const_iterator begin() const
     {
         return instances_.begin();
     }
 
+    /**
+     * @return an iterator to the first instance
+     */
+    iterator begin()
+    {
+        return instances_.begin();
+    }
+
+    /**
+     * @return an iterator to one past the end of the dataset
+     */
+    const_iterator end() const
+    {
+        return instances_.end();
+    }
+
     /**
      * @return an iterator to one past the end of the dataset
      */
-    iterator end() const
+    iterator end()
     {
         return instances_.end();
     }

diff --git a/include/meta/learn/instance.h b/include/meta/learn/instance.h
@@ -60,7 +60,7 @@ struct instance
     /// the id within the dataset that contains this instance
     instance_id id;
     /// the weights of the features in this instance
-    const feature_vector weights;
+    feature_vector weights;
 };
 }
 }

diff --git a/include/meta/learn/transform.h b/include/meta/learn/transform.h
@@ -0,0 +1,161 @@
+/**
+ * @file dataset.h
+ * @author Chase Geigle
+ *
+ * All files in META are released under the MIT license. For more details,
+ * consult the file LICENSE in the root of the project.
+ */
+
+#ifndef META_LEARN_TRANSFORM_H_
+#define META_LEARN_TRANSFORM_H_
+
+#include "meta/index/ranker/ranker.h"
+#include "meta/index/score_data.h"
+#include "meta/learn/dataset.h"
+
+namespace meta
+{
+namespace learn
+{
+
+/**
+ * Transformer for converting term frequency vectors into tf-idf weight
+ * vectors. This transformation is performed with respect to a specific
+ * index::inverted_index that defines the term statistics, and with respect
+ * to an index::ranker that defines the "tf-idf" weight (via its
+ * score_one() function).
+ *
+ * For example, one can construct a tfidf_transformer with an
+ * inverted index and an okapi_bm25 ranker to get tf-idf vectors using
+ * Okapi BM25's definitions of tf and idf.
+ *
+ * Some caveats to be aware of:
+ *
+ * 1. if your ranker uses extra information that isn't present in score_data
+ *    (e.g. by using score_data.d_id and querying something), this will only
+ *    work if your instance ids directly correspond to doc ids in the
+ *    inverted index
+ *
+ * 2. tf-idf values are computed using statistics from the inverted_index.
+ *    If this index contains your test set, the statistics are going to be
+ *    computed including documents in your test set. If this is
+ *    undesirable, create an inverted_index on just your training data and
+ *    use that instead of one created on both the training and testing
+ *    data.
+ *
+ * 3. This transformation only makes sense if your instances' weight
+ *    vectors are actually term frequency vectors. If they aren't, the
+ *    assumptions here that every entry in every weight vector can be
+ *    safely converted to an integral value without rounding is violated.
+ */
+class tfidf_transformer
+{
+  public:
+    /**
+     * @param idx The index to use for term statistics
+     * @param r The ranker to use for defining the weights
+     */
+    tfidf_transformer(index::inverted_index& idx, index::ranker& r)
+        : idx_(idx),
+          rnk_(r),
+          sdata_(idx, idx.avg_doc_length(), idx.num_docs(),
+                 idx.total_corpus_terms(), 1)
+    {
+        sdata_.query_term_weight = 1.0f;
+    }
+
+    /**
+     * @param inst The instance to transform
+     */
+    void operator()(learn::instance& inst)
+    {
+        sdata_.d_id = doc_id{inst.id};
+        sdata_.doc_size = static_cast<uint64_t>(std::accumulate(
+            inst.weights.begin(), inst.weights.end(), 0.0,
+            [](double accum, const std::pair<feature_id, double>& val) {
+                return accum + val.second;
+            }));
+        sdata_.doc_unique_terms = inst.weights.size();
+        for (auto& pr : inst.weights)
+        {
+            sdata_.t_id = term_id{pr.first};
+            sdata_.doc_count = idx_.doc_freq(sdata_.t_id);
+            sdata_.corpus_term_count = idx_.total_num_occurences(sdata_.t_id);
+            sdata_.doc_term_count = static_cast<uint64_t>(pr.second);
+
+            pr.second = rnk_.score_one(sdata_);
+        }
+    }
+
+  private:
+    index::inverted_index& idx_;
+    index::ranker& rnk_;
+    index::score_data sdata_;
+};
+
+/**
+ * Transformer to normalize all unit vectors to unit length.
+ */
+class l2norm_transformer
+{
+  public:
+    void operator()(learn::instance& inst) const
+    {
+        auto norm = std::sqrt(std::accumulate(
+            inst.weights.begin(), inst.weights.end(), 0.0,
+            [](double accum, const std::pair<feature_id, double>& val) {
+                return accum + val.second * val.second;
+            }));
+        for (auto& pr : inst.weights)
+            pr.second /= norm;
+    }
+};
+
+/**
+ * Transforms the feature vectors of a dataset **in place** using the given
+ * transformation function. TransformFunction must have an operator() that
+ * takes a learn::instance by mutable reference and changes its
+ * feature values in-place. For example, a simple TransformFunction might
+ * be one that normalizes all of the feature vectors to be unit length.
+ *
+ * @param dset The dataset to be transformed
+ * @param trans The transformation function to be applied to all
+ * feature_vectors in dset
+ */
+template <class TransformFunction>
+void transform(dataset& dset, TransformFunction&& trans)
+{
+    for (auto& inst : dset)
+        trans(inst);
+}
+
+/**
+ * Transforms the feature vectors of a dataset **in place** to be tf-idf
+ * features using the given index for term statistics and ranker for
+ * tf-idf weight definitions.
+ *
+ * @param dset The dataset to be transformed
+ * @param idx The inverted_index to use for term statistics like df
+ * @param rnk The ranker to use to define tf-idf weights (via its
+ * score_one())
+ */
+void tfidf_transform(dataset& dset, index::inverted_index& idx,
+                     index::ranker& rnk)
+{
+    tfidf_transformer transformer{idx, rnk};
+    transform(dset, transformer);
+}
+
+/**
+ * Transforms the feature vectors of a dataset **in place** to be unit
+ * length according to their L2 norm.
+ *
+ * @param dset The dataset to be transformed
+ */
+void l2norm_transform(dataset& dset)
+{
+    return transform(dset, l2norm_transformer{});
+}
+}
+}
+#endif
diff --git a/src/index/inverted_index.cpp b/src/index/inverted_index.cpp
@@ -312,13 +312,7 @@ uint64_t inverted_index::total_corpus_terms()
 
 uint64_t inverted_index::total_num_occurences(term_id t_id) const
 {
-    auto pdata = search_primary(t_id);
-
-    double sum = 0;
-    for (auto& c : pdata->counts())
-        sum += c.second;
-
-    return static_cast<uint64_t>(sum);
+    return stream_for(t_id)->total_counts();
 }
 
 float inverted_index::avg_doc_length()
@@ -334,7 +328,7 @@ inverted_index::tokenize(const corpus::document& doc)
 
 uint64_t inverted_index::doc_freq(term_id t_id) const
 {
-    return search_primary(t_id)->counts().size();
+    return stream_for(t_id)->size();
 }
 
 auto inverted_index::search_primary(term_id t_id) const

diff --git a/tests/dataset_transform_test.cpp b/tests/dataset_transform_test.cpp
@@ -0,0 +1,76 @@
+/**
+ * @file dataset_transform_test.cpp
+ * @author Chase Geigle
+ */
+
+#include "bandit/bandit.h"
+#include "create_config.h"
+#include "meta/classify/multiclass_dataset.h"
+#include "meta/index/ranker/okapi_bm25.h"
+#include "meta/learn/transform.h"
+
+using namespace bandit;
+using namespace meta;
+
+go_bandit([]() {
+    describe("[learn] dataset l2 transformer", []() {
+        it("should normalize feature vectors to unit length", []() {
+
+            std::vector<learn::feature_vector> vectors(2);
+
+            vectors[0].emplace_back(0_tid, 12);
+            vectors[0].emplace_back(1_tid, 10);
+            vectors[0].emplace_back(2_tid, 5);
+
+            vectors[1].emplace_back(1_tid, 1);
+            vectors[1].emplace_back(3_tid, 4);
+            vectors[1].emplace_back(5_tid, 9);
+
+            learn::dataset dset{vectors.begin(), vectors.end(), 6};
+            learn::l2norm_transform(dset);
+
+            for (const auto& inst : dset)
+            {
+                auto norm = std::sqrt(std::accumulate(
+                    inst.weights.begin(), inst.weights.end(), 0.0,
+                    [](double accum, const std::pair<term_id, double>& val) {
+                        return accum + val.second * val.second;
+                    }));
+                AssertThat(norm, EqualsWithDelta(1, 1e-12));
+            }
+        });
+    });
+
+    describe("[learn] dataset tf-idf transformer", []() {
+        it("should produce tf-idf vectors", []() {
+            auto config = tests::create_config("line");
+            config->insert("uninvert", true);
+            filesystem::remove_all("ceeaus");
+
+            // make both indexes
+            auto inv = index::make_index<index::inverted_index>(*config);
+            auto fwd = index::make_index<index::forward_index>(*config);
+
+            // convert the data into a dataset
+            classify::multiclass_dataset dset{fwd};
+
+            // make tf-idf vectors
+            index::okapi_bm25 ranker;
+            learn::tfidf_transform(dset, *inv, ranker);
+
+            // check that we get the same scores for a particular word
+            std::vector<std::pair<std::string, double>> query
+                = {{"charact", 1.0}};
+
+            auto ranking = ranker.score(*inv, query.begin(), query.end());
+
+            auto tid = inv->get_term_id("charact");
+            for (const auto& result : ranking)
+            {
+                const auto& weights = dset(result.d_id).weights;
+                AssertThat(weights.at(tid),
+                           EqualsWithDelta(result.score, 1e-10));
+            }
+        });
+    });
+});