From fbc146a2dd58fa2aaf9051cf196e75861e090774 Mon Sep 17 00:00:00 2001 From: Nimesh Ghelani Date: Thu, 17 Oct 2019 01:54:50 +0530 Subject: [PATCH 01/14] Add basic design and tfidf featurizer --- CALEngine-painless/CMakeLists.txt | 13 + .../include/dataset/dataset-memory.h | 22 ++ CALEngine-painless/include/dataset/dataset.h | 23 ++ .../include/featurizer/featurizer.h | 29 ++ CALEngine-painless/include/featurizer/tfidf.h | 31 ++ CALEngine-painless/include/utils/logging.h | 20 ++ .../include/utils/sf-sparse-vector.h | 99 ++++++ CALEngine-painless/include/utils/text-utils.h | 67 ++++ CALEngine-painless/src/CMakeLists.txt | 21 ++ .../src/dataset/dataset-memory.cc | 29 ++ .../src/featurizer/featurizer.h | 29 ++ CALEngine-painless/src/featurizer/tfidf.cc | 65 ++++ CALEngine-painless/src/main.cc | 11 + CALEngine-painless/src/utils/porter.c | 323 ++++++++++++++++++ .../src/utils/sf-sparse-vector.cc | 66 ++++ CALEngine-painless/src/utils/text-utils.cc | 66 ++++ 16 files changed, 914 insertions(+) create mode 100644 CALEngine-painless/CMakeLists.txt create mode 100644 CALEngine-painless/include/dataset/dataset-memory.h create mode 100644 CALEngine-painless/include/dataset/dataset.h create mode 100644 CALEngine-painless/include/featurizer/featurizer.h create mode 100644 CALEngine-painless/include/featurizer/tfidf.h create mode 100644 CALEngine-painless/include/utils/logging.h create mode 100644 CALEngine-painless/include/utils/sf-sparse-vector.h create mode 100644 CALEngine-painless/include/utils/text-utils.h create mode 100644 CALEngine-painless/src/CMakeLists.txt create mode 100644 CALEngine-painless/src/dataset/dataset-memory.cc create mode 100644 CALEngine-painless/src/featurizer/featurizer.h create mode 100644 CALEngine-painless/src/featurizer/tfidf.cc create mode 100644 CALEngine-painless/src/main.cc create mode 100644 CALEngine-painless/src/utils/porter.c create mode 100644 CALEngine-painless/src/utils/sf-sparse-vector.cc create mode 100644 CALEngine-painless/src/utils/text-utils.cc diff --git a/CALEngine-painless/CMakeLists.txt b/CALEngine-painless/CMakeLists.txt new file mode 100644 index 0000000..6f85c48 --- /dev/null +++ b/CALEngine-painless/CMakeLists.txt @@ -0,0 +1,13 @@ +cmake_minimum_required(VERSION 3.0) +project(calengine) + +add_compile_options(-W -Wall -Werror --std=c++14) + +add_subdirectory(src) + +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message("Setting build type to 'Release' as none was specified.") + set(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build." FORCE) + # Set the possible values of build type for cmake-gui + set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release") +endif() diff --git a/CALEngine-painless/include/dataset/dataset-memory.h b/CALEngine-painless/include/dataset/dataset-memory.h new file mode 100644 index 0000000..002eb65 --- /dev/null +++ b/CALEngine-painless/include/dataset/dataset-memory.h @@ -0,0 +1,22 @@ +#ifndef DATASET_MEMORY_H +#define DATASET_MEMORY_H + +#include "dataset/dataset.h" + +class DatasetMemory : public Dataset { + +public: + + explicit DatasetMemory(std::unique_ptr _featurizer, + std::string filename); + + static DatasetMemory create(const std::string &filename); + + virtual size_t size(); + + virtual SfSparseVector get_features(const std::string &id); + + virtual SfSparseVector get_features(int id); +}; + +#endif // DATASET_MEMORY_H diff --git a/CALEngine-painless/include/dataset/dataset.h b/CALEngine-painless/include/dataset/dataset.h new file mode 100644 index 0000000..1482988 --- /dev/null +++ b/CALEngine-painless/include/dataset/dataset.h @@ -0,0 +1,23 @@ +#ifndef DATASET_H +#define DATASET_H + +#include +#include +#include +#include "utils/sf-sparse-vector.h" +#include "featurizer/featurizer.h" + +class Dataset { + std::unique_ptr featurizer; + +public: + Dataset(std::unique_ptr _featurizer): featurizer(std::move(_featurizer)) {} + + virtual size_t size() = 0; + + virtual SfSparseVector get_features(const std::string &id) = 0; + + virtual SfSparseVector get_features(int id) = 0; +}; + +#endif // DATASET_H diff --git a/CALEngine-painless/include/featurizer/featurizer.h b/CALEngine-painless/include/featurizer/featurizer.h new file mode 100644 index 0000000..5ec65c0 --- /dev/null +++ b/CALEngine-painless/include/featurizer/featurizer.h @@ -0,0 +1,29 @@ +#ifndef FEATURIZER_H +#define FEATURIZER_H + +#include +#include "utils/sf-sparse-vector.h" +#include "utils/logging.h" + +class Featurizer { + bool finalized_ = false; + +public: + virtual void fit(const std::string &text) = 0; + + virtual void finalize() { + if(finalized_) { + FATAL("Cannot finalize a featurizer twice!"); + } else { + finalized_ = true; + } + }; + + virtual void write(const std::string &filename) = 0; + + virtual SfSparseVector get_features(const std::string &text) = 0; + + virtual ~Featurizer() {} +}; + +#endif // FEATURIZER_H diff --git a/CALEngine-painless/include/featurizer/tfidf.h b/CALEngine-painless/include/featurizer/tfidf.h new file mode 100644 index 0000000..9d6e673 --- /dev/null +++ b/CALEngine-painless/include/featurizer/tfidf.h @@ -0,0 +1,31 @@ +#ifndef TFIDF_H +#define TFIDF_H + +#include "featurizer/featurizer.h" +#include "utils/text-utils.h" +#include + +struct TermInfo { + uint32_t id; + uint64_t df; +}; + +class TFIDFFeaturizer : public Featurizer { + std::unordered_map dictionary_; + BMITokenizer tokenizer_ = BMITokenizer(); + size_t total_docs_ = 0; + double max_norm_ = 20; + +public: + TFIDFFeaturizer(); + + TFIDFFeaturizer(const std::string &filename); + + virtual void fit(const std::string &text); + + virtual void write(const std::string &filename); + + virtual SfSparseVector get_features(const std::string &text); +}; + +#endif // TFIDF_H diff --git a/CALEngine-painless/include/utils/logging.h b/CALEngine-painless/include/utils/logging.h new file mode 100644 index 0000000..56ad28d --- /dev/null +++ b/CALEngine-painless/include/utils/logging.h @@ -0,0 +1,20 @@ +#ifndef LOGGING_H +#define LOGGING_H + +#include +#include + +inline void FATAL(const char *msg) { + fprintf(stderr, "[FATAL] %s\n", msg); + exit(1); +} + +inline void INFO(const char *msg) { + fprintf(stderr, "[INFO] %s\n", msg); +} + +inline void WARN(const char *msg) { + fprintf(stderr, "[WARN] %s\n", msg); +} + +#endif // LOGGING_H diff --git a/CALEngine-painless/include/utils/sf-sparse-vector.h b/CALEngine-painless/include/utils/sf-sparse-vector.h new file mode 100644 index 0000000..ccd185b --- /dev/null +++ b/CALEngine-painless/include/utils/sf-sparse-vector.h @@ -0,0 +1,99 @@ +//================================================================================// +// Copyright 2009 Google Inc. // +// // +// Licensed under the Apache License, Version 2.0 (the "License"); // +// you may not use this file except in compliance with the License. // +// You may obtain a copy of the License at // +// // +// http://www.apache.org/licenses/LICENSE-2.0 // +// // +// Unless required by applicable law or agreed to in writing, software // +// distributed under the License is distributed on an "AS IS" BASIS, // +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // +// See the License for the specific language governing permissions and // +// limitations under the License. // +//================================================================================// +// +// sf-sparse-vector.h +// +// Author: D. Sculley, December 2008 +// dsculley@google.com or dsculley@cs.tufts.edu +// +// A sparse vector for use with sofia-ml. Vector elements are contained +// in an stl vector, stored in a struct containing (feature id, feature value) +// pairs. Feature id's are assumed to be unique, sorted, and strictly positive. +// Sparse vector is assumed to be in svm-light format, with the distinction +// that the class label may be a float rather than an integer, and +// that there is an optional group id value. +// +//