htm-community · breznak · Aug 22, 2019 · Jul 15, 2019 · Jul 16, 2019 · Jul 16, 2019
diff --git a/README.md b/README.md
@@ -244,6 +244,7 @@ The installation scripts will automatically download and build the dependencies
  * mnist test data
  * numpy
  * pytest
+ * [digestpp](https://github.com/kerukuro/digestpp) (for SimHash encoders)
 
 Once these third party components have been downloaded and built they will not be
 re-visited again on subsequent builds.  So to refresh the third party components
@@ -263,6 +264,7 @@ distribution packages as listed and rename them as indicated. Copy these to
 | mnist.zip     (*note3) | https://github.com/wichtounet/mnist/archive/master.zip |
 | pybind11.tar.gz        | https://github.com/pybind/pybind11/archive/v2.2.4.tar.gz |
 | cereal.tar.gz          | https://github.com/USCiLab/cereal/archive/v1.2.2.tar.gz |
+| digestpp.zip           | https://github.com/kerukuro/digestpp/archive/36fa6ca2b85808bd171b13b65a345130dbe1d774.zip |
 
  * note1: Version 0.6.2 of yaml-cpp is broken so use the master from the repository.
  * note2: Boost is not required for Windows (MSVC 2017) or any compiler that supports C++17 with std::filesystem.

diff --git a/bindings/py/cpp_src/CMakeLists.txt b/bindings/py/cpp_src/CMakeLists.txt
@@ -55,6 +55,7 @@ set(src_py_encoders_files
     bindings/encoders/encoders_module.cpp
     bindings/encoders/py_ScalarEncoder.cpp
     bindings/encoders/py_RDSE.cpp
+    bindings/encoders/py_SimHashDocumentEncoder.cpp
     )
 
 set(src_py_engine_files

diff --git a/bindings/py/cpp_src/bindings/encoders/encoders_module.cpp b/bindings/py/cpp_src/bindings/encoders/encoders_module.cpp
@@ -29,6 +29,7 @@ namespace htm_ext
 {
     void init_ScalarEncoder(py::module&);
     void init_RDSE(py::module&);
+    void init_SimHashDocumentEncoder(py::module&);
 }
 
 using namespace htm_ext;
@@ -60,4 +61,5 @@ categories into integers before encoding them. )";
 
     init_ScalarEncoder(m);
     init_RDSE(m);
+    init_SimHashDocumentEncoder(m);
 }
diff --git a/bindings/py/cpp_src/bindings/encoders/py_RDSE.cpp b/bindings/py/cpp_src/bindings/encoders/py_RDSE.cpp
@@ -93,7 +93,7 @@ of SDRs to prevent conflicts between different encodings.  This method does
 not allow for decoding SDRs into the inputs which likely created it.
 
 To inspect this run:
-$ python -m htm.encoders.rdse --help)");
+$ python -m htm.examples.encoders.rdse --help)");
         py_RDSE.def(py::init<RDSE_Parameters>());
 
         py_RDSE.def_property_readonly("parameters",

diff --git a/bindings/py/cpp_src/bindings/encoders/py_ScalarEncoder.cpp b/bindings/py/cpp_src/bindings/encoders/py_ScalarEncoder.cpp
@@ -102,7 +102,7 @@ bits. The output is 0's except for a contiguous block of 1's. The location of
 this contiguous block varies continuously with the input value.
 
 To inspect this run:
-$ python -m htm.encoders.scalar_encoder --help)");
+$ python -m htm.examples.encoders.scalar_encoder --help)");
 
     py_ScalarEnc.def(py::init<ScalarEncoderParameters&>(), R"()");
     py_ScalarEnc.def_property_readonly("parameters",

diff --git a/bindings/py/cpp_src/bindings/encoders/py_SimHashDocumentEncoder.cpp b/bindings/py/cpp_src/bindings/encoders/py_SimHashDocumentEncoder.cpp
@@ -0,0 +1,337 @@
+/* -----------------------------------------------------------------------------
+ * HTM Community Edition of NuPIC
+ * Copyright (C) 2016, Numenta, Inc. https://numenta.com
+ *               2019, David McDougall
+ *               2019, Brev Patterson, Lux Rota LLC, https://luxrota.com
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Affero Public License version 3 as published by
+ * the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU Affero Public License along with
+ * this program.  If not, see http://www.gnu.org/licenses.
+ * -------------------------------------------------------------------------- */
+
+/** @file
+ * py_SimHashDocumentEncoder.cpp
+ */
+
+#include <bindings/suppress_register.hpp>  //include before pybind11.h
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <pybind11/iostream.h>
+
+#include <htm/encoders/SimHashDocumentEncoder.hpp>
+
+namespace py = pybind11;
+
+using namespace htm;
+using namespace std;
+
+
+namespace htm_ext {
+
+  using namespace htm;
+
+  void init_SimHashDocumentEncoder(py::module& m)
+  {
+    /**
+     * Parameters
+     */
+    py::class_<SimHashDocumentEncoderParameters>
+      py_SimHashDocumentEncoderParameters(m,
+        "SimHashDocumentEncoderParameters",
+R"(
+Parameters for the SimHashDocumentEncoder.
+)");
+
+    py_SimHashDocumentEncoderParameters.def(py::init<>());
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("activeBits",
+      &SimHashDocumentEncoderParameters::activeBits,
+R"(
+This is the number of true bits in the encoded output SDR. The output encoding
+will have a distribution of this many 1's. Specify only one of: activeBits
+or sparsity.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("caseSensitivity",
+      &SimHashDocumentEncoderParameters::caseSensitivity,
+R"(
+Should capitalized English letters (A-Z) have different influence on our output
+than their lower-cased (a-z) counterparts? Or the same influence on output?
+  If TRUE:  "DOGS" and "dogs" will have completely different encodings.
+  If FALSE: "DOGS" and "dogs" will share the same encoding (Default).
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("encodeOrphans",
+      &SimHashDocumentEncoderParameters::encodeOrphans,
+R"(
+If param `vocabulary` is set, should we `encode()` tokens not in our
+`vocabulary` ("orphan" tokens)?
+  If True: Unrecognized tokens will be added to our encoding
+    with weight=1. Our `vocabulary` is useful as a simple weight map.
+  If False (default): Unrecognized tokens will be discarded. Our `vocabulary`
+    now serves more like a whitelist (also with weights).
+  Any tokens in the `exclude` list will be discarded.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("excludes",
+      &SimHashDocumentEncoderParameters::excludes,
+R"(
+List of tokens to discard when passed in to `encode()`. Terms in the
+`vocabulary`, and orphan terms, will be ignored if excluded here. If
+`tokenSimilarity` is enabled, you can also pass in single character (letter)
+strings to discard.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("frequencyCeiling",
+      &SimHashDocumentEncoderParameters::frequencyCeiling,
+R"(
+The max number of times a token can be repeated in a document. Occurances of
+the token beyond this number will be discarded. A setting of 1 will act as
+token de-duplication, guaranteeing each token in a document is unique. Inverse
+to param `frequencyFloor`.
+  If param `tokenSimilarity` is on, this will be the max number of times a
+  char/letter can be repeated in a token. Occurances of the character beyond
+  this number will be discarded. A setting of 1 will act as character
+  de-duplication, guaranteeing each character in a token is unique.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("frequencyFloor",
+      &SimHashDocumentEncoderParameters::frequencyFloor,
+R"(
+If this option is set, a token will be ignored until it occurs this many times
+in the document. Occurances of the token before this number will be discarded.
+Inverse to param `frequencyCeiling`.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("size",
+      &SimHashDocumentEncoderParameters::size,
+R"(
+This is the total number of bits in the encoded output SDR.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("sparsity",
+      &SimHashDocumentEncoderParameters::sparsity,
+R"(
+This is an alternate way (percentage) to specify the the number of active bits.
+Specify only one of: activeBits or sparsity.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("tokenSimilarity",
+      &SimHashDocumentEncoderParameters::tokenSimilarity,
+R"(
+This allows similar tokens ("cat", "cats") to also be represented similarly,
+at the cost of document similarity accuracy. Default is FALSE (providing better
+document-level similarity, at the expense of token-level similarity). This
+could be use to meaningfully encode plurals and mis-spellings as similar. It
+may also be hacked to create a complex dimensional category encoder. Results
+are heavily dependent on the content of your input data.
+  If TRUE: Similar tokens ("cat", "cats") will have similar influence on the
+    output simhash. This benefit comes with the cost of a reduction in
+    document-level similarity accuracy. Param `frequencyCeiling` is also
+    available for use with this.
+  If FALSE: Similar tokens ("cat", "cats") will have individually unique and
+    unrelated influence on the output simhash encoding, thus losing token-level
+    similarity and increasing document-level similarity.
+)");
+
+    py_SimHashDocumentEncoderParameters.def_readwrite("vocabulary",
+      &SimHashDocumentEncoderParameters::vocabulary,
+R"(
+Map of possible document tokens with weights.
+    ex: {{ "what", 3 }, { "is", 1 }, { "up", 2 }}.
+  If `encodeOrphans` is True, this will be useful like a simple weight
+    map. If `encodeOrphans` is False, this will be more useful as a
+    whitelist (still with weights).
+  If `tokenSimilarity` is enabled, you can also pass in single
+    character (letter) strings to weight.
+  Any tokens in the `exclude` list will be discarded.
+)");
+
+    /**
+     * Class
+     */
+    py::class_<SimHashDocumentEncoder> py_SimHashDocumentEncoder(m,
+      "SimHashDocumentEncoder",
+R"(
+Encodes a document text into a distributed spray of 1's.
+
+The SimHashDocumentEncoder encodes a document (array of strings) value into an
+array of bits. The output is 0's except for a sparse distribution spray of 1's.
+Similar document encodings will share similar representations, and vice versa.
+Unicode is supported. No lookup tables are used.
+
+"Similarity" here refers to bitwise similarity (small hamming distance,
+high overlap), not semantic similarity (encodings for "apple" and
+"computer" will have no relation here.) For document encodings which are
+also semantic, please try Cortical.io and their Semantic Folding tech.
+
+Definition of Terms:
+  - A "corpus" is a collection of "documents".
+  - A "document" is made up of "tokens" (or "words").
+  - A "token" is made up of "characters" (or "letters").
+
+For details on the SimHash Algorithm itself, please see source code file:
+  - SimHashDocumentEncoder.README.md
+
+To inspect this run:
+$ python -m htm.examples.encoders.simhash_document_encoder --help
+
+Python Code Example:
+    from htm.bindings.encoders import SimHashDocumentEncoder
+    from htm.bindings.encoders import SimHashDocumentEncoderParameters
+    from htm.bindings.sdr import SDR
+
+    params = SimHashDocumentEncoderParameters()
+    params.size = 400
+    params.activeBits = 21
+
+    output = SDR(params.size)
+    encoder = SimHashDocumentEncoder(params)
+
+    # call style: output is reference
+    encoder.encode([ "bravo", "delta", "echo" ], output)
+    encoder.encode("bravo delta echo", output)
+
+    # call style: output is returned
+    other = encoder.encode([ "bravo", "delta", "echo" ])
+    other = encoder.encode("bravo delta echo")
+)");
+
+    py_SimHashDocumentEncoder.def(
+      py::init<SimHashDocumentEncoderParameters&>());
+
+    py_SimHashDocumentEncoder.def_property_readonly("dimensions",
+      [](SimHashDocumentEncoder &self) { return self.dimensions; },
+R"(
+This is the total number of bits in the encoded output SDR.
+)");
+
+    py_SimHashDocumentEncoder.def_property_readonly("parameters",
+      [](SimHashDocumentEncoder &self) { return self.parameters; },
+R"(
+Contains the parameter structure which this encoder uses internally. All fields
+are filled in automatically.
+)");
+
+    py_SimHashDocumentEncoder.def_property_readonly("size",
+      [](SimHashDocumentEncoder &self) { return self.size; },
+R"(
+This is the total number of bits in the encoded output SDR.
+)");
+
+    // Handle case of class method overload + class method override
+    // http://pybind11.readthedocs.io/en/master/classes.html#overloaded-methods
+    // Alternate calling patterns seem to have to come before the main ones.
+    //  1. Explain
+    py_SimHashDocumentEncoder.def("encode", // alt: simple string. Define 1st!
+      (void (SimHashDocumentEncoder::*)(std::string, htm::SDR &))
+        &SimHashDocumentEncoder::encode);
+    py_SimHashDocumentEncoder.def("encode", // main: list.
+      (void (SimHashDocumentEncoder::*)(std::vector<std::string>, htm::SDR &))
+        &SimHashDocumentEncoder::encode);
+    //  2. Details
+    py_SimHashDocumentEncoder.def("encode", // alt: simple string. Define 1st!
+      [](SimHashDocumentEncoder &self, std::string value) {
+        auto output = new SDR({ self.size });
+        self.encode( value, *output );
+        return output;
+      },
+R"(
+Encode (Alternate calling style: Simple string method).
+Simple alternate calling pattern using only a single longer string. Takes input
+as a long python string, which will automatically be tokenized (split on
+whitespace). Ex: "alpha bravo delta echo".
+)");
+    py_SimHashDocumentEncoder.def("encode", // main: list.
+      [](SimHashDocumentEncoder &self, std::vector<std::string> value) {
+        auto output = new SDR({ self.size });
+        self.encode( value, *output );
+        return output;
+      },
+R"(
+Encode (Main calling style).
+Each token will be hashed with SHA3+SHAKE256 to get a binary digest output of
+desired `size`. These vectors will be stored in a matrix for the next step of
+processing. Weights from the `vocabulary` are added in during hashing and
+simhashing. After the loop, we SimHash the matrix of hashes, resulting in an
+output SDR. If param "tokenSimilarity" is set, we'll also loop and hash through
+all the letters in the tokens. Takes input in a python list of
+strings (tokens).
+  Ex: [ "alpha", "bravo", "delta", "echo" ].
+Documents can contain any number of tokens > 0. Token order in the document is
+  ignored and does not effect the output encoding. Tokens in the `vocabulary`
+  will be weighted, while others may be encoded depending on the
+  `encodeOrphans` param. Tokens in the `exclude` list will always be discarded.
+)");
+
+    /**
+     * Serialization
+     */
+    // file out
+    py_SimHashDocumentEncoder.def("saveToFile",
+      [](SimHashDocumentEncoder &self, const std::string& filename) {
+        self.saveToFile(filename, SerializableFormat::BINARY);
+      },
+R"(
+Serialize current encoder instance out to a file.
+)");
+    // file in
+    py_SimHashDocumentEncoder.def("loadFromFile",
+	    [](SimHashDocumentEncoder &self, const std::string& filename) {
+        return self.loadFromFile(filename, SerializableFormat::BINARY);
+      },
+R"(
+Deserialize file contents into current object.
+)");
+    // string out
+    py_SimHashDocumentEncoder.def("writeToString",
+        [](const SimHashDocumentEncoder& self) {
+      std::ostringstream outStream;
+      outStream.flags(ios::scientific);
+      outStream.precision(numeric_limits<double>::digits10 + 1);
+      self.save(outStream, JSON);
+      return py::bytes( outStream.str() );
+    },
+R"(
+Serialize current encoder instance out to a bytestring.
+)");
+    // string in
+    py_SimHashDocumentEncoder.def("loadFromString",
+        [](SimHashDocumentEncoder& self, const py::bytes& inString) {
+      std::stringstream inStream(inString.cast<std::string>());
+      self.load(inStream, JSON);
+    },
+R"(
+Deserialize bytestring into current object.
+)");
+    // pickle
+    py_SimHashDocumentEncoder.def(py::pickle(
+      // pickle out
+      [](const SimHashDocumentEncoder& self) {
+        std::stringstream ss;
+        self.save(ss);
+        return py::bytes( ss.str() );
+      },
+      // pickle in
+      [](py::bytes &s) {
+        std::stringstream ss( s.cast<std::string>() );
+        std::unique_ptr<SimHashDocumentEncoder>
+          self(new SimHashDocumentEncoder());
+        self->load(ss);
+        return self;
+      }
+    ),
+R"(
+De/Serialize with Python Pickle.
+)");
+
+  }
+}