RTDataScience
diff --git a/‎.gitignore
Lines changed: 4 additions & 1 deletion b/‎.gitignore
Lines changed: 4 additions & 1 deletion
diff --git a/‎Makefile
Lines changed: 60 additions & 1 deletion b/‎Makefile
Lines changed: 60 additions & 1 deletion
diff --git a/‎src/args.cc
Lines changed: 21 additions & 15 deletions b/‎src/args.cc
Lines changed: 21 additions & 15 deletions
diff --git a/‎src/autotune.cc
Lines changed: 8 additions & 6 deletions b/‎src/autotune.cc
Lines changed: 8 additions & 6 deletions
diff --git a/‎src/densematrix.cc
Lines changed: 11 additions & 6 deletions b/‎src/densematrix.cc
Lines changed: 11 additions & 6 deletions
diff --git a/‎src/fasttext.cc
Lines changed: 35 additions & 15 deletions b/‎src/fasttext.cc
Lines changed: 35 additions & 15 deletions
@@ -2,8 +2,11 @@
 *.o
 *.bin
 *.vec
+*.bc
+.DS_Store
 data
 fasttext
 result
 website/node_modules/
-
+package-lock.json
+node_modules/
@@ -20,6 +20,12 @@ coverage: fasttext
 debug: CXXFLAGS += -g -O0 -fno-inline
 debug: fasttext
 
+wasm: webassembly/fasttext_wasm.js
+
+wasmdebug: export EMCC_DEBUG=1
+wasmdebug: webassembly/fasttext_wasm.js
+
+
 args.o: src/args.cc src/args.h
 	$(CXX) $(CXXFLAGS) -c src/args.cc
 
@@ -63,4 +69,57 @@ fasttext: $(OBJS) src/fasttext.cc
 	$(CXX) $(CXXFLAGS) $(OBJS) src/main.cc -o fasttext
 
 clean:
-	rm -rf *.o *.gcno *.gcda fasttext
+	rm -rf *.o *.gcno *.gcda fasttext *.bc webassembly/fasttext_wasm.js webassembly/fasttext_wasm.wasm
+
+
+EMCXX = em++
+EMCXXFLAGS = --bind --std=c++11 -s WASM=1 -s ALLOW_MEMORY_GROWTH=1 -s "EXTRA_EXPORTED_RUNTIME_METHODS=['addOnPostRun', 'FS']" -s "DISABLE_EXCEPTION_CATCHING=0" -s "EXCEPTION_DEBUG=1" -s "FORCE_FILESYSTEM=1" -s "MODULARIZE=1" -s "EXPORT_ES6=1" -s 'EXPORT_NAME="FastTextModule"' -Isrc/
+EMOBJS = args.bc autotune.bc matrix.bc dictionary.bc loss.bc productquantizer.bc densematrix.bc quantmatrix.bc vector.bc model.bc utils.bc meter.bc fasttext.bc main.bc
+
+
+main.bc: webassembly/fasttext_wasm.cc
+	$(EMCXX) $(EMCXXFLAGS)  webassembly/fasttext_wasm.cc -o main.bc
+
+args.bc: src/args.cc src/args.h
+	$(EMCXX) $(EMCXXFLAGS)  src/args.cc -o args.bc
+
+autotune.bc: src/autotune.cc src/autotune.h
+	$(EMCXX) $(EMCXXFLAGS)  src/autotune.cc -o autotune.bc
+
+matrix.bc: src/matrix.cc src/matrix.h
+	$(EMCXX) $(EMCXXFLAGS) src/matrix.cc -o matrix.bc
+
+dictionary.bc: src/dictionary.cc src/dictionary.h src/args.h
+	$(EMCXX) $(EMCXXFLAGS)  src/dictionary.cc -o dictionary.bc
+
+loss.bc: src/loss.cc src/loss.h src/matrix.h src/real.h
+	$(EMCXX) $(EMCXXFLAGS) src/loss.cc -o loss.bc
+
+productquantizer.bc: src/productquantizer.cc src/productquantizer.h src/utils.h
+	$(EMCXX) $(EMCXXFLAGS)  src/productquantizer.cc -o productquantizer.bc
+
+densematrix.bc: src/densematrix.cc src/densematrix.h src/utils.h src/matrix.h
+	$(EMCXX) $(EMCXXFLAGS) src/densematrix.cc -o densematrix.bc
+
+quantmatrix.bc: src/quantmatrix.cc src/quantmatrix.h src/utils.h src/matrix.h
+	$(EMCXX) $(EMCXXFLAGS) src/quantmatrix.cc -o quantmatrix.bc
+
+vector.bc: src/vector.cc src/vector.h src/utils.h
+	$(EMCXX) $(EMCXXFLAGS)  src/vector.cc -o vector.bc
+
+model.bc: src/model.cc src/model.h src/args.h
+	$(EMCXX) $(EMCXXFLAGS)  src/model.cc -o model.bc
+
+utils.bc: src/utils.cc src/utils.h
+	$(EMCXX) $(EMCXXFLAGS)  src/utils.cc -o utils.bc
+
+meter.bc: src/meter.cc src/meter.h
+	$(EMCXX) $(EMCXXFLAGS)  src/meter.cc -o meter.bc
+
+fasttext.bc: src/fasttext.cc src/*.h
+	$(EMCXX) $(EMCXXFLAGS)  src/fasttext.cc -o fasttext.bc
+
+webassembly/fasttext_wasm.js: $(EMOBJS) webassembly/fasttext_wasm.cc Makefile
+	$(EMCXX) $(EMCXXFLAGS) $(EMOBJS) -o webassembly/fasttext_wasm.js
+
+
@@ -262,43 +262,49 @@ void Args::printTrainingHelp() {
   std::cerr
       << "\nThe following arguments for training are optional:\n"
       << "  -lr                 learning rate [" << lr << "]\n"
-      << "  -lrUpdateRate       change the rate of updates for the learning rate ["
+      << "  -lrUpdateRate       change the rate of updates for the learning "
+         "rate ["
       << lrUpdateRate << "]\n"
       << "  -dim                size of word vectors [" << dim << "]\n"
       << "  -ws                 size of the context window [" << ws << "]\n"
       << "  -epoch              number of epochs [" << epoch << "]\n"
       << "  -neg                number of negatives sampled [" << neg << "]\n"
       << "  -loss               loss function {ns, hs, softmax, one-vs-all} ["
       << lossToString(loss) << "]\n"
-      << "  -thread             number of threads (set to 1 to ensure reproducible results) ["
+      << "  -thread             number of threads (set to 1 to ensure "
+         "reproducible results) ["
       << thread << "]\n"
-      << "  -pretrainedVectors  pretrained word vectors for supervised learning ["
+      << "  -pretrainedVectors  pretrained word vectors for supervised "
+         "learning ["
       << pretrainedVectors << "]\n"
       << "  -saveOutput         whether output params should be saved ["
       << boolToString(saveOutput) << "]\n"
       << "  -seed               random generator seed  [" << seed << "]\n";
 }
 
 void Args::printAutotuneHelp() {
-  std::cerr
-      << "\nThe following arguments are for autotune:\n"
-      << "  -autotune-validation            validation file to be used for evaluation\n"
-      << "  -autotune-metric                metric objective {f1, f1:labelname} ["
-      << autotuneMetric << "]\n"
-      << "  -autotune-predictions           number of predictions used for evaluation  ["
-      << autotunePredictions << "]\n"
-      << "  -autotune-duration              maximum duration in seconds ["
-      << autotuneDuration << "]\n"
-      << "  -autotune-modelsize             constraint model file size ["
-      << autotuneModelSize << "] (empty = do not quantize)\n";
+  std::cerr << "\nThe following arguments are for autotune:\n"
+            << "  -autotune-validation            validation file to be used "
+               "for evaluation\n"
+            << "  -autotune-metric                metric objective {f1, "
+               "f1:labelname} ["
+            << autotuneMetric << "]\n"
+            << "  -autotune-predictions           number of predictions used "
+               "for evaluation  ["
+            << autotunePredictions << "]\n"
+            << "  -autotune-duration              maximum duration in seconds ["
+            << autotuneDuration << "]\n"
+            << "  -autotune-modelsize             constraint model file size ["
+            << autotuneModelSize << "] (empty = do not quantize)\n";
 }
 
 void Args::printQuantizationHelp() {
   std::cerr
       << "\nThe following arguments for quantization are optional:\n"
       << "  -cutoff             number of words and ngrams to retain ["
       << cutoff << "]\n"
-      << "  -retrain            whether embeddings are finetuned if a cutoff is applied ["
+      << "  -retrain            whether embeddings are finetuned if a cutoff "
+         "is applied ["
       << boolToString(retrain) << "]\n"
       << "  -qnorm              whether the norm is quantized separately ["
       << boolToString(qnorm) << "]\n"
 
@@ -416,10 +416,10 @@ void Autotune::train(const Args& autotuneArgs) {
         if (!sizeConstraintWarning && trials_ > 10 &&
             sizeConstraintFailed_ > (trials_ / 2)) {
           sizeConstraintWarning = true;
-          std::cerr
-              << std::endl
-              << "Warning : requested model size is probably too small. You may want to increase `autotune-modelsize`."
-              << std::endl;
+          std::cerr << std::endl
+                    << "Warning : requested model size is probably too small. "
+                       "You may want to increase `autotune-modelsize`."
+                    << std::endl;
         }
       }
     } catch (DenseMatrix::EncounteredNaNError&) {
@@ -442,10 +442,12 @@ void Autotune::train(const Args& autotuneArgs) {
     std::string errorMessage;
     if (sizeConstraintWarning) {
       errorMessage =
-          "Couldn't fulfil model size constraint: please increase `autotune-modelsize`.";
+          "Couldn't fulfil model size constraint: please increase "
+          "`autotune-modelsize`.";
     } else {
       errorMessage =
-          "Didn't have enough time to train once: please increase `autotune-duration`.";
+          "Didn't have enough time to train once: please increase "
+          "`autotune-duration`.";
     }
     throw std::runtime_error(errorMessage);
   } else {
 
@@ -43,12 +43,17 @@ void DenseMatrix::uniformThread(real a, int block, int32_t seed) {
 }
 
 void DenseMatrix::uniform(real a, unsigned int thread, int32_t seed) {
-  std::vector<std::thread> threads;
-  for (int i = 0; i < thread; i++) {
-    threads.push_back(std::thread([=]() { uniformThread(a, i, seed); }));
-  }
-  for (int32_t i = 0; i < threads.size(); i++) {
-    threads[i].join();
+  if (thread > 1) {
+    std::vector<std::thread> threads;
+    for (int i = 0; i < thread; i++) {
+      threads.push_back(std::thread([=]() { uniformThread(a, i, seed); }));
+    }
+    for (int32_t i = 0; i < threads.size(); i++) {
+      threads[i].join();
+    }
+  } else {
+    // webassembly can't instantiate `std::thread`
+    uniformThread(a, 0, seed);
   }
 }
 
 
@@ -263,22 +263,30 @@ void FastText::loadModel(std::istream& in) {
   buildModel();
 }
 
-void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
+std::tuple<int64_t, double, double> FastText::progressInfo(real progress) {
   double t = utils::getDuration(start_, std::chrono::steady_clock::now());
   double lr = args_->lr * (1.0 - progress);
   double wst = 0;
 
   int64_t eta = 2592000; // Default to one month in seconds (720 * 3600)
 
   if (progress > 0 && t >= 0) {
-    progress = progress * 100;
-    eta = t * (100 - progress) / progress;
+    eta = t * (1 - progress) / progress;
     wst = double(tokenCount_) / t / args_->thread;
   }
 
+  return std::tuple<double, double, int64_t>(wst, lr, eta);
+}
+
+void FastText::printInfo(real progress, real loss, std::ostream& log_stream) {
+  double wst;
+  double lr;
+  int64_t eta;
+  std::tie<double, double, int64_t>(wst, lr, eta) = progressInfo(progress);
+
   log_stream << std::fixed;
   log_stream << "Progress: ";
-  log_stream << std::setprecision(1) << std::setw(5) << progress << "%";
+  log_stream << std::setprecision(1) << std::setw(5) << (progress * 100) << "%";
   log_stream << " words/sec/thread: " << std::setw(7) << int64_t(wst);
   log_stream << " lr: " << std::setw(9) << std::setprecision(6) << lr;
   log_stream << " avg.loss: " << std::setw(9) << std::setprecision(6) << loss;
@@ -304,7 +312,7 @@ std::vector<int32_t> FastText::selectEmbeddings(int32_t cutoff) const {
   return idx;
 }
 
-void FastText::quantize(const Args& qargs) {
+void FastText::quantize(const Args& qargs, const TrainCallback& callback) {
   if (args_->model != model_name::sup) {
     throw std::invalid_argument(
         "For now we only support quantization of supervised models");
@@ -336,18 +344,16 @@ void FastText::quantize(const Args& qargs) {
       args_->verbose = qargs.verbose;
       auto loss = createLoss(output_);
       model_ = std::make_shared<Model>(input, output, loss, normalizeGradient);
-      startThreads();
+      startThreads(callback);
     }
   }
-
   input_ = std::make_shared<QuantMatrix>(
       std::move(*(input.get())), qargs.dsub, qargs.qnorm);
 
   if (args_->qout) {
     output_ = std::make_shared<QuantMatrix>(
         std::move(*(output.get())), 2, qargs.qnorm);
   }
-
   quant_ = true;
   auto loss = createLoss(output_);
   model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
@@ -615,7 +621,7 @@ bool FastText::keepTraining(const int64_t ntokens) const {
   return tokenCount_ < args_->epoch * ntokens && !trainException_;
 }
 
-void FastText::trainThread(int32_t threadId) {
+void FastText::trainThread(int32_t threadId, const TrainCallback& callback) {
   std::ifstream ifs(args_->input);
   utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
 
@@ -624,9 +630,18 @@ void FastText::trainThread(int32_t threadId) {
   const int64_t ntokens = dict_->ntokens();
   int64_t localTokenCount = 0;
   std::vector<int32_t> line, labels;
+  uint64_t callbackCounter = 0;
   try {
     while (keepTraining(ntokens)) {
       real progress = real(tokenCount_) / (args_->epoch * ntokens);
+      if (callback && ((callbackCounter++ % 64) == 0)) {
+        double wst;
+        double lr;
+        int64_t eta;
+        std::tie<double, double, int64_t>(wst, lr, eta) =
+            progressInfo(progress);
+        callback(progress, loss_, wst, lr, eta);
+      }
       real lr = args_->lr * (1.0 - progress);
       if (args_->model == model_name::sup) {
         localTokenCount += dict_->getLine(ifs, line, labels);
@@ -717,7 +732,7 @@ std::shared_ptr<Matrix> FastText::createTrainOutputMatrix() const {
   return output;
 }
 
-void FastText::train(const Args& args) {
+void FastText::train(const Args& args, const TrainCallback& callback) {
   args_ = std::make_shared<Args>(args);
   dict_ = std::make_shared<Dictionary>(args_);
   if (args_->input == "-") {
@@ -742,7 +757,7 @@ void FastText::train(const Args& args) {
   auto loss = createLoss(output_);
   bool normalizeGradient = (args_->model == model_name::sup);
   model_ = std::make_shared<Model>(input_, output_, loss, normalizeGradient);
-  startThreads();
+  startThreads(callback);
 }
 
 void FastText::abort() {
@@ -753,14 +768,19 @@ void FastText::abort() {
   }
 }
 
-void FastText::startThreads() {
+void FastText::startThreads(const TrainCallback& callback) {
   start_ = std::chrono::steady_clock::now();
   tokenCount_ = 0;
   loss_ = -1;
   trainException_ = nullptr;
   std::vector<std::thread> threads;
-  for (int32_t i = 0; i < args_->thread; i++) {
-    threads.push_back(std::thread([=]() { trainThread(i); }));
+  if (args_->thread > 1) {
+    for (int32_t i = 0; i < args_->thread; i++) {
+      threads.push_back(std::thread([=]() { trainThread(i, callback); }));
+    }
+  } else {
+    // webassembly can't instantiate `std::thread`
+    trainThread(0, callback);
   }
   const int64_t ntokens = dict_->ntokens();
   // Same condition as trainThread
@@ -772,7 +792,7 @@ void FastText::startThreads() {
       printInfo(progress, loss_, std::cerr);
     }
   }
-  for (int32_t i = 0; i < args_->thread; i++) {
+  for (int32_t i = 0; i < threads.size(); i++) {
     threads[i].join();
   }
   if (trainException_) {