From 3b25b8744209f4d640f6ee16ed12be8de19d156f Mon Sep 17 00:00:00 2001 From: DeepLearning VM Date: Wed, 25 Mar 2020 05:30:57 -0700 Subject: [PATCH] Fix getNN in python bindings to avoid 'utf-8' codec can't decode error. (#967) Summary: This [earlier commit](https://github.com/facebookresearch/fastText/commit/e13484bcb261cda51d33c4940ab5e207aba3ee79) fixed issue https://github.com/facebookresearch/fastText/issues/715 by casting all strings to Python strings. However, this functionality was not added to getNN and I was seeing the same error when querying nearest neighbors for Japanese language. This commit simply adapts castToPythonString to the get NN function. Pull Request resolved: https://github.com/facebookresearch/fastText/pull/967 Reviewed By: EdouardGrave Differential Revision: D19287807 Pulled By: Celebio fbshipit-source-id: 31fb8b4d643848f3f22381ac06f2443eb70c0009 --- .../fasttext/pybind/fasttext_pybind.cc | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc b/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc index f386a8c1b..d2a3253e9 100644 --- a/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc +++ b/python/fasttext_module/fasttext/pybind/fasttext_pybind.cc @@ -427,8 +427,20 @@ PYBIND11_MODULE(fasttext_pybind, m) { const std::string word) { m.getWordVector(vec, word); }) .def( "getNN", - [](fasttext::FastText& m, const std::string& word, int32_t k) { - return m.getNN(word, k); + [](fasttext::FastText& m, const std::string& word, int32_t k, + const char* onUnicodeError) { + std::vector> score_words = m.getNN( + word, k); + std::vector> output_list; + for (uint32_t i = 0; i < score_words.size(); i++) { + float score = score_words[i].first; + py::str word = castToPythonString( + score_words[i].second, onUnicodeError); + std::pair sw_pair = std::make_pair(score, word); + output_list.push_back(sw_pair); + } + + return output_list; }) .def( "getAnalogies",