add ernie python demo

PaddlePaddle · ZeyuChen · Nov 17, 2022 · Nov 16, 2022 · Nov 16, 2022 · Nov 16, 2022
commit 623abc54a323f39a1c6cf2d3bc7cf5cd9ef62942
diff --git a/fast_tokenizer/examples/clip/CMakeLists.txt → ...okenizer/examples/clip/cpp/CMakeLists.txt b/fast_tokenizer/examples/clip/CMakeLists.txt → ...okenizer/examples/clip/cpp/CMakeLists.txt
@@ -24,5 +24,5 @@ endif()
 include(${FAST_TOKENIZER_INSTALL_DIR}/FastTokenizer.cmake)
 include_directories(${FAST_TOKENIZER_INCS})
 
-add_executable(clip_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/clip_fast_tokenizer_demo.cc)
-target_link_libraries(clip_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS})
+add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cc)
+target_link_libraries(demo ${FAST_TOKENIZER_LIBS})
diff --git a/...examples/clip/clip_fast_tokenizer_demo.cc → fast_tokenizer/examples/clip/cpp/demo.cc b/...examples/clip/clip_fast_tokenizer_demo.cc → fast_tokenizer/examples/clip/cpp/demo.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
 #include <iostream>
 #include <vector>
+#include "fast_tokenizer/tokenizers/clip_fast_tokenizer.h"
 using namespace paddlenlp;
 
 template <typename T>
@@ -51,17 +51,19 @@ fast_tokenizer::tokenizers_impl::ClipFastTokenizer CreateClipFastTokenizer(
 
 int main() {
   // 1. Define a clip fast tokenizer
-  auto tokenizer =
-      CreateClipFastTokenizer("clip_vocab.json", "clip_merges.txt", /*max_length = */77, /* pad_to_max_length = */true);
+  auto tokenizer = CreateClipFastTokenizer("clip_vocab.json",
+                                           "clip_merges.txt",
+                                           /*max_length = */ 77,
+                                           /* pad_to_max_length = */ true);
   // 2. Tokenize the input strings
   std::vector<fast_tokenizer::core::Encoding> encodings;
   std::vector<std::string> texts = {
       "a photo of an astronaut riding a horse on mars"};
   tokenizer.EncodeBatchStrings(texts, &encodings);
 
   for (int i = 0; i < texts.size(); ++i) {
-      std::cout << "text = \"" << texts[i] << "\"" << std::endl;
-      std::cout << "ids = " << encodings[i].GetIds() << std::endl;
+    std::cout << "text = \"" << texts[i] << "\"" << std::endl;
+    std::cout << "ids = " << encodings[i].GetIds() << std::endl;
   }
 
   return 0;

diff --git a/fast_tokenizer/examples/clip/python/demo.py b/fast_tokenizer/examples/clip/python/demo.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/...kenizer/examples/ernie-3.0/CMakeLists.txt → ...zer/examples/ernie-3.0/cpp/CMakeLists.txt b/...kenizer/examples/ernie-3.0/CMakeLists.txt → ...zer/examples/ernie-3.0/cpp/CMakeLists.txt
@@ -17,5 +17,5 @@ include(${FAST_TOKENIZER_INSTALL_DIR}/FastTokenizer.cmake)
 
 include_directories(${FAST_TOKENIZER_INCS})
 
-add_executable(ernie_fast_tokenizer_demo ${PROJECT_SOURCE_DIR}/ernie_fast_tokenizer_demo.cc)
-target_link_libraries(ernie_fast_tokenizer_demo ${FAST_TOKENIZER_LIBS})
+add_executable(demo ${PROJECT_SOURCE_DIR}/demo.cc)
+target_link_libraries(demo ${FAST_TOKENIZER_LIBS})
diff --git a/fast_tokenizer/examples/ernie-3.0/cpp/README.md b/fast_tokenizer/examples/ernie-3.0/cpp/README.md
diff --git a/...es/ernie-3.0/ernie_fast_tokenizer_demo.cc → ..._tokenizer/examples/ernie-3.0/cpp/demo.cc b/...es/ernie-3.0/ernie_fast_tokenizer_demo.cc → ..._tokenizer/examples/ernie-3.0/cpp/demo.cc
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
 #include <iostream>
 #include <vector>
+#include "fast_tokenizer/tokenizers/ernie_fast_tokenizer.h"
 using namespace paddlenlp;
 
 int main() {

diff --git a/fast_tokenizer/examples/ernie-3.0/python/demo.py b/fast_tokenizer/examples/ernie-3.0/python/demo.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import fast_tokenizer
+from fast_tokenizer import ErnieFastTokenizer, models
+
+fast_tokenizer.set_thread_num(1)
+vocab = models.WordPiece.read_file("ernie_vocab.txt")
+fast_tokenizer = ErnieFastTokenizer(vocab)
+output = fast_tokenizer.encode("我爱中国")
+print("ids: ", output.ids)
+print("type_ids: ", output.type_ids)
+print("tokens: ", output.tokens)
+print("offsets: ", output.offsets)
+print("attention_mask: ", output.attention_mask)