-
Notifications
You must be signed in to change notification settings - Fork 34
add emotion_detection and update senta #62
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,23 @@ | ||
| model_type: "bow_net" | ||
| num_labels: 3 | ||
| vocab_size: 240465 | ||
| vocab_path: "./data/vocab.txt" | ||
| data_dir: "./data" | ||
| inference_model_dir: "./inference_model" | ||
| save_checkpoint_dir: "" | ||
| init_checkpoint: "" | ||
| checkpoints: "./checkpoints/" | ||
| lr: 0.02 | ||
| epoch: 10 | ||
| batch_size: 24 | ||
| do_train: True | ||
| do_val: True | ||
| do_infer: False | ||
| do_save_inference_model: False | ||
| max_seq_len: 20 | ||
| skip_steps: 10 | ||
| save_freq: 1 | ||
| eval_freq: 1 | ||
| random_seed: 0 | ||
| output_dir: "./output" | ||
| use_cuda: True |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| """ | ||
| Download script, download dataset and pretrain models. | ||
| """ | ||
|
|
||
| from __future__ import absolute_import | ||
| from __future__ import division | ||
| from __future__ import print_function | ||
|
|
||
| import io | ||
| import os | ||
| import sys | ||
| import time | ||
| import hashlib | ||
| import tarfile | ||
| import requests | ||
|
|
||
|
|
||
| def usage(): | ||
| desc = ("\nDownload datasets and pretrained models for EmotionDetection task.\n" | ||
| "Usage:\n" | ||
| " python download.py dataset\n") | ||
| print(desc) | ||
|
|
||
|
|
||
| def md5file(fname): | ||
| hash_md5 = hashlib.md5() | ||
| with io.open(fname, "rb") as fin: | ||
| for chunk in iter(lambda: fin.read(4096), b""): | ||
| hash_md5.update(chunk) | ||
| return hash_md5.hexdigest() | ||
|
|
||
|
|
||
| def extract(fname, dir_path): | ||
| """ | ||
| Extract tar.gz file | ||
| """ | ||
| try: | ||
| tar = tarfile.open(fname, "r:gz") | ||
| file_names = tar.getnames() | ||
| for file_name in file_names: | ||
| tar.extract(file_name, dir_path) | ||
| print(file_name) | ||
| tar.close() | ||
| except Exception as e: | ||
| raise e | ||
|
|
||
|
|
||
| def download(url, filename, md5sum): | ||
| """ | ||
| Download file and check md5 | ||
| """ | ||
| retry = 0 | ||
| retry_limit = 3 | ||
| chunk_size = 4096 | ||
| while not (os.path.exists(filename) and md5file(filename) == md5sum): | ||
| if retry < retry_limit: | ||
| retry += 1 | ||
| else: | ||
| raise RuntimeError("Cannot download dataset ({0}) with retry {1} times.". | ||
| format(url, retry_limit)) | ||
| try: | ||
| start = time.time() | ||
| size = 0 | ||
| res = requests.get(url, stream=True) | ||
| filesize = int(res.headers['content-length']) | ||
| if res.status_code == 200: | ||
| print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024)) | ||
| # save by chunk | ||
| with io.open(filename, "wb") as fout: | ||
| for chunk in res.iter_content(chunk_size=chunk_size): | ||
| if chunk: | ||
| fout.write(chunk) | ||
| size += len(chunk) | ||
| pr = '>' * int(size * 50 / filesize) | ||
| print('\r[Process ]: %s%.2f%%' % (pr, float(size / filesize*100)), end='') | ||
| end = time.time() | ||
| print("\n[CostTime]: %.2f s" % (end - start)) | ||
| except Exception as e: | ||
| print(e) | ||
|
|
||
|
|
||
| def download_dataset(dir_path): | ||
| BASE_URL = "https://baidu-nlp.bj.bcebos.com/" | ||
| DATASET_NAME = "emotion_detection-dataset-1.0.0.tar.gz" | ||
| DATASET_MD5 = "512d256add5f9ebae2c101b74ab053e9" | ||
| file_path = os.path.join(dir_path, DATASET_NAME) | ||
| url = BASE_URL + DATASET_NAME | ||
|
|
||
| if not os.path.exists(dir_path): | ||
| os.makedirs(dir_path) | ||
| # download dataset | ||
| print("Downloading dataset: %s" % url) | ||
| download(url, file_path, DATASET_MD5) | ||
| # extract dataset | ||
| print("Extracting dataset: %s" % file_path) | ||
| extract(file_path, dir_path) | ||
| os.remove(file_path) | ||
|
|
||
| if __name__ == '__main__': | ||
| if len(sys.argv) != 2: | ||
| usage() | ||
| sys.exit(1) | ||
|
|
||
| if sys.argv[1] == "dataset": | ||
| pwd = os.path.join(os.path.dirname(__file__), './') | ||
| download_dataset(pwd) | ||
| else: | ||
| usage() | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,8 @@ | ||
| #!/bin/bash | ||
|
|
||
| # download dataset file to ./data/ | ||
| DATA_URL=https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz | ||
| wget --no-check-certificate ${DATA_URL} | ||
|
|
||
| tar xvf emotion_detection-dataset-1.0.0.tar.gz | ||
| /bin/rm emotion_detection-dataset-1.0.0.tar.gz |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,179 @@ | ||
| # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. | ||
| # | ||
| # Licensed under the Apache License, Version 2.0 (the "License"); | ||
| # you may not use this file except in compliance with the License. | ||
| # You may obtain a copy of the License at | ||
| # | ||
| # http://www.apache.org/licenses/LICENSE-2.0 | ||
| # | ||
| # Unless required by applicable law or agreed to in writing, software | ||
| # distributed under the License is distributed on an "AS IS" BASIS, | ||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
| import paddle.fluid as fluid | ||
| from paddle.fluid.dygraph.nn import Linear, Embedding | ||
| # from paddle.fluid.dygraph.base import to_variable | ||
| import numpy as np | ||
| from hapi.model import Model | ||
| from hapi.text.text import GRUEncoderLayer as BiGRUEncoder | ||
| from hapi.text.text import BOWEncoder, CNNEncoder, GRUEncoder, LSTMEncoder | ||
|
|
||
| class CNN(Model): | ||
| def __init__(self, dict_dim, seq_len): | ||
| super(CNN, self).__init__() | ||
| self.dict_dim = dict_dim | ||
| self.emb_dim = 128 | ||
| self.hid_dim = 128 | ||
| self.fc_hid_dim = 96 | ||
| self.class_dim = 3 | ||
| self.channels = 1 | ||
| self.win_size = [3, self.hid_dim] | ||
| self.seq_len = seq_len | ||
| self._encoder = CNNEncoder( | ||
| dict_size=self.dict_dim + 1, | ||
| emb_dim=self.emb_dim, | ||
| seq_len=self.seq_len, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 看到这里需要设置seq_len,固定的seq_len对于可能会有些问题,这个的话是CNNEncoder这个API导致的,已经和 @jinyuKING 去更新 #60 中的这个接口了,后面可能还要麻烦再调整下。 |
||
| filter_size= self.win_size, | ||
| num_filters= self.hid_dim, | ||
| hidden_dim= self.hid_dim, | ||
| padding_idx=None, | ||
| act='tanh') | ||
| self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax") | ||
| self._fc_prediction = Linear(input_dim = self.fc_hid_dim, | ||
| output_dim = self.class_dim, | ||
| act="softmax") | ||
|
|
||
| def forward(self, inputs): | ||
| conv_3 = self._encoder(inputs) | ||
| fc_1 = self._fc1(conv_3) | ||
| prediction = self._fc_prediction(fc_1) | ||
| return prediction | ||
|
|
||
|
|
||
| class BOW(Model): | ||
| def __init__(self, dict_dim, seq_len): | ||
| super(BOW, self).__init__() | ||
| self.dict_dim = dict_dim | ||
| self.emb_dim = 128 | ||
| self.hid_dim = 128 | ||
| self.fc_hid_dim = 96 | ||
| self.class_dim = 3 | ||
| self.seq_len = seq_len | ||
| self._encoder = BOWEncoder( | ||
| dict_size=self.dict_dim + 1, | ||
| emb_dim=self.emb_dim, | ||
| padding_idx=None, | ||
| bow_dim=self.hid_dim, | ||
| seq_len=self.seq_len) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 同上,已经和 @jinyuKING 去更新 #60 中的这个接口了,后面可能还要麻烦再调整下 |
||
| self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh") | ||
| self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") | ||
| self._fc_prediction = Linear(input_dim = self.fc_hid_dim, | ||
| output_dim = self.class_dim, | ||
| act="softmax") | ||
|
|
||
| def forward(self, inputs): | ||
| bow_1 = self._encoder(inputs) | ||
| bow_1 = fluid.layers.tanh(bow_1) | ||
| fc_1 = self._fc1(bow_1) | ||
| fc_2 = self._fc2(fc_1) | ||
| prediction = self._fc_prediction(fc_2) | ||
| return prediction | ||
|
|
||
|
|
||
| class GRU(Model): | ||
| def __init__(self, dict_dim, seq_len): | ||
| super(GRU, self).__init__() | ||
| self.dict_dim = dict_dim | ||
| self.emb_dim = 128 | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 建议参数尽量可配置,可以后续考虑 |
||
| self.hid_dim = 128 | ||
| self.fc_hid_dim = 96 | ||
| self.class_dim = 3 | ||
| self.seq_len = seq_len | ||
| self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") | ||
| self._fc_prediction = Linear(input_dim=self.fc_hid_dim, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 看CNN、BOW和下面的LSTM中几个中也都是包含了 |
||
| output_dim=self.class_dim, | ||
| act="softmax") | ||
| self._encoder = GRUEncoder( | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. #60 中加的这个接口后面会去掉,对于GRU和LSTM后面各提供一个统一的接口,GRU的话可以使用下面BiGRU中一样的接口 |
||
| dict_size=self.dict_dim + 1, | ||
| emb_dim=self.emb_dim, | ||
| gru_dim=self.hid_dim, | ||
| hidden_dim=self.hid_dim, | ||
| padding_idx=None, | ||
| seq_len=self.seq_len) | ||
|
|
||
| def forward(self, inputs): | ||
| emb = self._encoder(inputs) | ||
| fc_1 = self._fc1(emb) | ||
| prediction = self._fc_prediction(fc_1) | ||
| return prediction | ||
|
|
||
|
|
||
| class BiGRU(Model): | ||
| def __init__(self, dict_dim, batch_size, seq_len): | ||
| super(BiGRU, self).__init__() | ||
| self.dict_dim = dict_dim | ||
| self.emb_dim = 128 | ||
| self.hid_dim = 128 | ||
| self.fc_hid_dim = 96 | ||
| self.class_dim = 3 | ||
| self.batch_size = batch_size | ||
| self.seq_len = seq_len | ||
| self.embedding = Embedding( | ||
| size=[self.dict_dim + 1, self.emb_dim], | ||
| dtype='float32', | ||
| param_attr=fluid.ParamAttr(learning_rate=30), | ||
| is_sparse=False) | ||
| # h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32") | ||
| # h_0 = to_variable(h_0) | ||
| self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3) | ||
| self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh") | ||
| self._fc_prediction = Linear(input_dim=self.fc_hid_dim, | ||
| output_dim=self.class_dim, | ||
| act="softmax") | ||
| self._encoder = BiGRUEncoder( | ||
| grnn_hidden_dim=self.hid_dim, | ||
| input_dim=self.hid_dim * 3, | ||
| # h_0=h_0, | ||
| init_bound=0.1, | ||
| is_bidirection=True) | ||
|
|
||
| def forward(self, inputs): | ||
| emb = self.embedding(inputs) | ||
| emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim]) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 类似seq_len,batch_size最后也要支持不同batch间可变,尽量不要固定。这里的话如果 |
||
| fc_1 = self._fc1(emb) | ||
| encoded_vector = self._encoder(fc_1) | ||
| encoded_vector = fluid.layers.tanh(encoded_vector) | ||
| encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1) | ||
| fc_2 = self._fc2(encoded_vector) | ||
| prediction = self._fc_prediction(fc_2) | ||
| return prediction | ||
|
|
||
| class LSTM(Model): | ||
| def __init__(self, dict_dim, seq_len): | ||
| super(LSTM, self).__init__() | ||
| self.seq_len = seq_len, | ||
| self.dict_dim = dict_dim, | ||
| self.emb_dim = 128, | ||
| self.hid_dim = 128, | ||
| self.fc_hid_dim = 96, | ||
| self.class_dim = 3, | ||
| self.emb_lr = 30.0, | ||
| self._encoder = LSTMEncoder( | ||
| dict_size=dict_dim + 1, | ||
| emb_dim=self.emb_dim, | ||
| lstm_dim=self.hid_dim, | ||
| hidden_dim=self.hid_dim, | ||
| seq_len=self.seq_len, | ||
| padding_idx=None, | ||
| is_reverse=False) | ||
|
|
||
| self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh") | ||
| self._fc_prediction = Linear(input_dim=self.fc_hid_dim, | ||
| output_dim=self.class_dim, | ||
| act="softmax") | ||
| def forward(self, inputs): | ||
| emb = self._encoder(inputs) | ||
| fc_1 = self._fc1(emb) | ||
| prediction = self._fc_prediction(fc_1) | ||
| return prediction | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里是否可以使用现有 download.py 中的
get_path_from_url呢