Skip to content
This repository was archived by the owner on Jan 24, 2024. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions examples/emotion_detection/config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
model_type: "bow_net"
num_labels: 3
vocab_size: 240465
vocab_path: "./data/vocab.txt"
data_dir: "./data"
inference_model_dir: "./inference_model"
save_checkpoint_dir: ""
init_checkpoint: ""
checkpoints: "./checkpoints/"
lr: 0.02
epoch: 10
batch_size: 24
do_train: True
do_val: True
do_infer: False
do_save_inference_model: False
max_seq_len: 20
skip_steps: 10
save_freq: 1
eval_freq: 1
random_seed: 0
output_dir: "./output"
use_cuda: True
123 changes: 123 additions & 0 deletions examples/emotion_detection/download.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Download script, download dataset and pretrain models.
"""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import io
import os
import sys
import time
import hashlib
import tarfile
import requests


def usage():
desc = ("\nDownload datasets and pretrained models for EmotionDetection task.\n"
"Usage:\n"
" python download.py dataset\n")
print(desc)


def md5file(fname):
hash_md5 = hashlib.md5()
with io.open(fname, "rb") as fin:
for chunk in iter(lambda: fin.read(4096), b""):
hash_md5.update(chunk)
return hash_md5.hexdigest()


def extract(fname, dir_path):
"""
Extract tar.gz file
"""
try:
tar = tarfile.open(fname, "r:gz")
file_names = tar.getnames()
for file_name in file_names:
tar.extract(file_name, dir_path)
print(file_name)
tar.close()
except Exception as e:
raise e


def download(url, filename, md5sum):
"""
Download file and check md5
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里是否可以使用现有 download.py 中的 get_path_from_url

"""
retry = 0
retry_limit = 3
chunk_size = 4096
while not (os.path.exists(filename) and md5file(filename) == md5sum):
if retry < retry_limit:
retry += 1
else:
raise RuntimeError("Cannot download dataset ({0}) with retry {1} times.".
format(url, retry_limit))
try:
start = time.time()
size = 0
res = requests.get(url, stream=True)
filesize = int(res.headers['content-length'])
if res.status_code == 200:
print("[Filesize]: %0.2f MB" % (filesize / 1024 / 1024))
# save by chunk
with io.open(filename, "wb") as fout:
for chunk in res.iter_content(chunk_size=chunk_size):
if chunk:
fout.write(chunk)
size += len(chunk)
pr = '>' * int(size * 50 / filesize)
print('\r[Process ]: %s%.2f%%' % (pr, float(size / filesize*100)), end='')
end = time.time()
print("\n[CostTime]: %.2f s" % (end - start))
except Exception as e:
print(e)


def download_dataset(dir_path):
BASE_URL = "https://baidu-nlp.bj.bcebos.com/"
DATASET_NAME = "emotion_detection-dataset-1.0.0.tar.gz"
DATASET_MD5 = "512d256add5f9ebae2c101b74ab053e9"
file_path = os.path.join(dir_path, DATASET_NAME)
url = BASE_URL + DATASET_NAME

if not os.path.exists(dir_path):
os.makedirs(dir_path)
# download dataset
print("Downloading dataset: %s" % url)
download(url, file_path, DATASET_MD5)
# extract dataset
print("Extracting dataset: %s" % file_path)
extract(file_path, dir_path)
os.remove(file_path)

if __name__ == '__main__':
if len(sys.argv) != 2:
usage()
sys.exit(1)

if sys.argv[1] == "dataset":
pwd = os.path.join(os.path.dirname(__file__), './')
download_dataset(pwd)
else:
usage()

8 changes: 8 additions & 0 deletions examples/emotion_detection/download_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

# download dataset file to ./data/
DATA_URL=https://baidu-nlp.bj.bcebos.com/emotion_detection-dataset-1.0.0.tar.gz
wget --no-check-certificate ${DATA_URL}

tar xvf emotion_detection-dataset-1.0.0.tar.gz
/bin/rm emotion_detection-dataset-1.0.0.tar.gz
179 changes: 179 additions & 0 deletions examples/emotion_detection/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.fluid as fluid
from paddle.fluid.dygraph.nn import Linear, Embedding
# from paddle.fluid.dygraph.base import to_variable
import numpy as np
from hapi.model import Model
from hapi.text.text import GRUEncoderLayer as BiGRUEncoder
from hapi.text.text import BOWEncoder, CNNEncoder, GRUEncoder, LSTMEncoder

class CNN(Model):
def __init__(self, dict_dim, seq_len):
super(CNN, self).__init__()
self.dict_dim = dict_dim
self.emb_dim = 128
self.hid_dim = 128
self.fc_hid_dim = 96
self.class_dim = 3
self.channels = 1
self.win_size = [3, self.hid_dim]
self.seq_len = seq_len
self._encoder = CNNEncoder(
dict_size=self.dict_dim + 1,
emb_dim=self.emb_dim,
seq_len=self.seq_len,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看到这里需要设置seq_len,固定的seq_len对于可能会有些问题,这个的话是CNNEncoder这个API导致的,已经和 @jinyuKING 去更新 #60 中的这个接口了,后面可能还要麻烦再调整下。

filter_size= self.win_size,
num_filters= self.hid_dim,
hidden_dim= self.hid_dim,
padding_idx=None,
act='tanh')
self._fc1 = Linear(input_dim = self.hid_dim*self.seq_len, output_dim=self.fc_hid_dim, act="softmax")
self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
output_dim = self.class_dim,
act="softmax")

def forward(self, inputs):
conv_3 = self._encoder(inputs)
fc_1 = self._fc1(conv_3)
prediction = self._fc_prediction(fc_1)
return prediction


class BOW(Model):
def __init__(self, dict_dim, seq_len):
super(BOW, self).__init__()
self.dict_dim = dict_dim
self.emb_dim = 128
self.hid_dim = 128
self.fc_hid_dim = 96
self.class_dim = 3
self.seq_len = seq_len
self._encoder = BOWEncoder(
dict_size=self.dict_dim + 1,
emb_dim=self.emb_dim,
padding_idx=None,
bow_dim=self.hid_dim,
seq_len=self.seq_len)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

同上,已经和 @jinyuKING 去更新 #60 中的这个接口了,后面可能还要麻烦再调整下

self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim, act="tanh")
self._fc2 = Linear(input_dim = self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
self._fc_prediction = Linear(input_dim = self.fc_hid_dim,
output_dim = self.class_dim,
act="softmax")

def forward(self, inputs):
bow_1 = self._encoder(inputs)
bow_1 = fluid.layers.tanh(bow_1)
fc_1 = self._fc1(bow_1)
fc_2 = self._fc2(fc_1)
prediction = self._fc_prediction(fc_2)
return prediction


class GRU(Model):
def __init__(self, dict_dim, seq_len):
super(GRU, self).__init__()
self.dict_dim = dict_dim
self.emb_dim = 128
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

建议参数尽量可配置,可以后续考虑

self.hid_dim = 128
self.fc_hid_dim = 96
self.class_dim = 3
self.seq_len = seq_len
self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看CNN、BOW和下面的LSTM中几个中也都是包含了_fc1, _fc_prediction, 后续可以考虑把不同特征提取部分抽出来

output_dim=self.class_dim,
act="softmax")
self._encoder = GRUEncoder(
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#60 中加的这个接口后面会去掉,对于GRU和LSTM后面各提供一个统一的接口,GRU的话可以使用下面BiGRU中一样的接口

dict_size=self.dict_dim + 1,
emb_dim=self.emb_dim,
gru_dim=self.hid_dim,
hidden_dim=self.hid_dim,
padding_idx=None,
seq_len=self.seq_len)

def forward(self, inputs):
emb = self._encoder(inputs)
fc_1 = self._fc1(emb)
prediction = self._fc_prediction(fc_1)
return prediction


class BiGRU(Model):
def __init__(self, dict_dim, batch_size, seq_len):
super(BiGRU, self).__init__()
self.dict_dim = dict_dim
self.emb_dim = 128
self.hid_dim = 128
self.fc_hid_dim = 96
self.class_dim = 3
self.batch_size = batch_size
self.seq_len = seq_len
self.embedding = Embedding(
size=[self.dict_dim + 1, self.emb_dim],
dtype='float32',
param_attr=fluid.ParamAttr(learning_rate=30),
is_sparse=False)
# h_0 = np.zeros((self.batch_size, self.hid_dim), dtype="float32")
# h_0 = to_variable(h_0)
self._fc1 = Linear(input_dim = self.hid_dim, output_dim=self.hid_dim*3)
self._fc2 = Linear(input_dim = self.hid_dim*2, output_dim=self.fc_hid_dim, act="tanh")
self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
output_dim=self.class_dim,
act="softmax")
self._encoder = BiGRUEncoder(
grnn_hidden_dim=self.hid_dim,
input_dim=self.hid_dim * 3,
# h_0=h_0,
init_bound=0.1,
is_bidirection=True)

def forward(self, inputs):
emb = self.embedding(inputs)
emb = fluid.layers.reshape(emb, shape=[self.batch_size, -1, self.hid_dim])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

类似seq_len,batch_size最后也要支持不同batch间可变,尽量不要固定。这里的话如果inputs[batch_size, seq_len]的形状的话应该就不需要这个reshape和batch_size的引入了

fc_1 = self._fc1(emb)
encoded_vector = self._encoder(fc_1)
encoded_vector = fluid.layers.tanh(encoded_vector)
encoded_vector = fluid.layers.reduce_max(encoded_vector, dim=1)
fc_2 = self._fc2(encoded_vector)
prediction = self._fc_prediction(fc_2)
return prediction

class LSTM(Model):
def __init__(self, dict_dim, seq_len):
super(LSTM, self).__init__()
self.seq_len = seq_len,
self.dict_dim = dict_dim,
self.emb_dim = 128,
self.hid_dim = 128,
self.fc_hid_dim = 96,
self.class_dim = 3,
self.emb_lr = 30.0,
self._encoder = LSTMEncoder(
dict_size=dict_dim + 1,
emb_dim=self.emb_dim,
lstm_dim=self.hid_dim,
hidden_dim=self.hid_dim,
seq_len=self.seq_len,
padding_idx=None,
is_reverse=False)

self._fc1 = Linear(input_dim=self.hid_dim, output_dim=self.fc_hid_dim, act="tanh")
self._fc_prediction = Linear(input_dim=self.fc_hid_dim,
output_dim=self.class_dim,
act="softmax")
def forward(self, inputs):
emb = self._encoder(inputs)
fc_1 = self._fc1(emb)
prediction = self._fc_prediction(fc_1)
return prediction
Loading