Skip to content

Commit

Permalink
new_tagspace
Browse files Browse the repository at this point in the history
  • Loading branch information
yinhaofeng committed Jan 19, 2021
1 parent 923f599 commit 388beb7
Show file tree
Hide file tree
Showing 15 changed files with 230 additions and 723 deletions.
13 changes: 0 additions & 13 deletions models/contentunderstanding/tagspace/__init__.py

This file was deleted.

66 changes: 12 additions & 54 deletions models/contentunderstanding/tagspace/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.

workspace: "models/contentunderstanding/tagspace"

dygraph:
runner:
train_data_dir: "data/train_data"
train_reader_path: "reader" # importlib format
train_batch_size: 10
model_save_path: "increment"

use_gpu: False
batch_size_train: 10
batch_size_infer: 10
epochs: 1
print_interval: 1
# model_init_path: "output_model/0" # init model
model_save_path: "increment"

test_data_dir: "data/test_data"
infer_reader_path: "reader" # importlib format
infer_batch_size: 10
infer_load_path: "increment"
infer_start_epoch: -1
infer_start_epoch: 0
infer_end_epoch: 1


dataset:
- name: sample_1
type: DataLoader
batch_size: 10
data_path: "{workspace}/data/train_data"
data_converter: "{workspace}/reader.py"
- name: inferdata
type: DataLoader
batch_size: 10
data_path: "{workspace}/data/test_data"
data_converter: "{workspace}/reader.py"

# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adagrad
learning_rate: 0.001
# user-defined <key, value> pairs
vocab_text_size: 75378
vocab_tag_size: 4
emb_dim: 10
Expand All @@ -54,36 +45,3 @@ hyper_parameters:
neg_size: 3
num_devices: 1
text_len: 45

mode: [runner1,infer_runner]

runner:
- name: runner1
class: train
epochs: 1
device: cpu
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
print_interval: 1
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/0" # load model path
phases: phase_infer

phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: sample_1
thread_num: 1
- name: phase_infer
model: "{workspace}/model.py"
dataset_name: inferdata
thread_num: 1
66 changes: 12 additions & 54 deletions models/contentunderstanding/tagspace/config_bigdata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,39 +12,30 @@
# See the License for the specific language governing permissions and
# limitations under the License.

workspace: "models/contentunderstanding/tagspace"

dygraph:
runner:
train_data_dir: "data/train_big_data"
train_reader_path: "reader" # importlib format
train_batch_size: 128
model_save_path: "increment"

use_gpu: False
batch_size_train: 128
batch_size_infer: 500
epochs: 1
print_interval: 1
# model_init_path: "output_model/0" # init model
model_save_path: "increment"

test_data_dir: "data/test_big_data"
infer_reader_path: "reader" # importlib format
infer_batch_size: 500
infer_load_path: "increment"
infer_start_epoch: -1
infer_start_epoch: 0
infer_end_epoch: 1


dataset:
- name: sample_1
type: DataLoader
batch_size: 128
data_path: "{workspace}/data/train_big_data"
data_converter: "{workspace}/reader.py"
- name: inferdata
type: DataLoader
batch_size: 500
data_path: "{workspace}/data/test_big_data"
data_converter: "{workspace}/reader.py"

# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adagrad
learning_rate: 0.001
# user-defined <key, value> pairs
vocab_text_size: 75378
vocab_tag_size: 4
emb_dim: 10
Expand All @@ -54,36 +45,3 @@ hyper_parameters:
neg_size: 3
num_devices: 1
text_len: 45

mode: [runner1, infer_runner]

runner:
- name: runner1
class: train
epochs: 1
device: cpu
save_checkpoint_interval: 1
save_inference_interval: 1
save_checkpoint_path: "increment"
save_inference_path: "inference"
save_inference_feed_varnames: []
save_inference_fetch_varnames: []
print_interval: 1
phases: phase1
- name: infer_runner
class: infer
# device to run training or infer
device: cpu
print_interval: 1
init_model_path: "increment/0" # load model path
phases: phase_infer

phase:
- name: phase1
model: "{workspace}/model.py"
dataset_name: sample_1
thread_num: 1
- name: phase_infer
model: "{workspace}/model.py"
dataset_name: inferdata
thread_num: 1
113 changes: 113 additions & 0 deletions models/contentunderstanding/tagspace/dygraph_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
import math

import net


class DygraphModel():
# define model
def create_model(self, config):
vocab_text_size = config.get("hyper_parameters.vocab_text_size")
vocab_tag_size = config.get("hyper_parameters.vocab_tag_size")
emb_dim = config.get("hyper_parameters.emb_dim")
hid_dim = config.get("hyper_parameters.hid_dim")
win_size = config.get("hyper_parameters.win_size")
margin = config.get("hyper_parameters.margin")
neg_size = config.get("hyper_parameters.neg_size")
text_len = config.get("hyper_parameters.text_len")

tagspace_model = net.TagspaceLayer(vocab_text_size, vocab_tag_size,
emb_dim, hid_dim, win_size, margin,
neg_size, text_len)
return tagspace_model

# define feeds which convert numpy of batch data to paddle.tensor
def create_feeds(self, batch_data, text_len, neg_size):
text = paddle.to_tensor(batch_data[0].numpy().astype('int64').reshape(
-1, text_len))
pos_tag = paddle.to_tensor(batch_data[1].numpy().astype('int64')
.reshape(-1, 1))
neg_tag = paddle.to_tensor(batch_data[2].numpy().astype('int64')
.reshape(-1, neg_size))
return [text, pos_tag, neg_tag]

# define loss function by predicts and label
def create_loss(self, batch_size, margin, cos_pos, cos_neg):
loss_part1 = paddle.subtract(
paddle.full(
shape=[batch_size, 1], fill_value=margin, dtype='float32'),
cos_pos)
loss_part2 = paddle.add(loss_part1, cos_neg)
loss_part3 = paddle.maximum(
paddle.full(
shape=[batch_size, 1], fill_value=0.0, dtype='float32'),
loss_part2)
avg_cost = paddle.mean(loss_part3)
return avg_cost

# define optimizer
def create_optimizer(self, dy_model, config):
lr = config.get("hyper_parameters.optimizer.learning_rate", 0.001)
optimizer = paddle.optimizer.Adagrad(
learning_rate=lr, parameters=dy_model.parameters())
return optimizer

# define metrics such as auc/acc
# multi-task need to define multi metric

def get_acc(self, x, y, batch_size):
less = paddle.cast(paddle.less_than(x, y), dtype='float32')
label_ones = paddle.full(
dtype='float32', shape=[batch_size, 1], fill_value=1.0)
correct = paddle.sum(less)
total = paddle.sum(label_ones)
acc = paddle.divide(correct, total)
return acc

def create_metrics(self):
metrics_list_name = []
metrics_list = []
return metrics_list, metrics_list_name

# construct train forward phase
def train_forward(self, dy_model, metrics_list, batch_data, config):
neg_size = config.get("hyper_parameters.neg_size")
text_len = config.get("hyper_parameters.text_len")
margin = config.get("hyper_parameters.margin")
batch_size = config.get("runner.train_batch_size", 128)
inputs = self.create_feeds(batch_data, text_len, neg_size)

cos_pos, cos_neg = dy_model.forward(inputs)
loss = self.create_loss(batch_size, margin, cos_pos, cos_neg)
# update metrics
acc = self.get_acc(cos_neg, cos_pos, batch_size)
print_dict = {"loss": loss, "ACC": acc}
return loss, metrics_list, print_dict

def infer_forward(self, dy_model, metrics_list, batch_data, config):
neg_size = config.get("hyper_parameters.neg_size")
text_len = config.get("hyper_parameters.text_len")
batch_size = config.get("runner.infer_batch_size", 128)
inputs = self.create_feeds(batch_data, text_len, neg_size)

cos_pos, cos_neg = dy_model.forward(inputs)
# update metrics
acc = self.get_acc(cos_neg, cos_pos, batch_size)
print_dict = {"ACC": acc}
return metrics_list, print_dict
Loading

0 comments on commit 388beb7

Please sign in to comment.