Skip to content

Commit

Permalink
merge opts.
Browse files Browse the repository at this point in the history
  • Loading branch information
zhezhaoa committed Dec 5, 2020
1 parent d33d9b2 commit 93a95c0
Show file tree
Hide file tree
Showing 11 changed files with 144 additions and 434 deletions.
34 changes: 4 additions & 30 deletions inference/run_c3_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,45 +18,19 @@
from uer.utils.tokenizer import *
from uer.utils.config import load_hyperparam
from uer.model_loader import load_model
from uer.opts import infer_opts
from run_classifier import batch_loader
from run_c3 import MultipleChoice, read_dataset


def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--load_model_path", default=None, type=str,
help="Path of the multiple choice model.")
parser.add_argument("--vocab_path", type=str, required=True,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--test_path", type=str,
help="Path of the testset.")
parser.add_argument("--prediction_path", default=None, type=str,
help="Path of the prediction file.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=32,
help="Batch size.")
parser.add_argument("--seq_length", type=int, default=512,
help="Sequence length.")
infer_opts(parser)

parser.add_argument("--max_choices_num", default=4, type=int,
help="The maximum number of cadicate answer, shorter than this will be padded.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

# Tokenizer options.

parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
Expand Down
41 changes: 4 additions & 37 deletions inference/run_chid_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from uer.utils.tokenizer import *
from uer.utils.config import load_hyperparam
from uer.model_loader import load_model
from uer.opts import infer_opts
from run_classifier import batch_loader
from run_c3 import MultipleChoice
from run_chid import read_dataset
Expand Down Expand Up @@ -51,52 +52,18 @@ def postprocess_chid_predictions(results):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--load_model_path", default=None, type=str,
help="Path of the multiple choice model.")
parser.add_argument("--vocab_path", type=str, required=True,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--test_path", type=str,
help="Path of the testset.")
parser.add_argument("--prediction_path", default=None, type=str,
help="Path of the prediction file.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=32,
help="Batch size.")
parser.add_argument("--seq_length", type=int, default=64,
help="Sequence length.")
infer_opts(parser)

parser.add_argument("--max_choices_num", default=10, type=int,
help="The maximum number of cadicate answer, shorter than this will be padded.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

# Tokenizer options.
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="char",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
"Char tokenizer segments sentences into characters."
"Space tokenizer segments sentences into words according to space."
)

args = parser.parse_args()

# Load the hyperparameters from the config file.
args = load_hyperparam(args)

# Build tokenizer.
args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)
args.tokenizer = CharTokenizer(args)

# Build classification model and load parameters.
model = MultipleChoice(args)
Expand Down
38 changes: 6 additions & 32 deletions inference/run_classifier_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from uer.utils.config import load_hyperparam
from uer.utils.seed import set_seed
from uer.model_loader import load_model
from uer.opts import infer_opts
from run_classifier import Classifier


Expand Down Expand Up @@ -69,48 +70,21 @@ def read_dataset(args, path):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--load_model_path", default=None, type=str,
help="Path of the classfier model.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--test_path", type=str,
help="Path of the testset.")
parser.add_argument("--prediction_path", default=None, type=str,
help="Path of the prediction file.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=128,
help="Batch size.")
parser.add_argument("--seq_length", type=int, default=128,
help="Sequence length.")
parser.add_argument("--labels_num", type=int, required=True,
help="Number of prediction labels.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
infer_opts(parser)

parser.add_argument("--pooling", choices=["mean", "max", "first", "last"], default="first",
help="Pooling type.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

# Tokenizer options.
parser.add_argument("--labels_num", type=int, required=True,
help="Number of prediction labels.")

parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="bert",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
"Char tokenizer segments sentences into characters."
"Space tokenizer segments sentences into words according to space."
)

# Output options.
parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.")
parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.")

Expand Down
32 changes: 3 additions & 29 deletions inference/run_cmrc_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,16 @@
from uer.utils.tokenizer import *
from uer.utils.vocab import Vocab
from uer.model_loader import load_model
from uer.opts import infer_opts
from run_cmrc import *


def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--load_model_path", default=None, type=str,
help="Path of the classfier model.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--test_path", type=str,
help="Path of the testset.")
parser.add_argument("--prediction_path", default=None, type=str,
help="Path of the prediction file.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=64,
help="Batch size.")
parser.add_argument("--seq_length", type=int, default=512,
help="Sequence length.")
infer_opts(parser)
parser.add_argument("--doc_stride", default=128, type=int,
help="When splitting up a long document into chunks, how much stride to take between chunks.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")


args = parser.parse_args()

Expand Down Expand Up @@ -121,5 +94,6 @@ def main():

f.write(json.dumps(output, indent=4, ensure_ascii=False) + "\n")


if __name__ == "__main__":
main()
33 changes: 4 additions & 29 deletions inference/run_ner_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from uer.utils.tokenizer import *
from uer.utils.vocab import Vocab
from uer.model_loader import load_model
from uer.opts import infer_opts
from run_ner import NerTagger


Expand Down Expand Up @@ -59,37 +60,11 @@ def batch_loader(batch_size, src, seg):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--load_model_path", default=None, type=str,
help="Path of the NER model.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--test_path", type=str,
help="Path of the testset.")
parser.add_argument("--prediction_path", default=None, type=str,
help="Path of the prediction file.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")
infer_opts(parser)

parser.add_argument("--label2id_path", type=str, required=True,
help="Path of the label2id file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=128,
help="Batch_size.")
parser.add_argument("--seq_length", default=128, type=int,
help="Sequence length.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

args = parser.parse_args()

# Load the hyperparameters of the config file.
Expand Down Expand Up @@ -153,7 +128,7 @@ def main():
for label_id in pred[j: j+seq_length_batch[j//args.seq_length]]:
f.write(i2l[label_id] + " ")
f.write("\n")


if __name__ == "__main__":
main()
68 changes: 8 additions & 60 deletions run_chid.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from uer.utils.config import load_hyperparam
from uer.utils.seed import set_seed
from uer.model_saver import save_model
from uer.opts import finetune_opts
from run_c3 import MultipleChoice
from run_classifier import build_optimizer, load_or_initialize_parameters, train_model, batch_loader, evaluate

Expand Down Expand Up @@ -128,82 +129,29 @@ def read_dataset(args, data_path, answer_path):
def main():
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)

# Path options.
parser.add_argument("--pretrained_model_path", default=None, type=str,
help="Path of the pretrained model.")
parser.add_argument("--output_model_path", default="./models/multichoice_model.bin", type=str,
help="Path of the output model.")
parser.add_argument("--vocab_path", default=None, type=str,
help="Path of the vocabulary file.")
parser.add_argument("--spm_model_path", default=None, type=str,
help="Path of the sentence piece model.")
parser.add_argument("--train_path", type=str, required=True,
help="Path of the trainset.")
finetune_opts(parser)

parser.add_argument("--train_answer_path", type=str, required=True,
help="Path of the answers for trainset.")
parser.add_argument("--dev_path", type=str, required=True,
help="Path of the devset.")
parser.add_argument("--dev_answer_path", type=str, required=True,
help="Path of the answers for devset.")
parser.add_argument("--config_path", default="./models/bert_base_config.json", type=str,
help="Path of the config file.")

# Model options.
parser.add_argument("--batch_size", type=int, default=8,
help="Batch size.")
parser.add_argument("--seq_length", type=int, default=64,
help="Sequence length.")
parser.add_argument("--embedding", choices=["bert", "word"], default="bert",
help="Emebdding type.")
parser.add_argument("--encoder", choices=["bert", "lstm", "gru", \
"cnn", "gatedcnn", "attn", "synt", \
"rcnn", "crnn", "gpt", "bilstm"], \
default="bert", help="Encoder type.")
parser.add_argument("--bidirectional", action="store_true", help="Specific to recurrent model.")
parser.add_argument("--factorized_embedding_parameterization", action="store_true", help="Factorized embedding parameterization.")
parser.add_argument("--parameter_sharing", action="store_true", help="Parameter sharing.")

parser.add_argument("--max_choices_num", default=10, type=int,
help="The maximum number of cadicate answer, shorter than this will be padded.")

# Tokenizer options.
parser.add_argument("--tokenizer", choices=["bert", "char", "space"], default="char",
help="Specify the tokenizer."
"Original Google BERT uses bert tokenizer on Chinese corpus."
"Char tokenizer segments sentences into characters."
"Space tokenizer segments sentences into words according to space."
)

# Optimizer options.
parser.add_argument("--learning_rate", type=float, default=2e-5,
help="Learning rate.")
parser.add_argument("--warmup", type=float, default=0.1,
help="Warm up value.")
parser.add_argument("--fp16", action='store_true',
help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit.")
parser.add_argument("--fp16_opt_level", choices=["O0", "O1", "O2", "O3" ], default='O1',
help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
"See details at https://nvidia.github.io/apex/amp.html")

# Training options.
parser.add_argument("--dropout", type=float, default=0.2,
help="Dropout.")
parser.add_argument("--epochs_num", type=int, default=8,
help="Number of epochs.")
parser.add_argument("--report_steps", type=int, default=100,
help="Specific steps to print prompt.")
parser.add_argument("--seed", type=int, default=7,
help="Random seed.")

args = parser.parse_args()

args.labels_num = args.max_choices_num
if args.output_model_path == None:
args.output_model_path = "./models/chid_model.bin"

# Load the hyperparameters from the config file.
args = load_hyperparam(args)

set_seed(args.seed)

# Build tokenizer.
args.tokenizer = globals()[args.tokenizer.capitalize() + "Tokenizer"](args)
args.tokenizer = CharTokenizer(args)

# Build multiple choice model.
model = MultipleChoice(args)
Expand Down
Loading

0 comments on commit 93a95c0

Please sign in to comment.