Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix typos #7687

Merged
merged 1 commit into from
Dec 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@
data_g.add_arg("test_set", str, None, "Path to test data.")
data_g.add_arg("dev_set", str, None, "Path to validation data.")
data_g.add_arg("vocab_path", str, None, "Vocabulary path.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest seqence.")
data_g.add_arg("q_max_seq_len", int, 32, "Number of words of the longest seqence.")
data_g.add_arg("p_max_seq_len", int, 256, "Number of words of the longest seqence.")
data_g.add_arg("max_seq_len", int, 512, "Number of words of the longest sequence.")
data_g.add_arg("q_max_seq_len", int, 32, "Number of words of the longest sequence.")
data_g.add_arg("p_max_seq_len", int, 256, "Number of words of the longest sequence.")
data_g.add_arg("train_data_size", int, 0, "Number of training data's total examples. Set for distribute.")
data_g.add_arg("batch_size", int, 32, "Total examples' number in batch for training. see also --in_tokens.")
data_g.add_arg("predict_batch_size", int, None, "Total examples' number in batch for predict. see also --in_tokens.")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

def forward(
self,
Expand All @@ -48,15 +48,15 @@ def forward(

cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)

# Substract margin from all positive samples cosine_sim()
# Subtract margin from all positive samples cosine_sim()
margin_diag = paddle.full(
shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
)

cosine_sim = cosine_sim - paddle.diag(margin_diag)

# Scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand All @@ -71,7 +71,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_
super().__init__(pretrained_model, dropout, output_emb_size)
self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

def forward(
self,
Expand All @@ -95,13 +95,13 @@ def forward(

cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)

# Substract margin from all positive samples cosine_sim()
# Subtract margin from all positive samples cosine_sim()
margin_diag = paddle.full(shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=cosine_sim.dtype)

cosine_sim = cosine_sim - paddle.diag(margin_diag)

# Scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ def predict(self, data, tokenizer):
args.enable_mkldnn,
)

# ErnieTinyTokenizer is special for ernie-tiny pretained model.
# ErnieTinyTokenizer is special for ernie-tiny pretrained model.
output_emb_size = 256
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
id2corpus = {0: "国有企业引入非国有资本对创新绩效的影响——基于制造业国有上市公司的经验证据"}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
2 changes: 1 addition & 1 deletion applications/neural_search/recall/simcse/export_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
# yapf: enable

if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
# If you want to use ernie1.0 model, please uncomment the following code
output_emb_size = 256

pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
Expand Down
2 changes: 1 addition & 1 deletion applications/neural_search/recall/simcse/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def convert_example(example, tokenizer, max_seq_length=512, do_evalute=False):
else:
raise ValueError("Please set --params_path with correct pretrained model file")

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
4 changes: 2 additions & 2 deletions applications/neural_search/recall/simcse/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

@paddle.jit.to_static(
input_spec=[
Expand Down Expand Up @@ -130,7 +130,7 @@ def forward(
cosine_sim = cosine_sim - paddle.diag(margin_diag)

# scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
2 changes: 1 addition & 1 deletion applications/neural_search/recall/simcse/recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale
self.classifier = nn.Linear(output_emb_size, 2)
self.rdrop_loss = paddlenlp.losses.RDropLoss()

Expand Down Expand Up @@ -133,7 +133,7 @@ def forward(
cosine_sim = cosine_sim - paddle.diag(margin_diag)

# scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def batchify_fn(

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# yapf: enable

if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
# If you want to use ernie1.0 model, please uncomment the following code
output_emb_size = 256

pretrained_model = AutoModel.from_pretrained("ernie-3.0-medium-zh")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.0, scale=20, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

@paddle.jit.to_static(
input_spec=[
Expand Down Expand Up @@ -125,7 +125,7 @@ def forward(
cosine_sim = cosine_sim - paddle.diag(margin_diag)

# scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def batchify_fn(

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
# fmt: on

if __name__ == "__main__":
# If you want to use ernie1.0 model, plesace uncomment the following code
# If you want to use ernie1.0 model, please uncomment the following code
pretrained_model = AutoModel.from_pretrained(args.model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
model = SemanticIndexBaseStatic(pretrained_model, output_emb_size=args.output_emb_size)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

def forward(
self,
Expand Down Expand Up @@ -55,7 +55,7 @@ def forward(
cosine_sim = cosine_sim - paddle.diag(margin_diag)

# scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def do_train():
if args.evaluate:
eval_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

def forward(
self,
Expand Down Expand Up @@ -55,7 +55,7 @@ def forward(
cosine_sim = cosine_sim - paddle.diag(margin_diag)

# scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def do_train():
if args.evaluate:
eval_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __init__(self, pretrained_model, dropout=None, margin=0.3, scale=30, output_

self.margin = margin
# Used scaling cosine similarity to ease converge
self.sacle = scale
self.scale = scale

def forward(
self,
Expand All @@ -48,15 +48,15 @@ def forward(

cosine_sim = paddle.matmul(query_cls_embedding, title_cls_embedding, transpose_y=True)

# Substract margin from all positive samples cosine_sim()
# Subtract margin from all positive samples cosine_sim()
margin_diag = paddle.full(
shape=[query_cls_embedding.shape[0]], fill_value=self.margin, dtype=paddle.get_default_dtype()
)

cosine_sim = cosine_sim - paddle.diag(margin_diag)

# Scale cosine to ease training converge
cosine_sim *= self.sacle
cosine_sim *= self.scale

labels = paddle.arange(0, query_cls_embedding.shape[0], dtype="int64")
labels = paddle.reshape(labels, shape=[-1, 1])
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@
else:
raise ValueError("Please set --params_path with correct pretrained model file")
id2corpus = gen_id2corpus(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def do_train():
eval_func = partial(convert_example, tokenizer=tokenizer, max_seq_length=args.max_seq_length)
id2corpus = gen_id2corpus(args.corpus_file)
label2id = label2ids(args.corpus_file)
# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)
corpus_data_loader = create_dataloader(
Expand Down
2 changes: 1 addition & 1 deletion examples/semantic_indexing/recall.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down
2 changes: 1 addition & 1 deletion examples/semantic_indexing/run_ann_data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ def build_data_loader(args, tokenizer):

id2corpus = gen_id2corpus(args.corpus_file)

# conver_example function's input must be dict
# convert_example function's input must be dict
corpus_list = [{idx: text} for idx, text in id2corpus.items()]
corpus_ds = MapDataset(corpus_list)

Expand Down