Description
If candidate sentence string is "", then it should give a score of 0, but instead it gives an error.
I run this statement:
sol = score([""], ["Hello World."], model_type=None, num_layers=None, verbose=True,
idf=True, device=None, batch_size=64, nthreads=4, all_layers=False,
lang="en", return_hash=False, rescale_with_baseline=True)
This is the output I got:
preparing IDF dict...
done in 0.64 seconds
calculating scores...
computing bert embedding.
0%
0/1 [00:00<?, ?it/s]
IndexError Traceback (most recent call last)
in ()
3 sol = score([""], ["Hello World."], model_type=None, num_layers=None, verbose=True,
4 idf=True, device=None, batch_size=64, nthreads=4, all_layers=False,
----> 5 lang="en", return_hash=False, rescale_with_baseline=True)
10 frames
/usr/local/lib/python3.6/dist-packages/bert_score/score.py in score(cands, refs, model_type, num_layers, verbose, idf, device, batch_size, nthreads, all_layers, lang, return_hash, rescale_with_baseline)
110 all_preds = bert_cos_score_idf(model, refs, cands, tokenizer, idf_dict,
111 verbose=verbose, device=device,
--> 112 batch_size=batch_size, all_layers=all_layers).cpu()
113
114 if ref_group_boundaries is not None:
/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in bert_cos_score_idf(model, refs, hyps, tokenizer, idf_dict, verbose, batch_size, device, all_layers)
365 sen_batch = sentences[batch_start:batch_start+batch_size]
366 embs, masks, padded_idf = get_bert_embedding(sen_batch, model, tokenizer, idf_dict,
--> 367 device=device, all_layers=all_layers)
368 embs = embs.cpu()
369 masks = masks.cpu()
/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in get_bert_embedding(all_sens, model, tokenizer, idf_dict, batch_size, device, all_layers)
235 tokenizer,
236 idf_dict,
--> 237 device=device)
238
239 if batch_size == -1: batch_size = len(all_sens)
/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in collate_idf(arr, tokenizer, idf_dict, device)
202 - :param: device
(str): device to use, e.g. 'cpu' or 'cuda'
203 """
--> 204 arr = [sent_encode(tokenizer, a) for a in arr]
205
206 idf_weights = [[idf_dict[i] for i in a] for a in arr]
/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in (.0)
202 - :param: device
(str): device to use, e.g. 'cpu' or 'cuda'
203 """
--> 204 arr = [sent_encode(tokenizer, a) for a in arr]
205
206 idf_weights = [[idf_dict[i] for i in a] for a in arr]
/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in sent_encode(tokenizer, sent)
81 return tokenizer.encode(sent.strip(), add_special_tokens=True,
82 add_prefix_space=True,
---> 83 max_length=tokenizer.max_len)
84 else:
85 return tokenizer.encode(sent.strip(), add_special_tokens=True,
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in encode(self, text, text_pair, add_special_tokens, max_length, stride, truncation_strategy, pad_to_max_length, return_tensors, **kwargs)
1421 pad_to_max_length=pad_to_max_length,
1422 return_tensors=return_tensors,
-> 1423 **kwargs,
1424 )
1425
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in encode_plus(self, text, text_pair, add_special_tokens, max_length, stride, truncation_strategy, pad_to_max_length, is_pretokenized, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, **kwargs)
1563 )
1564
-> 1565 first_ids = get_input_ids(text)
1566 second_ids = get_input_ids(text_pair) if text_pair is not None else None
1567
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in get_input_ids(text)
1535 def get_input_ids(text):
1536 if isinstance(text, str):
-> 1537 tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
1538 return self.convert_tokens_to_ids(tokens)
1539 elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in tokenize(self, text, **kwargs)
1259 """
1260 all_special_tokens = self.all_special_tokens
-> 1261 text = self.prepare_for_tokenization(text, **kwargs)
1262
1263 # TODO: should this be in the base class?
/usr/local/lib/python3.6/dist-packages/transformers/tokenization_roberta.py in prepare_for_tokenization(self, text, add_special_tokens, **kwargs)
237 else:
238 add_prefix_space = add_special_tokens
--> 239 if add_prefix_space and not text[0].isspace():
240 text = " " + text
241 return text
IndexError: string index out of range