Skip to content

If the candidate sentence string has nothing in it, I get an error. #47

Closed
@codewithkaranjeswani

Description

@codewithkaranjeswani

If candidate sentence string is "", then it should give a score of 0, but instead it gives an error.

I run this statement:
sol = score([""], ["Hello World."], model_type=None, num_layers=None, verbose=True,
idf=True, device=None, batch_size=64, nthreads=4, all_layers=False,
lang="en", return_hash=False, rescale_with_baseline=True)

This is the output I got:
preparing IDF dict...
done in 0.64 seconds
calculating scores...
computing bert embedding.
0%
0/1 [00:00<?, ?it/s]
IndexError Traceback (most recent call last)
in ()
3 sol = score([""], ["Hello World."], model_type=None, num_layers=None, verbose=True,
4 idf=True, device=None, batch_size=64, nthreads=4, all_layers=False,
----> 5 lang="en", return_hash=False, rescale_with_baseline=True)

10 frames
/usr/local/lib/python3.6/dist-packages/bert_score/score.py in score(cands, refs, model_type, num_layers, verbose, idf, device, batch_size, nthreads, all_layers, lang, return_hash, rescale_with_baseline)
110 all_preds = bert_cos_score_idf(model, refs, cands, tokenizer, idf_dict,
111 verbose=verbose, device=device,
--> 112 batch_size=batch_size, all_layers=all_layers).cpu()
113
114 if ref_group_boundaries is not None:

/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in bert_cos_score_idf(model, refs, hyps, tokenizer, idf_dict, verbose, batch_size, device, all_layers)
365 sen_batch = sentences[batch_start:batch_start+batch_size]
366 embs, masks, padded_idf = get_bert_embedding(sen_batch, model, tokenizer, idf_dict,
--> 367 device=device, all_layers=all_layers)
368 embs = embs.cpu()
369 masks = masks.cpu()

/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in get_bert_embedding(all_sens, model, tokenizer, idf_dict, batch_size, device, all_layers)
235 tokenizer,
236 idf_dict,
--> 237 device=device)
238
239 if batch_size == -1: batch_size = len(all_sens)

/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in collate_idf(arr, tokenizer, idf_dict, device)
202 - :param: device (str): device to use, e.g. 'cpu' or 'cuda'
203 """
--> 204 arr = [sent_encode(tokenizer, a) for a in arr]
205
206 idf_weights = [[idf_dict[i] for i in a] for a in arr]

/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in (.0)
202 - :param: device (str): device to use, e.g. 'cpu' or 'cuda'
203 """
--> 204 arr = [sent_encode(tokenizer, a) for a in arr]
205
206 idf_weights = [[idf_dict[i] for i in a] for a in arr]

/usr/local/lib/python3.6/dist-packages/bert_score/utils.py in sent_encode(tokenizer, sent)
81 return tokenizer.encode(sent.strip(), add_special_tokens=True,
82 add_prefix_space=True,
---> 83 max_length=tokenizer.max_len)
84 else:
85 return tokenizer.encode(sent.strip(), add_special_tokens=True,

/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in encode(self, text, text_pair, add_special_tokens, max_length, stride, truncation_strategy, pad_to_max_length, return_tensors, **kwargs)
1421 pad_to_max_length=pad_to_max_length,
1422 return_tensors=return_tensors,
-> 1423 **kwargs,
1424 )
1425

/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in encode_plus(self, text, text_pair, add_special_tokens, max_length, stride, truncation_strategy, pad_to_max_length, is_pretokenized, return_tensors, return_token_type_ids, return_attention_mask, return_overflowing_tokens, return_special_tokens_mask, return_offsets_mapping, **kwargs)
1563 )
1564
-> 1565 first_ids = get_input_ids(text)
1566 second_ids = get_input_ids(text_pair) if text_pair is not None else None
1567

/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in get_input_ids(text)
1535 def get_input_ids(text):
1536 if isinstance(text, str):
-> 1537 tokens = self.tokenize(text, add_special_tokens=add_special_tokens, **kwargs)
1538 return self.convert_tokens_to_ids(tokens)
1539 elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):

/usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils.py in tokenize(self, text, **kwargs)
1259 """
1260 all_special_tokens = self.all_special_tokens
-> 1261 text = self.prepare_for_tokenization(text, **kwargs)
1262
1263 # TODO: should this be in the base class?

/usr/local/lib/python3.6/dist-packages/transformers/tokenization_roberta.py in prepare_for_tokenization(self, text, add_special_tokens, **kwargs)
237 else:
238 add_prefix_space = add_special_tokens
--> 239 if add_prefix_space and not text[0].isspace():
240 text = " " + text
241 return text

IndexError: string index out of range

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions