This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
[ASK] Error while running extractive_summarization_cnndm_transformer.ipynb #624
Open
Description
openedon Jul 24, 2021
When I run below code.
summarizer.fit( ext_sum_train, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE, gradient_accumulation_steps=2, max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, warmup_steps=WARMUP_STEPS, verbose=True, report_every=REPORT_EVERY, clip_grad_norm=False, use_preprocessed_data=USE_PREPROCSSED_DATA )
It gives me error like this.
Iteration: 0%| | 0/199 [00:00<?, ?it/s]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-40-343cf59f0aa4> in <module>()
12 report_every=REPORT_EVERY,
13 clip_grad_norm=False,
---> 14 use_preprocessed_data=USE_PREPROCSSED_DATA
15 )
16
11 frames
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in fit(self, train_dataset, num_gpus, gpu_ids, batch_size, local_rank, max_steps, warmup_steps, learning_rate, optimization_method, max_grad_norm, beta1, beta2, decay_method, gradient_accumulation_steps, report_every, verbose, seed, save_every, world_size, rank, use_preprocessed_data, **kwargs)
775 report_every=report_every,
776 clip_grad_norm=False,
--> 777 save_every=save_every,
778 )
779
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/common.py in fine_tune(self, train_dataloader, get_inputs, device, num_gpus, max_steps, global_step, max_grad_norm, gradient_accumulation_steps, optimizer, scheduler, fp16, amp, local_rank, verbose, seed, report_every, save_every, clip_grad_norm, validation_function)
191 disable=local_rank not in [-1, 0] or not verbose,
192 )
--> 193 for step, batch in enumerate(epoch_iterator):
194 inputs = get_inputs(batch, device, self.model_name)
195 outputs = self.model(**inputs)
/usr/local/lib/python3.7/dist-packages/tqdm/std.py in __iter__(self)
1102 fp_write=getattr(self.fp, 'write', sys.stderr.write))
1103
-> 1104 for obj in iterable:
1105 yield obj
1106 # Update and possibly print the progressbar.
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
519 if self._sampler_iter is None:
520 self._reset()
--> 521 data = self._next_data()
522 self._num_yielded += 1
523 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
559 def _next_data(self):
560 index = self._next_index() # may raise StopIteration
--> 561 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
562 if self._pin_memory:
563 data = _utils.pin_memory.pin_memory(data)
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
45 else:
46 data = self.dataset[possibly_batched_index]
---> 47 return self.collate_fn(data)
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate_fn(data)
744 def collate_fn(data):
745 return self.processor.collate(
--> 746 data, block_size=self.max_pos_length, device=device
747 )
748
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate(self, data, block_size, device, train_mode)
470 else:
471 if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472 encoded_text = [self.encode_single(d, block_size) for d in data]
473 batch = Batch(list(filter(None, encoded_text)), True)
474 else:
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in <listcomp>(.0)
470 else:
471 if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472 encoded_text = [self.encode_single(d, block_size) for d in data]
473 batch = Batch(list(filter(None, encoded_text)), True)
474 else:
/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in encode_single(self, d, block_size, train_mode)
539 + ["[SEP]"]
540 )
--> 541 src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
542 _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
543 segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)
/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _convert_token_to_id_with_added_voc(self, token)
TypeError: Can't convert 0 to PyString
P.S. I try to run this code using google colab free GPU.
Any help is welcome :)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Metadata
Assignees
Labels
No labels