[ASK] Error while running extractive_summarization_cnndm_transformer.ipynb

When I run below code.
`summarizer.fit(
            ext_sum_train,
            num_gpus=NUM_GPUS,
            batch_size=BATCH_SIZE,
            gradient_accumulation_steps=2,
            max_steps=MAX_STEPS,
            learning_rate=LEARNING_RATE,
            warmup_steps=WARMUP_STEPS,
            verbose=True,
            report_every=REPORT_EVERY,
            clip_grad_norm=False,
            use_preprocessed_data=USE_PREPROCSSED_DATA
        )`


It gives me error like this.


```
Iteration:   0%|          | 0/199 [00:00<?, ?it/s]

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-40-343cf59f0aa4> in <module>()
     12             report_every=REPORT_EVERY,
     13             clip_grad_norm=False,
---> 14             use_preprocessed_data=USE_PREPROCSSED_DATA
     15         )
     16 

11 frames

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in fit(self, train_dataset, num_gpus, gpu_ids, batch_size, local_rank, max_steps, warmup_steps, learning_rate, optimization_method, max_grad_norm, beta1, beta2, decay_method, gradient_accumulation_steps, report_every, verbose, seed, save_every, world_size, rank, use_preprocessed_data, **kwargs)
    775             report_every=report_every,
    776             clip_grad_norm=False,
--> 777             save_every=save_every,
    778         )
    779 

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/common.py in fine_tune(self, train_dataloader, get_inputs, device, num_gpus, max_steps, global_step, max_grad_norm, gradient_accumulation_steps, optimizer, scheduler, fp16, amp, local_rank, verbose, seed, report_every, save_every, clip_grad_norm, validation_function)
    191                 disable=local_rank not in [-1, 0] or not verbose,
    192             )
--> 193             for step, batch in enumerate(epoch_iterator):
    194                 inputs = get_inputs(batch, device, self.model_name)
    195                 outputs = self.model(**inputs)

/usr/local/lib/python3.7/dist-packages/tqdm/std.py in __iter__(self)
   1102                 fp_write=getattr(self.fp, 'write', sys.stderr.write))
   1103 
-> 1104         for obj in iterable:
   1105             yield obj
   1106             # Update and possibly print the progressbar.

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate_fn(data)
    744             def collate_fn(data):
    745                 return self.processor.collate(
--> 746                     data, block_size=self.max_pos_length, device=device
    747                 )
    748 

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate(self, data, block_size, device, train_mode)
    470         else:
    471             if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472                 encoded_text = [self.encode_single(d, block_size) for d in data]
    473                 batch = Batch(list(filter(None, encoded_text)), True)
    474             else:

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in <listcomp>(.0)
    470         else:
    471             if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472                 encoded_text = [self.encode_single(d, block_size) for d in data]
    473                 batch = Batch(list(filter(None, encoded_text)), True)
    474             else:

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in encode_single(self, d, block_size, train_mode)
    539             + ["[SEP]"]
    540         )
--> 541         src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
    542         _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
    543         segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _convert_token_to_id_with_added_voc(self, token)

TypeError: Can't convert 0 to PyString
```

P.S. I try to run this code using google colab free GPU.


Any help is welcome :)


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[ASK] Error while running extractive_summarization_cnndm_transformer.ipynb #624

ToonicTie
openedon Jul 24, 2021

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

[ASK] Error while running extractive_summarization_cnndm_transformer.ipynb #624

Description

ToonicTieopenedon Jul 24, 2021

Metadata