Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

[ASK] Error while running extractive_summarization_cnndm_transformer.ipynb #624

Open

Description

When I run below code.
summarizer.fit( ext_sum_train, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE, gradient_accumulation_steps=2, max_steps=MAX_STEPS, learning_rate=LEARNING_RATE, warmup_steps=WARMUP_STEPS, verbose=True, report_every=REPORT_EVERY, clip_grad_norm=False, use_preprocessed_data=USE_PREPROCSSED_DATA )

It gives me error like this.

Iteration:   0%|          | 0/199 [00:00<?, ?it/s]

---------------------------------------------------------------------------

TypeError                                 Traceback (most recent call last)

<ipython-input-40-343cf59f0aa4> in <module>()
     12             report_every=REPORT_EVERY,
     13             clip_grad_norm=False,
---> 14             use_preprocessed_data=USE_PREPROCSSED_DATA
     15         )
     16 

11 frames

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in fit(self, train_dataset, num_gpus, gpu_ids, batch_size, local_rank, max_steps, warmup_steps, learning_rate, optimization_method, max_grad_norm, beta1, beta2, decay_method, gradient_accumulation_steps, report_every, verbose, seed, save_every, world_size, rank, use_preprocessed_data, **kwargs)
    775             report_every=report_every,
    776             clip_grad_norm=False,
--> 777             save_every=save_every,
    778         )
    779 

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/common.py in fine_tune(self, train_dataloader, get_inputs, device, num_gpus, max_steps, global_step, max_grad_norm, gradient_accumulation_steps, optimizer, scheduler, fp16, amp, local_rank, verbose, seed, report_every, save_every, clip_grad_norm, validation_function)
    191                 disable=local_rank not in [-1, 0] or not verbose,
    192             )
--> 193             for step, batch in enumerate(epoch_iterator):
    194                 inputs = get_inputs(batch, device, self.model_name)
    195                 outputs = self.model(**inputs)

/usr/local/lib/python3.7/dist-packages/tqdm/std.py in __iter__(self)
   1102                 fp_write=getattr(self.fp, 'write', sys.stderr.write))
   1103 
-> 1104         for obj in iterable:
   1105             yield obj
   1106             # Update and possibly print the progressbar.

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    519             if self._sampler_iter is None:
    520                 self._reset()
--> 521             data = self._next_data()
    522             self._num_yielded += 1
    523             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    559     def _next_data(self):
    560         index = self._next_index()  # may raise StopIteration
--> 561         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    562         if self._pin_memory:
    563             data = _utils.pin_memory.pin_memory(data)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     45         else:
     46             data = self.dataset[possibly_batched_index]
---> 47         return self.collate_fn(data)

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate_fn(data)
    744             def collate_fn(data):
    745                 return self.processor.collate(
--> 746                     data, block_size=self.max_pos_length, device=device
    747                 )
    748 

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in collate(self, data, block_size, device, train_mode)
    470         else:
    471             if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472                 encoded_text = [self.encode_single(d, block_size) for d in data]
    473                 batch = Batch(list(filter(None, encoded_text)), True)
    474             else:

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in <listcomp>(.0)
    470         else:
    471             if train_mode is True and "tgt" in data[0] and "oracle_ids" in data[0]:
--> 472                 encoded_text = [self.encode_single(d, block_size) for d in data]
    473                 batch = Batch(list(filter(None, encoded_text)), True)
    474             else:

/content/drive/My Drive/nlp-recipes/utils_nlp/models/transformers/extractive_summarization.py in encode_single(self, d, block_size, train_mode)
    539             + ["[SEP]"]
    540         )
--> 541         src_subtoken_idxs = self.tokenizer.convert_tokens_to_ids(src_subtokens)
    542         _segs = [-1] + [i for i, t in enumerate(src_subtoken_idxs) if t == self.sep_vid]
    543         segs = [_segs[i] - _segs[i - 1] for i in range(1, len(_segs))]

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in convert_tokens_to_ids(self, tokens)

/usr/local/lib/python3.7/dist-packages/transformers/tokenization_utils_fast.py in _convert_token_to_id_with_added_voc(self, token)

TypeError: Can't convert 0 to PyString

P.S. I try to run this code using google colab free GPU.

Any help is welcome :)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions