|
10 | 10 | from torchtext.vocab import Vectors, GloVe
|
11 | 11 |
|
12 | 12 | def load_dataset(test_sen=None):
|
13 |
| - |
14 |
| - """ |
15 |
| - tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied |
16 |
| - Field : A class that stores information about the way of preprocessing |
17 |
| - fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will |
| 13 | + |
| 14 | + """ |
| 15 | + tokenizer : Breaks sentences into a list of words. If sequential=False, no tokenization is applied |
| 16 | + Field : A class that stores information about the way of preprocessing |
| 17 | + fix_length : An important property of TorchText is that we can let the input to be variable length, and TorchText will |
18 | 18 | dynamically pad each sequence to the longest sequence in that "batch". But here we are using fi_length which
|
19 | 19 | will pad each sequence to have a fix length of 200.
|
20 | 20 |
|
21 |
| - build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an |
| 21 | + build_vocab : It will first make a vocabulary or dictionary mapping all the unique words present in the train_data to an |
22 | 22 | idx and then after it will use GloVe word embedding to map the index to the corresponding word embedding.
|
23 |
| - |
24 |
| - vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. |
25 |
| - BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. |
26 |
| - |
27 |
| - """ |
28 |
| - |
| 23 | +
|
| 24 | + vocab.vectors : This returns a torch tensor of shape (vocab_size x embedding_dim) containing the pre-trained word embeddings. |
| 25 | + BucketIterator : Defines an iterator that batches examples of similar lengths together to minimize the amount of padding needed. |
| 26 | +
|
| 27 | + """ |
| 28 | + |
29 | 29 | tokenize = lambda x: x.split()
|
30 | 30 | TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, include_lengths=True, batch_first=True, fix_length=200)
|
31 | 31 | LABEL = data.LabelField(tensor_type=torch.FloatTensor)
|
|
0 commit comments