pcastonguay
diff --git a/‎PyTorch/LanguageModeling/Transformer-XL/.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎PyTorch/LanguageModeling/Transformer-XL/.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎PyTorch/LanguageModeling/Transformer-XL/README.md‎
Lines changed: 775 additions & 216 deletions b/‎PyTorch/LanguageModeling/Transformer-XL/README.md‎
Lines changed: 775 additions & 216 deletions
diff --git a/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/.dockerignore‎
Lines changed: 3 additions & 0 deletions b/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/.dockerignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/Dockerfile‎
Lines changed: 2 additions & 1 deletion b/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/Dockerfile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/data_utils.py‎
Lines changed: 22 additions & 8 deletions b/‎PyTorch/LanguageModeling/Transformer-XL/pytorch/data_utils.py‎
Lines changed: 22 additions & 8 deletions
@@ -1,3 +1,7 @@
 **/.DS_Store
 __pycache__/
 data/
+results/
+*.out
+*.log
+*.json
@@ -1,2 +1,5 @@
 LM-TFM*
 internal/result*
+*.out
+*.log
+*.json
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.09-py3
+ARG FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:19.11-py3
 FROM ${FROM_IMAGE_NAME}
 
 ENV LANG C.UTF-8
@@ -26,5 +26,6 @@ WORKDIR /workspace/transformer-xl/pytorch
 
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+RUN pip install --no-cache-dir git+https://github.com/NVIDIA/dllogger.git#egg=dllogger
 
 ADD . /workspace/transformer-xl/pytorch
@@ -27,32 +27,41 @@
 
 
 class LMOrderedIterator(object):
-    def __init__(self, data, bsz, bptt, device='cpu', ext_len=None):
+    def __init__(self, data, bsz, bptt, device='cpu', mem_len=None, ext_len=None, warmup=True):
         """
             data -- LongTensor -- the LongTensor is strictly ordered
         """
         self.bsz = bsz
         self.bptt = bptt
         self.ext_len = ext_len if ext_len is not None else 0
+        self.mem_len = mem_len
+        self.warmup = warmup
 
         self.device = device
 
         # Work out how cleanly we can divide the dataset into bsz parts.
-        self.n_step = data.size(0) // bsz
+        n_step = data.size(0) // bsz
 
         # Trim off any extra elements that wouldn't cleanly fit (remainders).
-        data = data.narrow(0, 0, self.n_step * bsz)
+        data = data[:n_step * bsz]
 
         # Evenly divide the data across the bsz batches.
         self.data = data.view(bsz, -1).t().contiguous()
 
+        if mem_len and warmup:
+            self.warmup_batches = (mem_len + bptt - 1) // bptt
+            self.warmup_elems = self.warmup_batches * bptt
+
+            warmup_data = self.data.roll((self.warmup_elems, 1), (0, 1))[:self.warmup_elems]
+            self.data = torch.cat((warmup_data, self.data))
+
         # Partition data for DistributedDataParallel
         world_size = utils.distributed.get_world_size()
         rank = utils.distributed.get_rank()
-        self.data = self.data.chunk(world_size, dim=1)[rank].to(device)
+        self.data = self.data.chunk(world_size, dim=1)[rank]
 
         # Number of mini-batches
-        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+        self.n_batch = (self.data.size(0) + self.bptt - 1) // self.bptt
 
     def roll(self):
         for i in range(self.data.size(1)):
@@ -70,10 +79,15 @@ def get_batch(self, i, bptt=None):
         end_idx = i + seq_len
         beg_idx = max(0, i - self.ext_len)
 
-        data = self.data[beg_idx:end_idx]
-        target = self.data[i+1:i+1+seq_len]
+        data = self.data[beg_idx:end_idx].to(self.device)
+        target = self.data[i+1:i+1+seq_len].to(self.device)
+
+        if self.mem_len and self.warmup:
+            warm = i >= self.warmup_elems
+        else:
+            warm = True
 
-        return data, target, seq_len
+        return data, target, seq_len, warm
 
     def get_fixlen_iter(self, start=0):
         for i in range(start, self.data.size(0) - 1, self.bptt):
-Original file line number
+Diff line change
@@ @@ -1,2 +1,5 @@ @@
 LM-TFM*
 internal/result*
 +*.out
 +*.log
 +*.json