Skip to content

Anyone encounter loss nan? #11

@kuanzi

Description

@kuanzi

It is ok for small scale models like MLP/LeNet5. However, when it comes to vgg16/ resnet18, it will always produce nan loss.
The model structure configuration is below:

`
cfg = {
'VGG11': [64, 'M', 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG13': [64, 64, 'M', 128, 128, 'M', 256, 256, 'M', 512, 512, 'M', 512, 512, 'M'],
'VGG16': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'M', 512, 512, 512, 'M', 512, 512, 512, 'M'],
'VGG19': [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 256, 'M', 512, 512, 512, 512, 'M', 512, 512, 512, 512, 'M'],
}

class VGG_CIFAR10_BAY(nn.Module):
kl_list = []
def init(self, vgg_name):
super(VGG_CIFAR10_BAY, self).init()
self.features = self._make_layers(cfg[vgg_name])
linear_index = BayesianLayer.LinearGroupNJ(512, 10, clip_var=0.04, cuda=True)
self.classifier = linear_index
self.kl_list.append(linear_index)

def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out

def _make_layers(self, cfg):
layers = []
in_channels = 3
for x in cfg:
if x == 'M':
layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
else:
conv_index = BayesianLayer.Conv2dGroupNJ(in_channels, x, kernel_size=3, padding=1, clip_var=0.04, cuda=True)
layers += [conv_index,
nn.BatchNorm2d(x),
nn.ReLU(inplace=True)]
self.kl_list.append(conv_index)
in_channels = x
layers += [nn.AvgPool2d(kernel_size=1, stride=1)]
return nn.Sequential(*layers)

def get_masks(self,thresholds):
# import pdb
# pdb.set_trace()
weight_masks = []
mask = None
layers = self.kl_list
for i, (layer, threshold) in enumerate(zip(layers, thresholds)):
# compute dropout mask
if len(layer.weight_mu.shape) > 2:
if mask is None:
mask = [True]*layer.in_channels
else:
mask = np.copy(next_mask)

        log_alpha = layers[i].get_log_dropout_rates().cpu().data.numpy()
        next_mask = log_alpha <= thresholds[i]

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
        weight_mask = weight_mask[:,:,None,None]
    else:
        if mask is None:
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
        elif len(weight_mask.shape) > 2:
            temp = next_mask.repeat(layer.in_features/next_mask.shape[0])
            log_alpha = layer.get_log_dropout_rates().cpu().data.numpy()
            mask = log_alpha <= threshold
            #mask = mask | temp  ##Upper bound for number of weights at first fully connected layer
            mask = mask & temp   ##Lower bound for number of weights at fully connected layer
        else:
            mask = np.copy(next_mask)

        try:
            log_alpha = layers[i + 1].get_log_dropout_rates().cpu().data.numpy()
            next_mask = log_alpha <= thresholds[i + 1]
        except:
            # must be the last mask
            next_mask = np.ones(10)

        weight_mask = np.expand_dims(mask, axis=0) * np.expand_dims(next_mask, axis=1)
    weight_masks.append(weight_mask.astype(np.float))
return weight_masks

def model_kl_div(self):
KLD = 0
for layer in self.kl_list:
KLD += layer.layer_kl_div()
return KLD
`

Does it cause by high variance? But I have tried to clip variance, it doesn't work...

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions