Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Attention layer model fails with 'Could not find valid device for node.' #70

Open
mg64ve opened this issue Feb 12, 2020 · 0 comments
Open

Comments

@mg64ve
Copy link

mg64ve commented Feb 12, 2020

Hello, I am trying to get code from https://blogs.rstudio.com/tensorflow/posts/2018-07-30-attention-layer/ and trying to reproduce this example.

The following is my code:


reticulate::use_condaenv("tf-gpu", required = TRUE)



library(keras)
use_implementation("tensorflow")

library(tensorflow)
tfe_enable_eager_execution()

library(tfdatasets)

library(purrr)
library(stringr)
library(reshape2)
library(viridis)
library(ggplot2)
library(tibble)

filepath <- file.path("data", "nld.txt")

lines <- readLines(filepath, n = 10000)
sentences <- str_split(lines, "\t")
str(sentences)

space_before_punct <- function(sentence) {
  str_replace_all(sentence, "([?.!])", " \\1")
}

replace_special_chars <- function(sentence) {
  str_replace_all(sentence, "[^a-zA-Z?.!,¿]+", " ")
}

add_tokens <- function(sentence) {
  paste0("<start> ", sentence, " <stop>")
}

add_tokens <- Vectorize(add_tokens, USE.NAMES = FALSE)

preprocess_sentence <- compose(add_tokens,
                               str_squish,
                               replace_special_chars,
                               space_before_punct)

word_pairs <- map(sentences, preprocess_sentence)

create_index <- function(sentences) {
  unique_words <- sentences %>% unlist() %>% paste(collapse = " ") %>%
    str_split(pattern = " ") %>% .[[1]] %>% unique() %>% sort()
  index <- data.frame(
    word = unique_words,
    index = 1:length(unique_words),
    stringsAsFactors = FALSE
  ) %>%
    add_row(word = "<pad>",
            index = 0,
            .before = 1)
  index
}

word2index <- function(word, index_df) {
  index_df[index_df$word == word, "index"]
}
index2word <- function(index, index_df) {
  index_df[index_df$index == index, "word"]
}

src_index <- create_index(map(word_pairs, ~ .[[1]]))
target_index <- create_index(map(word_pairs, ~ .[[2]]))

sentence2digits <- function(sentence, index_df) {
  map((sentence %>% str_split(pattern = " "))[[1]], function(word)
    word2index(word, index_df))
}

sentlist2diglist <- function(sentence_list, index_df) {
  map(sentence_list, function(sentence)
    sentence2digits(sentence, index_df))
}

src_diglist <- sentlist2diglist(map(word_pairs, ~ .[[1]]), src_index)
src_maxlen <- map(src_diglist, length) %>% unlist() %>% max()
src_matrix <- pad_sequences(src_diglist, maxlen = src_maxlen,  padding = "post")

target_diglist <- sentlist2diglist(map(word_pairs, ~ .[[2]]), target_index)
target_maxlen <- map(target_diglist, length) %>% unlist() %>% max()
target_matrix <- pad_sequences(target_diglist, maxlen = target_maxlen, padding = "post")

train_indices <-
  sample(nrow(src_matrix), size = nrow(src_matrix) * 0.8)

validation_indices <- setdiff(1:nrow(src_matrix), train_indices)

x_train <- src_matrix[train_indices, ]
y_train <- target_matrix[train_indices, ]

str(x_train)
str(y_train)

x_valid <- src_matrix[validation_indices, ]
y_valid <- target_matrix[validation_indices, ]

str(x_valid)
str(y_valid)

buffer_size <- nrow(x_train)

# just for convenience, so we may get a glimpse at translation 
# performance during training
train_sentences <- sentences[train_indices]
validation_sentences <- sentences[validation_indices]
validation_sample <- sample(validation_sentences, 5)

str(train_sentences)

batch_size <- 32
embedding_dim <- 64
gru_units <- 256

src_vocab_size <- nrow(src_index)
target_vocab_size <- nrow(target_index)

train_dataset <- 
  tensor_slices_dataset(keras_array(list(x_train, y_train)))  %>%
  dataset_shuffle(buffer_size = buffer_size) %>%
  dataset_batch(batch_size, drop_remainder = TRUE)

str(train_dataset)

validation_dataset <-
  tensor_slices_dataset(keras_array(list(x_valid, y_valid))) %>%
  dataset_shuffle(buffer_size = buffer_size) %>%
  dataset_batch(batch_size, drop_remainder = TRUE)

str(validation_dataset)


attention_encoder <-
  
  function(gru_units,
           embedding_dim,
           src_vocab_size,
           name = NULL) {
    
    keras_model_custom(name = name, function(self) {
      
      self$embedding <-
        layer_embedding(
          input_dim = src_vocab_size,
          output_dim = embedding_dim
        )
      
      self$gru <-
        layer_gru(
          units = gru_units,
          return_sequences = TRUE,
          return_state = TRUE
        )
      
      function(inputs, mask = NULL) {
        
        x <- inputs[[1]]
        hidden <- inputs[[2]]
        
        x <- self$embedding(x)
        c(output, state) %<-% self$gru(x, initial_state = hidden)
        
        list(output, state)
      }
    })
  }


attention_decoder <-
  function(object,
           gru_units,
           embedding_dim,
           target_vocab_size,
           name = NULL) {
    
    keras_model_custom(name = name, function(self) {
      
      self$gru <-
        layer_gru(
          units = gru_units,
          return_sequences = TRUE,
          return_state = TRUE
        )
      
      self$embedding <-
        layer_embedding(input_dim = target_vocab_size, 
                        output_dim = embedding_dim)
      
      gru_units <- gru_units
      self$fc <- layer_dense(units = target_vocab_size)
      self$W1 <- layer_dense(units = gru_units)
      self$W2 <- layer_dense(units = gru_units)
      self$V <- layer_dense(units = 1L)
      
      function(inputs, mask = NULL) {
        
        x <- inputs[[1]]
        hidden <- inputs[[2]]
        encoder_output <- inputs[[3]]
        
        hidden_with_time_axis <- k_expand_dims(hidden, 2)
        
        score <- self$V(k_tanh(self$W1(encoder_output) + 
                                 self$W2(hidden_with_time_axis)))
        
        attention_weights <- k_softmax(score, axis = 2)
        
        context_vector <- attention_weights * encoder_output
        context_vector <- k_sum(context_vector, axis = 2)
        
        x <- self$embedding(x)
        
        x <- k_concatenate(list(k_expand_dims(context_vector, 2), x), axis = 3)
        
        c(output, state) %<-% self$gru(x)
        
        output <- k_reshape(output, c(-1, gru_units))
        
        x <- self$fc(output)
        
        list(x, state, attention_weights)
        
      }
      
    })
  }

encoder <- attention_encoder(
  gru_units = gru_units,
  embedding_dim = embedding_dim,
  src_vocab_size = src_vocab_size
)

decoder <- attention_decoder(
  gru_units = gru_units,
  embedding_dim = embedding_dim,
  target_vocab_size = target_vocab_size
)


optimizer <- tf$compat$v1$train$AdamOptimizer()

cx_loss <- function(y_true, y_pred) {
  mask <- ifelse(y_true == 0L, 0, 1)
  loss <-
    tf$nn$sparse_softmax_cross_entropy_with_logits(labels = y_true,
                                                   logits = y_pred) * mask
  tf$reduce_mean(loss)
}


n_epochs <- 50

encoder_init_hidden <- k_zeros(c(batch_size, gru_units))

for (epoch in seq_len(n_epochs)) {
  
  total_loss <- 0
  iteration <- 0
  
  iter <- make_iterator_one_shot(train_dataset)
  
  until_out_of_range({
    
    batch <- iterator_get_next(iter)
    loss <- 0
    x <- batch[[1]]
    y <- batch[[2]]
    iteration <- iteration + 1
    
    with(tf$GradientTape() %as% tape, {
      c(enc_output, enc_hidden) %<-% encoder(list(x, encoder_init_hidden))
      
      dec_hidden <- enc_hidden
      dec_input <-
        k_expand_dims(rep(list(
          word2index("<start>", target_index)
        ), batch_size))
      
      
      for (t in seq_len(target_maxlen - 1)) {
        c(preds, dec_hidden, weights) %<-%
          decoder(list(dec_input, dec_hidden, enc_output))
        loss <- loss + cx_loss(y[, t], preds)
        
        dec_input <- k_expand_dims(y[, t])
      }
      
    })
    
    total_loss <-
      total_loss + loss / k_cast_to_floatx(dim(y)[2])
    
    paste0(
      "Batch loss (epoch/batch): ",
      epoch,
      "/",
      iter,
      ": ",
      (loss / k_cast_to_floatx(dim(y)[2])) %>% 
        as.double() %>% round(4),
      "\n"
    )
    
    variables <- c(encoder$variables, decoder$variables)
    gradients <- tape$gradient(loss, variables)
    
    optimizer$apply_gradients(
      purrr::transpose(list(gradients, variables)),
      global_step = tf$train$get_or_create_global_step()
    )
    
  })

  paste0(
    "Total loss (epoch): ",
    epoch,
    ": ",
    (total_loss / k_cast_to_floatx(buffer_size)) %>% 
      as.double() %>% round(4),
    "\n"
  )
}


this code fails with the following error:

2020-02-12 12:48:30.175011: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library cublas64_100.dll
Error: NotFoundError: Could not find valid device for node.
Node:{{node SparseSoftmaxCrossEntropyWithLogits}}
All kernels registered for op SparseSoftmaxCrossEntropyWithLogits :
  device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
  device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_DOUBLE]; Tlabels in [DT_INT64]
  device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
  device='CPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
  device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT32]
  device='GPU'; T in [DT_FLOAT]; Tlabels in [DT_INT64]
  device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT32]
  device='GPU'; T in [DT_HALF]; Tlabels in [DT_INT64]
 [Op:SparseSoftmaxCrossEntropyWithLogits]

It is not clear to me what is the reason for this failure.
Do you get the same result?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant