Using pretrained models in step_tokenize_sentencepiece

I'm trying to use bpe working in step_tokenize_sentencepiece.

Could I use some already trained model? Here some examples trying different approaches:

```
library(tidymodels)
library(textrecipes)
library(sentencepiece)

dataf <- data.frame(
  "text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
  "label_col" = c("pos", "neg", "neg", "pos", "neg")
)

test <- data.frame(
  "text" = c("negative results", "neg"),
  "label_col" = c("neg", "neg")
)

rec0 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_sentencepiece(text, vocabulary_size = 25) |> 
  step_tfidf(text) 

# It splits by character not by "bpe"
rec0 |> 
  prep() |> 
  juice() |>
  print()


# 0.853 for .pred_neg in the first case
rec0 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))

# I can't split by bpe in step_tokenize_sentencepiece
tryCatch({
    rec1 <- dataf |> 
      recipe(label_col ~ text) |> 
      step_tokenize_sentencepiece(text, vocabulary_size = 25, options = list(type = "bpe")) |> 
      step_tfidf(text) 

    rec1 |> 
      prep() |> 
      juice() |>
      print()
  },
  error = \(e) message(e)
)

# I can do it in step_tokenize_bpe
rec2 <- dataf |> 
  recipe(label_col ~ text) |> 
  step_tokenize_bpe(text) |>   
  step_tfidf(text) 

rec2 |> 
  prep() |> 
  juice() |>
  print()

# 1.00 for .pred_neg in the first case
rec2 |> 
  workflow(logistic_reg()) |> 
  fit(dataf) |> 
  augment(test, type = c("prob", "class"))


# Using a sentencepiece trained model before
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()

rec3 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_tfidf(text) 

rec3 |> 
  prep() |> 
  juice() |>
  print()

rec3 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))



# Using a sentencepiece trained model before
# Using word embedings instead of step_tfidf
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
embeddings <- tibble(
  tokens = sentencepiece_encode(model, all$text, type = "subwords") |> unlist() |> unique(),
  ids = sentencepiece_encode(model, all$text, type = "ids") |> unlist() |> unique()
)
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |> 
  tokenlist()
rec4 <- all |> 
  filter(data == "train") |>
  recipe(label_col ~ text) |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "sum", keep_original_cols = TRUE, prefix = "sum") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "mean", keep_original_cols = TRUE, prefix = "mean") |> 
  step_word_embeddings(text, embeddings = embeddings, aggregation = "max", keep_original_cols = TRUE, prefix = "max") |> 
  step_rm(text)

rec4 |> 
  prep() |> 
  juice() |>
  print()

rec4 |> 
  workflow(logistic_reg()) |> 
  fit(all |> filter(data == "train")) |> 
  augment(all |> filter(data == "test"), type = c("prob", "class"))
```

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Using pretrained models in step_tokenize_sentencepiece #278

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Using pretrained models in step_tokenize_sentencepiece #278

Description

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions