I'm trying to use bpe working in step_tokenize_sentencepiece.
Could I use some already trained model? Here some examples trying different approaches:
library(tidymodels)
library(textrecipes)
library(sentencepiece)
dataf <- data.frame(
"text" = c("positive sentiment", "super neg", "bad outcome", "good results"),
"label_col" = c("pos", "neg", "neg", "pos", "neg")
)
test <- data.frame(
"text" = c("negative results", "neg"),
"label_col" = c("neg", "neg")
)
rec0 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_sentencepiece(text, vocabulary_size = 25) |>
step_tfidf(text)
# It splits by character not by "bpe"
rec0 |>
prep() |>
juice() |>
print()
# 0.853 for .pred_neg in the first case
rec0 |>
workflow(logistic_reg()) |>
fit(dataf) |>
augment(test, type = c("prob", "class"))
# I can't split by bpe in step_tokenize_sentencepiece
tryCatch({
rec1 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_sentencepiece(text, vocabulary_size = 25, options = list(type = "bpe")) |>
step_tfidf(text)
rec1 |>
prep() |>
juice() |>
print()
},
error = \(e) message(e)
)
# I can do it in step_tokenize_bpe
rec2 <- dataf |>
recipe(label_col ~ text) |>
step_tokenize_bpe(text) |>
step_tfidf(text)
rec2 |>
prep() |>
juice() |>
print()
# 1.00 for .pred_neg in the first case
rec2 |>
workflow(logistic_reg()) |>
fit(dataf) |>
augment(test, type = c("prob", "class"))
# Using a sentencepiece trained model before
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |>
tokenlist()
rec3 <- all |>
filter(data == "train") |>
recipe(label_col ~ text) |>
step_tfidf(text)
rec3 |>
prep() |>
juice() |>
print()
rec3 |>
workflow(logistic_reg()) |>
fit(all |> filter(data == "train")) |>
augment(all |> filter(data == "test"), type = c("prob", "class"))
# Using a sentencepiece trained model before
# Using word embedings instead of step_tfidf
all <- bind_rows(dataf |> mutate(data = "train"), test |> mutate(data = "test"))
download.file("https://bpemb.h-its.org/en/en.wiki.bpe.vs1000.model", "en.wiki.bpe.vs1000.model")
model <- sentencepiece_load_model("en.wiki.bpe.vs1000.model")
embeddings <- tibble(
tokens = sentencepiece_encode(model, all$text, type = "subwords") |> unlist() |> unique(),
ids = sentencepiece_encode(model, all$text, type = "ids") |> unlist() |> unique()
)
all$text <- sentencepiece_encode(model, all$text, type = "subwords") |>
tokenlist()
rec4 <- all |>
filter(data == "train") |>
recipe(label_col ~ text) |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "sum", keep_original_cols = TRUE, prefix = "sum") |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "mean", keep_original_cols = TRUE, prefix = "mean") |>
step_word_embeddings(text, embeddings = embeddings, aggregation = "max", keep_original_cols = TRUE, prefix = "max") |>
step_rm(text)
rec4 |>
prep() |>
juice() |>
print()
rec4 |>
workflow(logistic_reg()) |>
fit(all |> filter(data == "train")) |>
augment(all |> filter(data == "test"), type = c("prob", "class"))
I'm trying to use bpe working in step_tokenize_sentencepiece.
Could I use some already trained model? Here some examples trying different approaches: