@@ -652,13 +652,22 @@ Let's add a column that indicates how many types a pokemon has.
652652``` {r mutate-add-variables}
653653poke <- read_csv("https://github.com/srvanderplas/datasets/raw/main/clean/pokemon_gen_1-9.csv")
654654
655+ # This splits type_1,type_2 into two separate variables.
656+ # Don't worry about the string processing (gsub) just now
657+ # Focus on how variables are defined.
658+ poke$type_1 <- gsub(",.*$", "", poke$type) # Replace anything after comma with ''
659+ poke$type_2 <- gsub("^.*,", "", poke$type) # Use the 2nd type
660+ poke$type_2[poke$type_1 == poke$type_2] <- NA # Type 2 only exists if not same as Type 1
661+
655662poke$no_types <- 1 # set a default value
656- poke$no_types[!is.na( poke$type_2 )] <- 2 # set the value if type_2 is not NA
663+ poke$no_types[grepl(",", poke$type )] <- 2 # set the value if there's not a comma in type
657664
658665# This is a bit faster
659- poke$no_types <- ifelse(is.na( poke$type_2 ), 1, 2 )
666+ poke$no_types <- ifelse(grepl(",", poke$type ), 2, 1 )
660667
661- # This checks number of types vs. value of type_2 (sanity)
668+ # Sanity check
669+ # This checks number of types vs. value of type_2
670+ # If type 2 is NA, then number of types should be 1
662671t(table(poke$type_2, poke$no_types, useNA = 'ifany'))
663672```
664673
@@ -671,6 +680,13 @@ I could reduce that to 2x with the `ifelse` function, but it's still a lot of ty
671680poke <- read_csv("https://github.com/srvanderplas/datasets/raw/main/clean/pokemon_gen_1-9.csv")
672681
673682poke <- poke %>%
683+ # This splits type into type_1,type_2 : two separate variables.
684+ # Don't worry about the string processing (str_extract) just now
685+ # Focus on how variables are defined:
686+ # we use a function on the type column
687+ # within the mutate statement.
688+ mutate(type_1 = str_extract(type, "^(.*),", group = 1),
689+ type_2 = str_extract(type, "(.*),(.*)", group = 2)) %>%
674690 mutate(no_types = if_else(is.na(type_2), 1, 2))
675691
676692select(poke, type_2, no_types) %>% table(useNA = 'ifany') %>% t()
@@ -686,8 +702,17 @@ In python, this type of variable operation (replacing one value with another) ca
686702import pandas as pd
687703poke = pd.read_csv("https://github.com/srvanderplas/datasets/raw/main/clean/pokemon_gen_1-9.csv")
688704
705+ # This splits type into two columns, type_1 and type_2, based on ","
706+ poke[["type_1", "type_2"]] = poke["type"].apply(lambda x: pd.Series(str(x).split(",")))
707+
708+ # This defines number of types
689709poke["no_types"] = 1 # default value
690710poke.loc[~poke.type_2.isna(), "no_types"] = 2 # change those with a defined type 2
711+
712+
713+ poke.groupby(["no_types", "type_2"], dropna=False).size()
714+ # When type_2 is NaN, no_types is 1
715+ # When type_2 is defined, no_types is 2
691716```
692717
693718Another function that may be useful is the ` assign ` function, which can be used to create new variables if you don't want to use the ` ["new_col"] ` notation.
0 commit comments