-
Notifications
You must be signed in to change notification settings - Fork 13
Expand file tree
/
Copy pathweek_5.R
More file actions
120 lines (84 loc) · 4.72 KB
/
week_5.R
File metadata and controls
120 lines (84 loc) · 4.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# Annie and Jhen's Tidy Tuesday
# Week 5
# September 8, 2020
#### What is tidy data? ####
# Load packages.
library(tidyverse)
library(ggpmisc)
# Data formatting is very important for using the tidyverse - functions within this library were designed specifically with this format of data in mind.
# Tidy data:
# (1) Each variable in its own column.
# (2) Each observation in its own row.
# (3) Each value in its own cell.
# An example of a pre-tidied dataset is "iris" in base R.
View(iris)
# An example of a not-quite tidied dataset is "mtcars" in base R.
mtcars_og <- mtcars
View(mtcars_og) # Notice rownames are car models.
mtcars_tidy <- mtcars_og %>% # Using the original dataset.
mutate(carmodel = rownames(mtcars_og)) %>% # Creating a new column with rownames from original dataset.
select(carmodel, mpg, wt, cyl) %>% # Select only the named columns.
pivot_longer(cols = c("mpg", "wt", "cyl"), names_to = "attribute", values_to = "value") # Pivoting only allows you to pivot similar data formats - numeric in this case. The "carmodel" column is characters, so we will keep that as is.
mtcars_wider <- mtcars_tidy %>% # Using the tidied dataset from above.
pivot_wider(names_from = "attribute", values_from = "value") # Pivoting back into the format we had previously, going from mtcars_tidy back to a format similar to mtcars_og.
# When you're dealing with issues relating to unique observations in each cell, the unite() and separate() functions can also be very helpful.
# Now, let's use the dataset that was sent out to perform some more tidying.
growth_data_og <- read_csv("growth_rates.csv") # Load in this week's dataset.
# Extracted dataset from a manuscript about duckweed growth in different environmental conditions.
# This is a pre-tidied dataset.
# Additional pivoting practice.
growth_longer <- growth_data_og %>% # Using the new dataset.
pivot_longer(cols = c("flow.cfs", "depth.cm", "water.temp.C", "turbidity.NTU", "DO.mg.L", "pH", "sp.cond.uS.cm"),
names_to = "analyte_name", values_to = "value") # Pivot numeric values into the new "value" column and name its corresponding column "analyte_name" and put the column names in as values instead.
# Bottom line: Data types matter.
#### Plot away!! ####
View(growth_data_og) # Let's again examine the dataset loaded in for today's tutorial.
fig1 <- ggplot(growth_data_og, aes(x = pH, y = growth.rate.per.day)) +
labs(y = "Growth rate per day") +
geom_point() # straightforward basic plot.
fig1
fig2 <- ggplot(growth_data_og, aes(x = pH, y = growth.rate.per.day, color = Site)) +
labs(y = "Growth rate per day") +
geom_point() # straightforward basic plot colored by Site.
fig2
# Running a linear model to examine the relationship in the data.
modelpH <- lm(growth.rate.per.day ~ pH, data = growth_data_og) # Create a linear model and assign it the name "modelpH". The basic structure is y ~ x in the lm() function.
modelpH # Display the model output.
fig3 <- ggplot(growth_data_og, aes(x = pH, y = growth.rate.per.day, color = Site)) +
labs(y = "Growth rate per day") +
geom_point() +
geom_smooth(method = lm, se = FALSE) # plot with linear model added. Remove 95% confidence intervals with the "se = FALSE" call.
fig3
# Now let's add the regression statistics to the plot itself using the stat_poly_eq() from the ggmisc package (loaded above).
myformula <- y ~ x # Create standard formula to use below.
fig4 <- ggplot(growth_data_og, aes(x = pH, y = growth.rate.per.day, color = Site)) +
labs(y = "Growth rate per day") +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
stat_poly_eq(formula = myformula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
parse = TRUE) # plot with linear models and equations added.
fig4
# Now let's panel out the data by site using facets.
fig5 <- ggplot(growth_data_og, aes(x = pH, y = growth.rate.per.day)) +
labs(y = "Growth rate per day") +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
stat_poly_eq(formula = myformula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
parse = TRUE) +
facet_wrap(~Site) # plot with linear models and equations added and facet!
fig5
# Now, let's make some additional plots.
View(growth_longer) # Examine pivoted dataset.
fig6 <- ggplot(growth_longer, aes(x = value, y = growth.rate.per.day, color = Site)) +
labs(y = "Growth rate per day",
title = "Growth rates of duckweed in response to environmental conditions") +
geom_point() +
geom_smooth(method = lm, se = FALSE) +
stat_poly_eq(formula = myformula,
aes(label = paste(..eq.label.., ..rr.label.., sep = "~~~~")),
parse = TRUE) +
facet_wrap(~analyte_name, scales = "free") # Scales here are letting each axes (x & y) be whatever range it needs to be.
fig6
# End of script.