-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.qmd
146 lines (120 loc) · 6.06 KB
/
index.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
---
output: html_document
editor_options:
chunk_output_type: console
---
# Introduction {.unnumbered}
This technical appendix is a companion to a data descriptor article, submitted to Nature Scientific Data, that focuses on data collected by the Rural Observatory System (ROS) between 1995 and 2015. It provides the source code for all figures and visualizations presented in the paper. It also offers a tutorial on how to georeference this data, which can serve as guidance for various types of analysis beyond this specific application. We use computational notebooks in Quarto format with the R programming language, combining code, results, explanations, and multimedia in an interactive way. The source code can be accessed by expanding code blocks like the following one, which produced the figure below.
```{r}
#| label: fig-hist-ros
#| fig-cap: "Coarse location of rural observatory and survey years"
# Load required libraries
library(tidyverse) # A series of packages for data manipulation
library(haven) # Required for reading STATA files (.dta)
library(labelled) # To work with labelled data from STATA
library(sf) # for spatial data handling
library(tmap) # for mapping
library(readxl) # Read data frames to Excel format
library(cowplot) # to combine plots
# Select appropriate folder as data source
data_path <- "data/dta_format/"
# Define a function to load and count surveys per observatory for a given year
load_and_count <- function(year, factorize = FALSE) {
# Define file path
file_path <- paste0(data_path, year, "/res_deb.dta")
# Load data
data <- read_dta(file_path)
# Extract label and convert to factors if option
if (factorize) {
data <- data %>%
mutate(across(everything(), as.character),
across(where(is.labelled), ~ as.character(as_factor(.))))
}
# Count surveys per observatory
count_data <- data %>%
group_by(j0) %>%
summarise(survey_count = n()) %>%
ungroup() %>%
mutate(year = year) # Add year column
return(count_data)
}
# Generate a list of years
years <- 1995:2015
# Use purrr::map_df to loop through each year and bind results
obs_count <- map_df(years, load_and_count) %>%
# Remove rows with observatory "7 " and "NA", which are errors
filter(j0 != 7 & !is.na(j0) & survey_count > 1) %>%
rename(observatory = j0)
# Read observatory names
observatory_names <- readxl::read_xlsx("references/observatory_names.xlsx") %>%
select(code, observatory_name = name)
# PAss it to wide.
obs_count <- obs_count %>%
left_join(observatory_names, by = c("observatory" = "code")) %>%
group_by(observatory_name, year) %>%
summarise(survey_count = sum(survey_count))
obs_count_wide <- obs_count %>%
pivot_wider(names_from = year, values_from = survey_count)
# Add observatory approximate location
locations <- tibble(
code = c(1, 2, 3, 4, 12, 13, 15, 16, 21, 22, 23, 24, 31, 25, 41, 42, 43, 51,
44, 45, 61, 17, 18, 19, 71, 52),
name = c("Antalaha", "Antsirabe", "Marovoay", "Toliara coastal", "Antsohihy",
"Tsiroanomandidy", "Farafangana", "Ambovombe",
"Alaotra", "Manjakandriana", "Toliara North",
"Fenerive East", "Bekily", "Mahanoro", "Itasy",
"Menabe-Belo", "Fianarantsoa", "Tsivory", "Morondava", "Manandriana",
"Tanandava", "Ihosy", "Ambohimahasoa", "Manakara", "Tolanaro",
"Menabe North-East"),
latitude = c(-14.8833, -19.8659, -16.1000, -23.7574, -14.8796, -18.7713,
-22.8167, -25.1667, -17.8319, -18.9167, -23.2941, -17.3500,
-24.6900, -19.9000, -19.1686, -19.6975, -21.4527, -24.4667,
-20.2833, -20.2333, -22.5711, -22.4000, -20.7145, -22.1333,
-25.0381, -20.5486),
longitude = c(50.2833, 47.0333, 46.6333, 43.6770, 47.9875, 46.0546, 47.8333,
46.0833, 48.4167, 47.8000, 43.7761, 49.4167, 45.1700, 48.8000,
46.7354, 44.5419, 47.0857, 45.4667, 44.2833, 47.3833, 45.0439,
46.1167, 47.0389, 48.0167, 46.9562, 47.1597))
obs_count <- left_join(obs_count, locations, by = c("observatory_name" = "name"))
madagascar <- st_read(paste0("data/Spatial_data/OCHA_BNGRC admin boundaries/",
"mdg_admbnda_adm0_BNGRC_OCHA_20181031.shp"),
quiet = TRUE)
# Sort locations by latitude to generate sequence numbers
locations <- locations %>%
arrange(desc(latitude)) %>%
mutate(seq_num = 1:n())
# Create map plot with labels
map_plot <- ggplot(data = madagascar) +
geom_sf(fill = "lightgray", colour = "dimgrey") +
geom_point(data = locations, aes(x = longitude, y = latitude, color = name),
size = 3) +
geom_text(data = locations, aes(x = longitude, y = latitude, label = seq_num),
vjust = -1, hjust = 1, size = 3, # check_overlap = TRUE,
fontface = "bold") +
theme_void() +
theme(legend.position = "none")
# Add sequence numbers to observatory names in obs_count dataframe
obs_count <- obs_count %>%
left_join(locations %>%
select(name, seq_num),
by = c("observatory_name" = "name")) %>%
mutate(observatory_with_num = paste0(seq_num, ". ", observatory_name))
# Create timeline plot using modified obs_count with observatory_with_num
timeline_plot <- ggplot(obs_count,
aes(x = year,
y = fct_reorder(observatory_with_num, latitude),
color = observatory_name)) +
geom_point(aes(size = 1), show.legend = F) +
theme_minimal() +
labs(y = NULL, x = NULL) +
theme(axis.text.y = element_text(size = 8, face = "bold"),
legend.position = "none")
# Stitch the plots together
combined_plot <- plot_grid(map_plot, timeline_plot, rel_widths = c(1.3, 2))
ggsave("output/figure_1.pdf", plot = combined_plot,
width = 10, height = 7, dpi = 300)
ggsave("output/figure_1.png", plot = combined_plot,
width = 10, height = 7, dpi = 300)
print(combined_plot)
```
@fig-hist-ros shows the coarse location of the 26 observatories composing the ROS, as well as the years in which data was collected in each one.