-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path02-ros-data-attrition.qmd
145 lines (123 loc) · 6.77 KB
/
02-ros-data-attrition.qmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# Panel attrition
To assess the integrity of the panel data, we assess the attrition that characterizes it. That is, for each survey reiteration, we compute the percentage of households identified in the previous round that are still present in the subsequent round. Some adjustment must be made as households' identification numbering system between 1995 and 1996. We also have to remove 2005 survey in Marovoay from the analysis, as it was a specific tracking survey aimed at identifying individuals from households that could not be re-interviewed in previous years in one of the observatory sites [@vaillant2013]. This produces the following result:
```{r}
#| label: fig-attrition-ros
#| fig-cap: "Attrition rate of ROS panels per observatory and survey round"
library(tidyverse) # A series of packages for data manipulation
library(haven) # Required for reading STATA files (.dta)
library(labelled) # To work with labelled data from STATA
library(readxl) # Read data frames to Excel format
# Obtain all years from the directory structure
ros_data_loc <- "data/dta_format/"
years <- list.dirs(ros_data_loc, recursive = FALSE, full.names = FALSE)
# Add observatory approximate location
locations <- tibble(
code = c(1, 2, 3, 4, 12, 13, 15, 16, 21, 22, 23, 24, 31, 25, 41, 42, 43, 51,
44, 45, 61, 17, 18, 19, 71, 52),
name = c("Antalaha", "Antsirabe", "Marovoay", "Toliara coastal", "Antsohihy",
"Tsiroanomandidy", "Farafangana", "Ambovombe",
"Alaotra", "Manjakandriana", "Toliara North",
"Fenerive East", "Bekily", "Mahanoro", "Itasy",
"Menabe-Belo", "Fianarantsoa", "Tsivory", "Morondava", "Manandriana",
"Tanandava", "Ihosy", "Ambohimahasoa", "Manakara", "Tolanaro",
"Menabe North-East"),
latitude = c(-14.8833, -19.8659, -16.1000, -23.7574, -14.8796, -18.7713,
-22.8167, -25.1667, -17.8319, -18.9167, -23.2941, -17.3500,
-24.6900, -19.9000, -19.1686, -19.6975, -21.4527, -24.4667,
-20.2833, -20.2333, -22.5711, -22.4000, -20.7145, -22.1333,
-25.0381, -20.5486),
longitude = c(50.2833, 47.0333, 46.6333, 43.6770, 47.9875, 46.0546, 47.8333,
46.0833, 48.4167, 47.8000, 43.7761, 49.4167, 45.1700, 48.8000,
46.7354, 44.5419, 47.0857, 45.4667, 44.2833, 47.3833, 45.0439,
46.1167, 47.0389, 48.0167, 46.9562, 47.1597))
# Sort locations by latitude to generate sequence numbers
locations <- locations %>%
arrange(desc(latitude)) %>%
mutate(seq_num = 1:n())
# Function to read and process each file
read_and_process <- function(year) {
file_path <- file.path(ros_data_loc, as.character(year), "res_deb.dta")
data <- read_dta(file_path) %>%
select(j0, j5) %>%
mutate(year = year)
return(data)
}
# Use map to read and process files, then combine with bind_rows
consolidated_data <- map_dfr(years, read_and_process) %>%
mutate(year = as.numeric(year))
# NB : j5 codes have been modified in 1996
# so we need to replace the ones from 1995
hh_96 <- read_dta(paste0(ros_data_loc, "1996/res_deb.dta")) %>%
select(j0, year, j5_96 = j5, j_1995, j12b) %>%
filter(j_1995 == 1) %>%
select(-j_1995) %>%
mutate(year = 1995) %>%
distinct(j12b, .keep_all = TRUE)
consolidated_data <- consolidated_data %>%
left_join(hh_96, by = c("j0", "year", "j5" = "j12b")) %>%
mutate(j5 = ifelse(year == 1995 & !is.na(j5_96), j5_96, j5)) %>%
select(j0, j5, year)
# We need also to discard the 2004 survey in Marovoay that is very particular
# cf. Vaillant 2013.
consolidated_data <- consolidated_data %>%
filter(!(j0 == 3 & year == 2005))
# Remove duplicates and create the hh_all table
hh_all <- consolidated_data %>%
distinct(j0, j5, year, .keep_all = TRUE) %>%
arrange(j0, j5)
hh_grouped <- hh_all %>%
group_by(j0, year) %>%
summarise(j5_list = list(j5), .groups = 'drop') %>%
# Count the number of j5 in j5_list
mutate(j5_count = map_int(j5_list, length)) %>%
# Create a column to identify the most recent previous year with data for the same observatory
group_by(j0) %>%
mutate(previous_year = lag(year)) %>%
ungroup()
# Self-join to create previous_year_j5_list
attrition_rates_detail <- hh_grouped %>%
left_join(hh_grouped %>% select(j0, year,
previous_year_j5_list = j5_list,
j5_count_previous_year = j5_count),
by = c("j0", "previous_year" = "year")) %>%
mutate(repeated_j5 = map_int(
seq_along(j5_list),
~length(intersect(j5_list[[.]], previous_year_j5_list[[.]]))),
attrition_rate = (j5_count_previous_year - repeated_j5) /
j5_count_previous_year * 100)
# Pivot the data to have years as columns and observatory numbers as rows
attrition_rates <- attrition_rates_detail %>%
select(j0, year, attrition_rate) %>%
left_join(locations %>%
mutate(observatory_with_num = paste0(seq_num, ". ", name),
observatory_with_num = fct_reorder(observatory_with_num,
latitude)) %>%
select(code, name, observatory_with_num),
by = c("j0" = "code")) %>%
drop_na(name)
average_wo_outliers <- attrition_rates %>%
filter(attrition_rate < 75) %>%
summarise(mean = mean(attrition_rate))
average_wo_outliers <- round(average_wo_outliers$mean, 1)
compound_avg_10y <- round((1-(1-(average_wo_outliers/100))^10)*100)
attrition_plot <- ggplot(attrition_rates,
aes(x = year, y = observatory_with_num,
fill = attrition_rate)) +
geom_tile() + # Create the heatmap tiles
geom_text(aes(label = ifelse(is.na(attrition_rate), "",
round(attrition_rate))),
color = "black", size = 2.5) +
scale_fill_gradient2(low = "darkgreen", mid = "yellow", high = "red",
midpoint = 30, na.value = "grey", name = "Attrition Rate (%)") +
labs(x = "Year",
y = "Observatory (j0)") +
theme_minimal() +
labs(y = NULL, x = NULL) +
theme(axis.text.y = element_text(size = 8))
ggsave("output/figure_3.pdf", plot = attrition_plot,
width = 7, height = 4, dpi = 300)
ggsave("output/figure_3.png", plot = attrition_plot,
width = 7, height = 4, dpi = 300)
attrition_plot
```
Annual attrition rates superior to 75% for a specific observatory are likely to be induced by new reshuffles of the household identification codes and we hope to be able to solve such issue later on. If we discard these outliers (attrition rates over 75%), we have an average attrition year of `{r} print(average_wo_outliers)`%, which is very high, leading to a compound attrition rate of `{r} print(compound_avg_10y)`% over 10 years. Attrition on ROS data has been further studied in focused publications [@gubert2008; @vaillant2013].