-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy path00_data-prep.R
127 lines (95 loc) · 3.83 KB
/
00_data-prep.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# Crosstalk: Shiny-like without Shiny (EARL 18, London, Sep 2018)
# Blurb: https://earlconf.com/2018/london/#matt-dray
# Matt Dray
# July 2018
# This file: get and prepare data for use in a series of outputs thst show the
# progression of solutions to a policy need in my department.
# Packages ----------------------------------------------------------------
library(data.table) # used for fast file read
library(dplyr) # tidy data manipulation
library(janitor) # additional tidy functions
library(sf) # tidy spatial dataframes
library(skimr) # nice df summaries
# Get data ----------------------------------------------------------------
# Data will be stored in the workspace and not saved to this repo
# Fetch latest cut of Get Information About Schools, a published source of basic
# schools data: https://get-information-schools.service.gov.uk/
# Warning: approx 54 MB
gias <- data.table::fread( # fast read with progress bar
paste0(
"http://ea-edubase-api-prod.azurewebsites.net/edubase/edubasealldata",
stringr::str_replace_all(Sys.Date(), "-", ""),
".csv"
),
na.strings = c("", " ") # strings that should be considered NA
) # 46,992 rows, 133 columns at 31 July 2018
# Prepare data ------------------------------------------------------------
# Simplify col names, retain cols of interest, ensure all rows are complete
# These selections are arbitrary and not related to any policies
gias_cut <- gias %>%
janitor::clean_names() %>%
dplyr::filter(
establishmentstatus_name == "Open",
phaseofeducation_name %in% c("Primary", "Secondary")
) %>%
select(
# school
sch_urn = urn,
sch_name = establishmentname,
sch_type = typeofestablishment_name,
sch_type_group = establishmenttypegroup_name,
sch_phase = phaseofeducation_name,
# ofsted
ofsted_rating = ofstedrating_name,
ofsted_date = ofstedlastinsp,
# pupils
pupil_count = numberofpupils,
pupil_gender = gender_name,
pupil_percent_fsm = percentagefsm,
# geography
geo_town = town,
geo_postcode = postcode,
geo_la = la_name,
geo_easting = easting,
geo_northing = northing,
geo_urban_rural = urbanrural_name,
geo_rsc_region = rscregion_name
) %>%
mutate_at(
vars(pupil_count,pupil_percent_fsm,geo_easting,geo_northing),
as.numeric # make all these columns numeric
) %>%
mutate(
sch_urn = as.character(sch_urn),
ofsted_date = lubridate::dmy(ofsted_date),
ofsted_rating = case_when(
ofsted_rating == "Outstanding" ~ "1 Outstanding",
ofsted_rating == "Good" ~ "2 Good",
ofsted_rating == "Requires improvement" ~ "3 Requires improvement",
ofsted_rating == "Serious Weaknesses" ~ "4 Serious weakness",
ofsted_rating == "Special Measures" ~ "5 Special measures"
)
) %>%
filter(complete.cases(.)) # only schools with complete info for all columns
# Convert to sf -----------------------------------------------------------
# Convert the dataframe to a spatial dataframe with the sf package
# Create a listcol of latlongs and add spatial metadata
gias_sf <- sf::st_as_sf(
x = gias_cut,
coords = c("geo_easting", "geo_northing"), # columns with coordinates
crs = 27700 # coordinate reference system code for eastings/northings
) %>%
sf::st_transform(crs = 4326) # the coord ref system code for latlong
# Random sample -----------------------------------------------------------
# Randomly sample 100 schools, half primary, half secondary
set.seed(1337) # for reproducibility of random selection
gias_sample <- gias_sf %>%
group_by(sch_phase) %>% # half primary, half secondary
sample_n(50) %>% # 50 per phase
ungroup()
# Check the output
glimpse(gias_sample)
View(gias_sample)
skimr::skim(gias_sample)
# Save --------------------------------------------------------------------
saveRDS(gias_sample, "data/gias_sample.RDS")