-
Notifications
You must be signed in to change notification settings - Fork 0
/
prep_annon_2017_2019.R
145 lines (120 loc) · 4.14 KB
/
prep_annon_2017_2019.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
# merge datasets
path = "D:/OneDrive - University of Leeds/Data/CREDS Data/"
year <- 2019
dir.create("tmp")
unzip(paste0(path,"MOT anoymised/dft_test_result_",year,".zip"),
exdir = "tmp")
files <- list.files("tmp", pattern = ".csv", full.names = TRUE)
res <- list()
for(j in seq(1, length(files))){
sub <- readLines(files[j])
res[[j]] <- sub[seq(2, length(sub))]
}
unlink("tmp", recursive = TRUE)
data <- unlist(res, recursive = FALSE)
data <- strsplit(data,",")
lths <- lengths(data)
summary(lths)
data_good <- data[lths == 14]
data_bad <- data[lths != 14]
# format up the good data
data_good <- data.frame(matrix(unlist(data_good), ncol=14, byrow=T),stringsAsFactors=FALSE)
names(data_good) <- as.character(data_good[1,])
data_good <- data_good[2:nrow(data_good),]
data_bad_13 = data_bad[lengths(data_bad) == 13]
data_bad_15 = data_bad[lengths(data_bad) == 15]
data_bad_16 = data_bad[lengths(data_bad) == 16]
data_bad_17 = data_bad[lengths(data_bad) == 17]
if(length(data_bad_17) > 0){
data_bad_17 = lapply(data_bad_17, function(x){c(x[1:10],x[14:17])})
data_bad_17 = t(as.data.frame(data_bad_17))
rownames(data_bad_17) = 1:nrow(data_bad_17)
} else {
data_bad_17 <- NULL
}
if(length(data_bad_16) > 0){
data_bad_16 = lapply(data_bad_16, function(x){c(x[1:9],paste(x[10:12], collapse = " "),x[13:16])})
data_bad_16 = t(as.data.frame(data_bad_16))
rownames(data_bad_16) = 1:nrow(data_bad_16)
} else {
data_bad_16 <- NULL
}
if(length(data_bad_15) > 0){
data_bad_15 = lapply(data_bad_15, function(x){c(x[1:9],paste(x[10:11], collapse = " "),x[12:15])})
data_bad_15 = t(as.data.frame(data_bad_15))
rownames(data_bad_15) = 1:nrow(data_bad_15)
} else {
data_bad_15 <- NULL
}
if(length(data_bad_13) > 0){
data_bad_13 = lapply(data_bad_13, function(x){c(x[1:13],"")})
data_bad_13 = t(as.data.frame(data_bad_13))
rownames(data_bad_13) = 1:nrow(data_bad_13)
} else {
data_bad_13 <- NULL
}
data_fixed <- rbind(data_bad_13, data_bad_15)
data_fixed <- rbind(data_fixed, data_bad_16)
data_fixed <- rbind(data_fixed, data_bad_17)
data_fixed <- as.data.frame(data_fixed)
names(data_fixed) <- names(data_good)
data_fixed[] <- lapply(data_fixed, as.character)
data_final <- rbind(data_good, data_fixed)
data_final <- data_final[data_final$test_id != "test_id",]
data_final$test_id <- as.numeric(data_final$test_id)
data_final$vehicle_id <- as.numeric(data_final$vehicle_id)
data_final$test_date <- lubridate::ymd(data_final$test_date)
data_final$test_mileage <- as.numeric(data_final$test_mileage)
data_final$cylinder_capacity <- as.numeric(data_final$cylinder_capacity)
data_final$first_use_date <- lubridate::ymd(data_final$first_use_date)
saveRDS(data_final, paste0(path,"MOT anoymised/clean/test_result_",year,".Rds"))
# import_mot = function(file){
# data <- readLines(file, n = 100000)
# data <- strsplit(data,",")
# lths <- lengths(data)
#
# data_good <- data[lths == 14]
# data_bad <- data[lths != 14]
# #rm(data)
#
#
#
# #handel the bad data
# fix_mot <- function(sub){
# #cols are
# #test_id number
# #vehicle_id number
# #test_date date
# #test_class_id number 1 digit
# #test_type character 2 letters
# #test_result character 1-3 letters
# #test_mileage number
# #postcode_area character 2 letters
# #make character
# #model character
# #colour character
# #fuel_type character 2 letters
# #cylinder_capacity number
# # first_use_date date
#
# is_int <- !is.na(as.integer(sub))
# is_date <- !is.na(lubridate::ymd(sub))
# n_char <- nchar(sub)
#
# if()
#
#
#
# }
#
#
# }
# # problem reading the data
# foo <- (1:nrow(main_anon))[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST")]
# foo2 <- main_anon[!main_anon$fuel_type %in% c("DI","PE","EL","HY","OT","GB","LP","FC","ED","GD","CN","GA","LN","ST"),]
# main_anon2 <- readLines(con = "E:/OneDrive - University of Leeds/CREDS Data/MOT anoymised/test_result_2017/test_result_2017.csv", n = 1)
# main_anon2[13833:13836]
# main_anon2[13834]
#
# # Mathc formats
# main_api$firstUsedDate <- lubridate::ymd(main_api$firstUsedDate)