-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathOutliers.R
187 lines (148 loc) · 4.92 KB
/
Outliers.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
rm(list=ls(all=TRUE))
setwd("../data/outlier/")
library("lubridate")
library("outliers")
library("moments")
library("seasonal")
library("forecast")
library("ForeCA")
# Function to calculate desc. stats.
desc.stat <- function(z)
{
cc <- c(mean(z), median(z), sd(z), min(z),max(z), skewness(z), kurtosis(z))
cc <- as.matrix(cc)
rownames(cc) <- c("Mean", "Median", "Std. Dev.", "Min", "Max", "Skewness", "Kurtosis")
return(cc)
}
# Set the seed
set.seed(123456)
# Starting date
start <- as.POSIXct("2017-01-01")
interval <- 1 # observation every minute
ndays <- 500
end <- start + as.difftime(ndays, units="days") # say for 500 days
d <- seq(from=start, by=interval*60, to=end)
# Generate a dataset
# which takes values 0:4
# this could also be a categorical dataset translated to numerical
x <- round(runif(NROW(d), min=0, max=4))
# Add some outliers
o <- 2*round(NROW(x)^0.5)
o <- round(runif(o, min=1, max=NROW(x)))
x[o] <- mean(x)+6*sd(x)
# Now, show a day
ysel <- which(date(d)==as.Date("2017-01-01"))
y <- x[ysel]
yd <- d[ysel]
plot(yd, y, type="l", xlab="Time", ylab="", main="2017-01-01")
od1 <- scores(y, type="z", prob=0.90)
od2 <- scores(y, type="z", prob=0.95)
od3 <- scores(y, type="z", prob=0.99)
op <- od1
op[which(op==TRUE)] <- y[which(op==TRUE)]
op[op==FALSE] <- NA
points(yd, op, col="green", pch=19)
op <- od2
op[which(op==TRUE)] <- y[which(op==TRUE)]
op[op==FALSE] <- NA
points(yd, op, col="blue", pch=19)
op <- od3
op[which(op==TRUE)] <- y[which(op==TRUE)]
op[op==FALSE] <- NA
points(yd, op, col="red", pch=19)
boxplot(y, main="2017-01-01")
# Remove outliers on a daily basis and assign the day median
# Then, aggregate daily
aggd <- unique(date(d))
aggx <- rep(NA, NROW(aggd))
aggx2 <- aggx
aggdl <- list()
for(i in 1:NROW(aggd))
{
xsel <- which(date(d)==aggd[i])
od <- scores(x[xsel], type="z", prob=0.99)
x2 <- x[xsel]
x2[which(od==TRUE)] <- median(x2)
aggx[i] <- sum(x[xsel])
aggx2[i] <- sum(x2)
aggdl <- c(aggdl, list(x[xsel]))
cat("Now done ", i, " of ", NROW(aggd), "\n")
}
aggd <- aggd[1:ndays]
aggx <- aggx[1:ndays]
aggx2 <- aggx2[1:ndays]
aggdl <- aggdl[,,1:ndays]
# Do some daily box-plots
aggdl <- matrix(unlist(aggdl), ncol=ndays, byrow=TRUE)
bp <- boxplot(aggdl[,1:50], main="First 50 days")
# Compare with and without
plot(aggd, aggx, type="l", main="Daily Aggregation", xlab="", ylab="")
lines(aggd, aggx2, col="blue")
dstat <- round(cbind(desc.stat(aggx), desc.stat(aggx2)),3)
write.csv(dstat, "./out/dstat.csv")
boxplot(aggx2, main="Daily Time Series, N=500")
od4 <- scores(aggx2, type="z", prob=0.99)
plot(aggd, aggx2, type="l", xlab="Time", ylab="", main="Daily Time Series, N=500")
op <- od4
op[which(op==TRUE)] <- aggx2[which(op==TRUE)]
op[op==FALSE] <- NA
points(aggd, op, col="red", pch=19)
# Now let's try to introduce seasonal effects which will lead
# to seasonal daily time series
# we re-generate the series without the outliers
# using the set.seed() this does not cause any problems
x <- round(runif(NROW(d), min=0, max=4))*(1+wday(d)/100)
aggd <- unique(date(d))
aggx <- rep(NA, NROW(aggd))
aggx2 <- aggx
aggdl <- list()
for(i in 1:NROW(aggd))
{
xsel <- which(date(d)==aggd[i])
od <- scores(xs[xsel], type="z", prob=0.99)
x2 <- x[xsel]
x2[which(od==TRUE)] <- median(x2)
aggx[i] <- sum(x[xsel])
aggx2[i] <- sum(x2)
aggdl <- c(aggdl, list(x[xsel]))
cat("Now done ", i, " of ", NROW(aggd), "\n")
}
aggd <- aggd[1:ndays]
aggx <- aggx[1:ndays]
aggx2 <- aggx2[1:ndays]
aggdl <- aggdl[,,1:ndays]
# Compare
plot(aggd, aggx, type="l",
main="Daily Aggregation, Weekly Pattern", xlab="", ylab="")
par(mfrow=c(1,2))
xsel <- which(date(d)=="2017-01-01")
plot(d[xsel], x[xsel], type="l", main="2017-01-01", xlab="", ylab="")
xsel <- which(date(d)=="2017-01-07")
plot(d[xsel], x[xsel], type="l", main="2017-01-07", xlab="", ylab="")
par(mfrow=c(1,1))
plot(d[1:(14*1440)], x[1:(14*1440)], type="l",
main="Daily Aggregation, Weekly Pattern", xlab="", ylab="")
# Decompose
# Input: aggx is a numeric vector of aggregated time series in weekly frequency
# First, we transform the numeric vector in a time series (ts) object
# correctly specifying the frequency.
tsaggx <- ts(aggx, frequency=7)
# Then, we use the ts object, tsaggx, as the input in the LOESS function.
# s.window: can be a string "periodic" or "per" which reads the frequency from the ts transformation
# otherwise it can be a user choice.
ss <- stl(tsaggx, s.window="per")
# Plot the output
plot(ss, main="Daily Aggregation, Weekly Pattern")
# Extract the seasonal component (xs) and the trend component (xt)
xs <- ss$time.series[,1]
xt <- ss$time.series[,2]
# Calculate the cleaned series
xc <- tsaggx-xs-xt
# And create a plot
# plot(xc, type="l", xlab="", ylab="", main="Detrended and Deseasonalised")
plot(aggd, xc, type="l", xlab="", ylab="", main="Detrended and Deseasonalised")
od5 <- scores(xc, type="z", prob=0.99)
op <- od5
op[which(op==TRUE)] <- xc[which(op==TRUE)]
op[op==FALSE] <- NA
points(aggd, op, col="red", pch=19)