-
Notifications
You must be signed in to change notification settings - Fork 29.6k
/
scatter.R
86 lines (73 loc) Β· 2.37 KB
/
scatter.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
#!/usr/bin/env Rscript
library(ggplot2);
library(plyr);
# get __dirname and load ./_cli.R
args = commandArgs(trailingOnly = F);
dirname = dirname(sub("--file=", "", args[grep("--file", args)]));
source(paste0(dirname, '/_cli.R'), chdir=T);
if (is.null(args.options$xaxis) || is.null(args.options$category) ||
(!is.null(args.options$plot) && args.options$plot == TRUE)) {
stop("usage: cat file.csv | Rscript scatter.R [variable=value ...]
--xaxis variable variable name to use as xaxis (required)
--category variable variable name to use as colored category (required)
--plot filename save plot to filename
--log use a log-2 scale for xaxis in the plot");
}
plot.filename = args.options$plot;
# parse options
x.axis.name = args.options$xaxis;
category.name = args.options$category;
use.log2 = !is.null(args.options$log);
# parse data
dat = read.csv(file('stdin'), strip.white=TRUE);
dat = data.frame(dat);
# List of aggregated variables
aggregate = names(dat);
aggregate = aggregate[
! aggregate %in% c('rate', 'time', 'filename', x.axis.name, category.name)
];
# Variables that don't change aren't aggregated
for (aggregate.key in aggregate) {
if (length(unique(dat[[aggregate.key]])) == 1) {
aggregate = aggregate[aggregate != aggregate.key];
}
}
# Print out aggregated variables
for (aggregate.variable in aggregate) {
cat(sprintf('aggregating variable: %s\n', aggregate.variable));
}
if (length(aggregate) > 0) {
cat('\n');
}
# Calculate statistics
stats = ddply(dat, c(x.axis.name, category.name), function(subdat) {
rate = subdat$rate;
# calculate confidence interval of the mean
ci = NA;
if (length(rate) > 1) {
se = sqrt(var(rate)/length(rate));
ci = se * qt(0.975, length(rate) - 1)
}
# calculate mean and 95 % confidence interval
r = list(
rate = mean(rate),
confidence.interval = ci
);
return(data.frame(r));
});
print(stats, row.names=F);
if (!is.null(plot.filename)) {
p = ggplot(stats, aes_string(x=x.axis.name, y='rate', colour=category.name));
if (use.log2) {
p = p + scale_x_continuous(trans='log2');
}
p = p + geom_errorbar(
aes(ymin=rate-confidence.interval, ymax=rate+confidence.interval),
width=.1, na.rm=TRUE
);
p = p + geom_point();
p = p + geom_line();
p = p + ylab("rate of operations (higher is better)");
p = p + ggtitle(dat[1, 1]);
ggsave(plot.filename, p);
}