General update, mostly sourcecode, Jul 2018

Derek-Jones · Jul 1, 2018 · c96a988 · c96a988
1 parent e7a2d74
commit c96a988
Show file tree

Hide file tree

Showing 257 changed files with 3,492 additions and 653 deletions.
diff --git a/CHANGES b/CHANGES
@@ -1,5 +1,9 @@
 Change log
 
+ 1 Jul 18
+
+General update, mostly source code related, plus added TAGs
+
  2 Apr 18
 
 Reliability draft pdf release, plus more data and tweaks

diff --git a/benchmark/1509-all64.R b/benchmark/1509-all64.R
@@ -0,0 +1,50 @@
+#
+# 1509-all64.R, 28 May 18
+# Data from:
+# Array Layouts for Comparison-Based Searching
+# Paul-Virak Khuong and Pat Morin
+#
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+
+source("ESEUR_config.r")
+
+
+# library("dplyr")
+
+
+pal_col=rainbow(5)
+
+
+plot_run=function(df, col_str="black")
+{
+lines(df$items, df$seconds, col=col_str)
+}
+
+
+all64=read.csv(paste0(ESEUR_dir, "benchmark/1509-all64.csv.xz"), as.is=TRUE)
+
+
+plot(0, type="n", log="xy",
+	xaxs="i", yaxs="i",
+	xlim=range(all64$items), ylim=range(all64$seconds),
+	xlab="Array size", ylab="Runtime (secs)\n")
+
+# d_ply(all64, .(alg), plot_run)
+
+plot_run(subset(all64, alg == "eytzinger_bf"), pal_col[1])
+plot_run(subset(all64, alg == "eytzinger_branchy"), pal_col[2])
+plot_run(subset(all64, alg == "sorted_bfp"), pal_col[3])
+plot_run(subset(all64, alg == "btree16_bf_a"), pal_col[4])
+plot_run(subset(all64, alg == "btree32_a"), pal_col[5])
+
+
+# Sizes of L1, L2, and L3 cache
+lines(c(2^13, 2^13), c(1e-3, 2), col="grey")
+text(2^13, 1, "L1")
+lines(c(2^16, 2^16), c(1e-3, 2), col="grey")
+text(2^16, 1, "L2")
+lines(c(2^21, 2^21), c(1e-3, 2), col="grey")
+text(2^21, 1, "L3")
+
diff --git a/benchmark/1509-all64.csv.xz b/benchmark/1509-all64.csv.xz
diff --git a/communicating/github-lang-pairs.R b/communicating/github-lang-pairs.R
@@ -1,5 +1,5 @@
 #
-# github-lang-pairs.R, 14 May 14
+# github-lang-pairs.R, 31 May 18
 #
 # Data from:
 # Popularity, interoperability, and impact of programming languages in 100,000 open source projects
@@ -15,7 +15,7 @@ library("igraph")
 library("plyr")
 
 
-git_lang=read.csv(paste0(ESEUR_dir, "src_measure/github-lang-use.csv.xz"), as.is=TRUE)
+git_lang=read.csv(paste0(ESEUR_dir, "sourcecode/github-lang-use.csv.xz"), as.is=TRUE)
 
 git_lang=subset(git_lang, language != "")
 

diff --git a/developers/a013582.R b/developers/a013582.R
@@ -0,0 +1,29 @@
+#
+# a013582.R, 15 May 18
+# Data from:
+# A MODEL OF HUMAN COGNITIVE BEHAVIOR IN WRITING CODE FOR COMPUTER PROGRAMS, VOL I
+# Ruven Brooks
+# 
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+#
+# TAG cognition debugging experiment LOC
+
+
+source("ESEUR_config.r")
+
+
+a013=read.csv(paste0(ESEUR_dir, "developers/a013582.csv.xz"), as.is=TRUE)
+a013$position=1:nrow(a013)
+
+plot(~ log(Writing)+log(Debugging)+log(Lines), data=a013)
+
+# d_mod=glm(log(Debugging) ~ log(Lines), data=a013)
+# There is a learning effect
+d_mod=glm(log(Debugging) ~ log(position)*log(Lines)-log(Lines), data=a013)
+summary(d_mod)
+
+w_mod=glm(log(Lines) ~ log(Writing), data=a013)
+summary(w_mod)
+
diff --git a/developers/a013582.csv.xz b/developers/a013582.csv.xz
diff --git a/developers/adelson1981.R b/developers/adelson1981.R
@@ -0,0 +1,106 @@
+#
+# adelson1981.R, 12 May 18
+# Data from:
+# Problem solving and the development of abstract categories in programming languages
+# Beth Adelson
+#
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+#
+# TAG experiment cognition memory-recall
+
+
+source("ESEUR_config.r")
+
+
+plot_layout(2, 1)
+
+
+# Martix for holding the item pairwise distances
+mk_item_mat=function()
+{
+item_mat=matrix(data=0, nrow=length(line_loc), ncol=length(line_loc))
+colnames(item_mat)=line_loc
+rownames(item_mat)=line_loc
+
+return(item_mat)
+}
+
+
+
+# Calculate the distance between all pairs of items in a list
+calc_dist=function(items)
+{
+
+   # Calculate distance between one item and all other items
+   item_dist=function(X)
+   {
+   items=items[!is.na(items)]
+
+   op_pos=which(items == X)
+   # Missing items are given fixed distance from all other items
+   if (length(op_pos) == 0)
+      {
+      recall_mat[cbind(rep(X, length(line_loc)), line_loc)] <<- item_na_len
+      recall_mat[cbind(line_loc, rep(X, length(line_loc)))] <<- item_na_len
+      recall_mat[X, X] <<- 0
+      return(0)
+      }
+
+   dist_vec=abs(1:length(items) - op_pos)
+
+   # print(c(X, dist_vec))
+
+   recall_mat[cbind(rep(X, length(items)), items)] <<- dist_vec
+
+   return(0)
+   }
+
+recall_mat=mk_item_mat()
+d_mat=sapply(line_loc, item_dist)
+
+return(recall_mat)
+}
+
+
+adel=read.csv(paste0(ESEUR_dir, "developers/adelson1981.csv.xz"), as.is=TRUE)
+
+# Three programs and the lines they contain
+line_loc=c("1_0", "1_1", "1_2", "1_3", "1_4",
+		"2_0", "2_1", "2_2", "2_3", "2_4",
+		"3_0", "3_1", "3_2", "3_3", "3_4", "3_5")
+# Program statement kind.  This entry has no NAs
+line_kind=adel$e7C[order(adel$e7p)]
+
+# Seems as good a value as any other
+item_na_len=length(line_loc)/2
+
+teacher_dist=calc_dist(adel$e6p)
+teacher_dist=teacher_dist+calc_dist(adel$e7p)
+teacher_dist=teacher_dist+calc_dist(adel$e8p)
+teacher_dist=teacher_dist+calc_dist(adel$e9p)
+teacher_dist=teacher_dist+calc_dist(adel$e10p)
+
+d_dist=dist(teacher_dist/5)
+hc=hclust(d_dist)
+
+plot(hc, main="Teachers", sub="", col=point_col,
+	xlab="Items", ylab="")
+
+
+student_dist=calc_dist(adel$n1p)
+student_dist=student_dist+calc_dist(adel$n2p)
+student_dist=student_dist+calc_dist(adel$n3p)
+student_dist=student_dist+calc_dist(adel$n4p)
+student_dist=student_dist+calc_dist(adel$n5p)
+
+rownames(student_dist)=paste0(line_loc, "-", line_kind)
+
+d_dist=dist(student_dist/5)
+hc=hclust(d_dist)
+
+plot(hc, main="Students", sub="", col=point_col,
+	xlab="Items", ylab="")
+
+
diff --git a/developers/adelson1981.csv.xz b/developers/adelson1981.csv.xz
diff --git a/developers/api-struct/api-robinson.R b/developers/api-struct/api-robinson.R
@@ -1,5 +1,5 @@
 #
-# api-robinson.R, 10 Oct 16
+# api-robinson.R, 18 May 18
 #
 # Data from:
 # Developer characterization of data structure fields decisions
@@ -16,7 +16,7 @@ library("seriation")
 library("grid") # Yes, seriation uses grid graphics
 
 
-pal_col=diverge_hcl(10)
+pal_col=heat_hcl(10)
 
 fields=read.csv(paste0(ESEUR_dir, "developers/api-struct/similar_08.csv.xz"), as.is=TRUE)
 rownames(fields)=colnames(fields)
@@ -26,5 +26,5 @@ fmat=as.matrix(fields)
 fdist = as.dist(1 - fmat/max(fmat))
 fser = seriate(fdist, method="BBURCG")
 
-pimage(fdist, fser, col=pal_col, key=FALSE, gp=gpar(cex=0.8))
+pimage(fdist, fser, col=rev(pal_col), key=FALSE, gp=gpar(cex=0.8))
 
diff --git a/developers/like-n-dis.R b/developers/like-n-dis.R
@@ -1,7 +1,7 @@
 #
-# like-n-dis.R, 17 Apr 17
+# like-n-dis.R, 23 Apr 18
 # Data from:
-# The sources and Consequences of the Fluent Processing of numbers
+# The Sources and Consequences of the Fluent Processing of numbers
 # Dan King and Chris Janiszewski
 #
 # Example from:
@@ -11,18 +11,21 @@
 source("ESEUR_config.r")
 
 
-plot_wide()
-
-plot_layout(3, 1, default_width=14)
+plot_layout(2, 1)
 
 
 lnd=read.csv(paste0(ESEUR_dir, "developers/like-n-dis.csv.xz"), as.is=TRUE)
 
-plot(lnd$Number, lnd$Like, type="l",
-	xlab="Like", ylab="Number")
+plot(lnd$Number, lnd$Like, type="l", col=point_col,
+	xaxs="i",
+	xlab="Number", ylab="Like\n")
+lines(loess.smooth(lnd$Number, lnd$Like, span=0.3), col=loess_col)
+
+
+spectrum(lnd$Like, main="Spectrum density", sub="", col=point_col,
+		xlab="Frequency", ylab="Density\n")
 
+# spectrum(lnd$Dislike)
 
-spectrum(lnd$Like)
-spectrum(lnd$Dislike)
-spectrum(lnd$Neutral)
+# spectrum(lnd$Neutral)
 
diff --git a/ecosystems/CompWorld85.R b/ecosystems/CompWorld85.R
@@ -0,0 +1,43 @@
+#
+# CompWorld85.R,  3 Jun 18
+# Data from:
+# Hardware Roundup
+# Tom Henkel
+# ComputerWorld, 19 Aug 1985, pages 23--34
+#
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+#
+# TAG hardware performance price 1985
+
+
+source("ESEUR_config.r")
+
+
+library("plyr")
+
+
+plot_pts=function(df)
+{
+points(df$Memory.Size, df$Purchase.Price, col=df$col_str)
+
+# points(as.numeric(df$Relative.Performance), df$Purchase.Price, col=df$col_str)
+}
+
+
+cw85=read.csv(paste0(ESEUR_dir, "ecosystems/CompWorld85.csv.xz"), as.is=TRUE)
+
+cat_str=unique(cw85$Category)
+pal_col=rainbow(length(cat_str))
+cw85$col_str=mapvalues(cw85$Category, cat_str, pal_col)
+
+plot(1, type="n", log="xy",
+	# xlim=c(5, 5000), ylim=range(cw85$Purchase.Price),
+	xlim=range(cw85$Memory.Size), ylim=range(cw85$Purchase.Price),
+	xlab="Memory (MB)", ylab="Purchase price ($)\n")
+
+d_ply(cw85, .(Category), plot_pts)
+
+legend(x="bottomright", legend=cat_str, bty="n", fill=pal_col, cex=1.2)
+
diff --git a/ecosystems/CompWorld85.csv.xz b/ecosystems/CompWorld85.csv.xz
diff --git a/ecosystems/GovMachineHistComputer.R b/ecosystems/GovMachineHistComputer.R
@@ -0,0 +1,30 @@
+#
+# GovMachineHistComputer.R, 19 May 18
+# Data from:
+# The Government Machine {A} Revolutionary History of the Computer
+# Jon Agar
+#
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+#
+# TAG government hardware card office-equipement
+
+source("ESEUR_config.r")
+
+
+pal_col=rainbow(4)
+
+
+cards=read.csv(paste0(ESEUR_dir, "ecosystems/GovMachineHistComputer.csv.xz"), as.is=TRUE)
+
+plot(cards$Financial_year, cards$Typewriters_duplicators, log="y", col=pal_col[1],
+	ylim=c(9e3, 1e6),
+	xlab="Year", ylab="Expenditure (in #)\n")
+
+points(cards$Financial_year, cards$Other, col=pal_col[2])
+points(cards$Financial_year, cards$Tabulators_rent, col=pal_col[3])
+points(cards$Financial_year, cards$Cards, col=pal_col[4])
+
+legend(x="topleft", legend=c("Typewriters/duplicators", "Other", "Tabulator rental", "Cards"), bty="n", fill=pal_col, cex=1.2)
+
diff --git a/ecosystems/GovMachineHistComputer.csv.xz b/ecosystems/GovMachineHistComputer.csv.xz
diff --git a/ecosystems/eindor1985.R b/ecosystems/eindor1985.R
@@ -0,0 +1,41 @@
+#
+# eindor1985.R,  1 Jun 18
+# Data from:
+# Grosch's Law Re-revisited: {CPU} Power and the Cost of Computation
+# Phillip Ein-Dor
+#
+# Example from:
+# Empirical Software Engineering using R
+# Derek M. Jones
+#
+# TAG hardware performance MIPS cost memory
+
+
+source("ESEUR_config.r")
+
+
+library("plyr")
+
+
+plot_pts=function(df)
+{
+points(df$Max_Memory, df$Average_cost, col=df$col_str)
+# points(df$MIPS, df$Average_cost, col=df$col_str)
+}
+
+
+ein=read.csv(paste0(ESEUR_dir, "ecosystems/eindor1985.csv.xz"), as.is=TRUE)
+
+cat_str=unique(ein$Category)
+pal_col=rainbow(length(cat_str))
+ein$col_str=mapvalues(ein$Category, cat_str, pal_col)
+
+plot(0.1, type="n", log="xy",
+	yaxs="i",
+	xlim=range(ein$Max_Memory), ylim=c(1, 800),
+	xlab="Maximum memory (Kbytes)", ylab="Average cost\n")
+
+d_ply(ein, .(Category), plot_pts)
+
+legend(x="bottomright", legend=cat_str, bty="n", fill=pal_col, cex=1.2)
+
diff --git a/ecosystems/eindor1985.csv.xz b/ecosystems/eindor1985.csv.xz