aliburchard
diff --git a/‎scripts/in-development/flattening_functions.R renamed to ‎functions/flattening_functions.R
Lines changed: 0 additions & 18 deletions b/‎scripts/in-development/flattening_functions.R renamed to ‎functions/flattening_functions.R
Lines changed: 0 additions & 18 deletions
diff --git a/‎functions/quick_functions.R
Lines changed: 18 additions & 0 deletions b/‎functions/quick_functions.R
Lines changed: 18 additions & 0 deletions
diff --git a/‎scripts/marking-tasks/.DS_Store renamed to ‎sampledata/.DS_Store b/‎scripts/marking-tasks/.DS_Store renamed to ‎sampledata/.DS_Store
diff --git a/‎scripts/.DS_Store
4 KB b/‎scripts/.DS_Store
4 KB
diff --git a/‎scripts/in-development/json-parsing-wildebeest.R
Lines changed: 0 additions & 63 deletions b/‎scripts/in-development/json-parsing-wildebeest.R
Lines changed: 0 additions & 63 deletions
diff --git a/‎scripts/point-marking-wildebeest/flatten-wildebeest.R
Lines changed: 76 additions & 0 deletions b/‎scripts/point-marking-wildebeest/flatten-wildebeest.R
Lines changed: 76 additions & 0 deletions
@@ -8,24 +8,6 @@
 # library(lubridate)
 
 
-###################### EXPLORE DATA #######################
-# Need to evaluate and limit to proper subsets
-
-# check workflow
-check_workflow <- function(data){
-     data %>% group_by(workflow_id, workflow_version) %>%
-          summarise(date = max(created_at), count = n()) %>%
-          print
-}
-
-
-# View classifications and dates of workflows to limit data out to proper workflow version and number
-
-View_json <- function(jdata) {
-     for (i in 1:50) {
-          jdata$annotations[i] %>% prettify %>% print
-     }
-}
 
 ####################### FLATTEN ####################### 
 # Data formats:
 
@@ -0,0 +1,18 @@
+###################### EXPLORE DATA #######################
+# Need to evaluate and limit to proper subsets
+
+# check workflow
+check_workflow <- function(data){
+     data %>% group_by(workflow_id, workflow_version) %>%
+          summarise(date = max(created_at), count = n()) %>%
+          print
+}
+
+
+# View classifications and dates of workflows to limit data out to proper workflow version and number
+
+View_json <- function(jdata) {
+     for (i in 1:50) {
+          jdata$annotations[i] %>% prettify %>% print
+     }
+}
@@ -0,0 +1,76 @@
+library(tidyjson)
+library(magrittr)
+library(jsonlite)
+library(dplyr)
+library(stringr)
+library(tidyr)
+
+source("functions/quick_functions.R") #adds a check workflow and view json function
+
+# this works for the old version of serengeti wildebeest count. Note you'll want to set working directory as appropriate.
+wilde <- read.csv("scripts/points-wildebeest/wildebeest_2016_sample.csv", stringsAsFactors = F)
+
+check_workflow(wilde)
+
+# Filter to the relevant workflow version. You might want to combine multiple versions; it depends on the changes that have been made to the project.
+dat <- wilde %>% filter(., workflow_id == 78, workflow_version == 36.60) 
+
+View_json(dat)
+dat$annotations[1] %>% prettify
+
+
+# View the data structure, note that anything with zero length "value" field is dropped
+dat$annotations %>% as.tbl_json %>% 
+     gather_array() %>%
+     spread_values(task = jstring("task"), tasklabel = (jstring("task_label"))) %>%
+     enter_object("value") %>%
+     gather_array() %>%
+     gather_keys() %>% 
+     append_values_string() %>% head %>% View
+
+# Grab the top-level info for ALL classifications
+# produces one row per classification per subject; final column indicates how many x-y coordinates were made in that classification.
+all_submissions <- dat %>% 
+     select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>%
+     as.tbl_json(json.column = "annotations") %>%
+     gather_array(column.name = "task_index") %>%
+     spread_values(task = jstring("task"), task_label = jstring("task_label")) %>%
+     gather_keys() %>%
+     json_lengths(column.name = "total_marks") %>% 
+     filter(., key == "value") 
+
+# produces one row per mark per classification per subject, but only keeps classifications with >0 marks
+flattened <- dat %>% 
+     select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>%
+     as.tbl_json(json.column = "annotations") %>%
+     gather_array(column.name = "task_index") %>%
+     spread_values(task = jstring("task"), task_label = (jstring("task_label"))) %>%
+     enter_object("value") %>%
+     gather_array(column.name = "mark_index") %>% #don't gather keys, whole point is that you are spreading out the damn keys.
+     spread_values(tool_label = jstring("tool_label"), xcoord = jnumber("x"), ycoord = jnumber("y"), tool = jstring("tool"))
+
+
+
+#check that captures all the data. should equal original total classifications.
+# dat %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) #original data
+# all_submissions %>%  summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # this maintains one row per classification.
+# all_submissions %>% filter(., total_marks == 0) %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of "empty" classifications
+# flattened %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of non-empty classifications
+
+original_class <- n_distinct(dat$classification_id)
+empty_class <- n_distinct(filter(all_submissions, total_marks == 0)$classification_id)
+nonempty_class <- n_distinct(flattened$classification_id)
+
+ifelse(empty_class + nonempty_class == original_class, "yay", "boo")
+
+# recombine datasets: merge flat and empty (okay, do a full + meaty join)
+# all_submissions - has one record per classification per subject
+# flattened has one record per mark per classification, but only if the counter >0
+
+tot <- left_join(all_submissions, flattened) 
+
+data_out <- tot %>% 
+     mutate(., task_label = str_trunc(task_label, width = 25)) %>%
+     select(., -task_index, -key)
+
+write.csv(x = data_out, file = "flattened-wildebeest_2016_sample.csv")