aliburchard
diff --git a/‎DataProcessing.Rproj
Lines changed: 13 additions & 0 deletions b/‎DataProcessing.Rproj
Lines changed: 13 additions & 0 deletions
diff --git a/‎notebooks/.DS_Store
6 KB b/‎notebooks/.DS_Store
6 KB
diff --git a/‎notebooks/json-parsing-chicago.Rmd
Lines changed: 108 additions & 0 deletions b/‎notebooks/json-parsing-chicago.Rmd
Lines changed: 108 additions & 0 deletions
diff --git a/‎notebooks/json-parsing-chicago.nb.html
Lines changed: 439 additions & 0 deletions b/‎notebooks/json-parsing-chicago.nb.html
Lines changed: 439 additions & 0 deletions
diff --git a/‎notebooks/json-parsing-examples.Rmd
Lines changed: 114 additions & 0 deletions b/‎notebooks/json-parsing-examples.Rmd
Lines changed: 114 additions & 0 deletions
diff --git a/‎notebooks/json-parsing-examples.nb.html
Lines changed: 501 additions & 0 deletions b/‎notebooks/json-parsing-examples.nb.html
Lines changed: 501 additions & 0 deletions
diff --git a/‎notebooks/json-parsing-michigan.Rmd
Lines changed: 130 additions & 0 deletions b/‎notebooks/json-parsing-michigan.Rmd
Lines changed: 130 additions & 0 deletions
diff --git a/‎notebooks/json-parsing-michigan.nb.html
Lines changed: 535 additions & 0 deletions b/‎notebooks/json-parsing-michigan.nb.html
Lines changed: 535 additions & 0 deletions
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: No
+SaveWorkspace: No
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 5
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX
@@ -0,0 +1,108 @@
+---
+title: "JSON-Parsing Survey Tasks: Chicago"
+output: html_notebook
+---
+
+This code flattens the Chicago Wildlife Watch data. 
+
+```{r}
+library(tidyjson)
+library(magrittr)
+library(jsonlite)
+library(dplyr)
+library(stringr)
+library(tidyr)
+
+chicago_unfiltered <- read.csv("../data/chicago-wildlife-watch-classifications.csv", stringsAsFactors = F)
+```
+
+First, we need to limit the classification data to the final workflow version and, if necessary, split by task. T0 is clearly the only task we really care about in this dataset (though note the changed format of current site). 
+
+```{r}
+# check which workflow version we want:
+chicago_unfiltered %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n_distinct(workflow_version))
+
+quick_check <- chicago_unfiltered %>% 
+     select(., subject_ids, classification_id, workflow_version, annotations) %>%
+     as.tbl_json(json.column = "annotations") %>%
+     gather_array(column.name = "task_index") %>% # really important for joining later
+     spread_values(task = jstring("task"), task_label = jstring("task_label"), value = jstring("value"))  %>% 
+     gather_keys %>%
+     append_values_string()
+
+quick_check %>% data.frame %>% group_by(., workflow_version, key, task) %>% summarise(., classification_count = n()) %>% print
+
+```
+
+So filter to the appropriate workflow and get going! Let's take a quick peek at the data.
+
+```{r}
+chicago <- chicago_unfiltered %>% filter(., workflow_version == 397.41)
+chicago$annotations[1] %>% prettify()
+```
+
+
+```{r}
+# preliminary flat
+basic_flat_with_values <- chicago %>% 
+     select(., subject_ids, classification_id, workflow_version, annotations) %>%
+     as.tbl_json(json.column = "annotations") %>%
+     gather_array(column.name = "task_index") %>% # really important for joining later
+     spread_values(task = jstring("task"), task_label = jstring("task_label"), value = jstring("value")) 
+
+basic_flat_with_values %>% data.frame %>% head
+
+chicago_summary <-  basic_flat_with_values %>% 
+     gather_keys %>%
+     append_values_string()
+
+chicago_summary %>% data.frame %>% head # this will have all the classification IDs; if Value is empty, then the field will be null. This will have multiple rows per classification if there are multiple tasks completed
+
+chicago_summary %>% data.frame %>% group_by(., workflow_version, key, task) %>% summarise(., n())
+
+# quick check the filtered original data
+chicago %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n_distinct(workflow_version))
+```
+
+Now dive into the first nested object, the species choice. Note that if you have different task types that you haven't filtered out, or if you have null objects, this might break or else drop rows.
+
+```{r}
+# grab choices; append embedded array values just for tracking
+# Note that this will break if any of the tasks are simple questions. You would need to split by task before here.
+chicago_choices <- basic_flat_with_values %>%
+     enter_object("value") %>% json_lengths(column.name = "total_species") %>% 
+     gather_array(column.name = "species_index") %>% #each classification is an array. so you need to gather up multiple arrays.
+     spread_values(choice = jstring("choice"), answers = jstring("answers")) #append the answers as characters just in case
+
+# if there are multiple species ID'd, there will be multiple rows and array.index will be >1
+chicago_choices %>% data.frame %>% head 
+chicago_choices %>% group_by(., classification_id) %>% summarise(., count = n(), max(species_index)) %>% arrange(., -count)
+```
+
+Now dive into the second nested object, which is the sub questions. Since these actually aren't arrays, it's okay if they're empty! This still keeps the rows.
+```{r}
+# grab answers - for some reason, this keeps rows even if there are no answers! 
+# Note that this last bit is the part that would need to be customized per team, I think
+chicago_answers <- chicago_choices %>% 
+     enter_object("answers") %>% 
+     spread_values(how_many = jstring("HWMN"), wow = jstring("CLCKWWFTHSSNWSMPHT"), off_leash = jstring("CLCKSFDGSFFLSH"))
+
+chicago_answers %>% data.frame %>% head      
+#chicago_answers %>% group_by(classification_id) %>% summarise(., n())     
+```
+
+Put everything back together, which is important if you've dropped rows because of empty arrays and things.
+```{r}
+# in theory, you want to tie all of these back together just in case there are missing values
+add_choices <- left_join(basic_flat_with_values, chicago_choices)
+tot <- left_join(add_choices, chicago_answers)
+flat_data <- tot %>% select(., -task_index, -task_label, -value, -answers)
+
+flat_data %>% data.frame %>% head
+```
+
+Here's your file out!
+```{r}
+write.csv(flat_data, file = "../data/chicago-flattened.csv")
+```
+
@@ -0,0 +1,114 @@
+---
+title: "R Notebook"
+output: html_notebook
+---
+
+```{r}
+library(tidyjson)
+library(magrittr)
+library(jsonlite)
+library(dplyr)
+```
+
+
+# JSON Parsing
+
+Each classification is an array. Depending on the workflow and how it's changed, classification arrays may vary in structure within a single project. Also, empty arrays seem to be problematic. Depending on the type of project, you probably want to split the data into workflows and even limit workflow version prior to flattening. 
+
+---
+
+#### Load example data
+```{r load example data}
+sas <- read.csv("../data/questions-SAS-1000.csv", stringsAsFactors = F)
+kitteh <- read.csv("../data/kitteh-zoo-classifications.csv", stringsAsFactors = F)
+wilde <- read.csv("../data/points-wildebeest.csv", stringsAsFactors = F)
+chicago <- read.csv("../data/chicago-wildlife-watch-classifications.csv", stringsAsFactors = F)
+
+```
+
+#### Simple Yes or No Questions
+
+```{r display example annotation formats}
+sas$annotations[1] %>% prettify
+```
+
+#### Simple Point Marking
+```{r}
+wilde$annotations[2] %>% prettify()
+```
+#### Combination Question and Marking: Note that the fomat of the value array varies by task
+```{r}
+kitteh$annotations[1] %>% prettify
+
+```
+
+# Flattening the Files
+
+It's much easier to parse/flatten the JSON when everything is in a standard format, so you probably want to split out your raw file based on the Workflow and even Task IDs. You also want to limit to only the workflow version(s) with actual data. This is because previous versions, especially those with empty data, may have different structures for the classification data, which is annoying and problematic.
+
+Note: you may need to dig into your raw data a bit to identify which workflow and version you need. Some projects have many workflows and versions, others not so many.
+
+```{r workflow_fun_definition}
+fun_check_workflow <- function(data){
+ data %>% group_by(workflow_id, workflow_version) %>% 
+          summarise(date = max(created_at), count = n()) %>% 
+          print    
+}
+```
+For example: This is the Snapshots at Sea classifications by workflow
+
+```{r}
+sas %>% fun_check_workflow()
+```
+
+vs. that of Wildebeest Marking Project
+```{r}
+wilde %>% fun_check_workflow()
+```
+Vs. Chicago Wildlife Watch
+```{r}
+chicago %>% fun_check_workflow()
+```
+
+## Basic Flattening
+
+With jsonlite, you can basically flatten all of the json data into a series of nested lists. This works really well for simple data, like questions, but marking tasks and more complex workflows get a bit complicated.
+
+```{r flattening }
+library(jsonlite)
+
+#Basic Flattening Function
+basic_flattening <- function(jdata) {
+     out <- list() #create list to hold everything
+     
+     for (i in 1:dim(jdata)[1]) { #loop through each row of the dataset at a time
+          classification_id  <- jdata$classification_id[i] 
+          subject_id <- jdata$subject_ids[i] 
+          split_anno <- fromJSON(txt = jdata$annotations[i], simplifyDataFrame = T) 
+          out[[i]] <- cbind(classification_id, subject_id, split_anno)
+     }
+     
+     do.call(what = rbind, args = out)   
+}
+
+```
+
+Single questions flatten alright
+```{r flatten sas}
+flat_sas <- sas %>% basic_flattening() 
+str(flat_sas)
+```
+
+But more complex questions produce embedded lists inside the "value" column.
+
+
+```{r}
+flat_wilde <- wilde[1:10,] %>% basic_flattening() 
+str(flat_wilde, max.level = 2)
+```
+
+```{r}
+flat_kitteh <- kitteh %>% basic_flattening() 
+str(flat_kitteh, max.level = 3)
+```
+
@@ -0,0 +1,130 @@
+---
+title: "JSON parsing: survey tasks: with multiple-choice subquestions"
+output: html_notebook
+---
+
+This project has two tasks in their workflow, a survey task and a follow up question task asking about the weather. The survey task also has subquestions that ask the volunteer to select all that apply, meaning we have an extra step to flatten out the annotations
+
+```{r}
+library(tidyjson)
+library(magrittr)
+library(jsonlite)
+library(dplyr)
+library(stringr)
+library(tidyr)
+library(lubridate)
+```
+```{r}
+
+jdata_unfiltered <- read.csv(file = "../data/michigan-zoomin-classifications.csv", stringsAsFactors = F)
+
+# you'd probably need to include multiple versions (as these likely have minor text changes, but for this demo we'll choose 463.55)
+jdata_unfiltered %>% mutate(., created_at = ymd_hms(created_at)) %>% 
+     group_by(., workflow_id, workflow_version) %>% summarise(., max(created_at), n()) %>% head
+
+
+jdata <- jdata_unfiltered %>% filter(., workflow_version == 463.55) %>% head(., n = 5000)
+jdata %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n_distinct(workflow_version))
+
+```
+
+Take a peek at the data structure. There are two tasks, and within the survey task, only some species have subquestions.
+```{r}
+############### SURVEY TASK
+head(jdata)
+for (i in 15:17) {
+     jdata$annotations[i] %>% prettify %>% print
+}
+```
+
+```{r}
+# preliminary flat
+
+basic_flat_with_values <- jdata %>% 
+     select(., subject_ids, classification_id, workflow_version, annotations) %>%
+     as.tbl_json(json.column = "annotations") %>%
+     gather_array(column.name = "task_index") %>% # really important for joining later
+     spread_values(task = jstring("task"), task_label = jstring("task_label"), value = jstring("value")) 
+
+basic_flat_with_values %>% data.frame %>% head
+
+basic_summary %>% data.frame %>% group_by(., workflow_version, key, task) %>% summarise(., n())
+```
+
+```{r}
+
+#--------------------------------------------------------------------------------#
+# split into survey vs. non-survey data frames. Question is flattened and can be exported as a separate file now.
+survey <- basic_flat_with_values %>% filter(., task == "T3")
+question <- basic_flat_with_values %>% filter(., task == "T2") 
+
+###----------------------------### SURVEY FLATTENING ###----------------------------### 
+
+# grab choices; Species_index lists how many species were recorded in a given classification. (Usually maxes at 2...
+with_choices <- survey %>%
+     enter_object("value") %>% json_lengths(column.name = "total_species") %>% 
+     gather_array(column.name = "species_index") %>% #each classification is an array. so you need to gather up multiple arrays.
+     spread_values(choice = jstring("choice")) 
+
+# if there are multiple species ID'd, there will be multiple rows and array.index will be >1
+with_choices %>% data.frame %>% head
+with_choices %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id))
+```
+
+Let's start the process of grabbing and flattening the nested data. Note that this section requires you to reference the specific suquestion labels, so if they change throughout the life of your project, you MUST create a script to handle the revisions.
+```{r}
+# grab answers. Note that the spread_values() function needs to be customized per team and subquestion label.
+
+with_answers <- with_choices %>% 
+     enter_object("answers") %>% 
+     spread_values(how_many = jstring("HOWMANYANIMALSDOYOUSEE")) %>%
+     enter_object("WHATISTHEANIMALSDOING") %>% #enter into the list of behaviors
+     gather_array("behavior_index") %>% #gather into one behavior per row
+     append_values_string("behavior") 
+
+# note that behaviors are listed in a "long" format, but this is probably unwieldy.
+with_answers %>% data.frame %>% head
+```
+
+Let's spread out the answers into individual columns with 1/0 indicators for whether or not that behavior was identified.
+```{r}
+# spread answers (into separate columns): have to drop behavior index or else the rows won't combine!
+with_answers_spread <- with_answers %>% data.frame %>% 
+     select(., -behavior_index) %>%
+     mutate(., behavior_present = 1) %>%
+     spread(., key = behavior, value = behavior_present, fill = 0)
+
+with_answers_spread %>% data.frame %>% head
+with_answers_spread %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id))
+
+```
+
+You could, also, in theory, create a column that contains an actual list of the behaviors. Note that the values look similar to how tidyjson would display them, but they are actual lists instead of character strings that say "list(...)"
+```{r}
+# spread answers (into a list)
+test <- with_answers %>% data.frame %>% 
+     select(., -behavior_index) %>% nest(behavior)
+
+
+with_answers %>% data.frame %>% head
+```
+
+```{r}
+# in theory, you want to tie all of these back together just in case there are missing values
+add_choices <- left_join(survey, with_choices)
+tot <- left_join(add_choices, with_answers_spread)
+flat_data <- tot %>% select(., -task_index, -task_label, -value)
+
+flat_data %>% data.frame %>% head
+```
+
+```{r}
+#check that the number of distinct subject IDs and classification IDs is still the same
+flat_data %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n()) #flattened,
+jdata %>% summarise(., n_distinct(subject_ids), n_distinct(classification_id), n()) #original
+
+#save your files for aggregation!
+write.csv(flat_data, file = "../data/T3-flattened.csv")
+write.csv(question, file = "../data/T2-flattened.csv")
+
+```