|
| 1 | +library(tidyjson) |
| 2 | +library(magrittr) |
| 3 | +library(jsonlite) |
| 4 | +library(dplyr) |
| 5 | +library(stringr) |
| 6 | +library(tidyr) |
| 7 | + |
| 8 | +source("functions/quick_functions.R") #adds a check workflow and view json function |
| 9 | + |
| 10 | +# this works for the old version of serengeti wildebeest count. Note you'll want to set working directory as appropriate. |
| 11 | +wilde <- read.csv("scripts/points-wildebeest/wildebeest_2016_sample.csv", stringsAsFactors = F) |
| 12 | + |
| 13 | +check_workflow(wilde) |
| 14 | + |
| 15 | +# Filter to the relevant workflow version. You might want to combine multiple versions; it depends on the changes that have been made to the project. |
| 16 | +dat <- wilde %>% filter(., workflow_id == 78, workflow_version == 36.60) |
| 17 | + |
| 18 | +View_json(dat) |
| 19 | +dat$annotations[1] %>% prettify |
| 20 | + |
| 21 | + |
| 22 | +# View the data structure, note that anything with zero length "value" field is dropped |
| 23 | +dat$annotations %>% as.tbl_json %>% |
| 24 | + gather_array() %>% |
| 25 | + spread_values(task = jstring("task"), tasklabel = (jstring("task_label"))) %>% |
| 26 | + enter_object("value") %>% |
| 27 | + gather_array() %>% |
| 28 | + gather_keys() %>% |
| 29 | + append_values_string() %>% head %>% View |
| 30 | + |
| 31 | +# Grab the top-level info for ALL classifications |
| 32 | +# produces one row per classification per subject; final column indicates how many x-y coordinates were made in that classification. |
| 33 | +all_submissions <- dat %>% |
| 34 | + select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>% |
| 35 | + as.tbl_json(json.column = "annotations") %>% |
| 36 | + gather_array(column.name = "task_index") %>% |
| 37 | + spread_values(task = jstring("task"), task_label = jstring("task_label")) %>% |
| 38 | + gather_keys() %>% |
| 39 | + json_lengths(column.name = "total_marks") %>% |
| 40 | + filter(., key == "value") |
| 41 | + |
| 42 | +# produces one row per mark per classification per subject, but only keeps classifications with >0 marks |
| 43 | +flattened <- dat %>% |
| 44 | + select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>% |
| 45 | + as.tbl_json(json.column = "annotations") %>% |
| 46 | + gather_array(column.name = "task_index") %>% |
| 47 | + spread_values(task = jstring("task"), task_label = (jstring("task_label"))) %>% |
| 48 | + enter_object("value") %>% |
| 49 | + gather_array(column.name = "mark_index") %>% #don't gather keys, whole point is that you are spreading out the damn keys. |
| 50 | + spread_values(tool_label = jstring("tool_label"), xcoord = jnumber("x"), ycoord = jnumber("y"), tool = jstring("tool")) |
| 51 | + |
| 52 | + |
| 53 | + |
| 54 | +#check that captures all the data. should equal original total classifications. |
| 55 | +# dat %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) #original data |
| 56 | +# all_submissions %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # this maintains one row per classification. |
| 57 | +# all_submissions %>% filter(., total_marks == 0) %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of "empty" classifications |
| 58 | +# flattened %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of non-empty classifications |
| 59 | + |
| 60 | +original_class <- n_distinct(dat$classification_id) |
| 61 | +empty_class <- n_distinct(filter(all_submissions, total_marks == 0)$classification_id) |
| 62 | +nonempty_class <- n_distinct(flattened$classification_id) |
| 63 | + |
| 64 | +ifelse(empty_class + nonempty_class == original_class, "yay", "boo") |
| 65 | + |
| 66 | +# recombine datasets: merge flat and empty (okay, do a full + meaty join) |
| 67 | +# all_submissions - has one record per classification per subject |
| 68 | +# flattened has one record per mark per classification, but only if the counter >0 |
| 69 | + |
| 70 | +tot <- left_join(all_submissions, flattened) |
| 71 | + |
| 72 | +data_out <- tot %>% |
| 73 | + mutate(., task_label = str_trunc(task_label, width = 25)) %>% |
| 74 | + select(., -task_index, -key) |
| 75 | + |
| 76 | +write.csv(x = data_out, file = "flattened-wildebeest_2016_sample.csv") |
0 commit comments