Skip to content

Commit 1617deb

Browse files
committed
add json parsing for serengeti wildebeest count
1 parent bb6e43c commit 1617deb

File tree

8 files changed

+34323
-81
lines changed

8 files changed

+34323
-81
lines changed

scripts/in-development/flattening_functions.R renamed to functions/flattening_functions.R

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,6 @@
88
# library(lubridate)
99

1010

11-
###################### EXPLORE DATA #######################
12-
# Need to evaluate and limit to proper subsets
13-
14-
# check workflow
15-
check_workflow <- function(data){
16-
data %>% group_by(workflow_id, workflow_version) %>%
17-
summarise(date = max(created_at), count = n()) %>%
18-
print
19-
}
20-
21-
22-
# View classifications and dates of workflows to limit data out to proper workflow version and number
23-
24-
View_json <- function(jdata) {
25-
for (i in 1:50) {
26-
jdata$annotations[i] %>% prettify %>% print
27-
}
28-
}
2911

3012
####################### FLATTEN #######################
3113
# Data formats:

functions/quick_functions.R

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
###################### EXPLORE DATA #######################
2+
# Need to evaluate and limit to proper subsets
3+
4+
# check workflow
5+
check_workflow <- function(data){
6+
data %>% group_by(workflow_id, workflow_version) %>%
7+
summarise(date = max(created_at), count = n()) %>%
8+
print
9+
}
10+
11+
12+
# View classifications and dates of workflows to limit data out to proper workflow version and number
13+
14+
View_json <- function(jdata) {
15+
for (i in 1:50) {
16+
jdata$annotations[i] %>% prettify %>% print
17+
}
18+
}
File renamed without changes.

scripts/.DS_Store

4 KB
Binary file not shown.

scripts/in-development/json-parsing-wildebeest.R

Lines changed: 0 additions & 63 deletions
This file was deleted.
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
library(tidyjson)
2+
library(magrittr)
3+
library(jsonlite)
4+
library(dplyr)
5+
library(stringr)
6+
library(tidyr)
7+
8+
source("functions/quick_functions.R") #adds a check workflow and view json function
9+
10+
# this works for the old version of serengeti wildebeest count. Note you'll want to set working directory as appropriate.
11+
wilde <- read.csv("scripts/points-wildebeest/wildebeest_2016_sample.csv", stringsAsFactors = F)
12+
13+
check_workflow(wilde)
14+
15+
# Filter to the relevant workflow version. You might want to combine multiple versions; it depends on the changes that have been made to the project.
16+
dat <- wilde %>% filter(., workflow_id == 78, workflow_version == 36.60)
17+
18+
View_json(dat)
19+
dat$annotations[1] %>% prettify
20+
21+
22+
# View the data structure, note that anything with zero length "value" field is dropped
23+
dat$annotations %>% as.tbl_json %>%
24+
gather_array() %>%
25+
spread_values(task = jstring("task"), tasklabel = (jstring("task_label"))) %>%
26+
enter_object("value") %>%
27+
gather_array() %>%
28+
gather_keys() %>%
29+
append_values_string() %>% head %>% View
30+
31+
# Grab the top-level info for ALL classifications
32+
# produces one row per classification per subject; final column indicates how many x-y coordinates were made in that classification.
33+
all_submissions <- dat %>%
34+
select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>%
35+
as.tbl_json(json.column = "annotations") %>%
36+
gather_array(column.name = "task_index") %>%
37+
spread_values(task = jstring("task"), task_label = jstring("task_label")) %>%
38+
gather_keys() %>%
39+
json_lengths(column.name = "total_marks") %>%
40+
filter(., key == "value")
41+
42+
# produces one row per mark per classification per subject, but only keeps classifications with >0 marks
43+
flattened <- dat %>%
44+
select(., subject_ids, classification_id, user_name, workflow_id, workflow_version, created_at, annotations) %>%
45+
as.tbl_json(json.column = "annotations") %>%
46+
gather_array(column.name = "task_index") %>%
47+
spread_values(task = jstring("task"), task_label = (jstring("task_label"))) %>%
48+
enter_object("value") %>%
49+
gather_array(column.name = "mark_index") %>% #don't gather keys, whole point is that you are spreading out the damn keys.
50+
spread_values(tool_label = jstring("tool_label"), xcoord = jnumber("x"), ycoord = jnumber("y"), tool = jstring("tool"))
51+
52+
53+
54+
#check that captures all the data. should equal original total classifications.
55+
# dat %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) #original data
56+
# all_submissions %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # this maintains one row per classification.
57+
# all_submissions %>% filter(., total_marks == 0) %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of "empty" classifications
58+
# flattened %>% summarise(., n(), n_distinct(classification_id), n_distinct(subject_ids)) # number of non-empty classifications
59+
60+
original_class <- n_distinct(dat$classification_id)
61+
empty_class <- n_distinct(filter(all_submissions, total_marks == 0)$classification_id)
62+
nonempty_class <- n_distinct(flattened$classification_id)
63+
64+
ifelse(empty_class + nonempty_class == original_class, "yay", "boo")
65+
66+
# recombine datasets: merge flat and empty (okay, do a full + meaty join)
67+
# all_submissions - has one record per classification per subject
68+
# flattened has one record per mark per classification, but only if the counter >0
69+
70+
tot <- left_join(all_submissions, flattened)
71+
72+
data_out <- tot %>%
73+
mutate(., task_label = str_trunc(task_label, width = 25)) %>%
74+
select(., -task_index, -key)
75+
76+
write.csv(x = data_out, file = "flattened-wildebeest_2016_sample.csv")

0 commit comments

Comments
 (0)