Skip to content

Commit 23a4961

Browse files
bcbi-botibacher
andcommitted
Merge #51
51: Improvements r=ibacher a=ibacher Usually PRs should be a single change, but here we are... Uses `writedlm()` to generate output rather than the custom implementation we were using. In addition, it adds support for date shifting columns where there are non-date values as well as valid date values. This sometimes happens when the system generating the raw data marks missing data. Co-authored-by: Ian <ian_bacher@brown.edu>
2 parents c23f7a2 + 92ae019 commit 23a4961

File tree

6 files changed

+423
-58
lines changed

6 files changed

+423
-58
lines changed

Project.toml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "DeIdentification"
22
uuid = "b905b068-7150-5b22-bc23-80596c88c6a6"
33
authors = ["Brown Center for Biomedical Informatics"]
4-
version = "0.7.0"
4+
version = "0.8.0"
55

66
[deps]
77
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
@@ -11,21 +11,23 @@ DelimitedFiles = "8bb1440f-4735-579b-a4ab-409b98df4dab"
1111
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
1212
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1313
Memento = "f28f55f0-a522-5efc-85c2-fe41dfb9b2d9"
14+
Parsers = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0"
1415
REPL = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
1516
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
1617
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
1718
Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
1819
YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6"
1920

2021
[compat]
21-
julia = "^1.0.0"
2222
CSV = "^0.5.14"
23+
Parsers = "^0.3.7"
2324
DataStructures = "^0.17.2"
2425
Glob = "^1.2.0"
2526
JSON = "^0.21.0"
2627
Memento = "^0.12.1"
2728
Tables = "^0.2.11"
2829
YAML = "^0.3.2"
30+
julia = "^1.0.0"
2931

3032
[extras]
3133
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"

src/DeIdentification.jl

Lines changed: 62 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ import Random: shuffle, randstring, seed!, make_seed
1313
import Memento
1414
import DataStructures: OrderedDict
1515
import REPL
16+
import Parsers
1617
using REPL.TerminalMenus
1718
using DelimitedFiles
1819

@@ -30,7 +31,14 @@ tracking identifier mappings.
3031
"""
3132
function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
3233
# Initiate new file
33-
infile = CSV.File(fc.filename, dateformat = fc.dateformat)
34+
infile = try
35+
CSV.File(fc.filename, dateformat = fc.dateformat)
36+
catch ArgumentError
37+
CSV.File(fc.filename)
38+
end
39+
40+
dicts = DeIdDicts(dicts, fc.dateformat)
41+
3442
outfile = joinpath(pc.outdir, "deid_" * fc.name * "_" * getcurrentdate() * ".csv")
3543

3644
ncol = length(infile.names)
@@ -75,52 +83,60 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
7583
writedlm(io, reshape(header, 1, length(header)), ',')
7684

7785
# Process each row
78-
for row in infile
79-
80-
val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
81-
pid = setrid(val, dicts)
82-
83-
for col in infile.names
84-
colname = get(fc.rename_cols, col, col)
85-
86-
action = get(fc.colmap, colname, Missing) ::Type
87-
# drop cols
88-
action == Drop && continue
89-
90-
VAL = getproperty(row, col)
91-
92-
# apply pre-processing transform
93-
if haskey(fc.preprocess, colname) && !ismissing(VAL)
94-
transform = fc.preprocess[colname]
95-
transform = replace(transform, "VAL" => "\"$VAL\"")
96-
expr = Meta.parse(transform)
97-
VAL = Core.eval(@__MODULE__, expr)
98-
end
99-
100-
VAL = getoutput(dicts, action, VAL, pid)
101-
102-
if col == pcol
103-
VAL = pid
86+
for (i, row) in Iterators.enumerate(infile)
87+
try
88+
val = getoutput(dicts, Hash, getproperty(row, pcol), 0)
89+
pid = setrid(val, dicts)
90+
columns = Vector{String}()
91+
92+
for col in infile.names
93+
colname = get(fc.rename_cols, col, col)
94+
95+
action = get(fc.colmap, colname, Missing) ::Type
96+
97+
if action == Drop
98+
continue
99+
end
100+
101+
VAL = getproperty(row, col)
102+
103+
# apply pre-processing transform
104+
if haskey(fc.preprocess, colname) && !ismissing(VAL)
105+
transform = fc.preprocess[colname]
106+
transform = replace(transform, "VAL" => "\"$VAL\"")
107+
expr = Meta.parse(transform)
108+
VAL = Core.eval(@__MODULE__, expr)
109+
end
110+
111+
VAL = getoutput(dicts, action, VAL, pid)
112+
113+
if col == pcol
114+
VAL = pid
115+
end
116+
117+
# apply post-processing transform
118+
if haskey(fc.postprocess, colname) && !ismissing(VAL)
119+
transform = fc.postprocess[colname]
120+
transform = replace(transform, "VAL" => "\"$VAL\"")
121+
expr = Meta.parse(transform)
122+
VAL = Core.eval(@__MODULE__, expr)
123+
end
124+
125+
if eltype(VAL) <: String
126+
VAL = replace(VAL, "\"" => "\\\"")
127+
end
128+
129+
if VAL !== nothing && !ismissing(VAL)
130+
push!(columns, string(VAL))
131+
else
132+
push!(columns, "")
133+
end
104134
end
105135

106-
# apply post-processing transform
107-
if haskey(fc.postprocess, colname) && !ismissing(VAL)
108-
transform = fc.postprocess[colname]
109-
transform = replace(transform, "VAL" => "\"$VAL\"")
110-
expr = Meta.parse(transform)
111-
VAL = Core.eval(@__MODULE__, expr)
112-
end
113-
114-
if eltype(VAL) <: String
115-
VAL = replace(VAL, "\"" => "\\\"")
116-
end
117-
118-
write(io, "\"$VAL\"")
119-
if lastcol == col
120-
write(io, '\n')
121-
else
122-
write(io, ",")
123-
end
136+
writedlm(io, reshape(columns, 1, length(columns)), ',')
137+
catch e
138+
Memento.error(logger, "$(Dates.now()) Error occurred while processing row $i")
139+
rethrow(e)
124140
end
125141
end
126142

@@ -129,8 +145,6 @@ function deid_file!(dicts::DeIdDicts, fc::FileConfig, pc::ProjectConfig, logger)
129145
return nothing
130146
end
131147

132-
133-
134148
"""
135149
deidentify(cfg::ProjectConfig)
136150
This is the constructor for the `DeIdentified` struct. We use this type to store
@@ -142,7 +156,7 @@ digest of the original primary ID to our new research IDs.
142156
"""
143157
function deidentify(cfg::ProjectConfig)
144158
num_files = length(cfg.file_configs)
145-
dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears)
159+
dicts = DeIdDicts(cfg.maxdays, cfg.shiftyears, cfg.dateformat)
146160

147161
if !isdir(cfg.outdir)
148162
# mkpath also creates any intermediate paths

src/de_identify.jl

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ struct FileConfig
2525
rename_cols::Dict{Symbol,Symbol}
2626
preprocess::Dict{Symbol, String}
2727
postprocess::Dict{Symbol, String}
28-
dateformat::String
28+
dateformat::Dates.DateFormat
2929
end
3030

3131

@@ -38,7 +38,7 @@ struct ProjectConfig
3838
maxdays::Int
3939
shiftyears::Int
4040
primary_id::Symbol
41-
dateformat::String
41+
dateformat::Dates.DateFormat
4242
end
4343

4444
"""
@@ -54,7 +54,7 @@ function ProjectConfig(cfg_file::String)
5454
num_file = length(cfg["datasets"])
5555
outdir = cfg["output_path"]
5656
pk = Symbol(cfg["primary_id"])
57-
dateformat = get(cfg, "date_format", "y-m-dTH:M:S.s")
57+
dateformat = Dates.DateFormat(get(cfg, "date_format", "y-m-dTH:M:S.s"))
5858

5959
seed = get(_ -> make_seed()[1], cfg, "project_seed")
6060
maxdays = get(cfg, "max_dateshift_days", 30)
@@ -66,7 +66,12 @@ function ProjectConfig(cfg_file::String)
6666
# populate File Configs
6767
for (i, ds) in enumerate(cfg["datasets"])
6868
name = ds["name"]
69-
file_dateformat = get(ds, "date_format", dateformat)
69+
if haskey(ds, "date_format")
70+
file_dateformat = Dates.DateFormat(get(ds, "date_format", "y-m-dTH:M:S.s"))
71+
else
72+
file_dateformat = dateformat
73+
end
74+
7075
rename_dict = Dict{Symbol,Symbol}()
7176
for pair in get(ds, "rename_cols", [])
7277
rename_dict[Symbol(pair["in"])] = Symbol(pair["out"])
@@ -105,17 +110,22 @@ struct DeIdDicts
105110
dateshift::Dict{Int, Int}
106111
maxdays::Int
107112
shiftyears::Int
113+
dateformat::Dates.DateFormat
108114
end
109115

110116
"""
111-
DeIdDicts(maxdays)
117+
DeIdDicts(maxdays, shiftyears, dateformat)
112118
113119
Structure containing dictionaries for project level mappings
114120
- Primary ID -> Research ID
115121
- Research ID -> DateShift number of days
116122
- Research ID -> Salt value
117123
"""
118-
DeIdDicts(maxdays, shiftyears) = DeIdDicts(Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears)
124+
DeIdDicts(maxdays, shiftyears, dateformat) = DeIdDicts(
125+
Dict{String, Int}(), Dict{Int, String}(), Dict{Int, Int}(), maxdays, shiftyears, dateformat)
126+
127+
DeIdDicts(current::DeIdDicts, dateformat::Dates.DateFormat) = DeIdDicts(
128+
current.id, current.salt, current.dateshift, current.maxdays, current.shiftyears, dateformat)
119129

120130

121131
"""
@@ -162,6 +172,22 @@ function dateshift_val!(dicts::DeIdDicts, val::Union{Dates.Date, Dates.DateTime,
162172

163173
end
164174

175+
function dateshift_val!(dicts::DeIdDicts, val::String, pid::Int)
176+
177+
newval = Parsers.tryparse(Dates.DateTime, val, Parsers.Options(dateformat=dicts.dateformat))
178+
if newval === nothing
179+
newval = Parsers.tryparse(Dates.Date, val, Parsers.Options(dateformat=dicts.dateformat))
180+
end
181+
182+
if newval === nothing
183+
@warn "Could not date shift non-date value $val"
184+
return missing
185+
end
186+
187+
return dateshift_val!(dicts, newval, pid)
188+
189+
end
190+
165191
"""
166192
setrid(val, dicts)
167193

0 commit comments

Comments
 (0)