Skip to content

Commit

Permalink
add hash verification to datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
KristofferC committed Jan 30, 2019
1 parent 4be08fe commit f60079d
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 19 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
Expand Down
16 changes: 16 additions & 0 deletions src/data/Data.jl
Original file line number Diff line number Diff line change
@@ -1,11 +1,27 @@
module Data

import ..Flux
import SHA

export CMUDict, cmudict

deps(path...) = joinpath(@__DIR__, "..", "..", "deps", path...)

function download_and_verify(url, path, hash)
tmppath = tempname()
download(url, tmppath)
hash_download = open(tmppath) do f
bytes2hex(SHA.sha256(f))
end
if hash_download !== hash
msg = "Hash Mismatch!\n"
msg *= " Expected sha256: $hash\n"
msg *= " Calculated sha256: $hash_download"
error(msg)
end
mv(tmppath, path; force=true)
end

function __init__()
mkpath(deps())
end
Expand Down
14 changes: 8 additions & 6 deletions src/data/cmudict.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,23 +2,25 @@ module CMUDict

export cmudict

using ..Data: deps
using ..Data: deps, download_and_verify

const version = "0.7b"
const cache_prefix = "https://cache.julialang.org"

function load()
suffixes = ["", ".phones", ".symbols"]
suffixes_and_hashes = [("" , "209a8b4cd265013e96f4658632a9878103b0c5abf62b50d4ef3ae1be226b29e4"),
(".phones" , "ffb588a5e55684723582c7256e1d2f9fadb130011392d9e59237c76e34c2cfd6"),
(".symbols", "408ccaae803641c6d7b626b6299949320c2dbca96b2220fd3fb17887b023b027")]
if isdir(deps("cmudict"))
if all(isfile(deps("cmudict", "cmudict$x")) for x in suffixes)
if all(isfile(deps("cmudict", "cmudict$x")) for (x, _) in suffixes_and_hashes)
return
end
end
@info "Downloading CMUDict dataset"
mkpath(deps("cmudict"))
for x in suffixes
download("$cache_prefix/http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
deps("cmudict", "cmudict$x"))
for (x, hash) in suffixes_and_hashes
download_and_verify("$cache_prefix/http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/cmudict-$version$x",
deps("cmudict", "cmudict$x"), hash)
end
end

Expand Down
11 changes: 6 additions & 5 deletions src/data/fashion-mnist.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,20 @@
module FashionMNIST

using ..MNIST: gzopen, imageheader, rawimage, labelheader, rawlabel
using ..Data: download_and_verify

const dir = joinpath(@__DIR__, "../../deps/fashion-mnist")

function load()
mkpath(dir)
cd(dir) do
for file in ["train-images-idx3-ubyte",
"train-labels-idx1-ubyte",
"t10k-images-idx3-ubyte",
"t10k-labels-idx1-ubyte"]
for (file, hash) in [("train-images-idx3-ubyte", "3aede38d61863908ad78613f6a32ed271626dd12800ba2636569512369268a84"),
("train-labels-idx1-ubyte", "a04f17134ac03560a47e3764e11b92fc97de4d1bfaf8ba1a3aa29af54cc90845"),
("t10k-images-idx3-ubyte" , "346e55b948d973a97e58d2351dde16a484bd415d4595297633bb08f03db6a073"),
("t10k-labels-idx1-ubyte" , "67da17c76eaffca5446c3361aaab5c3cd6d1c2608764d35dfb1850b086bf8dd5")]
isfile(file) && continue
@info "Downloading Fashion-MNIST dataset"
download("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz")
download_and_verify("http://fashion-mnist.s3-website.eu-central-1.amazonaws.com/$file.gz", "$file.gz", hash)
open(file, "w") do io
write(io, gzopen(read, "$file.gz"))
end
Expand Down
11 changes: 6 additions & 5 deletions src/data/mnist.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
module MNIST

using CodecZlib, Colors
using ..Data: download_and_verify

const Gray = Colors.Gray{Colors.N0f8}

Expand All @@ -15,13 +16,13 @@ end
function load()
mkpath(dir)
cd(dir) do
for file in ["train-images-idx3-ubyte",
"train-labels-idx1-ubyte",
"t10k-images-idx3-ubyte",
"t10k-labels-idx1-ubyte"]
for (file, hash) in [("train-images-idx3-ubyte", "440fcabf73cc546fa21475e81ea370265605f56be210a4024d2ca8f203523609"),
("train-labels-idx1-ubyte", "3552534a0a558bbed6aed32b30c495cca23d567ec52cac8be1a0730e8010255c"),
("t10k-images-idx3-ubyte" , "8d422c7b0a1c1c79245a5bcf07fe86e33eeafee792b84584aec276f5a2dbc4e6"),
("t10k-labels-idx1-ubyte" , "f7ae60f92e00ec6debd23a6088c31dbd2371eca3ffa0defaefb259924204aec6")]
isfile(file) && continue
@info "Downloading MNIST dataset"
download("https://cache.julialang.org/http://yann.lecun.com/exdb/mnist/$file.gz", "$file.gz")
download_and_verify("https://cache.julialang.org/http://yann.lecun.com/exdb/mnist/$file.gz", "$file.gz", hash)
open(file, "w") do io
write(io, gzopen(read, "$file.gz"))
end
Expand Down
6 changes: 3 additions & 3 deletions src/data/sentiment.jl
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
module Sentiment

using ZipFile
using ..Data: deps
using ..Data: deps, download_and_verify

function load()
isfile(deps("sentiment.zip")) && return
@info "Downloading sentiment treebank dataset"
download("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
deps("sentiment.zip"))
download_and_verify("https://cache.julialang.org/https://nlp.stanford.edu/sentiment/trainDevTestTrees_PTB.zip",
deps("sentiment.zip"), "5c613a4f673fc74097d523a2c83f38e0cc462984d847b82c7aaf36b01cbbbfcc")
end

getfile(r, name) = r.files[findfirst(x -> x.name == name, r.files)]
Expand Down

0 comments on commit f60079d

Please sign in to comment.