Skip to content

Commit

Permalink
Merge pull request #142 from commonsense/no-more-checksum
Browse files Browse the repository at this point in the history
Download files with wget over https; don't bother checksumming them anymore
  • Loading branch information
jlowryduda authored Nov 3, 2017
2 parents be5a039 + 04a1b60 commit 7b1d103
Show file tree
Hide file tree
Showing 6 changed files with 12 additions and 48 deletions.
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ ENV PYTHON python3

# Install system dependencies (the overall form of this command is recommended by https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/)
RUN apt-get update \
&& apt-get install -y build-essential python3-pip libatlas-dev liblapack-dev libhdf5-dev libmecab-dev mecab-ipadic-utf8 nginx supervisor \
&& apt-get install -y build-essential python3-pip libatlas-dev liblapack-dev libhdf5-dev libmecab-dev mecab-ipadic-utf8 nginx supervisor wget \
&& rm -rf /var/lib/apt/lists/*

ADD conceptnet5 /src/conceptnet/conceptnet5
Expand Down
29 changes: 7 additions & 22 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,9 @@ CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES]

DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"]

RAW_DATA_URL = "http://conceptnet.s3.amazonaws.com/raw-data/2016"
RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016"
PRECOMPUTED_DATA_PATH = "/precomputed-data/2016"
PRECOMPUTED_DATA_URL = "http://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH
PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH

INPUT_EMBEDDINGS = [
Expand Down Expand Up @@ -119,8 +119,7 @@ rule all:
DATA + "/stats/language_edges.txt",
DATA + "/stats/relations.txt",
DATA + "/assoc/reduced.csv",
DATA + "/vectors/mini.h5",
"data-loader/sha256sums.txt"
DATA + "/vectors/mini.h5"

rule evaluation:
input:
Expand Down Expand Up @@ -158,25 +157,25 @@ rule download_raw:
output:
DATA + "/raw/{dirname}/{filename}"
shell:
"wget {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}"
"wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}"

rule download_conceptnet_ppmi:
output:
DATA + "/precomputed/vectors/conceptnet-55-ppmi.h5"
shell:
"wget {PRECOMPUTED_DATA_URL}/numberbatch/16.09/conceptnet-55-ppmi.h5 -O {output}"
"wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/16.09/conceptnet-55-ppmi.h5 -O {output}"

rule download_numberbatch:
output:
DATA + "/precomputed/vectors/numberbatch.h5"
shell:
"wget {PRECOMPUTED_DATA_URL}/numberbatch/16.09/numberbatch.h5 -O {output}"
"wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/16.09/numberbatch.h5 -O {output}"

rule download_opensubtitles_ppmi:
output:
DATA + "/precomputed/vectors/opensubtitles-ppmi-5.h5"
shell:
"wget {PRECOMPUTED_DATA_URL}/numberbatch/17.02/opensubtitles-ppmi-5.h5 -O {output}"
"wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/17.02/opensubtitles-ppmi-5.h5 -O {output}"


# Precomputation
Expand Down Expand Up @@ -649,20 +648,6 @@ rule export_english_text:
"cn5-vectors export_text -l en {input} {output}"


rule sha256sums:
input:
DATA + "/psql/edge_features.csv.gz",
DATA + "/psql/edges.csv.gz",
DATA + "/psql/edge_sources.csv.gz",
DATA + "/psql/node_prefixes.csv.gz",
DATA + "/psql/nodes.csv.gz",
DATA + "/psql/relations.csv.gz",
DATA + "/psql/sources.csv.gz"
output:
"data-loader/sha256sums.txt"
shell:
"sha256sum {input} | sed -e 's:%(data)s:/data/conceptnet:' > {output}" % {'data': DATA}

# Evaluation
# ==========

Expand Down
13 changes: 2 additions & 11 deletions data-loader/10-check-download.sh → data-loader/10-download.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,8 @@ NAMES='edges edge_sources edge_features nodes node_prefixes sources relations'

get_db_files() {
for name in $NAMES; do
curl $PRECOMPUTED_PSQL_URL/$name.csv.gz > $DATA/psql/$name.csv.gz
wget -nv -O $DATA/psql/$name.csv.gz $PRECOMPUTED_PSQL_URL/$name.csv.gz
done
sha256sum $DATA/psql/*.csv.gz > $CHECKSUM/sha256sums.computed.txt
diff $CHECKSUM/sha256sums.txt $CHECKSUM/sha256sums.computed.txt || panic
}

panic() {
rm $DATA/psql/*.csv.gz
echo "SHA-256 hashes of input files don't match. The database will not be built."
echo "This could indicate a failed download, a version mismatch, or your HTTP connection getting hijacked."
exit 1
}

mkdir -p $DATA/psql
Expand All @@ -27,7 +18,7 @@ mkdir -p $DATA/vectors
# Get semantic vectors (ConceptNet Numberbatch Mini) that would be
# computationally expensive to compute
if [ ! -e $DATA/vectors/mini.h5 ]; then
curl $PRECOMPUTED_VECTOR_URL/mini.h5 > $DATA/vectors/mini.h5
wget -nv -O $DATA/vectors/mini.h5 $PRECOMPUTED_VECTOR_URL/mini.h5
fi
# Get the database input files
if [ ! -e $DATA/psql/edges.csv.gz ]; then
Expand Down
8 changes: 2 additions & 6 deletions data-loader/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,9 @@ MAINTAINER Rob Speer <rob@luminoso.com>

# Install system dependencies (the overall form of this command is recommended by https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/)
RUN apt-get update \
&& apt-get install -y coreutils diffutils curl gzip \
&& apt-get install -y coreutils diffutils wget gzip \
&& rm -rf /var/lib/apt/lists/*

ADD 10-check-download.sh /docker-entrypoint-initdb.d/10-check-download.sh
ADD 10-download.sh /docker-entrypoint-initdb.d/10-check-download.sh
ADD 20-load-db.sql /docker-entrypoint-initdb.d/20-load-db.sql
ADD 30-done.sh /docker-entrypoint-initdb.d/30-done.sh
ADD sha256sums.txt /checksum/sha256sums.txt

RUN chown -R root.postgres /data/conceptnet
RUN chmod -R g+w /data/conceptnet
7 changes: 0 additions & 7 deletions data-loader/sha256sums.txt

This file was deleted.

1 change: 0 additions & 1 deletion scripts/build.sh

This file was deleted.

0 comments on commit 7b1d103

Please sign in to comment.