diff --git a/Dockerfile b/Dockerfile index 061ac9f8..32d27ca7 100644 --- a/Dockerfile +++ b/Dockerfile @@ -9,7 +9,7 @@ ENV PYTHON python3 # Install system dependencies (the overall form of this command is recommended by https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/) RUN apt-get update \ - && apt-get install -y build-essential python3-pip libatlas-dev liblapack-dev libhdf5-dev libmecab-dev mecab-ipadic-utf8 nginx supervisor \ + && apt-get install -y build-essential python3-pip libatlas-dev liblapack-dev libhdf5-dev libmecab-dev mecab-ipadic-utf8 nginx supervisor wget \ && rm -rf /var/lib/apt/lists/* ADD conceptnet5 /src/conceptnet/conceptnet5 diff --git a/Snakefile b/Snakefile index 5349e058..27e1add8 100644 --- a/Snakefile +++ b/Snakefile @@ -76,9 +76,9 @@ CORE_DATASET_NAMES += ["emoji/{}".format(lang) for lang in EMOJI_LANGUAGES] DATASET_NAMES = CORE_DATASET_NAMES + ["dbpedia/dbpedia_en"] -RAW_DATA_URL = "http://conceptnet.s3.amazonaws.com/raw-data/2016" +RAW_DATA_URL = "https://conceptnet.s3.amazonaws.com/raw-data/2016" PRECOMPUTED_DATA_PATH = "/precomputed-data/2016" -PRECOMPUTED_DATA_URL = "http://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH +PRECOMPUTED_DATA_URL = "https://conceptnet.s3.amazonaws.com" + PRECOMPUTED_DATA_PATH PRECOMPUTED_S3_UPLOAD = "s3://conceptnet" + PRECOMPUTED_DATA_PATH INPUT_EMBEDDINGS = [ @@ -119,8 +119,7 @@ rule all: DATA + "/stats/language_edges.txt", DATA + "/stats/relations.txt", DATA + "/assoc/reduced.csv", - DATA + "/vectors/mini.h5", - "data-loader/sha256sums.txt" + DATA + "/vectors/mini.h5" rule evaluation: input: @@ -158,25 +157,25 @@ rule download_raw: output: DATA + "/raw/{dirname}/{filename}" shell: - "wget {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}" + "wget -nv {RAW_DATA_URL}/{wildcards.dirname}/{wildcards.filename} -O {output}" rule download_conceptnet_ppmi: output: DATA + "/precomputed/vectors/conceptnet-55-ppmi.h5" shell: - "wget {PRECOMPUTED_DATA_URL}/numberbatch/16.09/conceptnet-55-ppmi.h5 -O {output}" + "wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/16.09/conceptnet-55-ppmi.h5 -O {output}" rule download_numberbatch: output: DATA + "/precomputed/vectors/numberbatch.h5" shell: - "wget {PRECOMPUTED_DATA_URL}/numberbatch/16.09/numberbatch.h5 -O {output}" + "wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/16.09/numberbatch.h5 -O {output}" rule download_opensubtitles_ppmi: output: DATA + "/precomputed/vectors/opensubtitles-ppmi-5.h5" shell: - "wget {PRECOMPUTED_DATA_URL}/numberbatch/17.02/opensubtitles-ppmi-5.h5 -O {output}" + "wget -nv {PRECOMPUTED_DATA_URL}/numberbatch/17.02/opensubtitles-ppmi-5.h5 -O {output}" # Precomputation @@ -649,20 +648,6 @@ rule export_english_text: "cn5-vectors export_text -l en {input} {output}" -rule sha256sums: - input: - DATA + "/psql/edge_features.csv.gz", - DATA + "/psql/edges.csv.gz", - DATA + "/psql/edge_sources.csv.gz", - DATA + "/psql/node_prefixes.csv.gz", - DATA + "/psql/nodes.csv.gz", - DATA + "/psql/relations.csv.gz", - DATA + "/psql/sources.csv.gz" - output: - "data-loader/sha256sums.txt" - shell: - "sha256sum {input} | sed -e 's:%(data)s:/data/conceptnet:' > {output}" % {'data': DATA} - # Evaluation # ========== diff --git a/data-loader/10-check-download.sh b/data-loader/10-download.sh similarity index 61% rename from data-loader/10-check-download.sh rename to data-loader/10-download.sh index 5cbf227d..9f767bb8 100644 --- a/data-loader/10-check-download.sh +++ b/data-loader/10-download.sh @@ -8,17 +8,8 @@ NAMES='edges edge_sources edge_features nodes node_prefixes sources relations' get_db_files() { for name in $NAMES; do - curl $PRECOMPUTED_PSQL_URL/$name.csv.gz > $DATA/psql/$name.csv.gz + wget -nv -O $DATA/psql/$name.csv.gz $PRECOMPUTED_PSQL_URL/$name.csv.gz done - sha256sum $DATA/psql/*.csv.gz > $CHECKSUM/sha256sums.computed.txt - diff $CHECKSUM/sha256sums.txt $CHECKSUM/sha256sums.computed.txt || panic -} - -panic() { - rm $DATA/psql/*.csv.gz - echo "SHA-256 hashes of input files don't match. The database will not be built." - echo "This could indicate a failed download, a version mismatch, or your HTTP connection getting hijacked." - exit 1 } mkdir -p $DATA/psql @@ -27,7 +18,7 @@ mkdir -p $DATA/vectors # Get semantic vectors (ConceptNet Numberbatch Mini) that would be # computationally expensive to compute if [ ! -e $DATA/vectors/mini.h5 ]; then - curl $PRECOMPUTED_VECTOR_URL/mini.h5 > $DATA/vectors/mini.h5 + wget -nv -O $DATA/vectors/mini.h5 $PRECOMPUTED_VECTOR_URL/mini.h5 fi # Get the database input files if [ ! -e $DATA/psql/edges.csv.gz ]; then diff --git a/data-loader/Dockerfile b/data-loader/Dockerfile index 23340fbc..a7eafa24 100644 --- a/data-loader/Dockerfile +++ b/data-loader/Dockerfile @@ -3,13 +3,9 @@ MAINTAINER Rob Speer # Install system dependencies (the overall form of this command is recommended by https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/) RUN apt-get update \ - && apt-get install -y coreutils diffutils curl gzip \ + && apt-get install -y coreutils diffutils wget gzip \ && rm -rf /var/lib/apt/lists/* -ADD 10-check-download.sh /docker-entrypoint-initdb.d/10-check-download.sh +ADD 10-download.sh /docker-entrypoint-initdb.d/10-check-download.sh ADD 20-load-db.sql /docker-entrypoint-initdb.d/20-load-db.sql ADD 30-done.sh /docker-entrypoint-initdb.d/30-done.sh -ADD sha256sums.txt /checksum/sha256sums.txt - -RUN chown -R root.postgres /data/conceptnet -RUN chmod -R g+w /data/conceptnet diff --git a/data-loader/sha256sums.txt b/data-loader/sha256sums.txt deleted file mode 100644 index 6cf5204f..00000000 --- a/data-loader/sha256sums.txt +++ /dev/null @@ -1,7 +0,0 @@ -386450767a6e2bcb8fb33ccfed382b56739c40e017bb0ba0f85e08b669658bc4 /data/conceptnet/psql/edge_features.csv.gz -8d65673c7bc3dec449c968c96034770ba70cd238bbf94475a29cc47907d882de /data/conceptnet/psql/edges.csv.gz -c323a36e4388de9c4530785c75d12161f93f2ada22aefebb3965f52aed3879b1 /data/conceptnet/psql/edge_sources.csv.gz -8ff28c2be67de8996866f083d9e91f469575b8b67341de3aa57d0f0d25324b5e /data/conceptnet/psql/node_prefixes.csv.gz -b96bce7155251ce572b778933728979cf7fc3a5cce949c6e441cea355df10804 /data/conceptnet/psql/nodes.csv.gz -274c2a19f3c0fd0a0e6f47fa1ecfeaea93c22679c676519f5fc103a3f04a611e /data/conceptnet/psql/relations.csv.gz -c9ab7b011478e626cf8ec9ad2a49a8c8583674dcbfc3d9949107a193ad891290 /data/conceptnet/psql/sources.csv.gz diff --git a/scripts/build.sh b/scripts/build.sh deleted file mode 120000 index c07a74de..00000000 --- a/scripts/build.sh +++ /dev/null @@ -1 +0,0 @@ -build.sh \ No newline at end of file