Skip to content

Commit

Permalink
Update pdb database creation script for cluster searches
Browse files Browse the repository at this point in the history
  • Loading branch information
milot-mirdita committed May 10, 2023
1 parent d6743f3 commit 8eb36a2
Show file tree
Hide file tree
Showing 2 changed files with 56 additions and 137 deletions.
7 changes: 4 additions & 3 deletions util/update_webserver_pdb/config.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ CIF_SUBDIR="::ftp_data/structures/divided/mmCIF/" # subdirectory
foldseek="./foldseek/bin/foldseek"

# directories
SCRATCH="/scratch_dir_that_will_be_deleted_rmrf_after_completion"
MIRRORDIR="/parent_directory_to_download_all_pdb_files"
PAST_DB_FOLDER="/directory_to_store_previously_generated_foldseek_databases"
SYMLINK_FOLDER="/directory_for_symlinks_to_pdb_files"
# log file

# pdb download log file
LOGFILE="file_to_record_logs_for_downloading_all_pdb_files"

# cloud storage upload endpoint
cloudflare_endpoint_url="endpoint_url_for_r2_cloudflarestorage_with_bucket_name=foldseek"
186 changes: 52 additions & 134 deletions util/update_webserver_pdb/single-script.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@
# Exit when any command fails
set -e
set -x
set -u

last_command=""
current_command=""
# Keep track of last executed command & echo error message before exit
trap 'last_command=$current_command; current_command=$BASH_COMMAND' DEBUG
trap 'echo "\"${last_command}\" command filed with exit code $?."' EXIT
trap 'echo "\"${last_command}\" command filed with exit code $?"' EXIT

source ./config.sh

Expand All @@ -16,6 +19,7 @@ else
echo "foldseek not found"
exit 1
fi
foldseekVersion=$($foldseek version)

# Check if aws cli is installed
if command -v aws >/dev/null 2>&1; then
Expand All @@ -25,137 +29,52 @@ else
exit 1
fi

## 1) Restore backup of last database
lastDB=$(find pdb100.tar.gz)
thisDate=$(cat pdb_download_date)
mv $lastDB $lastDB$thisDate
mv $lastDB$thisDate $PAST_DB_FOLDER
rm pdb*

## 2) Download mmCIF files from ftp.pdbj.org to PDB/FTP/
# Download mmCIF files from ftp.pdbj.org to PDB/FTP/
# rsync PDB files with accession number in string uniqueProteins, each separated by NEWLINE
date -u +%Y-%m-%d > pdb_download_date
rsync -av -zP ${SERVER}${CIF_SUBDIR} $MIRRORDIR > $LOGFILE 2>/dev/null

# store absolute paths of all files nested in PDB/FTP directory
find $MIRRORDIR -type f > full_filepaths.txt

## 3) Download ID Mapping file from ftp.uniprot.org to Taxonomy
rm -f zipped_idmap.gz idmap.txt
aria2c -x 8 --log=idmap_log --out=zipped_idmap.gz https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/idmapping/idmapping_selected.tab.gz
gzip -c -d zipped_idmap.gz > idmap.txt

## 4) Process idmap.txt into two-column format (PDBID "\t" TaxonID)
awk -F"\t" '{ for(N=1; N<=NF; N++) if($N=="") $N="BAD" } 1' OFS="\t" idmap.txt | awk -F '\t' '{
if ($6 != "BAD" && $13 != "BAD")
print $6 "\t" $13
else if ($6 != "BAD" && $13 = "BAD")
print $6 "\t" "0"
}' > idmap_columns_6_and_13

awk -F"\t" 'split($1,f,/; /)>1{for (i=1; i in f; i++) {$1=f[i]; print} next } 1' idmap_columns_6_and_13 > temp.txt
cat temp.txt | awk '{print tolower(substr($0,1,4)) substr($0,5,length($0)-4)}' | awk -F' ' 'BEGIN{OFS=FS} $2 == "" {$2 = "0"} 1' | sed 's/:/.cif.gz_/' > id_mapping_file
echo "$(sort -u id_mapping_file)" > id_mapping_file

rm idmap_columns_6_and_13 temp.txt

# Create symlinks to each pdb file
while read p; do
fileName=$(echo $p | awk -F"/" '{print $NF}')
if ! test -f "${SYMLINK_FOLDER}/$fileName"; then
ln -s $p ./symlinks
fi
done < full_filepaths.txt

## 5) run foldseek createdb, cluster workflows
# Createdb
$foldseek createdb ./symlinks targetDB --chain-name-mode 1
#rsync -av -zP ${SERVER}${CIF_SUBDIR} $MIRRORDIR > $LOGFILE 2>/dev/null
#date -u +%Y-%m-%d > pdb_download_date
timestamp=$(cat pdb_download_date)

mkdir -p ${SCRATCH}
pushd ${SCRATCH}
# Run foldseek createdb, cluster workflows
$foldseek createdb $MIRRORDIR pdb_seq --chain-name-mode 1 --write-mapping 1
# Get rid of .cif.gz from foldseek search results
$foldseek prefixid targetDB_h targetDB_h.tsv --tsv
awk '{gsub(".cif.gz",""); print $0}' targetDB_h.tsv > targetDB_h_new.tsv
$foldseek tsv2db targetDB_h_new.tsv targetDB_h --output-dbtype 12
$foldseek prefixid pdb_seq_h pdb_seq_h.tsv --tsv
sed -i 's|\.cif\.gz||g' pdb_seq_h.tsv
sed -i 's|\.cif\.gz||g' pdb_seq.lookup
sed -i 's|\.cif\.gz||g' pdb_seq.source
$foldseek tsv2db pdb_seq_h.tsv pdb_seq_h --output-dbtype 12
rm -f -- pdb_seq_h.tsv

# Clustering: Create 'targetDB_clu100' by removing redundancies from 'targetDB'
$foldseek cluster targetDB targetDB_clu100 tmp -c 1.0 --min-seq-id 1.0 --cov-mode 0
# Create sub-database 'pdb', so we don't have to remove redundancies everytime we execute 'foldseek search'
$foldseek createsubdb targetDB_clu100 targetDB pdb
$foldseek createsubdb targetDB_clu100 targetDB_h pdb_h
$foldseek createsubdb targetDB_clu100 targetDB_ca pdb_ca
$foldseek createsubdb targetDB_clu100 targetDB_ss pdb_ss

## 6) Assign Taxonomy
$foldseek createtaxdb pdb tmp --tax-mapping-file id_mapping_file

# Append taxonomy ID = 0 to protein entries which are present in lookup file but missing in mapping file
TEMP_MAPPING="appendToMapping.txt"

MAX_ID=$(wc -l pdb.lookup | awk '{ print $1 }')
MAX_ID=$MAX_ID+1
LAST_OBSERVED_ID=-1

rm -f $TEMP_MAPPING
touch $TEMP_MAPPING

while read -r line
do
THIS_ID=$(cut -f1 <<< "$line")
for (( i=$LAST_OBSERVED_ID+1; i<$THIS_ID; i++ ))
do
echo "$i 0" >> $TEMP_MAPPING
done
LAST_OBSERVED_ID=$THIS_ID
done < pdb_mapping

cat $TEMP_MAPPING >> pdb_mapping
sort -k1 -n pdb_mapping -o pdb_mapping
rm $TEMP_MAPPING

# Convert softlinks to real files
SOFTLINKS=("pdb.lookup" "pdb.source" "pdb_ss_h" "pdb_ss_h.dbtype" "pdb_ss_h.index")
backup="-backup"

for softlink in ${SOFTLINKS[@]}
do
if [ -e $softlink ]
then
cp $softlink $softlink$backup
rm $softlink
mv $softlink$backup $softlink
fi
done

# Remove redundant indices from lookup, mapping files
DB="pdb"
indices="indices.txt"
lookupExtension=".lookup"
mappingExtension="_mapping"

awk -F"\t" '{print $1}' $DB.index > $indices

INFLATED=("$DB$lookupExtension" "$DB$mappingExtension")
extract="-extract"

for file in ${INFLATED[@]}
do
echo $file
awk -F"\t" 'NR==FNR{a[$1]; next} $1 in a' $indices $file > $file$extract
rm $file
mv $file$extract $file
done
rm $indices

# Remove useless database
rm targetDB*

## 7) Three example runs - sanity check
MMSEQS_FORCE_MERGE=1 $foldseek cluster pdb_seq pdb_clu tmp -c 0.95 --min-seq-id 1.0 --cov-mode 0
# Create sub-database 'pdb'
$foldseek createsubdb pdb_clu pdb_seq pdb --subdb-mode 1
rm pdb_h*
$foldseek lndb pdb_seq_h pdb_h
$foldseek createsubdb pdb_clu pdb_seq_ca pdb_ca --subdb-mode 1
$foldseek createsubdb pdb_clu pdb_seq_ss pdb_ss --subdb-mode 1
# Assign Taxonomy
$foldseek createtaxdb pdb_seq tmp
ln -sf -- pdb_seq.lookup pdb.lookup
ln -sf -- pdb_seq_mapping pdb_mapping
ln -sf -- pdb_seq_taxonomy pdb_taxonomy

# turn absolute symlinks to relative ones so we can make a correctly working tar
while read LINK TARGET; do
rm -f -- $(basename $LINK)
ln -sf -- $(basename $TARGET) $(basename $LINK)
done < <(find . -maxdepth 1 -type l -printf "%p\t%l\n")

# Three example runs - sanity check
# Check if top result for 7p03 is 7p03, 3ojp is 3ojp, 9rub is 9rub
# If a redundant entry is added to PDB (ex: xyzw which is identical to 7p03),
# topID may become xyzw => sanity check failing although there's actually nothing wrong.
# If such an event occurs in the future, we shall simply replace p0/7p03 in example_runs with yz/xyzw
if false; then
declare -a example_runs=("p0/7p03" "oj/3ojp" "ru/9rub")
for id in "${example_runs[@]}"
do
for id in "${example_runs[@]}"; do
$foldseek easy-search ${MIRRORDIR}/${id}.cif.gz pdb aln.m8 tmp
topResult=$(head -n 1 aln.m8 | awk -F"\t" '{print $2}')
topID=${topResult:0:4}
Expand All @@ -165,26 +84,25 @@ do
exit 1
fi
done
fi

## 8) Upload to R2 storage
# Formatting: prepare upload to cloudflare
dbname="pdb100"
timestamp=$(cat pdb_download_date)
foldseekVersion=$($foldseek | grep "foldseek Version: " | sed 's/foldseek Version: //')
zip_filename=${dbname}.tar.gz
md5_file="pdb100.version"
md5sum pdb* > pdb.md5sum
tar --owner=0 --group=0 -czvf pdb100.tar.gz pdb*
md5sum pdb100.tar.gz > pdb100.version
echo -e "${timestamp}\tPDB_DATE\n${foldseekVersion}\tFOLDSEEK_COMMIT" >> pdb100.version
popd

md5sum pdb pdb_ca pdb_ca.dbtype pdb_ca.index pdb_h pdb_h.dbtype pdb_h.index pdb_mapping pdb_ss pdb_ss.dbtype pdb_ss.index pdb_taxonomy pdb.dbtype pdb.index pdb.lookup > pdb.md5sum
tar czf $zip_filename pdb pdb_ca pdb_ca.dbtype pdb_ca.index pdb_h pdb_h.dbtype pdb_h.index pdb_mapping pdb_ss pdb_ss.dbtype pdb_ss.index pdb_taxonomy pdb.dbtype pdb.index pdb.lookup pdb.md5sum
md5sum $zip_filename > $md5_file
stamp="${timestamp}\tPDB_DATE\n${foldseekVersion}\tFOLDSEEK_COMMIT"
echo -e $stamp >> $md5_file
cp -f -- ${SCRATCH}/pdb100.tar.gz .
cp -f -- ${SCRATCH}/pdb100.version .
rm -rf -- "${SCRATCH}"

# Make sure to not hit R2 upload rate limits
aws configure set default.s3.max_concurrent_requests 2
aws configure set default.s3.multipart_threshold 128MB
aws configure set default.s3.multipart_chunksize 128MB

# Upload zipped file and md5 file to cloudflare r2 storage
aws s3 cp $zip_filename s3://foldseek/$zip_filename --endpoint-url $cloudflare_endpoint_url
aws s3 cp $md5_file s3://foldseek/$md5_file --endpoint-url $cloudflare_endpoint_url
aws s3 cp pdb100.tar.gz s3://foldseek/pdb100.tar.gz --endpoint-url $cloudflare_endpoint_url
aws s3 cp pdb100.version s3://foldseek/pdb100.version --endpoint-url $cloudflare_endpoint_url

0 comments on commit 8eb36a2

Please sign in to comment.