Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let's help Dora find the BAM #5

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 30 additions & 15 deletions scripts/collect_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,30 @@ then
>&2 echo "WARNING: No secondary run/experiment (SRP/SRX) IDs in the family.soft file; this often happens in datasets that are restricted access (dbGap, etc)."
fi

## curl info about each run (SRR/ERR/DRR) from ENA API; v2 pulls GSM data etc
## curl info about each run (SRR/ERR/DRR) from ENA and SRA APIs; v2 pulls GSM data etc
RET=1
RET_SRA=1
TRIES=1
until (( $RET == 0 ))
until (( $RET == 0))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?

if [[ $RET_SRA -eq 1 ]]
then
./curl_sra_metadata.sh $SERIES
RET_SRA=$?
fi

## this either pulls sub-series data (and replaces $SERIES.project.list with useful PRJNA* IDs), or just quits after 5 tries
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
then
>&2 echo "WARNING: No ENA records can be retrieved for GEO projects listed in $SERIES.project.list!"
if [[ $SUBGSE == "" ]]
then
then
>&2 echo "ERROR: No GSE subseries were listed in ${SERIES}_family.soft - no alternative PRJNA* to be found, and no ENA entries can be retrieved!"
exit 1
else
else
>&2 echo "WARNING: replacing $SERIES.project.list with sub-series projects.."
rm $SERIES.project.list
for i in $SUBGSE
Expand All @@ -84,22 +90,31 @@ then
TRIES=1
until (( $RET == 0 ))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
then
>&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list, I quit!"
exit 1
>&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list!"
fi
done
fi

## ena metadata loading failed, now check if sra was loaded
if [[ $RET_SRA -eq 1 ]]
then
>&2 echo "ERROR: No SRA records can be retrieved for the $SERIES, I quit!"
exit 1
else
META="$SERIES.sra.tsv"
RET=0
fi
fi
sleep 1
done

# Checking which metadata file was loaded
if [[ -s $SERIES.sra.tsv ]]
# checking if ENA metadata file is empty
if [[ ! -s $SERIES.ena.tsv ]]
then
META="$SERIES.sra.tsv"
fi
Expand Down Expand Up @@ -144,7 +159,7 @@ then
until (( $RET == 0 ))
do
## for ArrayExpress, we query by sample ID because sdrf doesn't list the BioProject ID
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.sample.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
Expand Down Expand Up @@ -182,7 +197,7 @@ then
TRIES=1
until (( $RET == 0 ))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
Expand Down Expand Up @@ -226,8 +241,8 @@ then
>&2 cat $SUBSET
grep -f $SUBSET $SERIES.sample.list > $SERIES.sample.list.tmp
mv $SERIES.sample.list.tmp $SERIES.sample.list
grep -f $SUBSET $SERIES.ena.tsv > $SERIES.ena.tsv.tmp
mv $SERIES.ena.tsv.tmp $SERIES.ena.tsv
grep -f $SUBSET $META > $META.tmp
mv $META.tmp $META
grep -f $SUBSET $SERIES.accessions.tsv > $SERIES.accessions.tsv.tmp
mv $SERIES.accessions.tsv.tmp $SERIES.accessions.tsv

Expand Down
25 changes: 3 additions & 22 deletions scripts/curl_ena_metadata.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash -e

SERIES=$1
RUNS="$SERIES.project.list"
RUNS=$1

if [[ ! -f $RUNS ]]
then
Expand All @@ -11,28 +10,10 @@ fi

KK=`cat $RUNS`

: > $SERIES.ena.tsv

for i in $KK
do
>&2 echo "Processing run ID $i.."
curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | sed '/study_accession/d' >> $SERIES.ena.tsv
curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | grep -v study_accession
done


if [[ ! -s $SERIES.ena.tsv && $SERIES == GSE* ]]
then
>&2 echo "WARNING: Was not able to load metadata from ENA. Loading it from SRA..."
rm $SERIES.ena.tsv
: > $SERIES.sra.tsv
for i in $KK
do
wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
rm -f $i.xml
curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $SERIES.sra.tsv
done
fi

>&2 echo "CURL METADATA: ALL DONE!"
>&2 echo "CURL ENA METADATA: ALL DONE!"
26 changes: 26 additions & 0 deletions scripts/curl_sra_metadata.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash -e

SERIES=$1
RUNS="$SERIES.project.list"

if [[ ! -f $RUNS ]]
then
>&2 echo "ERROR: file $RUNS not found!"
exit 1
fi

KK=`cat $RUNS`

for i in $KK
do
>&2 echo "Processing run ID $i.."
wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
rm -f $i.xml
curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $i.sra.tsv
done

cat *.sra.tsv > $SERIES.sra.tsv

>&2 echo "CURL SRA METADATA: ALL DONE!"
16 changes: 14 additions & 2 deletions scripts/parse_ena_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@ do
ENAGZ=`grep -w $i $SERIES.ena.tsv | cut -f11 | grep "_1\.fastq.gz" | grep "_2\.fastq.gz"` ## ENA formatting is strict
ORIFQ=`grep -w $i $SERIES.ena.tsv | cut -f12 | grep "f.*q"` ## ppl name files *all kinds of random shiz*, really
ORIBAM=`grep -w $i $SERIES.ena.tsv | cut -f12 | tr ';' '\n' | grep -v "\.bai" | grep "\.bam"` ## don't need the BAM index which is often there
SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`

SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`
SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
.result[].files[] |
select(.name | contains("bam")) |
.locations[] |
select((.rehydrationRequired // false) == false and (.payRequired // false) == false) |
.link
'`

if [[ $AEGZ != "" ]]
then
Expand All @@ -48,6 +54,12 @@ do
LOC=$ORIBAM
echo $ORIBAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via ENA as an original submitter's BAM file: $LOC"
elif [[ $SRABAM != "" ]]
then
TYPE="BAM"
LOC=$SRABAM
echo $SRABAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
elif [[ $SRA != "" ]]
then
TYPE="SRA"
Expand Down
17 changes: 15 additions & 2 deletions scripts/parse_sra_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@ do

SPECIES=`grep -w $i $SERIES.sra.tsv | cut -f29`
SRA=`grep -w $i $SERIES.sra.tsv | cut -f10`

if [[ $SRA != "" ]]
SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
.result[].files[] |
select(.name | contains("bam")) |
.locations[] |
select((.rehydrationRequired // false) == false and (.payRequired // false) == false) |
.link
'`

if [[ $SRABAM != "" ]]
then
TYPE="BAM"
LOC=$SRABAM
echo $SRABAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
elif [[ $SRA != "" ]]
then
LOC=$SRA
echo $SRA >> $SERIES.urls.list
Expand Down
9 changes: 5 additions & 4 deletions scripts/starsolo_10x_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ then
>&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
fi


## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment.
## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand
## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat;
Expand All @@ -213,14 +214,14 @@ STRAND=Forward

$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
--soloUMIlen $UMILEN --soloStrand Forward \
--soloUMIlen $UMILEN --soloStrand Forward --genomeLoad LoadAndKeep \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
--soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null

$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
--soloUMIlen $UMILEN --soloStrand Reverse \
--soloUMIlen $UMILEN --soloStrand Reverse --genomeLoad LoadAndKeep \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
--soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
Expand Down Expand Up @@ -266,13 +267,13 @@ then
$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
--soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
else
$CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
--soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
--soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
--soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
--soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
fi

Expand Down