From 2fad39d763c7867690ff5c1d521c2ca29e342161 Mon Sep 17 00:00:00 2001 From: "Robert A. Petit III" Date: Sat, 17 Jun 2023 14:03:43 -0500 Subject: [PATCH] add paper for JOSS submission --- citation.cff | 12 ++++++++- paper.bib | 45 +++++++++++++++++++++++++++++++++ paper.md | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 127 insertions(+), 1 deletion(-) create mode 100644 paper.bib create mode 100644 paper.md diff --git a/citation.cff b/citation.cff index 50e6e23..ac0a28b 100644 --- a/citation.cff +++ b/citation.cff @@ -7,6 +7,16 @@ authors: - family-names: "Hall" given-names: "Micheal B." orcid: "https://orcid.org/0000-0003-3683-6208" +- family-names: "Tonkin-Hill" + given-names: "Gerry" + orcid: "https://orcid.org/0000-0002-1350-9426" +- family-names: "Zhu" + given-names: "Jie" +- family-names: "Read" + given-names: "Timothy D." + orcid: "https://orcid.org/0000-0001-8966-9680" title: "fastq-dl: efficiently download FASTQ files from SRA or ENA repositories" url: "https://github.com/rpetit3/fastq-dl" -version: 2.0.0 +version: 2.0.2 + + diff --git a/paper.bib b/paper.bib new file mode 100644 index 0000000..7d525be --- /dev/null +++ b/paper.bib @@ -0,0 +1,45 @@ +@article{Burgin_2023, + title = {The European Nucleotide Archive in 2022}, + author = {Burgin, Josephine and Ahamed, Alisha and Cummins, Carla and Devraj, Rajkumar and Gueye, Khadim and Gupta, Dipayan and Gupta, Vikas and Haseeb, Muhammad and Ihsan, Maira and Ivanov, Eugene and Jayathilaka, Suran and Balavenkataraman Kadhirvelu, Vishnukumar and Kumar, Manish and Lathi, Ankur and Leinonen, Rasko and Mansurova, Milena and McKinnon, Jasmine and O’Cathail, Colman and Paupério, Joana and Pesant, Stéphane and Rahman, Nadim and Rinck, Gabriele and Selvakumar, Sandeep and Suman, Swati and Vijayaraja, Senthilnathan and Waheed, Zahra and Woollard, Peter and Yuan, David and Zyoud, Ahmad and Burdett, Tony and Cochrane, Guy}, + year = 2023, + month = {Jan}, + journal = {Nucleic acids research}, + volume = 51, + number = {D1}, + pages = {D121–D125}, + doi = {10.1093/nar/gkac1051}, + issn = {0305-1048}, + url = {http://dx.doi.org/10.1093/nar/gkac1051}, + abstractnote = {The European Nucleotide Archive (ENA; https://www.ebi.ac.uk/ena), maintained by the European Molecular Biology Laboratory’s European Bioinformatics Institute (EMBL-EBI), offers those producing data an open and supported platform for the management, archiving, publication, and dissemination of data; and to the scientific community as a whole, it offers a globally comprehensive data set through a host of data discovery and retrieval tools. Here, we describe recent updates to the ENA's submission and retrieval services as well as focused efforts to improve connectivity, reusability, and interoperability of ENA data and metadata.}, + language = {en} +} +@article{Choudhary_2019, + title = {pysradb: A Python package to query next-generation sequencing metadata and data from NCBI Sequence Read Archive}, + author = {Choudhary, Saket}, + year = 2019, + month = {Apr}, + journal = {F1000Research}, + volume = 8, + pages = 532, + doi = {10.12688/f1000research.18676.1}, + issn = {2046-1402}, + url = {http://dx.doi.org/10.12688/f1000research.18676.1}, + abstractnote = {The NCBI Sequence Read Archive (SRA) is the primary archive of next-generation sequencing datasets. SRA makes metadata and raw sequencing data available to the research community to encourage reproducibility and to provide avenues for testing novel hypotheses on publicly available data. However, methods to programmatically access this data are limited. We introduce the Python package, pysradb, which provides a collection of command line methods to query and download metadata and data from SRA, utilizing the curated metadata database available through the SRAdb project. We demonstrate the utility of pysradb on multiple use cases for searching and downloading SRA datasets. It is available freely at https://github.com/saketkc/pysradb.}, + keywords = {GEO; NCBI; NGS; SRA; bioinformatics; metadata}, + language = {en} +} +@article{Katz_2022, + title = {The Sequence Read Archive: a decade more of explosive growth}, + author = {Katz, Kenneth and Shutov, Oleg and Lapoint, Richard and Kimelman, Michael and Brister, J. Rodney and O’Sullivan, Christopher}, + year = 2022, + month = {Jan}, + journal = {Nucleic acids research}, + volume = 50, + number = {D1}, + pages = {D387–D390}, + doi = {10.1093/nar/gkab1053}, + issn = {0305-1048}, + url = {http://dx.doi.org/10.1093/nar/gkab1053}, + abstractnote = {The Sequence Read Archive (SRA, https://www.ncbi.nlm.nih.gov/sra/) stores raw sequencing data and alignment information to enhance reproducibility and facilitate new discoveries through data analysis. Here we note changes in storage designed to increase access and highlight analyses that augment metadata with taxonomic insight to help users select data. In addition, we present three unanticipated applications of taxonomic analysis.}, + language = {en} +} diff --git a/paper.md b/paper.md new file mode 100644 index 0000000..d47fbb2 --- /dev/null +++ b/paper.md @@ -0,0 +1,71 @@ +--- +title: 'fastq-dl: efficiently download sequences from ENA and SRA' +tags: + - fastq + - download + - python + - bioinformatics +authors: + - name: Robert A. Petit III + orcid: 0000-0002-1350-9426 + affiliation: "1, 2" + - name: Michael B. Hall + orcid: 0000-0003-3683-6208 + affiliation: "3" + - name: Gerry Tonkin-Hill + orcid: 0000-0002-1350-9426 + affiliation: "4" + - name: Jie Zhu + affiliation: "5" + - name: Timothy D. Read + orcid: 0000-0001-8966-9680 + affiliation: "2" +affiliations: + - name: Wyoming Public Health Laboratory, Wyoming Department of Health, Wyoming, USA + index: 1 + - name: Division of Infectious Diseases, Department of Medicine, Emory University School of Medicine, Atlanta, Georgia, USA + index: 2 + - name: Department of Microbiology and Immunology, Peter Doherty Institute for Infection and Immunity, The University of Melbourne, Melbourne, Australia + index: 3 + - name: Department of Biostatistics, University of Oslo, Oslo, Norway + index: 4 + - name: Li Ka Shing Institute of Health Sciences, Faculty of Medicine, The Chinese University of Hong Kong, Hong Kong SAR, PR China + index: 5 +date: 17 June 2023 +bibliography: paper.bib +--- + +# Summary + +High-throughput sequencing technologies have revolutionized the field of genomics, enabling +researchers to generate vast amounts of data quickly and at relatively low cost. The European +Nucleotide Archive (ENA) [@Burgin_2023] and the Sequence Read Archive (SRA) [@Katz_2022] are +two major repositories for publicly hosting next-generation sequencing data from many research +projects. Retrieving sequences from these repositories is often a multi-step process and +difficult for researchers who lack experience with bioinformatics. fastq-dl is a bioinformatic +tool that simplifies the process of downloading sequences from SRA and ENA. + +fastq-dl is written in Python and is designed to be user-friendly and simple to use. Users can +submit queries to the ENA, via a REST API [@Burgin_2023], or SRA, via pysradb [@Choudhary_2019], +with fallback mechanisms in the event either repository is down. fastq-dl supports a range of +query types, including taxon ids, species names, and accessions, including BioSample, BioProject, +Experiment, and Run Accessions. A query will return metadata for each hit and save this metadata +to a tab-delimited file. Unless disabled by the user, fastq-dl will then proceed to download +available sequences for each hit of the query. If using ENA, raw FASTQs are downloaded using +their available FTP service, otherwise fasterq-dump [@Katz_2022] is used to download from SRA. +In the event a repository is unresponsive, download attempts will be made against the other +repository. When an Experiment or BioSample has multiple Run accessions associated with it, +users can optionally choose to merge these Run accessions. Upon completion, users are provided +with a summary file, a metadata file and FASTQ files per-query hit. + +fastq-dl is a convenient bioinformatic tool that simplifies the process of retrieving FASTQ files +from ENA and SRA. It was developed to be easy to use and accessible to researchers from all +backgrounds. By facilitating efficient downloading of publicly available FASTQ files, users can +easily integrate these data into their own research. fastq-dl is available from PyPI and Bioconda +for simple installation, and the source code is available at https://github.com/rpetit3/fastq-dl. + +# Funding + +This project was partially supported by the Georgia Emerging Infections Program and the Wyoming Department of Health + +# References