Merge branch 'master' of github.com:NCBI-Hackathons/EZData

NCBI-Hackathons · Jun 23, 2018 · 9d4155c · 9d4155c
2 parents e1249fe + aca151b
commit 9d4155c
Show file tree

Hide file tree

Showing 9 changed files with 173 additions and 71 deletions.
diff --git a/README.md b/README.md
@@ -1,5 +1,5 @@
 # *Find hot data sets in your area (of research)!*
-![logo](/docs/logo.png)
+![logo](/docs/SRA_Tinder_logo_full_medium_copy.png) 
 
 
 Have you ever spent weeks interacting with SRA data and then decided it just wasn't going to work? That's like going on a blind date with someone you have no interest in. It's a huge waste of your time!
@@ -11,6 +11,11 @@ Our goal is to show you only the most essential information about your SRA data
 
 ![alpha_output_example_1](/docs/alpha_output_example_1.png)
 
+## Dependencies
+Requires python 3.6
+
+Requires setuptools (https://pip.pypa.io/en/stable/installing/)
+
 ## Installation
 Installation is a three step process:
 ### Step 1:
@@ -46,6 +51,17 @@ To get your own SRA_Acc_list.txt go to https://www.ncbi.nlm.nih.gov/Traces/study
 
 ### Example run:  
 
+
+### Implementation
+
+SRA Tinder is implemented through three primary functions:
+
+Streaming:
+
+Trimming/Counting:
+
+Scraping:
+
 # Stretch goals
 - add the ngs code instead of scraping the web. This means we don't break when SRA changers there website, and we could easily take in fastq files instead of SRA accesssions.
 - graph summerize the output table

diff --git a/SRA_Tinder/CLI/sra_tinder b/SRA_Tinder/CLI/sra_tinder
@@ -17,7 +17,7 @@ import pandas
 import asyncio
 
 
-async def generate_acc_statistics(fastqfile,event=None):
+def generate_acc_statistics(fastqfile):
     '''
     Generates statistics from a fastq file generated from
     an SRA accession
@@ -32,47 +32,42 @@ async def generate_acc_statistics(fastqfile,event=None):
     a pandas dataframe containing statistcs
 
     '''
-    # Wait for the pipe to start to be filled
-    if event is not None:
-        await event.wait()
-    def do_stats(fastqfile):
-        print(f'Generating stats for {fastqfile}')
-        with open(fastqfile) as IN:
-            i = 0
-            for _ in IN:
-                i+=1
-                if i % 1000:
-                    print(f'counted {i} lines for {fastqfile}')
-            print(i)
-        return
-        # Do stuff
-        titleline = [
-            "Accession", "mean_quality_score", "most_abundent_organism",
-            "percent_abundence", "number_of_organims_greater_than_1%_abundence",
-            "total_reads_checked", "total_reads_withadapter", "mean_readlen_before_trim", "std_readlen_before_trim",
-            "mean_readlen_of_trimmed_reads", "std_readlen_of_trimmed_reads"
-        ]
-        accession = fastqfile.replace('.fastq','')
-        # Get some data about the SRA ACC from the web
-        my_tinder = sra_tinder.sra_tinder_web(accession)
-        i = my_tinder.scrape_qc()
-        iii = my_tinder.scrape_organisms()
-        # Get some data from trimmed data
-        print (fastqfile)
-        totalreads, withadapter, mean_readlen, \
-        std_readlen, readlen_trimmed, \
-        std_readlen_trimmed = trimandcount.basesleftaftertriming(fastqfile)
-        # Generate a list of the info
-        listofinfo = [accession, str(i), iii[0], iii[1], iii[2], totalreads, withadapter, mean_readlen, std_readlen, readlen_trimmed, std_readlen_trimmed]
-        # Generate the resultant Pandas frame and create output
-        df = pandas.DataFrame.from_records(listofinfo, columns=titleline)
-        return listofinfo
-    await asyncio.get_event_loop().run_in_executor(None,do_stats,fastqfile) 
-
+    print(f'Generating stats for {fastqfile}')
+    titleline = [
+        "Accession", "mean_quality_score", "most_abundent_organism",
+        "percent_abundence", "number_of_organims_greater_than_1%_abundence",
+        "total_reads_checked", "total_reads_withadapter", "mean_readlen_before_trim", "std_readlen_before_trim",
+        "mean_readlen_of_trimmed_reads", "std_readlen_of_trimmed_reads"
+    ]
+    final_output_line = []
+    accession = fastqfile.replace('.fastq','')
+    url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(accession)
+    # Get some data about the SRA ACC from the web
+    my_tinder = sra_tinder.sra_tinder_web(accession)
+    run_info = my_tinder.scrape_run()
+    m = {True: 'Pass', False: 'Fail'}
+    if args.full:
+        final_output_line += [accession, run_info['study'], run_info['%q30'], m[(run_info['%q30']>70)], run_info['mean_qual'], run_info['top_org'], run_info['top_org_%'], run_info['#_1%_orgs'], run_info['source'], run_info['strategy'], run_info['selection'], run_info['layout'], url]
+    else:
+        final_output_line += [run_info['%q30'], m[(run_info['%q30']>70)], run_info['top_org'], run_info['top_org_%'], run_info['#_1%_orgs'], run_info['source']]
+    # Get some data from trimmed data
+    totalreads, withadapter, mean_readlen, \
+    std_readlen, readlen_trimmed, \
+    std_readlen_trimmed = trimandcount.basesleftaftertriming(fastqfile)
+    # Generate a list of the info
+    final_output_line += [totalreads, withadapter, mean_readlen, std_readlen, readlen_trimmed, std_readlen_trimmed]
+    # Generate the resultant Pandas frame and create output
+    final_output_line = [str(x) for x in final_output_line]
+    df = pandas.DataFrame.from_records(final_output_line, columns=titleline)
+    return final_output_line
 
 # Event Loops
 def run_matching_event_loop(args):
-    # Create some base objects
+    """
+    :param args:
+    :return:
+    """
+    streamer = SRA_Stream()
     loop = asyncio.get_event_loop()
     pool = concurrent.futures.ProcessPoolExecutor(max_workers=100)
     # Create a task list
@@ -156,8 +151,13 @@ if __name__ == '__main__':
         '-o',
         help='output file'
     )
+    matches.add_argument(
+        '--full',
+        help='output LOTS of data - can be overwhelming if you are new to this'
+    )
     matches.set_defaults(func=run_matching_event_loop)
-
+
+    # Parse and run
     args = parser.parse_args()
     try:
         args.func(args)    

diff --git a/SRA_Tinder/sra_tinder.py b/SRA_Tinder/sra_tinder.py
@@ -13,15 +13,53 @@ class sra_tinder_web:
     def __init__(self, sra_file_name):
         self.sra_file_name = sra_file_name
 
-    def scrape_organisms(self):
+    # ABSORBED INTO scrape_run()
+    # def scrape_organisms(self):
+    #     """
+    #     Scrapes the taxonomic information for a run accession.
+    #     :return: a list representing [most_abundant_organism, its_%_abundance, #_organisms_>_1%_abundance]
+    #     """
+    #     url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
+    #     string = ur.urlopen(url).read().decode() # This holds the full URL data
+    #     output = []  # This is the final returned value
+    #     ret = {}  #its a hackathon, ret will hold k,v style output
+    #
+    #     count_organisms = 0
+    #     table = str(re.findall("<h3>Strong signals<\/h3>.*?<\/table>", string, re.DOTALL))
+    #     rows = re.findall("<tr>.*?<\/tr>", table, re.DOTALL)
+    #     for row in rows:
+    #         values = re.findall("<tdstyle=\"padding:.*?\">(.*?)<\/td>", str(row.replace('\n', '').replace(' ', '')),
+    #                             re.DOTALL)
+    #         if len(values) < 4:
+    #             # print("Error: {}".format(values))
+    #             continue
+    #         count_organisms += 1
+    #         if len(output) == 0 or float(values[3]) > output[1]:
+    #             output += [values[1], float(values[3])]
+    #     output += [count_organisms]
+    #
+    #     ret['top_org'] = output[0]
+    #     ret['top_org_%'] = output[1]
+    #     ret['#_1%_orgs'] = output[2]
+    #
+    #
+    #
+    #
+    #     return ret
+
+
+    def scrape_run(self):
         """
-        Scrapes the taxonomic information for a run accession.
-        :return: a list representing [most_abundant_organism, its_%_abundance, #_organisms_>_1%_abundance]
+        Scrapes the URL metadata page for the run accession
+        :return: a float rounded to two decimal places representing the average quality score of a SRA run
         """
         url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
         string = ur.urlopen(url).read().decode()
-        output = []
+        ret = {}
+
+        #Get organism information from the reads
         count_organisms = 0
+        output = []
         table = str(re.findall("<h3>Strong signals<\/h3>.*?<\/table>", string, re.DOTALL))
         rows = re.findall("<tr>.*?<\/tr>", table, re.DOTALL)
         for row in rows:
@@ -34,48 +72,96 @@ def scrape_organisms(self):
             if len(output) == 0 or float(values[3]) > output[1]:
                 output += [values[1], float(values[3])]
         output += [count_organisms]
-        return [str(x) for x in output]
 
+        ret['top_org'] = output[0]
+        ret['top_org_%'] = output[1]
+        ret['#_1%_orgs'] = output[2]
 
-    def scrape_qc(self):
-        """
-        Scrapes the URL metadata page for the run accession
-        :return: a float rounded to two decimal places representing the average quality score of a SRA run
-        """
-        url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
-        string = ur.urlopen(url).read().decode()
+        #scrape and count qc data for reads
         table = re.findall('<table class="zebra run-metatable">.*?<div class="center">Phred quality score<\/div>', string, re.DOTALL)
         table = str(table)
         entries = re.findall('<span title="(.*?) : (.*?)" style=', table, re.DOTALL)
         d = {}
         for score in entries:
             d[int(score[0])] = int(score[1].replace(',', ''))
-        total = 0
+        total_qual = 0
         count = 0
+        q30_plus_count = 0
+
         for k, v in d.items():
-            total += k * v
+            total_qual += k * v
             count += v
-        return round(total/count, 2)
+            if k >= 30:
+                q30_plus_count += v
+
+        ret['mean_qual'] = round(total_qual/count, 2)
+        ret['%q30'] = round(q30_plus_count/count, 2) * 100
+
+        #Scrape the project that contains this run
+        study = re.findall("<a href=\"\?study=(.*?)\">", string, re.DOTALL)
+        ret['study'] = study[0]
+
+        #Scrape the source, strategy, layout, and selection
+        experiment_table = re.findall('<table class=\"zebra\">(.*?)<\/tr>(.*?)<\/table>', string, re.DOTALL)
+        headers = re.findall('<th>(.*?)</th>', experiment_table[0][0], re.DOTALL)
+        values = re.findall('<td>(.*?)</td>', experiment_table[0][1], re.DOTALL)
 
+        headers = headers[1:]
+        ret['source'] = values[3]
+        ret['strategy'] = values[2]
+        ret['selection'] = values[4]
+        ret['layout'] = values[5]
+
+        #scraping bioproject and pubmed links
+        project = re.findall('<a href=\"https://www.ncbi.nlm.nih.gov/bioproject/(.*?)\">', string, re.DOTALL)
+        project = project[0]
+        project_string = ur.urlopen("https://www.ncbi.nlm.nih.gov/bioproject/{}".format(project)).read().decode()
+        taxonomy_id = re.findall('<td class="CTtitle">Organism</td><td class="CTcontent"><a href=".*?" class="RegularLinkB" title=\"(.*?)\"', project_string, re.DOTALL)
+        publications = re.findall('<td class="CTtitle">Publications</td><td class="CTcontent">(.*?)Publications</td></tr><tr><td>', project_string, re.DOTALL)
+        pmids = re.findall('href=\"/pubmed/(.*?)\"', publications[0], re.DOTALL)
+        print(pmids, taxonomy_id)
+        pmids = ['https://www.ncbi.nlm.nih.gov/pubmed/?term={}'.format(x) for x in pmids]
+
+        ret['pmids'] = pmids
+        ret['taxon_id'] = taxonomy_id
+
+        return ret
+
+my_tinder = sra_tinder_web('SRR3403834').scrape_run()
 #testing cases, please ignore
 # x = sra_tinder('SRR3403834').scrape_organisms()
 # print(x)
 
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='')
-    parser.add_argument('-i', '--input', help='Input File', required=True)
-
+# if __name__ == '__main__':
+#     parser = argparse.ArgumentParser(description='')
+#     parser.add_argument('-i', '--input', help='Input File', required=True)
+#     parser.add_argument('-e', '--essential', help='Run with only the essential fields (accession, average_qual_Score, pass/fail, top_organism, top_organism%, #_organisms >1%, USER_SUBMITTED_Type', action='store_true')
+#
+#
+#
+#     try:
+#         args = parser.parse_args()
+#     except:
+#         parser.print_help()
+#         sys.exit(1)
 
-    try:
-        args = parser.parse_args()
-    except:
-        parser.print_help()
-        sys.exit(1)
 
 
-    accession = args.input
-    my_tinder = sra_tinder(accession)
-    i = my_tinder.scrape_qc()
-    # ii = my_tinder.adapters()
-    iii = my_tinder.scrape_organisms()
-    print('\t'.join([accession, str(i), iii[0], iii[1], iii[2]]))
+    # accession = args.input
+    # url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(accession)
+    #
+    # my_tinder = sra_tinder_web(accession)
+    #
+    # run_info = my_tinder.scrape_run()
+    # org_info = my_tinder.scrape_organisms()
+    #
+    # m = {True: 'Pass', False: 'Fail'}
+    #
+    # if args.essential:
+    #     output = [accession, run_info['%q30'], m[(run_info['%q30']>70)], org_info['top_org'], org_info['top_org_%'], org_info['#_1%_orgs'], run_info['source']]
+    # else:
+    #     output = [accession, run_info['study'], run_info['%q30'], m[(run_info['%q30']>70)], run_info['mean_qual'], org_info['top_org'], org_info['top_org_%'], org_info['#_1%_orgs'], run_info['source'], run_info['strategy'], run_info['selection'], run_info['layout'], url]
+    # output = [str(x) for x in output]
+    # sys.stdout.write('\t'.join(output))
+    #
+    #
diff --git a/docs/SRA_Tinder_logo_full_medium_copy.png b/docs/SRA_Tinder_logo_full_medium_copy.png
diff --git a/docs/SRA_Tinder_logo_larger_text.png b/docs/SRA_Tinder_logo_larger_text.png
diff --git a/docs/SRA_Tinder_logo_larger_text_small.png b/docs/SRA_Tinder_logo_larger_text_small.png
diff --git a/docs/SRA_Tinder_logo_smaller_text.png b/docs/SRA_Tinder_logo_smaller_text.png
diff --git a/docs/logo_smaller.png b/docs/logo_smaller.png
diff --git a/docs/nope2.jpg b/docs/nope2.jpg