Skip to content

Commit

Permalink
Merge branch 'master' of github.com:NCBI-Hackathons/EZData
Browse files Browse the repository at this point in the history
  • Loading branch information
schae234 committed Jun 23, 2018
2 parents e1249fe + aca151b commit 9d4155c
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 71 deletions.
18 changes: 17 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# *Find hot data sets in your area (of research)!*
![logo](/docs/logo.png)
![logo](/docs/SRA_Tinder_logo_full_medium_copy.png)


Have you ever spent weeks interacting with SRA data and then decided it just wasn't going to work? That's like going on a blind date with someone you have no interest in. It's a huge waste of your time!
Expand All @@ -11,6 +11,11 @@ Our goal is to show you only the most essential information about your SRA data

![alpha_output_example_1](/docs/alpha_output_example_1.png)

## Dependencies
Requires python 3.6

Requires setuptools (https://pip.pypa.io/en/stable/installing/)

## Installation
Installation is a three step process:
### Step 1:
Expand Down Expand Up @@ -46,6 +51,17 @@ To get your own SRA_Acc_list.txt go to https://www.ncbi.nlm.nih.gov/Traces/study

### Example run:


### Implementation

SRA Tinder is implemented through three primary functions:

Streaming:

Trimming/Counting:

Scraping:

# Stretch goals
- add the ngs code instead of scraping the web. This means we don't break when SRA changers there website, and we could easily take in fastq files instead of SRA accesssions.
- graph summerize the output table
Expand Down
80 changes: 40 additions & 40 deletions SRA_Tinder/CLI/sra_tinder
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import pandas
import asyncio


async def generate_acc_statistics(fastqfile,event=None):
def generate_acc_statistics(fastqfile):
'''
Generates statistics from a fastq file generated from
an SRA accession
Expand All @@ -32,47 +32,42 @@ async def generate_acc_statistics(fastqfile,event=None):
a pandas dataframe containing statistcs
'''
# Wait for the pipe to start to be filled
if event is not None:
await event.wait()
def do_stats(fastqfile):
print(f'Generating stats for {fastqfile}')
with open(fastqfile) as IN:
i = 0
for _ in IN:
i+=1
if i % 1000:
print(f'counted {i} lines for {fastqfile}')
print(i)
return
# Do stuff
titleline = [
"Accession", "mean_quality_score", "most_abundent_organism",
"percent_abundence", "number_of_organims_greater_than_1%_abundence",
"total_reads_checked", "total_reads_withadapter", "mean_readlen_before_trim", "std_readlen_before_trim",
"mean_readlen_of_trimmed_reads", "std_readlen_of_trimmed_reads"
]
accession = fastqfile.replace('.fastq','')
# Get some data about the SRA ACC from the web
my_tinder = sra_tinder.sra_tinder_web(accession)
i = my_tinder.scrape_qc()
iii = my_tinder.scrape_organisms()
# Get some data from trimmed data
print (fastqfile)
totalreads, withadapter, mean_readlen, \
std_readlen, readlen_trimmed, \
std_readlen_trimmed = trimandcount.basesleftaftertriming(fastqfile)
# Generate a list of the info
listofinfo = [accession, str(i), iii[0], iii[1], iii[2], totalreads, withadapter, mean_readlen, std_readlen, readlen_trimmed, std_readlen_trimmed]
# Generate the resultant Pandas frame and create output
df = pandas.DataFrame.from_records(listofinfo, columns=titleline)
return listofinfo
await asyncio.get_event_loop().run_in_executor(None,do_stats,fastqfile)

print(f'Generating stats for {fastqfile}')
titleline = [
"Accession", "mean_quality_score", "most_abundent_organism",
"percent_abundence", "number_of_organims_greater_than_1%_abundence",
"total_reads_checked", "total_reads_withadapter", "mean_readlen_before_trim", "std_readlen_before_trim",
"mean_readlen_of_trimmed_reads", "std_readlen_of_trimmed_reads"
]
final_output_line = []
accession = fastqfile.replace('.fastq','')
url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(accession)
# Get some data about the SRA ACC from the web
my_tinder = sra_tinder.sra_tinder_web(accession)
run_info = my_tinder.scrape_run()
m = {True: 'Pass', False: 'Fail'}
if args.full:
final_output_line += [accession, run_info['study'], run_info['%q30'], m[(run_info['%q30']>70)], run_info['mean_qual'], run_info['top_org'], run_info['top_org_%'], run_info['#_1%_orgs'], run_info['source'], run_info['strategy'], run_info['selection'], run_info['layout'], url]
else:
final_output_line += [run_info['%q30'], m[(run_info['%q30']>70)], run_info['top_org'], run_info['top_org_%'], run_info['#_1%_orgs'], run_info['source']]
# Get some data from trimmed data
totalreads, withadapter, mean_readlen, \
std_readlen, readlen_trimmed, \
std_readlen_trimmed = trimandcount.basesleftaftertriming(fastqfile)
# Generate a list of the info
final_output_line += [totalreads, withadapter, mean_readlen, std_readlen, readlen_trimmed, std_readlen_trimmed]
# Generate the resultant Pandas frame and create output
final_output_line = [str(x) for x in final_output_line]
df = pandas.DataFrame.from_records(final_output_line, columns=titleline)
return final_output_line

# Event Loops
def run_matching_event_loop(args):
# Create some base objects
"""
:param args:
:return:
"""
streamer = SRA_Stream()
loop = asyncio.get_event_loop()
pool = concurrent.futures.ProcessPoolExecutor(max_workers=100)
# Create a task list
Expand Down Expand Up @@ -156,8 +151,13 @@ if __name__ == '__main__':
'-o',
help='output file'
)
matches.add_argument(
'--full',
help='output LOTS of data - can be overwhelming if you are new to this'
)
matches.set_defaults(func=run_matching_event_loop)


# Parse and run
args = parser.parse_args()
try:
args.func(args)
Expand Down
146 changes: 116 additions & 30 deletions SRA_Tinder/sra_tinder.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,53 @@ class sra_tinder_web:
def __init__(self, sra_file_name):
self.sra_file_name = sra_file_name

def scrape_organisms(self):
# ABSORBED INTO scrape_run()
# def scrape_organisms(self):
# """
# Scrapes the taxonomic information for a run accession.
# :return: a list representing [most_abundant_organism, its_%_abundance, #_organisms_>_1%_abundance]
# """
# url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
# string = ur.urlopen(url).read().decode() # This holds the full URL data
# output = [] # This is the final returned value
# ret = {} #its a hackathon, ret will hold k,v style output
#
# count_organisms = 0
# table = str(re.findall("<h3>Strong signals<\/h3>.*?<\/table>", string, re.DOTALL))
# rows = re.findall("<tr>.*?<\/tr>", table, re.DOTALL)
# for row in rows:
# values = re.findall("<tdstyle=\"padding:.*?\">(.*?)<\/td>", str(row.replace('\n', '').replace(' ', '')),
# re.DOTALL)
# if len(values) < 4:
# # print("Error: {}".format(values))
# continue
# count_organisms += 1
# if len(output) == 0 or float(values[3]) > output[1]:
# output += [values[1], float(values[3])]
# output += [count_organisms]
#
# ret['top_org'] = output[0]
# ret['top_org_%'] = output[1]
# ret['#_1%_orgs'] = output[2]
#
#
#
#
# return ret


def scrape_run(self):
"""
Scrapes the taxonomic information for a run accession.
:return: a list representing [most_abundant_organism, its_%_abundance, #_organisms_>_1%_abundance]
Scrapes the URL metadata page for the run accession
:return: a float rounded to two decimal places representing the average quality score of a SRA run
"""
url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
string = ur.urlopen(url).read().decode()
output = []
ret = {}

#Get organism information from the reads
count_organisms = 0
output = []
table = str(re.findall("<h3>Strong signals<\/h3>.*?<\/table>", string, re.DOTALL))
rows = re.findall("<tr>.*?<\/tr>", table, re.DOTALL)
for row in rows:
Expand All @@ -34,48 +72,96 @@ def scrape_organisms(self):
if len(output) == 0 or float(values[3]) > output[1]:
output += [values[1], float(values[3])]
output += [count_organisms]
return [str(x) for x in output]

ret['top_org'] = output[0]
ret['top_org_%'] = output[1]
ret['#_1%_orgs'] = output[2]

def scrape_qc(self):
"""
Scrapes the URL metadata page for the run accession
:return: a float rounded to two decimal places representing the average quality score of a SRA run
"""
url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(self.sra_file_name)
string = ur.urlopen(url).read().decode()
#scrape and count qc data for reads
table = re.findall('<table class="zebra run-metatable">.*?<div class="center">Phred quality score<\/div>', string, re.DOTALL)
table = str(table)
entries = re.findall('<span title="(.*?) : (.*?)" style=', table, re.DOTALL)
d = {}
for score in entries:
d[int(score[0])] = int(score[1].replace(',', ''))
total = 0
total_qual = 0
count = 0
q30_plus_count = 0

for k, v in d.items():
total += k * v
total_qual += k * v
count += v
return round(total/count, 2)
if k >= 30:
q30_plus_count += v

ret['mean_qual'] = round(total_qual/count, 2)
ret['%q30'] = round(q30_plus_count/count, 2) * 100

#Scrape the project that contains this run
study = re.findall("<a href=\"\?study=(.*?)\">", string, re.DOTALL)
ret['study'] = study[0]

#Scrape the source, strategy, layout, and selection
experiment_table = re.findall('<table class=\"zebra\">(.*?)<\/tr>(.*?)<\/table>', string, re.DOTALL)
headers = re.findall('<th>(.*?)</th>', experiment_table[0][0], re.DOTALL)
values = re.findall('<td>(.*?)</td>', experiment_table[0][1], re.DOTALL)

headers = headers[1:]
ret['source'] = values[3]
ret['strategy'] = values[2]
ret['selection'] = values[4]
ret['layout'] = values[5]

#scraping bioproject and pubmed links
project = re.findall('<a href=\"https://www.ncbi.nlm.nih.gov/bioproject/(.*?)\">', string, re.DOTALL)
project = project[0]
project_string = ur.urlopen("https://www.ncbi.nlm.nih.gov/bioproject/{}".format(project)).read().decode()
taxonomy_id = re.findall('<td class="CTtitle">Organism</td><td class="CTcontent"><a href=".*?" class="RegularLinkB" title=\"(.*?)\"', project_string, re.DOTALL)
publications = re.findall('<td class="CTtitle">Publications</td><td class="CTcontent">(.*?)Publications</td></tr><tr><td>', project_string, re.DOTALL)
pmids = re.findall('href=\"/pubmed/(.*?)\"', publications[0], re.DOTALL)
print(pmids, taxonomy_id)
pmids = ['https://www.ncbi.nlm.nih.gov/pubmed/?term={}'.format(x) for x in pmids]

ret['pmids'] = pmids
ret['taxon_id'] = taxonomy_id

return ret

my_tinder = sra_tinder_web('SRR3403834').scrape_run()
#testing cases, please ignore
# x = sra_tinder('SRR3403834').scrape_organisms()
# print(x)

if __name__ == '__main__':
parser = argparse.ArgumentParser(description='')
parser.add_argument('-i', '--input', help='Input File', required=True)

# if __name__ == '__main__':
# parser = argparse.ArgumentParser(description='')
# parser.add_argument('-i', '--input', help='Input File', required=True)
# parser.add_argument('-e', '--essential', help='Run with only the essential fields (accession, average_qual_Score, pass/fail, top_organism, top_organism%, #_organisms >1%, USER_SUBMITTED_Type', action='store_true')
#
#
#
# try:
# args = parser.parse_args()
# except:
# parser.print_help()
# sys.exit(1)

try:
args = parser.parse_args()
except:
parser.print_help()
sys.exit(1)


accession = args.input
my_tinder = sra_tinder(accession)
i = my_tinder.scrape_qc()
# ii = my_tinder.adapters()
iii = my_tinder.scrape_organisms()
print('\t'.join([accession, str(i), iii[0], iii[1], iii[2]]))
# accession = args.input
# url = "https://trace.ncbi.nlm.nih.gov/Traces/sra/?run={}".format(accession)
#
# my_tinder = sra_tinder_web(accession)
#
# run_info = my_tinder.scrape_run()
# org_info = my_tinder.scrape_organisms()
#
# m = {True: 'Pass', False: 'Fail'}
#
# if args.essential:
# output = [accession, run_info['%q30'], m[(run_info['%q30']>70)], org_info['top_org'], org_info['top_org_%'], org_info['#_1%_orgs'], run_info['source']]
# else:
# output = [accession, run_info['study'], run_info['%q30'], m[(run_info['%q30']>70)], run_info['mean_qual'], org_info['top_org'], org_info['top_org_%'], org_info['#_1%_orgs'], run_info['source'], run_info['strategy'], run_info['selection'], run_info['layout'], url]
# output = [str(x) for x in output]
# sys.stdout.write('\t'.join(output))
#
#
Binary file added docs/SRA_Tinder_logo_full_medium_copy.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/SRA_Tinder_logo_larger_text.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/SRA_Tinder_logo_larger_text_small.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/SRA_Tinder_logo_smaller_text.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/logo_smaller.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/nope2.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit 9d4155c

Please sign in to comment.