Skip to content

Commit 2149762

Browse files
committed
Merge branch 'master' of github.com:SciLifeLab/standalone_scripts
2 parents 1a62113 + 25b4d3a commit 2149762

26 files changed

+1374
-38
lines changed

DupRateTrends_from_charon.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
import os
2+
import glob
3+
import re
4+
import sys
5+
import argparse
6+
import random
7+
import requests
8+
import json
9+
import datetime
10+
import shutil
11+
from datetime import date
12+
13+
14+
15+
#Static list of all NeoPrep projects. As NeoPrep is discontinued this is a stable list.
16+
NeoPrepProjects = ["P6971", "P6810", "P6651", "P6406", "P6254", "P6252", "P6152", "P5951", "P5514", "P5503", "P5476", "P5470", "P5370", "P5364", "P5301", "P5206", "P5201", "P5151", "P4903", "P4805", "P4753", "P4751", "P4729", "P4710", "P4651", "P4552", "P4454", "P4453", "P4401", "P4353", "P4206", "P4105", "P4056", "P4055", "P4004", "P3966", "P3719", "P3452", "P3451", "P2954", "P2806", "P2703", "P2477", "P2468", "P2456", "P2282", "P1888"]
17+
ToExludeProjectForReallyGoodReasons=["P4752"]
18+
19+
def duplicate_trends(args):
20+
"""Fetch all project and start to loop over them
21+
"""
22+
token = args.token
23+
url = args.url
24+
session = requests.Session()
25+
headers = {'X-Charon-API-token': token, 'content-type': 'application/json'}
26+
duplications_per_date = {}
27+
projects_per_date = {}
28+
projects = {}
29+
for project in session.get(url+'/api/v1/projects', headers=headers).json()['projects']:
30+
if project['sequencing_facility'] != 'NGI-S':
31+
continue
32+
if "G.A16" in project['name'] or "G.A17" in project['name']:
33+
continue
34+
pid = project['projectid'] #project id
35+
#if pid in NeoPrepProjects or pid in ToExludeProjectForReallyGoodReasons:
36+
# continue
37+
request = session.get(url+'/api/v1/samples/{}'.format(pid), headers=headers)
38+
if request.status_code != 200:
39+
print pid
40+
continue
41+
for sample in request.json()['samples']:
42+
if sample.get('analysis_status') != 'ANALYZED':
43+
continue
44+
if 'duplication_pc' not in sample:
45+
continue
46+
if sample['duplication_pc'] == 0:
47+
continue
48+
if project['name'] not in projects:
49+
projects[project['name']] = 0
50+
#store this a analysed proejcts
51+
#now fetch sample runs
52+
sid = sample['sampleid']
53+
dup_rate = sample['duplication_pc']
54+
oldest_run_date = date.today() # no run can be older than today and being analysed
55+
for sample_run in session.get(url+ '/api/v1/seqruns/{}/{}'.format(pid, sid), headers=headers).json()['seqruns']:
56+
rid = sample_run['seqrunid']
57+
sequencing_start_date = rid.split("_")[0] #first 6 digit are the date
58+
year = int("20" + sequencing_start_date[0:2])
59+
month = int(sequencing_start_date[2:4])
60+
day = int(sequencing_start_date[4:6])
61+
if oldest_run_date > datetime.date(year, month, day):
62+
oldest_run_date = datetime.date(year, month, day)
63+
#at this point I have the older run date
64+
if oldest_run_date not in duplications_per_date:
65+
duplications_per_date[oldest_run_date] = []
66+
duplications_per_date[oldest_run_date].append(dup_rate)
67+
if oldest_run_date not in projects_per_date:
68+
projects_per_date[oldest_run_date] = {}
69+
if pid not in projects_per_date[oldest_run_date]:
70+
projects_per_date[oldest_run_date][pid] = [1,dup_rate]
71+
else:
72+
projects_per_date[oldest_run_date][pid][0] += 1
73+
projects_per_date[oldest_run_date][pid][1] += dup_rate
74+
if len(duplications_per_date) > 0:
75+
continue
76+
for cur_date in sorted(duplications_per_date):
77+
average = sum((duplications_per_date[cur_date]))/float(len(duplications_per_date[cur_date]))
78+
sys.stdout.write("{} {} {} ".format(cur_date, average, len(duplications_per_date[cur_date])))
79+
for pid in projects_per_date[cur_date]:
80+
num_samples = projects_per_date[cur_date][pid][0]
81+
average_dup_rate_proj = projects_per_date[cur_date][pid][1]/float(projects_per_date[cur_date][pid][0])
82+
sys.stdout.write("({},{},{}) ".format(pid,num_samples,average_dup_rate_proj))
83+
sys.stdout.write("\n")
84+
for project in projects:
85+
print project
86+
87+
def compute_human_genomes(args):
88+
"""Fetch all project and start to loop over them
89+
"""
90+
token = args.token
91+
url = args.url
92+
session = requests.Session()
93+
headers = {'X-Charon-API-token': token, 'content-type': 'application/json'}
94+
duplications_per_date = {}
95+
projects_per_date = {}
96+
projects = {}
97+
samples_without_autosome_cov = 0
98+
total_coverage = 0
99+
total_samples = 0
100+
for project in session.get(url+'/api/v1/projects', headers=headers).json()['projects']:
101+
if project['sequencing_facility'] != 'NGI-S':
102+
continue
103+
pid = project['projectid'] #project id
104+
request = session.get(url+'/api/v1/samples/{}'.format(pid), headers=headers)
105+
for sample in request.json()['samples']:
106+
if sample.get('analysis_status') != 'ANALYZED':
107+
continue
108+
if 'duplication_pc' not in sample:
109+
continue
110+
if sample['duplication_pc'] == 0:
111+
continue
112+
if project['name'] not in projects:
113+
projects[project['name']] = 0
114+
#store this a analysed proejcts
115+
#now fetch sample runs
116+
sid = sample['sampleid']
117+
dup_rate = sample['duplication_pc']
118+
coverage_field = 'total_autosomal_coverage'
119+
if sample[coverage_field] == 0:
120+
coverage_field = 'target_coverage'
121+
samples_without_autosome_cov += 1
122+
total_coverage += sample[coverage_field]
123+
total_samples += 1
124+
print "TOTAL SAMPLES {}".format(total_samples)
125+
print "TOTAL SAMPLES no cov {}".format(samples_without_autosome_cov)
126+
print "TOTAL COVERAGE {}".format(total_coverage)
127+
print "AVERAGE COVERAGE PER SAMPLE {}".format(total_coverage/total_samples)
128+
print "NUMBER OF 30X HG EQUVALENTS {}".format(total_coverage/30)
129+
130+
131+
132+
if __name__ == '__main__':
133+
parser = argparse.ArgumentParser("""This scripts connects to charon and fetches information about duplication rates for all human sample we are able to find. For each sample approaximates the sequencing data to the most recent sequencing run. It can be used also to compute the total amount of Whole Human Genomes sequenced by NGI-S.
134+
""")
135+
# general options
136+
parser.add_argument("-t", "--token", dest="token", default=os.environ.get('CHARON_API_TOKEN'),
137+
help="Charon API Token. Will be read from the env variable CHARON_API_TOKEN if not provided")
138+
parser.add_argument("-u", "--url", dest="url", default=os.environ.get('CHARON_BASE_URL'),
139+
help="Charon base url. Will be read from the env variable CHARON_BASE_URL if not provided")
140+
args = parser.parse_args()
141+
if not args.token :
142+
print( "No valid token found in arg or in environment. Exiting.")
143+
sys.exit(-1)
144+
if not args.url:
145+
print( "No valid url found in arg or in environment. Exiting.")
146+
sys.exit(-1)
147+
148+
compute_human_genomes(args)
149+
duplicate_trends(args)
150+
151+
152+

LICENSE

100644100755
File mode changed.

README.md

100644100755
Lines changed: 100 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,89 @@
22

33
Repository to store standalone scripts that do not belong to any bigger package or repository.
44

5+
6+
### compute_undet_index_stats.py
7+
used to fetch stats about undermined indexes.
8+
This scripts queries statusdb x_flowcell_db and fetch informaiton about runs.
9+
The following operations are supported:
10+
11+
- check_undet_index: given a specific index checks all FCs and prints all FC and lanes where the indx appears as undetermined
12+
- most_undet: outputs a summary about undetermiend indexes, printing the most 20 most occurring indexes for each instrument type
13+
- single_sample_lanes: prints stats about HiSeqX lanes run with a single sample in it
14+
- workset_undet: prints for each workset the FC, lanes and samples where the specified index has been found in undet. For each sample the plate position is printed.
15+
- fetch_pooled_projects: returns pooled projects, that is projects that have been run in a pool.
16+
17+
#### Usage
18+
Examples:
19+
20+
- compute for each workset the FC that contain a lane with index CTTGTAAT present in undet at least 0.5M times:
21+
- `python compute_undet_index_stats.py --config couch_db.yaml --index CTTGTAAT --mode workset_undet --min_occurences 500000`
22+
- Compute a list of the most occurring undetemriend indexes for HiSeqX runs:
23+
- `python compute_undet_index_stats.py --config couch_db.yaml -- mode most_undet --instrument-type HiSeqX`
24+
25+
26+
### compute_undet_index_stats.py
27+
used to fetch stats about undermined indexes.
28+
This scripts queries statusdb x_flowcell_db and fetch informaiton about runs.
29+
The following operations are supported:
30+
31+
- check_undet_index: given a specific index checks all FCs and prints all FC and lanes where the indx appears as undetermined
32+
- most_undet: outputs a summary about undetermiend indexes, printing the most 20 most occurring indexes for each instrument type
33+
- single_sample_lanes: prints stats about HiSeqX lanes run with a single sample in it
34+
- workset_undet: prints for each workset the FC, lanes and samples where the specified index has been found in undet. For each sample the plate position is printed.
35+
- fetch_pooled_projects: returns pooled projects, that is projects that have been run in a pool.
36+
37+
#### Usage
38+
Examples:
39+
40+
- compute for each workset the FC that contain a lane with index CTTGTAAT present in undet at least 0.5M times:
41+
- `python compute_undet_index_stats.py --config couch_db.yaml --index CTTGTAAT --mode workset_undet --min_occurences 500000`
42+
- Compute a list of the most occurring undetemriend indexes for HiSeqX runs:
43+
- `python compute_undet_index_stats.py --config couch_db.yaml -- mode most_undet --instrument-type HiSeqX`
44+
45+
46+
47+
48+
49+
### runs_per_week.sh
50+
Run on Irma prints a three columns:
51+
52+
- first column is the week number
53+
- second column number of HiSeqX runs in that week
54+
- seconf column number of HiSeq2500 runs in that week
55+
56+
#### Usage
57+
Examp `runs_per_week.sh `
58+
59+
60+
61+
### compute_production_stats.py
62+
This scripts queries statusdb x_flowcelldb and project database and fetches informations useful to plot trands and aggregated data. It can be run in three modalities:
63+
64+
- production-stats: for each instrument type it prints number of FCs, number of lanes, etc. It then prints a summary of all stats
65+
- instrument-usage: for each instrument type and year it prints different run set-ups and samples run with that set-up
66+
- year-stats: cumulative data production by month
67+
68+
69+
##### Usage
70+
Example: `compute_production_stats.py --config couchdb.yaml --mode year-stats`
71+
```
72+
Usage: compute_production_stats.py --config couchdb.yam
73+
74+
Options:
75+
--config CONFIG configuration file
76+
```
77+
#### Configuration
78+
Requires a config file to access statusdb
79+
```
80+
statusdb:
81+
url: path_to_tool
82+
username: Username
83+
password: *********
84+
port: port_number
85+
```
86+
87+
588
### backup_zendesk_tickets.py
689
Used to automatically back up tickets from zendesk
790

@@ -15,13 +98,14 @@ Options:
1598
--config-file PATH Path to the config file [required]
1699
--days INTEGER Since how many days ago to backup tickets
17100
 --help             Show this message and exit.
18-
101+
19102
```
20-
103+
21104
#### Dependencies
22105
* zendesk
23-
* click
106+
* click
24107
* yaml
108+
* requests
25109

26110
#### Configuration
27111
Requires a config file:
@@ -66,7 +150,7 @@ Prints a list of analyzed samples with user_id and ngi_id
66150
get_sample_names.py P1234
67151
```
68152

69-
### index_fixer.py
153+
### index_fixer.py
70154
Takes in a SampleSheet.csv and generates a new one with swapped or reverse complimented indexes.
71155

72156
###### Dependencies
@@ -78,12 +162,12 @@ Takes in a SampleSheet.csv and generates a new one with swapped or reverse compl
78162
Merges all fastq_files from a sample into one file.
79163
```
80164
merge_and_rename_NGI_fastq_files.py path/to/dir/with/inputfiles/ path/to/output/directory
81-
```
165+
```
82166

83167

84168

85169
### project_status_extended.py
86-
Collects information about specified project from the filesystem of irma.
170+
Collects information about specified project from the filesystem of irma.
87171
Without any arguments prints statistics for each sample, such as:
88172
* Number of reads
89173
* Coverage
@@ -138,21 +222,28 @@ Returns a summary of quota usage in Uppmax
138222
* couchdb
139223
* pprint
140224

225+
226+
### Samplesheet_converter.py
227+
For the purpose of converting Illumina samplesheet that contains Chromium 10X indexes for demultiplexing. Headers and lines with ordinary indexes will be passed without any change. Lines with Chromium 10X indexes will be expanded into 4 lines, with 1 index in each line, and suffix 'Sx' will be added at the end of sample names.
228+
#### Usage
229+
`python main.py -i <inputfile> -o <outputfile> -x <indexlibrary>`
230+
231+
141232
### set_bioinforesponsible.py
142-
Calls up the genologics LIMS directly in order to more quickly set a bioinformatics responsible.
233+
Calls up the genologics LIMS directly in order to more quickly set a bioinformatics responsible.
143234

144235
###### Dependencies
145236

146237
* Genologics: lims, config
147238

148239
### use_undetermined.sh
149-
Creates softlinks of undetermined for specified flowcell and lane to be used in the analysis.
240+
Creates softlinks of undetermined for specified flowcell and lane to be used in the analysis.
150241
To be run on irma.
151242
#### Usage
152243
Usage: `use_undetermined.sh  <flowcell> <lane> <sample>`
153244
Example: `use_undetermined.sh 160901_ST-E00214_0087_BH33GHALXX 1 P4601_273`
154245
#### Important
155-
After running the script, don't forget to (re-)**ORGANIZE FLOWCELL**.
246+
After running the script, don't forget to (re-)**ORGANIZE FLOWCELL**.
156247
And then analysis can be started.
157248

158249
### ZenDesk Attachments Backup

0 commit comments

Comments
 (0)