Skip to content

Commit

Permalink
🔥 keep caller-specific fields (dynamic)
Browse files Browse the repository at this point in the history
  • Loading branch information
mflevine committed Dec 17, 2018
1 parent 429cdea commit 024d3d7
Show file tree
Hide file tree
Showing 2 changed files with 136 additions and 75 deletions.
8 changes: 7 additions & 1 deletion README.rst
Original file line number Diff line number Diff line change
@@ -1,8 +1,14 @@
Merge SV VCFs
==========
=============

This set of routines merges SV VCF files by calls, labeling calls by the callers that
made them in an INFO field, Callers=. Most of the work goes into normalizing
SV calls, which are treated as paired of breakpoints that are equal if they fall within
a user-specified window.

------------
Known Issues
------------

* INFO and FORMAT merged for multiple from same caller with only be for last record
* Individual Caller Genotype not kept
203 changes: 129 additions & 74 deletions mergesvvcf/mergedfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,39 +26,39 @@ def int_if_possible(val):
i = val
return i

def make_info_dict(callers, records, pos1, pos2):
""" generate 'median' results for info fields from all records """
callermap = {
"delly": {"fields": ["DV", "RV"], "tumor": 0},
"svaba": {"fields": ["SR", "DR"], "tumor": 1},
"gridss": {"fields": ["RP", "SR"], "tumor": 1},
"brass": {"fields": ["PS", "RC"], "tumor": 1},
"smoove": {"fields": ["SR", "PE"], "tumor": 0},
}
# fields = ['CHR2', 'END', 'SVTYPE', 'SVLEN'] + ["SR", "DR"] + ["DV", "RV"] + ["RP"]
fields = [x["fields"] for x in callermap.values()]
fields = [item for sublist in fields for item in sublist]
info = {}
for field in fields:
answers = []
for caller, record in zip(callers, records):
if caller in callermap.keys() and field in callermap[caller]["fields"]:
if field in record.format:
answers.append([caller,int_if_possible(record.samples[callermap[caller]["tumor"]][field])])
elif field in record.info:
answers.append([caller,int_if_possible(record.info[field])])
# elif field in record.format:
# answers.append([caller,int_if_possible(record.samples[callermap[caller]["tumor"]][field])])
nanswers = len(answers)
if nanswers > 0:
# sorted_answers = sorted(answers)
# # doesn't quite deal with even #s correctly - can't average strings
# median_pos = int(nanswers / 2)
# median_answer = sorted_answers[median_pos]
# if not median_answer == 0:
# info[field] = median_answer
for a in answers:
info[a[0] + "_" + field] = a[1]
# def make_info_dict(callers, records, pos1, pos2):
# """ generate 'median' results for info fields from all records """
# callermap = {
# "delly": {"fields": ["DV", "RV"], "tumor": 0},
# "svaba": {"fields": ["SR", "DR"], "tumor": 1},
# "gridss": {"fields": ["RP", "SR"], "tumor": 1},
# "brass": {"fields": ["PS", "RC"], "tumor": 1},
# "smoove": {"fields": ["SR", "PE"], "tumor": 0},
# }
# # fields = ['CHR2', 'END', 'SVTYPE', 'SVLEN'] + ["SR", "DR"] + ["DV", "RV"] + ["RP"]
# fields = [x["fields"] for x in callermap.values()]
# fields = [item for sublist in fields for item in sublist]
# info = {}
# for field in fields:
# answers = []
# for caller, record in zip(callers, records):
# if caller in callermap.keys() and field in callermap[caller]["fields"]:
# if field in record.format:
# answers.append([caller,int_if_possible(record.samples[callermap[caller]["tumor"]][field])])
# elif field in record.info:
# answers.append([caller,int_if_possible(record.info[field])])
# # elif field in record.format:
# # answers.append([caller,int_if_possible(record.samples[callermap[caller]["tumor"]][field])])
# nanswers = len(answers)
# if nanswers > 0:
# # sorted_answers = sorted(answers)
# # # doesn't quite deal with even #s correctly - can't average strings
# # median_pos = int(nanswers / 2)
# # median_answer = sorted_answers[median_pos]
# # if not median_answer == 0:
# # info[field] = median_answer
# for a in answers:
# info[a[0] + "_" + field] = a[1]

if 'SVTYPE' in info and info['SVTYPE'] in ['DUP', 'DUP:TANDEM', 'DEL', 'INV']:
if not 'SVLEN' in info:
Expand Down Expand Up @@ -118,38 +118,84 @@ def passed_variant(record):
"""Did this variant pass?"""
return "PASS" in list(record.filter) or len(list(record.filter)) == 0 or noFilter

def infoString(callers, infodict):
"""
Generate an INFO string from the INFO dictionary plus
the list of callers.
"""
info = {}
# fields = ['SVTYPE', 'SVLEN']
# fields = ["svaba_SR", "svaba_DR"] + ["delly_DV", "delly_RV"] + ["gridss_RP", "gridss_SR"]
# for field in fields:
for field in infodict.keys():
if field in infodict:
res = infodict[field]
if type(res) is list:
res = res[0]
info[field] = res
if output_ncallers:
info["NumCallers"] = int(len(set(callers)))
info["Callers"] = ",".join(list(set(callers)))
return info
# def infoString(callers, infodict):
# """
# Generate an INFO string from the INFO dictionary plus
# the list of callers.
# """
# info = {}
# # fields = ['SVTYPE', 'SVLEN']
# # fields = ["svaba_SR", "svaba_DR"] + ["delly_DV", "delly_RV"] + ["gridss_RP", "gridss_SR"]
# # for field in fields:
# for field in infodict.keys():
# if field in infodict:
# res = infodict[field]
# if type(res) is list:
# res = res[0]
# info[field] = res
# if output_ncallers:
# info["NumCallers"] = int(len(set(callers)))
# info["Callers"] = ",".join(list(set(callers)))
# return info

samplemap = {} # the samples in each file
infomap = {} # the info fields in each file
formatmap = {} # the format fields in each file
outsamples = set() # standardized samples

calldict = variantdict.variantmap(awindow=0, svwindow=slop)
for (infile, program) in zip(filenames, programs):
count = 0
try:
vcf_reader = pysam.VariantFile(infile)

# Add contigs from all files
for contig in sorted(vcf_reader.header.contigs):
try:
outvcf.header.contigs.add(contig)
except ValueError:
pass

outvcf.header.add_meta(
key="Caller",
items=[
("ID", program),
("File", infile)
]
)

# Add filters from all files
for hfilter in vcf_reader.header.filters.itervalues():
if hfilter.name not in ["PASS","LOWSUPPORT"]:
outvcf.header.filters.add("{}_{}".format(program,hfilter.name),None,None,hfilter.description)
# Add info fields from all files
infos = []
for info in vcf_reader.header.info.itervalues():
if info.name not in ["SVTYPE","SVLEN","END","CHR2","NumCallers"]:
outvcf.header.info.add("{}_{}".format(program,info.name),info.number,info.type,info.description)
infos.append(info.name)
infomap[program] = infos
# Add format fields from all files
formats = []
for hformat in vcf_reader.header.formats.itervalues():
num = None
if hformat.name == "GT":
num = "G"
continue # Genotype is special cant do multiple
outvcf.header.formats.add("{}_{}".format(program,hformat.name),num or hformat.number,hformat.type,hformat.description)
formats.append(hformat.name)
formatmap[program] = formats
# Collect samples all files
samples = {}
for meta in vcf_reader.header.records:
if meta.key == "SAMPLE" and "SampleName" in meta.keys():
samples[meta["ID"]] = meta["SampleName"]
if not samples:
for samp in vcf_reader.header.samples:
samples[samp] = samp.rsplit("/")[-1].replace(".bam","")
samplemap[program] = samples
outsamples.update(samples.values())

for record in vcf_reader.fetch():

if not passed_variant(record):
Expand All @@ -172,13 +218,9 @@ def infoString(callers, infodict):
else:
pass

outvcf.header.add_meta(
key="caller",
items=[
("ID", program),
("File", infile)
]
)
# Add samples
for samp in sorted(outsamples):
outvcf.header.samples.add(samp)

# Write the results in a master vcf file for the sample
outvcf.header.info.add("Callers", ".", "String", "Callers that made this call")
Expand All @@ -188,21 +230,9 @@ def infoString(callers, infodict):

outvcf.header.info.add("CHR2", "1", "String", "Chromosome for END coordinate in case of a translocation")
outvcf.header.info.add("END", "1", "Integer", "End position of the structural variant")

outvcf.header.info.add("SVTYPE", "1", "String", "Type of structural variant")
outvcf.header.info.add("SVLEN", "1", "String", "Length of structural variant")

outvcf.header.info.add("brass_PS", "1", "Integer", "Count of pairs that span this breakpoint")
outvcf.header.info.add("brass_RC", "1", "Integer", "Count of countributing reads")
outvcf.header.info.add("delly_DV", "1", "Integer", "# high-quality variant pairs")
outvcf.header.info.add("delly_RV", "1", "Integer", "# high-quality variant junction reads")
outvcf.header.info.add("gridss_RP", "1", "Integer", "Count of read pairs supporting breakpoint")
outvcf.header.info.add("gridss_SR", "1", "Integer", "Count of split reads supporting breakpoint")
outvcf.header.info.add("smoove_PE", "1", "Integer", "Number of paired-end reads supporting the variant")
outvcf.header.info.add("smoove_SR", "1", "Integer", "Number of split reads supporting the variant")
outvcf.header.info.add("svaba_DR", "1", "Integer", "Number of discordant-supported reads for this variant")
outvcf.header.info.add("svaba_SR", "1", "Integer", "Number of spanning reads for this variants")

print(outvcf.header, end="", file=outfile)

for variant in sorted(calldict, key=operator.itemgetter(0)):
Expand All @@ -223,7 +253,7 @@ def infoString(callers, infodict):
vcfrec.ref = allele[0]
vcfrec.alts = [allele[1]]
vcfrec.filter.add(filterstring)
vcfrec.info.__setitem__("Callers", ",".join(callers))
vcfrec.info.__setitem__("Callers", ",".join(list(set(callers))))
if output_ncallers:
vcfrec.info.__setitem__("NumCallers", str(len(set(callers))))
print(vcfrec, end="", file=outfile)
Expand All @@ -244,14 +274,39 @@ def infoString(callers, infodict):
vcfrec.ref = ref
vcfrec.alts = [alt]
vcfrec.filter.add(filterstring)
for key, val in sorted(infoString(callers, make_info_dict(callers, records, medianPos1, medianPos2)).items()):
vcfrec.info.__setitem__(key, val)
# for key, val in sorted(infoString(callers, make_info_dict(callers, records, medianPos1, medianPos2)).items()):
# vcfrec.info.__setitem__(key, val)
vcfrec.info.__setitem__("Callers", ",".join(list(set(callers))))
if output_ncallers:
vcfrec.info.__setitem__("NumCallers", len(set(callers)))
vcfrec.info.__setitem__("CHR2", avgloc2.chrom)
vcfrec.stop = avgloc2.pos
svtype = getSVTYPE(avgloc1.chrom, avgloc2.chrom, extend1, extend2)
vcfrec.info.__setitem__("SVTYPE", svtype)
if avgloc1.chrom == avgloc2.chrom:
vcfrec.info.__setitem__("SVLEN", str(avgloc2.pos - avgloc1.pos))
for caller, rec in recordscalled:
if noFilter:
for fil in rec.filter:
if not fil in [".","PASS"]:
vcfrec.filter.add("{}_{}".format(caller,fil))
for info in infomap[caller]:
try:
vcfrec.info.__setitem__("{}_{}".format(caller,info), rec.info[info])
except KeyError:
continue
except:
print(info,rec.info[info])
raise
for samp in samplemap[caller].items():
for form in formatmap[caller]:
try:
vcfrec.samples[samp[1]].__setitem__("{}_{}".format(caller,form),rec.samples[samp[0]][form])
except KeyError:
continue
except:
print(form,rec.samples[samp[0]][form])
raise
print(vcfrec, end="", file=outfile)
if debug:
for caller, rec in recordscalled:
Expand Down

0 comments on commit 024d3d7

Please sign in to comment.