Skip to content

Commit

Permalink
update
Browse files Browse the repository at this point in the history
  • Loading branch information
quentin0515 committed Sep 19, 2021
1 parent 9ca9b47 commit c668ebc
Show file tree
Hide file tree
Showing 18 changed files with 1,013,257 additions and 1,305 deletions.
51 changes: 51 additions & 0 deletions .ipynb_checkpoints/README-checkpoint.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# TT-Mars

TT-Mars: S**t**ructural Varian**t**s Assess**m**ent B**a**sed on Haplotype-**r**esolved A**s**semblies.
A sequence content based structural variants (SVs) validation method which aims to efficiently and accurately assess SV calls through assembly contigs and established alignment information.

## Requirement

Python 3 packages: pysam (https://github.com/pysam-developers/pysam).
Softwares: samtools (https://github.com/samtools/samtools), samliftover (https://github.com/mchaisso/mcutils).

## Positional parameters

1. Output directory
2. if hg38: if reference is hg38 (True/False). If False, TT-Mars will use hs37d5 as reference
3. centromere file: centromere_positions.bed
4. Assembly alignment bam file: assembly1.bam (haplotype-resolved)
5. Assembly alignment bam file: assembly2.bam (haplotype-resolved)
6. Average read depth: the average read depth of the read file
7. Read file: read.bam (for high-depth-read calls only, need to modify the method to get high-depth regions)
8. A callset file: callset.vcf.gz
9. Referemce genome sequence file: reference_genome.fasta
10. Assembly sequence files: assembly1.fasta/.fa (haplotype-resolved)
11. Assembly sequence files: assembly2.fasta/.fa (haplotype-resolved)
12. liftover file: assembly1_liftover.bed (provided)
13. liftover file: assembly2_liftover.bed (provided)

## Usage

python ttmars.py output_dir if_hg38 centromere_positions.bed assembly1.bam assembly2.bam avg_read_depth read.bam callset.vcf.gz assembly1.fasta assembly2.fasta assembly1_liftover.bed assembly2_liftover.bed

## Example Output

ttmars_res.txt:
(chr start end type relative_length relative_score validation_result)
chr1 893792 893827 DEL 1.03 3.18 True

## Available Data

### Liftover files and assemblies
| Samples | Reference Liftover Hap1 | Reference Liftover Hap2 | Assembly Liftover Hap1 | Assembly Liftover Hap2 |
| :----: | :----: | :----: | :----: | :----: |
| HG00096 | https://figshare.com/ndownloader/files/30817390 | https://figshare.com/ndownloader/files/30817384 | https://figshare.com/ndownloader/files/30817387 | https://figshare.com/ndownloader/files/30817381 |
| HG00171 | https://figshare.com/ndownloader/files/30817402 | https://figshare.com/ndownloader/files/30817396 | https://figshare.com/ndownloader/files/30817399 | https://figshare.com/ndownloader/files/30817393 |
| HG00513 | https://figshare.com/ndownloader/files/30817411 | https://figshare.com/ndownloader/files/30817405 | https://figshare.com/ndownloader/files/30817408 | https://figshare.com/ndownloader/files/30817414 |
| HG00731 | https://figshare.com/ndownloader/files/30817426 | https://figshare.com/ndownloader/files/30817420 | https://figshare.com/ndownloader/files/30817423 | https://figshare.com/ndownloader/files/30817417 |
| HG00732 | https://figshare.com/ndownloader/files/30817435 | https://figshare.com/ndownloader/files/30817429 | https://figshare.com/ndownloader/files/30817432 | https://figshare.com/ndownloader/files/30817438 |
| HG00864 | https://figshare.com/ndownloader/files/30817450 | https://figshare.com/ndownloader/files/30817444 | https://figshare.com/ndownloader/files/30817447 | https://figshare.com/ndownloader/files/30817441 |
| HG01114 | https://figshare.com/ndownloader/files/30817459 | https://figshare.com/ndownloader/files/30817453 | https://figshare.com/ndownloader/files/30817456 | https://figshare.com/ndownloader/files/30817462 |
| HG01505 | https://figshare.com/ndownloader/files/30817471 | https://figshare.com/ndownloader/files/30817465 | https://figshare.com/ndownloader/files/30817468 | https://figshare.com/ndownloader/files/30817474 |
| HG01596 | https://figshare.com/ndownloader/files/30817486 | https://figshare.com/ndownloader/files/30817480 | https://figshare.com/ndownloader/files/30817483 | https://figshare.com/ndownloader/files/30817477 |
| HG03009 | https://figshare.com/ndownloader/files/30817498 | https://figshare.com/ndownloader/files/30817492 | https://figshare.com/ndownloader/files/30817495 | https://figshare.com/ndownloader/files/30817489 |
48 changes: 48 additions & 0 deletions combine_dup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import csv
import sys

output_dir = sys.argv[1] + "/"
other_sv_res_file = output_dir+"ttmars_res.txt"
regdup_res_file = output_dir+"ttmars_regdup_res.txt"

with open(other_sv_res_file) as f:
reader = csv.reader(f, delimiter="\t")
other_sv_res = list(reader)
f.close()

with open(regdup_res_file) as f:
reader = csv.reader(f, delimiter="\t")
regdup_res = list(reader)
f.close()

sv_dict = {}
for rec in other_sv_res:
ref_name = rec[0]
sv_pos = int(rec[1])
sv_end = int(rec[2])
sv_type = rec[3]
sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6]]
for rec in regdup_res:
ref_name = rec[0]
sv_pos = int(rec[1])
sv_end = int(rec[2])
sv_type = rec[3]
if (ref_name, int(sv_pos), int(sv_end), sv_type) in sv_dict:
if rec[6] == 'True' and sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)][2] == 'False':
sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6]]
else:
sv_dict[(ref_name, int(sv_pos), int(sv_end), sv_type)] = [rec[4], rec[5], rec[6]]

g = open(output_dir + "/ttmars_combined_res.txt", "w")
for key in sv_dict:
res = sv_dict[key]

g.write(str(key[0]) + "\t")
g.write(str(key[1]) + "\t")
g.write(str(key[2]) + "\t")
g.write(str(key[3]) + "\t")
g.write(str(res[0]) + "\t")
g.write(str(res[1]) + "\t")
g.write(str(res[2]))
g.write("\n")
g.close()
103 changes: 103 additions & 0 deletions compress_liftover.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import csv
import sys
interval = 20

output_dir = sys.argv[1] + "/"
in_file_name = output_dir + sys.argv[2]
out_file_name = output_dir + sys.argv[3]

#consider reverse strand also
with open(in_file_name, "r") as file:
reader = csv.reader(file, delimiter="\t")
start_row = next(reader)
chr_name = start_row[4]
contig_name = start_row[0]
chr_pos = int(start_row[5])
contig_pos = int(start_row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
ctr = 1
forward_strand = True
out_file = open(out_file_name, "w")
out_file.write(chr_name + "\t" + contig_name + "\t")
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")

for row in reader:
if (row[4] == chr_name) and (row[0] == contig_name) and (int(row[5]) - pre_chr_pos == interval) and \
(int(row[1]) - pre_contig_pos == interval) and forward_strand:
pre_chr_pos = int(row[5])
pre_contig_pos = int(row[1])
ctr += 1
elif (row[4] == chr_name) and (row[0] == contig_name) and (int(row[5]) - pre_chr_pos == interval) and \
(int(row[1]) - pre_contig_pos == -interval) and not forward_strand:
pre_chr_pos = int(row[5])
pre_contig_pos = int(row[1])
ctr += 1
else:
#if different chr or contig
if row[4] != chr_name or row[0] != contig_name:
#print(chr_name, contig_name)
out_file.write(str(ctr)+":")
if forward_strand:
out_file.write("+;")
else:
out_file.write("-;")
out_file.write("\n")
chr_name = row[4]
contig_name = row[0]
chr_pos = int(row[5])
contig_pos = int(row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
ctr = 1
forward_strand = True
out_file.write(chr_name + "\t" + contig_name + "\t")
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")
elif int(row[5]) - pre_chr_pos != interval:
out_file.write(str(ctr)+":")
if forward_strand:
out_file.write("+;")
else:
out_file.write("-;")
chr_pos = int(row[5])
contig_pos = int(row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")
ctr = 1
forward_strand = True
elif (int(row[1]) - pre_contig_pos != interval) and (int(row[1]) - pre_contig_pos != -interval):
out_file.write(str(ctr)+":")
if forward_strand:
out_file.write("+;")
else:
out_file.write("-;")
chr_pos = int(row[5])
contig_pos = int(row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")
ctr = 1
forward_strand = True
elif (int(row[1]) - pre_contig_pos == interval) and not forward_strand:
out_file.write(str(ctr)+":")
out_file.write("-;")
chr_pos = int(row[5])
contig_pos = int(row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")
ctr = 1
forward_strand = True
elif (int(row[1]) - pre_contig_pos == -interval) and forward_strand:
out_file.write(str(ctr)+":")
out_file.write("+;")
chr_pos = int(row[5])
contig_pos = int(row[1])
pre_chr_pos = chr_pos
pre_contig_pos = contig_pos
out_file.write(str(chr_pos)+":"+str(contig_pos)+":")
ctr = 1
forward_strand = False
#write last lines
out_file.close()
Loading

0 comments on commit c668ebc

Please sign in to comment.