ClusterList.py

from collections import deque
from Cluster import *
from ClusterPair import *
#import pp
# from pp import *
import pysam
#from pysam import csamtools
import gc
#from memory_profiler import profile

from multiprocessing import Pool
import cPickle as pickle
import y_serial_v060 as y_serial


class ClusterList:
    ##@profile
    def __init__(self, read_pair_list):
        #list of AlignedReadPair objects
        self.read_pair_list = read_pair_list

        #list of Cluster Objects
        self.cluster_list = []

        #list of ClusterPair objects


    #cluster the read pairs according to the interval defined by the non-TE mapped read
    #@profile
    def generate_clusters_parallel(self, verbose, num_CPUs, bin_size, psorted_bamfile_name, bed_file_handle, streaming, min_cluster_size,output_prefix):

###################### BEGIN PARALLEL VERSION ########################################

        ################ CLUSTER BY CHR #######################
        #cluster fwd intervals
#        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
#        fwd_clusters_by_chr = cluster_read_pairs_by_chr(fwd_read_pairs)
#        print "********************* total fwd non-overlapping clusters found by chr: %d" % sum([len(chr_list) for chr_list in fwd_clusters_by_chr.values()])
#
#
#
#        #cluster rev intervals
#        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
#        rev_clusters_by_chr = cluster_read_pairs_by_chr(rev_read_pairs)
#        print "********************* total rev non-overlapping clusters found by chr: %d" % sum([len(chr_list) for chr_list in rev_clusters_by_chr.values()])


        ############################ END CLUSTER BY CHR ###########################

        ################ CLUSTER BY BIN #######################
        #cluster fwd intervals
        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
        fwd_clusters_by_bin = cluster_read_pairs_by_chr_and_bin(fwd_read_pairs, bin_size)
        print "********************* total fwd non-overlapping clusters found by bin: %d" % sum([len(chr_list) for chr_list in fwd_clusters_by_bin.values()])

        ####DEBUG
        print "size of fwd_read_pairs list %s"%(sys.getsizeof(fwd_read_pairs))
        del(fwd_read_pairs)
        gc.collect()

        #cluster rev intervals
        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
        rev_clusters_by_bin = cluster_read_pairs_by_chr_and_bin(rev_read_pairs, bin_size)
        print "********************* total rev non-overlapping clusters found by bin: %d" % sum([len(chr_list) for chr_list in rev_clusters_by_bin.values()])
        
        ###DEBUG
        print "size of rev_read_pairs list %s"%(sys.getsizeof(rev_read_pairs))
        print "size of read pair list initial %s"%(sys.getsizeof(self.read_pair_list))
        del(rev_read_pairs)
        del(self.read_pair_list)
        gc.collect()
        
        ############################ END CLUSTER BY BIN ###########################


        print "Debug : len FWD = %d"%(len(fwd_clusters_by_bin))
        print "Debug : len REV = %d"%(len(rev_clusters_by_bin))
        print "size of fwd list %s"%(sys.getsizeof(fwd_clusters_by_bin))
        print "size of rev list %s"%(sys.getsizeof(rev_clusters_by_bin))


        ##########TODO THIS IS DEBUG

#        #cluster fwd intervals
#        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
#        fwd_clusters = cluster_read_pairs_all(fwd_read_pairs)
#
#        print "******************total fwd clusters found: %d" %  len(fwd_clusters)
#        non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters)
#        print "******************total fwd non-overlapping clusters found: %d" %  len(non_overlapping_fwd_clusters)
#
#
#        #cluster rev intervals
#        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
#        rev_clusters = cluster_read_pairs_all(rev_read_pairs)
#
#        print "******************total rev clusters found: %d" % len(rev_clusters)
#        non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters)
#        print "******************total rev non-overlapping clusters found: %d" %  len(non_overlapping_rev_clusters)
#        print fwd_clusters_by_chr.keys()
#        print rev_clusters_by_chr.keys()

        ############################## END THIS IS DEBUG


        ############## START NEW PARALLEL VERSION #################################

        input_arg_list = []

        #print fwd_clusters_by_bin.keys()
        #print rev_clusters_by_bin.keys()
        for key in fwd_clusters_by_bin.keys():
            if key in rev_clusters_by_bin.keys():
                input_arg_list.append((key, fwd_clusters_by_bin[key], rev_clusters_by_bin[key], psorted_bamfile_name, verbose, bed_file_handle, streaming, min_cluster_size))

        #print input_arg_list[0][0][0].readpair_list[0].read1
        ###DEBUG
        del(fwd_clusters_by_bin)
        del(rev_clusters_by_bin)
        gc.collect()


        print "sending %d jobs to %d processes" % (len(input_arg_list), num_CPUs)
        print "#DEBUG : "
        print "size of input_arg_list %s"%(sys.getsizeof(input_arg_list))

        #
        #pool = Pool(num_CPUs)
        #
        #all_clusters_by_bin = pool.map(pair_clusters_by_bin, input_arg_list)
        #
        #pool.close()
        #pool.join()
        #####FOR PROFILING######
        all_clusters_by_bin = list()    
        for i in input_arg_list:
            tmp = pair_clusters_by_bin(i)
            all_clusters_by_bin.append(tmp)
            #break


        ################ END NEW PARALLEL VERSION #################################
        # (paired, fwd, rev, bed_strings) = all_clusters_by_bin[0:3]

        # print all_clusters_by_bin

        if streaming:

            cluster_counts = [(len(p), len(f), len(r)) for (p,f,r,s) in all_clusters_by_bin]
            print "******************total fwd single clusters found: %d" %  sum([f for (p,f,r) in cluster_counts])
            print "******************total rev single clusters found: %d" %  sum([r for (p,f,r) in cluster_counts])
            print "******************total cluster pairs found: %d" %  sum([p for (p,f,r) in cluster_counts])

            bed_string = "\n".join([s for (p,f,r,s) in all_clusters_by_bin if s != ""])


            # print bed_string
            bed_file_handle.write(bed_string)
            bed_file_handle.close()
            with open(output_prefix+'all_clusters.pkl', 'wb') as output:
                ### saving the biggest object in a text file to avoid os.fork later
                pickle.dump(all_clusters_by_bin, output, pickle.HIGHEST_PROTOCOL)              
            return '' #all_clusters_by_bin

        else:
            cluster_counts = [(len(p), len(f), len(r)) for (p,f,r) in all_clusters_by_bin]
            print "******************total fwd single clusters found: %d" %  sum([f for (p,f,r) in cluster_counts])
            print "******************total rev single clusters found: %d" %  sum([r for (p,f,r) in cluster_counts])
            print "******************total cluster pairs found: %d" %  sum([p for (p,f,r) in cluster_counts])

            
            return all_clusters_by_bin


##################### END PARALLEL VERSION #############################################

    ##@profile
    def generate_clusters(self, verbose, psorted_bamfile_name, bed_file_handle, streaming, min_cluster_size):
##################### BEGIN NON PARALLEL VERSION ######################################
        #cluster fwd intervals
        fwd_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "fwd"]
        fwd_clusters = cluster_read_pairs_all(fwd_read_pairs)

        print "******************total fwd clusters found: %d" %  len(fwd_clusters)
        non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size)
        print "******************total fwd non-overlapping clusters found: %d" %  len(non_overlapping_fwd_clusters)


        #cluster rev intervals
        rev_read_pairs = [read_pair for read_pair in self.read_pair_list if read_pair.interval_direction == "rev"]
        rev_clusters = cluster_read_pairs_all(rev_read_pairs)

        print "******************total rev clusters found: %d" % len(rev_clusters)
        non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size)
        print "******************total rev non-overlapping clusters found: %d" %  len(non_overlapping_rev_clusters)

        #bam_file_name = output_prefix + ".proper_pair.sorted.bam"
        psorted_bamfile = pysam.Samfile(psorted_bamfile_name, "rb")


        #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
        cluster_pairs = []
        paired_fwd_clusters_indices = []
        paired_rev_clusters_indices = []
        bed_string = ""
        
        last_intersect=0
        # iterate over combinations of fwd and rev clusters, skipping if clusters dont meet min size requirements
        for fwd_index  in range(0,len(non_overlapping_fwd_clusters)):
            #if fwd_cluster.num_reads < min_cluster_size:
            #    continue
            fwd_cluster=non_overlapping_fwd_clusters[fwd_index]
            for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)):
                #if rev_cluster.num_reads < min_cluster_size:
                #    continue
                rev_cluster=non_overlapping_rev_clusters[rev_index]
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    last_intersect=rev_index
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    if not streaming:
                        reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                        new_cluster_pair.calc_zygosity(reads)
                    else:
                        bed_line = new_cluster_pair.to_bed()
                        if bed_string == "":
                            bed_string = bed_line
                        else:
                            bed_string = bed_string + "\n" + bed_line
                    if new_cluster_pair.insertion_int_end < new_cluster_pair.insertion_int_start:
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)
                elif  fwd_cluster.intersection_end < rev_cluster.intersection_start:
                    break
        #make lists of unpaired clusters
        unpaired_fwd_clusters = []
        unpaired_rev_clusters = []
        for fwd_index in range(len(non_overlapping_fwd_clusters)):
            if fwd_index not in paired_fwd_clusters_indices:
                unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

        for rev_index in range(len(non_overlapping_rev_clusters)):
            if rev_index not in paired_rev_clusters_indices:
                unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])


        print "******************total cluster pairs found: %d" %  len(cluster_pairs)
        if verbose:
            for (fwd_cluster, rev_cluster) in cluster_pairs:
                print "*************************cluster_pair:**************************************"
                print "fwd cluster:"
                print "cluster coordinates: %s %d %d" % (fwd_cluster[0].interval_chr, fwd_cluster[0].interval_start, fwd_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in fwd_cluster)
                print " ".join(read.str_TE_annot_list() for read in fwd_cluster)
                print "rev cluster:"
                print "cluster coordinates: %s %d %d" % (rev_cluster[0].interval_chr, rev_cluster[0].interval_start, rev_cluster[-1].interval_end )
                print " ".join(read.str_int() for read in rev_cluster)
                print " ".join(read.str_TE_annot_list() for read in rev_cluster)

        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)


############################### END NON PARALLEL VERSION ########################################################

    def generate_clusters_db(self,db,binsize,output_prefix,bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size):
        #"Reads the database with the valid read pairs"
        #"Recovers the tables and extracts the ID fwd and rev"
        #" Table name format: chr_start_end_direction"
        print "Generating clusters for each bin"
        output=list()
        bed_string=""
        total_fwd_clusters=0
        total_rev_clusters=0
        total_pairs=0
        #import sqlite3 as ysql
        #con = ysql.connect( db.db,    timeout = db.TIMEOUT,isolation_level = db.TRANSACT )
        #cur = con.cursor()
        #cur.execute("SELECT name FROM sqlite_master WHERE type='table';")
        #table_list = cur.fetchall()
        
        read_pair_database = y_serial.Main(db)
        table_list=read_pair_database.select(0,'bin_list')       
        
        fwd_bins=list()
        rev_bins=list()
        #con.close()
        for tablename in table_list:
            #print tablename
            #tablename=str(element[0])
            if 'fwd' in tablename:
                fwd_bins.append(tablename.split('_fwd')[0])
            elif 'rev' in tablename:
                rev_bins.append(tablename.split('_rev')[0])
            else:
                print tablename
                raise
        #"Generate dictionaries with all bins in the two directions"
        #read_pair_database = y_serial.Main(db)
        common_bins = list(set(fwd_bins) & set(rev_bins))
        tmp_list= list()
        #For all common keys
        with open(output_prefix +'all_clusters.pkl' ,'wb') as of:
            for bin_key in common_bins:
                #print "Processing %s Bin"%bin_key
                #   retrieve all read pairs for FWD and REV
                #   generate cluster in this Bin
                fwd_read_pairs = list()
                rev_read_pairs = list()
                iterable = read_pair_database.selectdic(bin_key.replace('[','')+'_fwd','read_pairs')
                for k,v in iterable.items():
                    fwd_read_pairs.append(v[2])
                fwd_clusters=cluster_read_pairs_all(fwd_read_pairs)
                del fwd_read_pairs
                iterable = read_pair_database.selectdic(bin_key.replace('[','')+'_rev','read_pairs')
                for k,v in iterable.items():
                    rev_read_pairs.append(v[2])
                rev_clusters=cluster_read_pairs_all(rev_read_pairs)
                del rev_read_pairs
                #   pair the clusters in this bin
                #   save iteratively the value in a pickled file
                tmp = pair_clusters_by_bin((bin_key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size))
                #print tmp
                #
                #tmp_list.append(tmp)
                pickle.dump(tmp,of,pickle.HIGHEST_PROTOCOL)
                bed_string += tmp[3]
                bed_string+='\n'
                total_fwd_clusters+=len(tmp[1])
                total_rev_clusters+=len(tmp[2])
                total_pairs+=len(tmp[0])
            #pickle.dump(tmp_list,of,pickle.HIGHEST_PROTOCOL)
        #Get back to Run_TE_xxx with the all_cluster file already saved
        
        #cluster_counts = [(len(p), len(f), len(r)) for (p,f,r,s) in tmp]
        print "******************total fwd single clusters found: %d"%(total_fwd_clusters)
        print "******************total rev single clusters found: %d"%(total_rev_clusters)
        print "******************total cluster pairs found: %d"%(total_pairs)
        # print bed_string
        bed_file_handle.write(bed_string)
        bed_file_handle.close()
        
        
        return


#@profile
def pair_clusters_by_bin((key, fwd_clusters, rev_clusters, bam_file_name, verbose, bed_file_handle, streaming, min_cluster_size)):


    print "processing cluster pairs on %s" % (key)
    #print "pairing clusters in parallel for chr %s" % fwd_clusters[0].chr
    non_overlapping_fwd_clusters = remove_overlapping_clusters(fwd_clusters,min_cluster_size)
    if verbose:
        print "non overlapping fwd clusters\t%d" % (len(non_overlapping_fwd_clusters))
    non_overlapping_rev_clusters = remove_overlapping_clusters(rev_clusters,min_cluster_size)
    if verbose:
        print "non overlapping rev clusters\t%d" % (len(non_overlapping_rev_clusters))
    if not streaming:
        proper_pair_bam = pysam.Samfile(bam_file_name, "rb")
    #print "haha"


    #print "ok1"
    #pair clusters by genomic location, keeping track of which indices in the array have been paired, so that you can pick out the unpaired ones after
    cluster_pairs = []
    paired_fwd_clusters_indices = []
    paired_rev_clusters_indices = []
    last_intersect=0
    bed_string = ""
    for fwd_index  in range(0,len(non_overlapping_fwd_clusters)):
            #if fwd_cluster.num_reads < min_cluster_size:
            #    continue
            fwd_cluster=non_overlapping_fwd_clusters[fwd_index]
            for rev_index in range(last_intersect, len(non_overlapping_rev_clusters)):
                #if rev_cluster.num_reads < min_cluster_size:
                #    continue
                rev_cluster=non_overlapping_rev_clusters[rev_index]
                if fwd_cluster.is_overlapping_strict(rev_cluster):
                    new_cluster_pair = ClusterPair(fwd_cluster, rev_cluster)
                    last_intersect=rev_index
                    #print new_cluster_pair.get_chr()
                    if not streaming:
                        reads = proper_pair_bam.fetch(new_cluster_pair.get_chr(), new_cluster_pair.get_insertion_int_start(), new_cluster_pair.get_insertion_int_end())
                        new_cluster_pair.calc_zygosity(reads)
                    else:
                        bed_line = new_cluster_pair.to_bed()
                        bed_string = bed_string + "\n" + bed_line
                    #print "poop"
                    if new_cluster_pair.get_insertion_int_end() < new_cluster_pair.get_insertion_int_start():
                        if True:
                            print "cluster pair not paired!"
                    else:
                        cluster_pairs.append(new_cluster_pair)
                        paired_fwd_clusters_indices.append(fwd_index)
                        paired_rev_clusters_indices.append(rev_index)
                elif  fwd_cluster.intersection_end < rev_cluster.intersection_start:
                    break

    #make lists of unpaired clusters
    unpaired_fwd_clusters = []
    unpaired_rev_clusters = []
    for fwd_index in range(len(non_overlapping_fwd_clusters)):
        if fwd_index not in paired_fwd_clusters_indices:
            unpaired_fwd_clusters.append(non_overlapping_fwd_clusters[fwd_index])

    for rev_index in range(len(non_overlapping_rev_clusters)):
        if rev_index not in paired_rev_clusters_indices:
            unpaired_rev_clusters.append(non_overlapping_rev_clusters[rev_index])

    if streaming:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters, bed_string)
    else:
        return (cluster_pairs, unpaired_fwd_clusters, unpaired_rev_clusters)

#helper functions
##@profile
def cluster_read_pairs_by_chr(read_pair_list):
    """this generates a list of  maximal  clusters, ie sets of overlapping read pairs. note: these clusters can be themselves overlapping.
    returns a disctionary of lists of Cluster objects, one entry per chromosome"""

    #sort according to end position then chromosome. sort is stable so the second sort will not unsort the positions
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_end)
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_chr)


    #store each a list of current Cluster objects, which contains a list of AlignedReadPair objects
    cluster_list = []

    #store a seperate cluster_list for each chromosome
    chr_cluster_lists = {}

    #read_pair_Q stores a list of currently overlapping read pair intervals
    read_pair_Q = deque([read_pair_list[0]])

    for read_pair in read_pair_list:
        #print read_pair_Q

        #if you can add the next interval to the current list of overlapping intervals, do so
        if read_pair.interval_chr == read_pair_Q[0].interval_chr and read_pair.interval_start <= read_pair_Q[0].interval_end:
            read_pair_Q.append(read_pair)

        #if the current read is from another chromosome, save the list of currently overlapping intervals as a cluster and empty it
        elif read_pair.interval_chr != read_pair_Q[0].interval_chr:
            new_cluster = Cluster(list(read_pair_Q))
            cluster_list.append(new_cluster)
            chr_cluster_lists[read_pair_Q[0].interval_chr] = cluster_list

            #empty queue and current cluster list since we are starting with a new chromosome
            cluster_list = []
            read_pair_Q.clear()
            read_pair_Q.append(read_pair)

        #otherwise, save the list of currently overlapping intervals as a cluster
        else:
            new_cluster = Cluster(list(read_pair_Q))
            cluster_list.append(new_cluster)
            # and pop off intervals in the Q as long as they do not overlap with your current interval -- these cannot constitute another maximal cluster
            while len(read_pair_Q) != 0 and read_pair.interval_start > read_pair_Q[0].interval_end:
                read_pair_Q.popleft()
            #then add your current read to the Q
            read_pair_Q.append(read_pair)
    #for cluster in cluster_list:
    #    print " ".join(read.str_int() for read in cluster)
        #print " ".join(read.str_TE_annot_list() for read in cluster)
    last_cluster = Cluster(list(read_pair_Q))
    cluster_list.append(last_cluster)
    chr_cluster_lists[read_pair_Q[0].interval_chr] = cluster_list

    return chr_cluster_lists


#@profile
def cluster_read_pairs_by_chr_and_bin(read_pair_list, bin_size):
    """this generates a list of  maximal  clusters, ie sets of overlapping read pairs. note: these clusters can be themselves overlapping.
    returns a disctionary of lists of Cluster objects, one entry per bin"""

    #sort according to end position then chromosome. sort is stable so the second sort will not unsort the positions
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_end)
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_chr)


    current_bin_start = 0
    current_bin_end = bin_size - 1

    current_chr = read_pair_list[0].interval_chr

    current_bin_key = "%s %d-%d" % (current_chr, current_bin_start, current_bin_end)


    #store each a list of current Cluster objects, which contains a list of AlignedReadPair objects
    current_bin_cluster_list = []

    #store a seperate cluster_list for each bin
    bin_cluster_lists = {}

    #read_pair_Q stores a list of currently overlapping read pair intervals
    read_pair_Q = deque([read_pair_list[0]])

    for read_pair in read_pair_list:
        #print read_pair_Q

        #if you can add the next interval to the current list of overlapping intervals, do so
        if read_pair.interval_chr == read_pair_Q[0].interval_chr and read_pair.interval_start <= read_pair_Q[0].interval_end:
            read_pair_Q.append(read_pair)

        #if the current read is from another chromosome, or from another bin, save the list of currently overlapping intervals as a cluster and empty it
        elif read_pair.interval_chr != current_chr or read_pair.interval_start > current_bin_end:
            new_cluster = Cluster(list(read_pair_Q))
            current_bin_cluster_list.append(new_cluster)
            bin_cluster_lists[current_bin_key] = current_bin_cluster_list

            #empty queue and current cluster list since we are starting with a new bin
            current_bin_cluster_list = []
            read_pair_Q.clear()
            read_pair_Q.append(read_pair)

            #update the current chromosome and bins

            if read_pair.interval_chr != current_chr:
                current_bin_start = 0
                current_bin_end = bin_size - 1

                current_chr = read_pair.interval_chr
            else:
                current_bin_start = current_bin_end + 1
                current_bin_end = current_bin_start + bin_size - 1

            current_bin_key = "%s %d-%d" % (current_chr, current_bin_start, current_bin_end)

        #otherwise, save the list of currently overlapping intervals as a cluster
        else:
            new_cluster = Cluster(list(read_pair_Q))
            current_bin_cluster_list.append(new_cluster)
            # and pop off intervals in the Q as long as they do not overlap with your current interval -- these cannot constitute another maximal cluster
            while len(read_pair_Q) != 0 and read_pair.interval_start > read_pair_Q[0].interval_end:
                read_pair_Q.popleft()
            #then add your current read to the Q
            read_pair_Q.append(read_pair)
    #for cluster in cluster_list:
    #    print " ".join(read.str_int() for read in cluster)
        #print " ".join(read.str_TE_annot_list() for read in cluster)
    last_cluster = Cluster(list(read_pair_Q))
    current_bin_cluster_list.append(last_cluster)
    bin_cluster_lists[current_bin_key] = current_bin_cluster_list

    return bin_cluster_lists

#@profile
def cluster_read_pairs_all(read_pair_list):
    """this generates a list of  maximal  clusters, ie sets of overlapping read pairs. note: these clusters can be themselves overlapping.
    returns a disctionary of lists of Cluster objects, one entry per chromosome"""

    #sort according to end position then chromosome. sort is stable so the second sort will not unsort the positions
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_end)
    read_pair_list.sort(key=lambda read_pair: read_pair.interval_chr)


    #store each cluster as a list of AlignedReadPair objects
    cluster_list = []


    #read_pair_Q stores a list of currently overlapping read pair intervals
    read_pair_Q = deque([read_pair_list[0]])

    for read_pair in read_pair_list:
        #print read_pair_Q

        #if you can add the next interval to the current list of overlapping intervals, do so
        if read_pair.interval_chr == read_pair_Q[0].interval_chr and read_pair.interval_start <= read_pair_Q[0].interval_end:
            read_pair_Q.append(read_pair)

        #if the current read is from another chromosome, save the list of currently overlapping intervals as a cluster and empty it
        elif read_pair.interval_chr != read_pair_Q[0].interval_chr:
            new_cluster = Cluster(list(read_pair_Q))
            cluster_list.append(new_cluster)
            read_pair_Q.clear()
            read_pair_Q.append(read_pair)

        #otherwise, save the list of currently overlapping intervals as a cluster
        else:
            new_cluster = Cluster(list(read_pair_Q))
            cluster_list.append(new_cluster)
            # and pop off intervals in the Q as long as they do not overlap with your current interval -- these cannot constitute another maximal cluster
            while len(read_pair_Q) != 0 and read_pair.interval_start > read_pair_Q[0].interval_end:
                read_pair_Q.popleft()
            #then add your current read to the Q
            read_pair_Q.append(read_pair)

    last_cluster = Cluster(list(read_pair_Q))
    cluster_list.append(last_cluster)
    #for cluster in cluster_list:
    #    print " ".join(read.str_int() for read in cluster)
        #print " ".join(read.str_TE_annot_list() for read in cluster)

    return cluster_list


#@profile
def remove_overlapping_clusters(cluster_list,min_size=0):
    """returns a list of clusters that do not overlap with any other. input is a list of lists of AlignedReadPair objects, sorted by end position.
    thus the start coordinate of the cluster will be the start coordinate of its first element: cluster[0]
    and the end coordinate of the cluster will be the end coordinate of its last element: cluster[-1]"""

    non_overlapping_clusters = []

    current_cluster = cluster_list[0]
    current_cluster_is_overlapped = False
    next_cluster_is_overlapped = False

    for next_cluster in cluster_list[1:]:
        #if the current cluster does not overlap teh next one,
        if current_cluster.cluster_end < next_cluster.cluster_start:
            next_cluster_is_overlapped = False
            #and is not overlapped itself
            if not current_cluster_is_overlapped and current_cluster.num_reads >= min_size:
                #add it to the list
                non_overlapping_clusters.append(current_cluster)
        #otherwise, flag the next cluster as overlapped
        else:
            next_cluster_is_overlapped = True
        #update current to next
        current_cluster = next_cluster
        current_cluster_is_overlapped = next_cluster_is_overlapped
    
    ## MATTIA comment:
        # The last cluster is never printed right?
        # maybe an escape line like
        # if currennt_cluster_is_overlapped == F:
        #   non_overlapping_clusters.append(current_cluster)
        # should be added when the for ends.
    
    #for cluster in non_overlapping_clusters:
    #    print " ".join(read.str_int() for read in cluster)

    return non_overlapping_clusters


def table_header(library_name,  bam_file_name, te_annot):
    param_string = "#this table describes the read clusters identified in the bam file %s and corresponding to the transposon annotations in %s\n" % (bam_file_name, te_annot)
    title_string = "#this table contains three types of lines:\
#** insertion lines: one per predicted insertion site, corresponding to a pair of overlapping clusters, one fwd, one rev\n\
I\tcluster_pair_ID\tlib\tchrom\tstart\tend\tnum_fwd_reads\tnum_rev_reads\tfwd_span\trev_span\tbest_sc_pos_st\tbest_sc_pos_end\tsc_pos_support\n\
#here the start and end are defined as the intersection of the intervals predicted by the leftmost forward read and the rightmost reverse read.\n\n\
#** cluster lines (two per insertion, one fwd and one rev):\n\
C\tcluster_pair_ID\tlib\tdirection\tstart\tend\tchrom\tnum_reads\tspan\n\
#span is defined as the range of start positions in the cluster. A span of 0 means that all the reads originate at the same start site, and are probably an artifact. \
#a span the size of the fragment length indicates good coverage. \n\n\
#**read lines (fwd reads consitute the fwd clusters, rev reads the rev clusters)\n\
#the reads that are \"anchor\" are those that consitute the cluster, the reads that are \"mate\" are the anchors' mates, which map to a TE\n\
R\tcluster_pair_ID\tlib\tdirection\tinterval_start\tinterval_end\tchrom\tstatus\tbam_line\n\n\
#this file is meant to be easily manipulated with tools like grep and sed, for example\n\
#grep ^C table_file > cluster_pairs.out\n\
#will give you a list of all the clusters pairs\n\
#the R lines sharing the same ID all come from the same cluster pair, with itself the same ID, corresponding to the the insertion of that same ID, thus \n\
#grep -w cluster_pair_ID_X table_file > predicted_insertion_X.table.out\n\
#will give you the definition line of the the predicted insertion site, the fwd and rev clusters comprising insertion X and the description of the reads that constitute them. \n"

    return param_string + title_string