Skip to content

Commit eb48143

Browse files
committed
Got the basic functions for producing more realistic ancestral reference
assemblies all working.
1 parent 5859912 commit eb48143

File tree

5 files changed

+275
-111
lines changed

5 files changed

+275
-111
lines changed

impl/referenceProblem2.c

Lines changed: 144 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,34 @@ void reference_makeNewInterval(reference *ref, int64_t firstNode, int64_t lastNo
318318
stList_append(ref->referenceIntervals, rTF);
319319
}
320320

321+
void reference_removeIntervals(reference *ref, stSortedSet *firstNodesOfIntervalsToRemove) {
322+
stList *updatedReferenceIntervals = stList_construct();
323+
for(int64_t i=0;i<stList_length(ref->referenceIntervals); i++) {
324+
int64_t node = reference_getFirstOfInterval(ref, i);
325+
referenceTerm *rT = reference_getTerm(ref, node);
326+
assert(rT != NULL);
327+
stIntTuple *j = stIntTuple_construct1(node);
328+
if(stSortedSet_search(firstNodesOfIntervalsToRemove, j) == NULL) { //It is not in the list of intervals to delete,
329+
//so we add it to the list of intervals to keep
330+
stList_append(updatedReferenceIntervals, reference_getTerm(ref, node));
331+
}
332+
else {
333+
//Release the term from the node array
334+
assert(rT->pTerm == NULL);
335+
while(rT != NULL) {
336+
ref->nodesInGraph[llabs(rT->node)-1] = NULL;
337+
referenceTerm *rTP = rT;
338+
rT = rT->nTerm;
339+
assert(rT == NULL || rT->pTerm == rTP);
340+
free(rTP);
341+
}
342+
}
343+
stIntTuple_destruct(j);
344+
}
345+
stList_destruct(ref->referenceIntervals);
346+
ref->referenceIntervals = updatedReferenceIntervals;
347+
}
348+
321349
void reference_insertNode(reference *ref, int64_t pNode, int64_t node) {
322350
referenceTerm *rT = st_malloc(sizeof(referenceTerm)), *rTP;
323351
rT->node = node;
@@ -459,33 +487,60 @@ void reference_log(reference *ref) {
459487
}
460488
}
461489

490+
static void setFirstPointer(referenceTerm *term, referenceTerm *firstTerm) {
491+
assert(firstTerm->pTerm == NULL);
492+
assert(firstTerm->first == firstTerm);
493+
do {
494+
term->first = firstTerm;
495+
term = term->nTerm;
496+
} while(term != NULL);
497+
}
498+
499+
void reference_translocateIntervals(reference *ref, int64_t pNode1, int64_t nNode2) {
500+
referenceTerm *pNode1Term = reference_getTerm(ref, pNode1);
501+
assert(pNode1Term != NULL);
502+
referenceTerm *nNode1Term = pNode1Term->nTerm;
503+
assert(nNode1Term != NULL); //pNode1 is not a stub end.
504+
referenceTerm *nNode2Term = reference_getTerm(ref, nNode2);
505+
referenceTerm *pNode2Term = nNode2Term->pTerm;
506+
assert(pNode2Term != NULL); //nNode2 is not a stub end.
507+
//The critical translocation lines
508+
pNode1Term->nTerm = nNode2Term;
509+
nNode2Term->pTerm = pNode1Term;
510+
pNode2Term->nTerm = nNode1Term;
511+
nNode1Term->pTerm = pNode2Term;
512+
//Correct the "first" pointers.
513+
assert(nNode1Term->first == pNode1Term->first);
514+
assert(nNode2Term->first == pNode2Term->first);
515+
setFirstPointer(nNode1Term, pNode2Term->first);
516+
setFirstPointer(nNode2Term, pNode1Term->first);
517+
}
518+
462519
void reference_splitInterval(reference *ref, int64_t pNode, int64_t stub1, int64_t stub2) {
463520
assert(reference_getNext(ref, pNode) != INT64_MAX);
464521
assert(!reference_inGraph(ref, stub1));
465522
assert(!reference_inGraph(ref, stub2));
466523
assert(stub1 != stub2);
467524
reference_makeNewInterval(ref, stub2, stub1);
468-
referenceTerm *pNodeTerm = reference_getTerm(ref, pNode);
469-
assert(pNodeTerm != NULL);
470-
referenceTerm *nNodeTerm = pNodeTerm->nTerm;
471-
assert(nNodeTerm != NULL); //Is not a stub end.
472-
referenceTerm *stub1Term = reference_getTerm(ref, stub1);
473-
referenceTerm *stub2Term = stub1Term->pTerm;
474-
assert(stub2Term != NULL && stub2Term == reference_getTerm(ref, stub2));
475-
assert(stub1Term->nTerm == NULL);
476-
assert(stub2Term->pTerm == NULL);
477-
pNodeTerm->nTerm = stub1Term;
478-
stub1Term->pTerm = pNodeTerm;
479-
stub2Term->nTerm = nNodeTerm;
480-
nNodeTerm->pTerm = stub2Term;
481-
assert(stub1Term->first == stub2Term);
482-
stub1Term->first = pNodeTerm->first;
483-
assert(nNodeTerm->first == pNodeTerm->first);
484-
do {
485-
nNodeTerm->first = stub2Term;
486-
nNodeTerm = nNodeTerm->nTerm;
487-
} while(nNodeTerm != NULL);
488-
assert(stub2Term->first == stub2Term);
525+
reference_translocateIntervals(ref, pNode, stub1);
526+
}
527+
528+
/*
529+
* Returns the integer value of the absolute highest valued node in the reference.
530+
*/
531+
int64_t reference_getMaximumNode(reference *ref) {
532+
int64_t maxNode = INT64_MIN;
533+
for(int64_t interval=0; interval<reference_getIntervalNumber(ref); interval++) {
534+
int64_t node = reference_getFirstOfInterval(ref, interval);
535+
while(node != INT64_MAX) {
536+
if(llabs(node) > maxNode) {
537+
maxNode = llabs(node);
538+
}
539+
node = reference_getNext(ref, node);
540+
}
541+
}
542+
assert(maxNode != INT64_MAX);
543+
return maxNode;
489544
}
490545

491546
/*
@@ -774,6 +829,7 @@ static insertPoint *connectedNodes_popBestInsert(connectedNodes *cN, refAdjList
774829
cN->misses++;
775830
free(iP);
776831
}
832+
return NULL;
777833
}
778834

779835
/*
@@ -1049,46 +1105,74 @@ void reorderReferenceToAvoidBreakpoints(refAdjList *aL, reference *ref) {
10491105
}
10501106
}
10511107

1052-
void translocateInterval(reference *ref, int64_t n, int64_t length, int64_t pNode) {
1053-
assert(pNode != INT64_MAX);
1054-
assert(n != INT64_MAX);
1055-
int64_t i=0;
1056-
while(i++<length) {
1057-
int64_t m = reference_getNext(ref, n);
1058-
assert(m != INT64_MAX);
1059-
reference_removeNode(ref, n);
1060-
reference_insertNode(ref, pNode, n);
1061-
pNode = n;
1062-
n = m;
1063-
}
1064-
}
1065-
1066-
int64_t getShortestInterval(reference *ref) {
1067-
int64_t shortestInterval = INT64_MAX;
1068-
int64_t shortestIntervalLength = INT64_MAX;
1069-
for(int64_t i=0; i<reference_getIntervalNumber(ref); i++) {
1070-
int64_t n = reference_getFirstOfInterval(ref, i);
1071-
int64_t length = reference_getRemainingIntervalLength(ref, n);
1072-
if(length == 0) {
1073-
return n;
1108+
stList *splitReferenceAtIndicatedLocations(reference *ref, bool (*refSplitFn)(int64_t, reference *, void *), void *extraArgs) {
1109+
int64_t indexOfFirstNewStub = reference_getMaximumNode(ref)+1; //The new stubs need to be unique in the reference
1110+
assert(indexOfFirstNewStub < INT64_MAX);
1111+
assert(!reference_inGraph(ref, indexOfFirstNewStub));
1112+
stList *newStubs = stList_construct3(0, (void (*)(void *))stIntTuple_destruct);
1113+
for(int64_t interval=reference_getIntervalNumber(ref)-1; interval>=0; interval--) { //Iterate over only the old intervals, not those involving new stubs.
1114+
int64_t pNode = reference_getFirstOfInterval(ref, interval);
1115+
int64_t node = reference_getNext(ref, pNode);
1116+
while(node != INT64_MAX) {
1117+
if(refSplitFn(pNode, ref, extraArgs)) { //Determine if a split is needed.
1118+
int64_t newStub1 = indexOfFirstNewStub++;
1119+
int64_t newStub2 = indexOfFirstNewStub++;
1120+
stList_append(newStubs, stIntTuple_construct1(newStub1));
1121+
stList_append(newStubs, stIntTuple_construct1(-newStub2)); //Invert the sign, because we use signs to refer to sides of a node
1122+
reference_splitInterval(ref, pNode, newStub1, newStub2);
1123+
}
1124+
pNode = node;
1125+
node = reference_getNext(ref, pNode);
1126+
}
1127+
}
1128+
return newStubs;
1129+
}
1130+
1131+
static void removeStub(stSortedSet *extraStubNodesSet, int64_t node) {
1132+
stIntTuple *stub = stIntTuple_construct1(node);
1133+
assert(stSortedSet_search(extraStubNodesSet, stub) != NULL);
1134+
stSortedSet_remove(extraStubNodesSet, stub);
1135+
stIntTuple_destruct(stub);
1136+
}
1137+
1138+
stList *remakeReferenceIntervals(reference *ref, stList *referenceIntervalsToPreserve, stList *extraStubNodes) {
1139+
stSortedSet *extraStubNodesSet = stList_getSortedSet(extraStubNodes, (int (*)(const void *, const void *))stIntTuple_cmpFn);
1140+
stSortedSet *firstNodesOfIntervalsToRemove = stSortedSet_construct3((int (*)(const void *, const void *))stIntTuple_cmpFn, (void (*)(void *))stIntTuple_destruct);
1141+
for(int64_t i=0; i<stList_length(referenceIntervalsToPreserve); i++) {
1142+
stIntTuple *intervalToPreserve = stList_get(referenceIntervalsToPreserve, i);
1143+
int64_t startNode = stIntTuple_get(intervalToPreserve, 0);
1144+
assert(reference_getFirst(ref, startNode) == startNode);
1145+
int64_t endNode = stIntTuple_get(intervalToPreserve, 1);
1146+
assert(reference_getLast(ref, endNode) == endNode);
1147+
int64_t nodeAdjacentStartNode = reference_getLast(ref, startNode);
1148+
int64_t nodeAdjacentEndNode = reference_getFirst(ref, endNode);
1149+
if(nodeAdjacentStartNode != endNode) { //We have identified a case where we need to scaffold across a break.
1150+
assert(llabs(nodeAdjacentStartNode) != llabs(endNode));
1151+
//void reference_translocateIntervals(reference *ref, int64_t pNode1, int64_t nNode2)
1152+
reference_translocateIntervals(ref, reference_getPrevious(ref, nodeAdjacentStartNode), reference_getNext(ref, nodeAdjacentEndNode));
1153+
//Check we have the correct connectivity
1154+
assert(reference_getLast(ref, startNode) == endNode);
1155+
assert(reference_getFirst(ref, endNode) == startNode);
1156+
assert(reference_getLast(ref, nodeAdjacentEndNode) == nodeAdjacentStartNode);
1157+
assert(reference_getNext(ref, nodeAdjacentEndNode) == nodeAdjacentStartNode);
1158+
assert(reference_getFirst(ref, nodeAdjacentStartNode) == nodeAdjacentEndNode);
1159+
assert(reference_getPrevious(ref, nodeAdjacentStartNode) == nodeAdjacentEndNode);
1160+
//Append to list of stub intervals to delete.
1161+
assert(reference_getFirst(ref, nodeAdjacentEndNode) == nodeAdjacentEndNode);
1162+
stSortedSet_insert(firstNodesOfIntervalsToRemove, stIntTuple_construct1(nodeAdjacentEndNode));
1163+
//Delete the stubs
1164+
removeStub(extraStubNodesSet, nodeAdjacentStartNode);
1165+
removeStub(extraStubNodesSet, -nodeAdjacentEndNode); //The minus sign is to refer to the 3' end of the node.
10741166
}
1075-
if(length < shortestIntervalLength) {
1076-
shortestInterval = n;
1077-
shortestIntervalLength = length;
1078-
}
1079-
}
1080-
return shortestInterval;
1081-
}
1082-
1083-
void reorderToAvoidOverlargeChromosome(reference *ref, bool (*tooLarge)(reference *, int64_t n)) {
1084-
for(int64_t i=0; i<reference_getIntervalNumber(ref); i++) {
1085-
int64_t n = reference_getFirstOfInterval(ref, i);
1086-
int64_t length = reference_getRemainingIntervalLength(ref, n);
1087-
if(tooLarge(ref, n) && length > 2) {
1088-
int64_t m = getShortestInterval(ref);
1089-
if(m != n) {
1090-
translocateInterval(ref, reference_getNext(ref, n), length/2, m);
1091-
}
1167+
else {
1168+
assert(nodeAdjacentEndNode == startNode);
10921169
}
10931170
}
1171+
//Remove the old stub intervals
1172+
reference_removeIntervals(ref, firstNodesOfIntervalsToRemove);
1173+
stSortedSet_destruct(firstNodesOfIntervalsToRemove);
1174+
//Revise list of extra stub nodes.
1175+
extraStubNodes = stSortedSet_getList(extraStubNodesSet);
1176+
stSortedSet_destruct(extraStubNodesSet);
1177+
return extraStubNodes;
10941178
}

inc/stReferenceProblem.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
#ifndef REFERENCEPROBLEM_H_
99
#define REFERENCEPROBLEM_H_
1010

11+
#include "sonLib.h"
12+
1113
stList *makeReferenceGreedily(stList *stubs, stList *chains,
1214
float *z, int64_t nodeNumber, double *totalScore, bool fast);
1315

inc/stReferenceProblem2.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,12 @@ void reference_log(reference *ref);
107107
//Splits an interval into two, making the existing interval end with stub1, and the new interval (which will be last interval) start with stub2.
108108
void reference_splitInterval(reference *ref, int64_t pNode, int64_t stub1, int64_t stub2);
109109

110+
//Gets the maximum value of a node in the reference. Makes it easy to generate a unique new node.
111+
int64_t reference_getMaximumNode(reference *ref);
112+
113+
//Remove intervals, input is set of first nodes of intervals. The intervals containing these first nodes are removed, including all members of the interval.
114+
void reference_removeIntervals(reference *ref, stSortedSet *firstNodesOfIntervalsToRemove);
115+
110116
/*
111117
* Reference algorithms
112118
*/
@@ -135,6 +141,16 @@ int64_t getBadAdjacencyCount(refAdjList *aL, reference *ref);
135141
/*
136142
* Breaks up chromosomes.
137143
*/
138-
void reorderToAvoidOverlargeChromosome(reference *ref, bool (*tooLarge)(reference *, int64_t n));
144+
145+
/*
146+
* Splits the reference up at any adjacency that refSplitFn returns non-zero for.
147+
*/
148+
stList *splitReferenceAtIndicatedLocations(reference *ref, bool (*refSplitFn)(int64_t, reference *, void *), void *extraArgs);
149+
150+
/*
151+
* Rejoins reference intervals that are specified by referenceIntervalsToPreserve. Returns a modified list of extraStubNodes with
152+
* unneeded stubs removed.
153+
*/
154+
stList *remakeReferenceIntervals(reference *ref, stList *referenceIntervalsToPreserve, stList *extraStubNodes);
139155

140156
#endif /* REFERENCEPROBLEM2_H_ */

tests/allTests.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,10 @@ CuSuite* referenceProblem2TestSuite(void);
1616
int referenceRunAllTests(void) {
1717
CuString *output = CuStringNew();
1818
CuSuite* suite = CuSuiteNew();
19-
CuSuiteAddSuite(suite, matchingAlgorithmsTestSuite());
19+
/*CuSuiteAddSuite(suite, matchingAlgorithmsTestSuite());
2020
CuSuiteAddSuite(suite, cyclesConstrainedMatchingAlgorithmsTestSuite());
2121
CuSuiteAddSuite(suite, referenceProblemTestSuite());
22-
CuSuiteAddSuite(suite, referenceProblemExamplesTestSuite());
22+
CuSuiteAddSuite(suite, referenceProblemExamplesTestSuite());*/
2323
CuSuiteAddSuite(suite, referenceProblem2TestSuite());
2424

2525
CuSuiteRun(suite);

0 commit comments

Comments
 (0)