Skip to content

Commit 200fadd

Browse files
committed
added sampling and raw data output to TnfDistance
1 parent ec132b3 commit 200fadd

File tree

2 files changed

+151
-9
lines changed

2 files changed

+151
-9
lines changed

apps/TnfDistance.cpp

Lines changed: 86 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -77,9 +77,11 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
7777
interDistanceFile(),
7878
intraInterFile(),
7979
clusterFile(),
80+
includeIntraInterDataFile(false),
8081
likelihoodBins(100),
8182
windowSize(10000),
8283
windowStep(1000),
84+
maxSamples(MAX_I64),
8385
clusterThresholdDistance(.175),
8486
referenceFiles(),
8587
distanceFormula(DistanceFormula::EUCLIDEAN) {}
@@ -95,6 +97,9 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
9597
string &getIntraInterFile() {
9698
return intraInterFile;
9799
}
100+
bool &getIncludeIntraInterDataFile() {
101+
return includeIntraInterDataFile;
102+
}
98103
int &getLikelihoodBins() {
99104
return likelihoodBins;
100105
}
@@ -107,6 +112,9 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
107112
int &getWindowStep() {
108113
return windowStep;
109114
}
115+
long &getMaxSamples() {
116+
return maxSamples;
117+
}
110118
float &getClusterThreshold() {
111119
return clusterThresholdDistance;
112120
}
@@ -139,12 +147,16 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
139147

140148
("intra-inter-file", po::value<string>()->default_value(intraInterFile), "output two discrete likelihood functions of intra vs inter-distances between all windows fasta between separate files. Assumes intra are calculated within a file and inter between files")
141149

150+
("include-intra-inter-data-file", po::value<bool>()->default_value(includeIntraInterDataFile), "if set then intra-inter-file.data will be created too (VERY LARGE) one per thread")
151+
142152
("likelihood-bins", po::value<int>()->default_value(likelihoodBins), "How many bins to create for the discrete likelihood functions")
143153

144154
("window-size", po::value<int>()->default_value(windowSize), "size of adjacent intra/inter-distance windows")
145155

146156
("window-step", po::value<int>()->default_value(windowStep), "step size of adjacent intra/inter-distance windows")
147157

158+
("max-samples", po::value<long>()->default_value(maxSamples), "maximum number of samples of intra and inter distances")
159+
148160
("cluster-file", po::value<string>()->default_value(clusterFile), "cluster output filename")
149161

150162
("cluster-threshold-distance", po::value<float>()->default_value(clusterThresholdDistance), "Euclidean distance threshold for clusters")
@@ -165,9 +177,11 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
165177
setOpt("kmer-size", kmerSize);
166178
setOpt("inter-distance-file", interDistanceFile);
167179
setOpt("intra-inter-file", intraInterFile);
180+
setOpt("include-intra-inter-data-file", includeIntraInterDataFile);
168181
setOpt("likelihood-bins", likelihoodBins);
169182
setOpt("window-size", windowSize);
170183
setOpt("window-step", windowStep);
184+
setOpt("max-samples", maxSamples);
171185
setOpt("cluster-file", clusterFile);
172186
setOpt("cluster-threshold-distance", clusterThresholdDistance);
173187
setOpt2("reference-file", referenceFiles);
@@ -185,8 +199,10 @@ class _TnfDistanceBaseOptions : public OptionsBaseInterface {
185199
private:
186200
int kmerSize;
187201
string interDistanceFile, intraInterFile, clusterFile;
202+
bool includeIntraInterDataFile;
188203
int likelihoodBins;
189204
int windowSize, windowStep;
205+
long maxSamples;
190206
float clusterThresholdDistance;
191207
FileListType referenceFiles;
192208
DistanceFormula::Enum distanceFormula;
@@ -604,14 +620,32 @@ int main(int argc, char *argv[]) {
604620
long window = TnfDistanceBaseOptions::getOptions().getWindowSize();
605621
long step = TnfDistanceBaseOptions::getOptions().getWindowStep();
606622
int numBins = TnfDistanceBaseOptions::getOptions().getLikelihoodBins();
623+
bool dataFile = TnfDistanceBaseOptions::getOptions().getIncludeIntraInterDataFile();
607624

608-
GH intraHist(0.0, 1.0, numBins, true), interHist(0.0, 1.0, numBins, true);
625+
float maxDistance = TNF::distanceFormula == DistanceFormula::EUCLIDEAN ? sqrt(2.0) : 1.0;
626+
GH intraHist(0.0, maxDistance, numBins, true), interHist(0.0, maxDistance, numBins, true);
609627

610628
std::vector< TNFS > interTnfs;
611629
interTnfs.resize( reads.getReadFileNum( reads.getSize() - 1 ));
612630

613631
LOG_VERBOSE(1, "Creating intra cluster TNF distance histogram for " << interTnfs.size() << " different sets.");
614632
LOG_DEBUG(1, MemoryUtils::getMemoryUsage());
633+
long maxSamples = TnfDistanceBaseOptions::getOptions().getMaxSamples();
634+
long maxIntraSamples = maxSamples / inputs.size();
635+
LongRand randGen[ omp_get_max_threads() ];
636+
ostream *dataPtr[omp_get_max_threads()];
637+
OfstreamMap om(intraInterFile, "");
638+
#pragma omp parallel
639+
{
640+
int threadId = omp_get_thread_num();
641+
randGen[threadId] = LongRand();
642+
if (dataFile) {
643+
dataPtr[threadId] = & om.getOfstream(".data." + boost::lexical_cast<string>( omp_get_thread_num() ));
644+
*dataPtr[threadId] << "Distance\tIntra\tInter\n";
645+
} else {
646+
dataPtr[threadId] = NULL;
647+
}
648+
}
615649

616650
ReadSet shrededReads;
617651
int fileIdx = 0;
@@ -626,11 +660,33 @@ int main(int argc, char *argv[]) {
626660
assert(fileIdx < (int) interTnfs.size());
627661
interTnfs[fileIdx] = buildIntraTNFs(shrededReads, true);
628662
TNFS &intraTnfs = interTnfs[fileIdx];
663+
LowerTriangle<false, long> lt(intraTnfs.size());
664+
long numSamples = lt.getNumValues();
665+
assert(numSamples == (long) ( intraTnfs.size() * (intraTnfs.size() - 1) / 2) );
666+
667+
if (numSamples < (long) maxIntraSamples) {
668+
// calculate everything
629669
#pragma omp parallel for schedule(dynamic,1)
630-
for(long i = 0 ; i < (long) intraTnfs.size(); i++) {
631-
for(long j = 0 ; j < i ; j++) { // lower triangle only
632-
float dist = intraTnfs[i].getDistance(intraTnfs[j]);
670+
for(long i = 0 ; i < (long) intraTnfs.size(); i++) {
671+
for(long j = 0 ; j < i ; j++) { // lower triangle only
672+
float dist = intraTnfs[i].getDistance(intraTnfs[j]);
673+
intraHist.observe(dist);
674+
if (dataPtr[omp_get_thread_num()] != NULL)
675+
*dataPtr[omp_get_thread_num()] << dist << "\t1\t0\n";
676+
}
677+
}
678+
} else {
679+
// subsample the full data set
680+
LOG_DEBUG(1, "Subsampling all possible: " << numSamples << " to " << numSamples);
681+
682+
#pragma omp parallel for
683+
for(long k = 0; k < (long) maxIntraSamples; k++) {
684+
long x,y, indexSample = randGen[omp_get_thread_num()].getRand() % numSamples;
685+
lt.getXY(indexSample, x, y);
686+
float dist = intraTnfs[x].getDistance(intraTnfs[y]);
633687
intraHist.observe(dist);
688+
if (dataPtr[omp_get_thread_num()] != NULL)
689+
*dataPtr[omp_get_thread_num()] << dist << "\t1\t0\n";
634690
}
635691
}
636692

@@ -641,28 +697,50 @@ int main(int argc, char *argv[]) {
641697
}
642698
}
643699

700+
LOG_VERBOSE(1, "intra-histogram totalCount: " << intraHist.getTotalCount() << " outliers: " << intraHist.getOutlierCount());
644701
LOG_VERBOSE(1, "Creating inter cluster TNF distance histogram.");
645702
LOG_DEBUG(1, MemoryUtils::getMemoryUsage());
646703

704+
long maxInterSamples = maxSamples / (inputs.size() * (inputs.size()-1) / 2);
705+
647706
#pragma omp parallel for schedule(dynamic,1)
648707
for(long i = 0; i < (long) interTnfs.size(); i++) {
649708
TNFS &interi = interTnfs[i];
650709
for(long j = 0 ; j < i; j++) { // lower triangle only
651710
TNFS &interj = interTnfs[j];
652711
long isize = interi.size(), jsize = interj.size();
653-
for(long k = 0 ; k < isize; k++) {
654-
for(long l = 0; l < jsize; l++) {
655-
float dist = interi[k].getDistance(interj[l]);
712+
long numSamples = isize * jsize;
713+
if (numSamples < maxInterSamples) {
714+
// calculate everything
715+
for(long k = 0 ; k < isize; k++) {
716+
for(long l = 0; l < jsize; l++) {
717+
float dist = interi[k].getDistance(interj[l]);
718+
interHist.observe(dist);
719+
if (dataPtr[omp_get_thread_num()] != NULL)
720+
*dataPtr[omp_get_thread_num()] << dist << "\t0\t1\n";
721+
}
722+
}
723+
} else {
724+
// subsample
725+
for(long k = 0; k < (long) maxInterSamples ; k++) {
726+
long indexSample = randGen[omp_get_thread_num()].getRand() % numSamples;
727+
long x = indexSample / jsize, y = indexSample % jsize;
728+
assert(x < isize);
729+
assert(y < jsize);
730+
float dist = interi[x].getDistance(interj[y]);
656731
interHist.observe(dist);
732+
if (dataPtr[omp_get_thread_num()] != NULL)
733+
*dataPtr[omp_get_thread_num()] << dist << "\t0\t1\n";
657734
}
658735
}
659736
}
660737
}
661738

739+
LOG_VERBOSE(1, "inter-histogram totalCount: " << interHist.getTotalCount() << " outliers: " << interHist.getOutlierCount());
740+
662741
assert(interHist.getOutlierCount() == 0);
663742
assert(intraHist.getOutlierCount() == 0);
664743

665-
OfstreamMap om(intraInterFile, "");
666744
ostream &os = om.getOfstream("");
667745

668746
double totalIntra = intraHist.getTotalCount(), totalInter = interHist.getTotalCount();

src/Utils.h

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1066,7 +1066,16 @@ class _Rand {
10661066
typedef std::vector< Instance > Instances;
10671067

10681068
_Rand( uint64_t _seed = getSeed( ) ) : engine( _seed ), dist() { }
1069-
1069+
_Rand(const _Rand &copy) {
1070+
*this = copy;
1071+
}
1072+
_Rand &operator=(const _Rand &copy) {
1073+
if (this == &copy)
1074+
return *this;
1075+
engine = copy.engine;
1076+
dist = copy.dist;
1077+
return *this;
1078+
}
10701079
Type getRand() {
10711080
return dist(engine);
10721081
}
@@ -1865,5 +1874,60 @@ class GenericHistogram {
18651874
std::vector<CountType> _bins;
18661875
};
18671876

1877+
template<bool diagonal = false, typename IndexType = int>
1878+
class LowerTriangle {
1879+
public:
1880+
struct XY {
1881+
IndexType x,y;
1882+
};
1883+
LowerTriangle(IndexType _size) {
1884+
assert(_size > 1);
1885+
size = diagonal ? _size : _size + 1;
1886+
}
1887+
IndexType getIndex(IndexType x, IndexType y) const {
1888+
assert(x >= 0 && y >= 0);
1889+
assert(y < size);
1890+
if (diagonal) {
1891+
assert(x <= y);
1892+
} else {
1893+
assert(y > 0);
1894+
y--;
1895+
assert(x < y);
1896+
}
1897+
return x + (y+1) * y / 2;
1898+
}
1899+
void getXY(IndexType index, IndexType &x, IndexType &y) {
1900+
assert(index >= 0);
1901+
y = (IndexType)((-1 + sqrt(8*index+1))/2);
1902+
x = index - y * (y+1) / 2;
1903+
1904+
if (diagonal) {
1905+
assert(index < size * (size + 1) / 2);
1906+
assert(x <= y);
1907+
} else {
1908+
assert(index < size * (size + 1) / 2);
1909+
y++;
1910+
assert(x < y);
1911+
assert(y < size - 1);
1912+
}
1913+
assert(y < size);
1914+
assert(x >= 0 && y >= 0);
1915+
}
1916+
XY getXY(IndexType index) const {
1917+
XY xy;
1918+
getXY(index, xy.x, xy.y);
1919+
return xy;
1920+
}
1921+
IndexType getNumValues() const {
1922+
if (diagonal) {
1923+
return size * (size+1) / 2;
1924+
} else {
1925+
return (size-2) * (size-1) / 2;
1926+
}
1927+
}
1928+
private:
1929+
IndexType size;
1930+
};
1931+
18681932
#endif
18691933

0 commit comments

Comments
 (0)