Skip to content

Commit

Permalink
Merge pull request #6 from ProjectAussie/INF-435/mz/extend-germline-t…
Browse files Browse the repository at this point in the history
…o-produce-desired-output-structure

INF-435/mz/extend germline to produce desired output structure
  • Loading branch information
Mike Zhong authored Dec 22, 2021
2 parents a1bdd07 + 0c31dea commit cff6dbc
Show file tree
Hide file tree
Showing 58 changed files with 8,900 additions and 20 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
minimal_test/*.log
.vscode
test/output
*.o
parse_bmatch
germline
germline
21 changes: 19 additions & 2 deletions GERMLINE_0001.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,16 @@ int MAX_ERR_HET = 1;
int main(int argc, char* argv[])
{
// parse arguments
string rs_range[2], map, samples_to_compare_to, new_samples;
string rs_range[2], map, samples_to_compare_to, new_samples, chromosome;
samples_to_compare_to = "";
new_samples = "";
map = "";
rs_range[0] = "";
rs_range[1] = "";
string params = argv[0];
ALL_SAMPLES.useEmbarkRFGermlineOutput = false;
ALL_SAMPLES.chromosome = "";
ALL_SAMPLES.individualOutputFolder = "";

bool bad_param = false;
for(int i=1; i<argc; i++) {
Expand Down Expand Up @@ -91,7 +94,19 @@ int main(int argc, char* argv[])
else if (strncmp(argv[i], "-new_samples", strlen("-new_samples")) == 0 && i < argc-1) {
new_samples = argv[++i];
}
else if (strncmp(argv[i], "-chromosome", strlen("-chromosome")) == 0 && i < argc-1) {
chromosome = argv[++i];
chromosome.insert(chromosome.begin(), 2 - chromosome.length(), '0');
ALL_SAMPLES.chromosome = chromosome;
cout << "Generating outputs for chromosome: " << chromosome << endl;
}
else if (strncmp(argv[i], "-individual_outputs", strlen("-individual_outputs")) == 0) {
ALL_SAMPLES.individualOutputFolder = argv[++i];
ALL_SAMPLES.useEmbarkRFGermlineOutput = true;
cout << "Generating individual outputs" << endl;
}
else {
cout << argv[i] << endl;
bad_param = true;
}
}
Expand Down Expand Up @@ -126,7 +141,9 @@ int main(int argc, char* argv[])
<< '\t' << "-h_extend" << '\t' << "Extend from seeds if *haplotypes* match" << endl
<< '\t' << "-w_extend" << '\t' << "Extend, one marker at a time, beyong the boundaries of a found match" << endl
<< '\t' << "-samples_to_compare_to" << '\t' << "List of individuals to cross compare." << endl
<< '\t' << "-new_samples" << '\t' << "List of new individuals." << endl;
<< '\t' << "-new_samples" << '\t' << "List of new individuals." << endl
<< '\t' << "-chromosome" << '\t' << "Chromosome number for individual match/homoz file outputs." << endl
<< '\t' << "-individual_outputs" << '\t' << "Top level directory to generate individual match/homoz file outputs to" << endl;
return 1;
}

Expand Down
37 changes: 37 additions & 0 deletions Individual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@ void Individual::clearMatch( size_t id )
void Individual::deleteMatch( size_t id )
{
// try to print it
// cout << "Writing results for: " << single_id << endl;
all_matches[ id ]->print( MATCH_FILE );
delete all_matches[ id ];

Expand Down Expand Up @@ -216,5 +217,41 @@ ostream& operator<<(ostream &fout, Individual& ind)
return fout;
}

void Individual::setIndividualMatchFile(string chromosome)
{
string ext = ".tsv";
string dir = ALL_SAMPLES.individualOutputFolder + "/dog_level_match_files/" + single_id;
experimental::filesystem::path _dir(dir);
if ( !experimental::filesystem::exists(_dir) ) {
experimental::filesystem::create_directories(_dir);
}
string fileHandleName = dir + "/chr" + chromosome + ext;
// cout << fileHandleName << endl;
individualMatchFile = new ofstream(fileHandleName, ofstream::app);
}

void Individual::setIndividualHomozFile(string chromosome)
{
string ext = ".tsv";
string dir = ALL_SAMPLES.individualOutputFolder + "/dog_level_homoz_files/" + single_id;
experimental::filesystem::path _dir(dir);
if ( !experimental::filesystem::exists(_dir) ) {
experimental::filesystem::create_directories(_dir);
}
string fileHandleName = dir + "/chr" + chromosome + ext;
// cout << fileHandleName << endl;
individualHomozFile = new ofstream(fileHandleName, ofstream::app);
}

ofstream* Individual::getIndividualMatchFile()
{
return individualMatchFile;
}

ofstream* Individual::getIndividualHomozFile()
{
return individualHomozFile;
}


// end Individual.cpp
12 changes: 12 additions & 0 deletions Individual.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <map>
#include <set>
#include <iostream>
#include <experimental/filesystem>

using namespace std;

Expand Down Expand Up @@ -96,6 +97,15 @@ class Individual
// clearMarkers(): clears all MarkerSets from this individual
void clearMarkers();

void setIndividualMatchFile(string chromosome);
void setIndividualHomozFile(string chromosome);
ofstream* getIndividualMatchFile();
ofstream* getIndividualHomozFile();

bool is_new;
string single_id;
string haplotype;

private:

// ID of the individual
Expand All @@ -108,6 +118,8 @@ class Individual
streamoff offset;

Match ** all_matches;
ofstream* individualMatchFile;
ofstream* individualHomozFile;
};


Expand Down
10 changes: 10 additions & 0 deletions Individuals.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,16 @@ void Individuals::initialize()
for ( iter = 0 ; iter < pedigree.size() ; iter++ ) pedigree[ iter ]->reserveMemory();
}

void Individuals::initializeOutputFileHandles(string chromosome)
{
for ( iter = 0 ; iter < pedigree.size() ; iter++ ) {
if ( pedigree[ iter ]->is_new ) {
pedigree[ iter ]->setIndividualMatchFile(chromosome);
pedigree[ iter ]->setIndividualHomozFile(chromosome);
}
}
}

void Individuals::freeMatches()
{
for(begin();more();next()) pedigree[ iter ]->freeMatches();
Expand Down
5 changes: 5 additions & 0 deletions Individuals.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ class Individuals
void begin();
size_t size() { return pedigree.size(); }
void initialize();
void initializeOutputFileHandles(string chromosome);
void print( ostream& );

void freeMatches();
Expand All @@ -42,6 +43,10 @@ class Individuals
bool isNew(string);
bool hasRestrictions();

bool useEmbarkRFGermlineOutput;
string chromosome; // used for individual file handle outputs
string individualOutputFolder; // top level folder for individual outputs

private:

void permuteMarkerSet(Chromosome *, int, MarkerSet);
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
CC= g++
OPT= -O3 -I include
OPT= -std=c++17 -O3 -I include
SRCS= GERMLINE_0001.cpp GERMLINE.cpp Share.cpp Chromosome.cpp ChromosomePair.cpp HMIndividualsExtractor.cpp MarkerSet.cpp Individual.cpp Individuals.cpp InputManager.cpp MatchFactory.cpp MatchesBuilder.cpp NucleotideMap.cpp PEDIndividualsExtractor.cpp Match.cpp PolymorphicIndividualsExtractor.cpp SNP.cpp SNPPositionMap.cpp SNPs.cpp
OBJS= GERMLINE_0001.o GERMLINE.o Chromosome.o Share.o ChromosomePair.o HMIndividualsExtractor.o MarkerSet.o Individual.o Individuals.o InputManager.o MatchFactory.o MatchesBuilder.o NucleotideMap.o PEDIndividualsExtractor.o Match.o PolymorphicIndividualsExtractor.o SNP.o SNPPositionMap.o SNPs.o
MAIN= germline
Expand All @@ -14,7 +14,7 @@ $(OBJS): $(SRCS)
$(CC) $(OPT) -c $*.cpp

germline: $(OBJS)
$(CC) $(OPT) -o $(MAIN) $(OBJS)
$(CC) $(OPT) -o $(MAIN) $(OBJS) -lstdc++fs

clean:
-rm -f *.o $(MAIN) $(BMATCH) test/generated.match test/generated.log test/generated.err test/generated.out
Expand Down
121 changes: 107 additions & 14 deletions Match.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -331,21 +331,114 @@ void Match::print( ostream& fout )
fout.write( (char*) &hom[1] , sizeof( bool ) );
} else
{
fout << node[0]->getID() << '\t';
fout << node[1]->getID() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getChr() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getPhysPos() << ' ';
fout << ALL_SNPS.getSNP(snp_end).getPhysPos() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getSNPID() << ' ';
fout << ALL_SNPS.getSNP(snp_end).getSNPID() << '\t';
fout << ( snp_end - snp_start + 1) << '\t';
fout << setiosflags(ios::fixed) << setprecision(2) << distance << '\t';
if ( genetic ) fout << "cM" << '\t'; else fout << "MB" << '\t';
fout << dif;
for ( int n = 0 ; n < 2 ; n++ )
if ( hom[n] ) fout << '\t' << 1; else fout << '\t' << 0;
fout << endl;
if ( ALL_SAMPLES.useEmbarkRFGermlineOutput ) {
int key1 = stoi(node[0]->single_id);
int key2 = stoi(node[1]->single_id);
vector<string> oline;
string joined_oline;
ofstream* ofs;
string chromosome = ALL_SNPS.getSNP(snp_start).getChr();
string start_pos = to_string(ALL_SNPS.getSNP(snp_start).getPhysPos());
string end_pos = to_string(ALL_SNPS.getSNP(snp_end).getPhysPos());

// homoz
if ( key1 == key2 && node[0]->is_new ) {
// cout << "Writing out homoz" << endl;
oline.push_back(chromosome);
oline.push_back(start_pos);
oline.push_back(end_pos);
ofs = node[0]->getIndividualHomozFile();
join(oline, '\t', joined_oline);
*ofs << joined_oline << endl;
}
// key1 is new, key2 is old
else if ( key1 > key2 && node[0]->is_new ) {
// cout << "Writing match tracts for new key1 " << key1 << endl;
oline.push_back(node[0]->single_id);
oline.push_back(node[0]->haplotype);
oline.push_back(node[1]->single_id);
oline.push_back(node[1]->haplotype);
oline.push_back(chromosome);
oline.push_back(start_pos);
oline.push_back(end_pos);
ofs = node[0]->getIndividualMatchFile();
join(oline, '\t', joined_oline);
*ofs << joined_oline << endl;
}
// key2 is new, key1 is old
else if ( key2 > key1 && node[1]->is_new ) {
// cout << "Writing match tracts for new key2 " << key2 << endl;
oline.push_back(node[1]->single_id);
oline.push_back(node[1]->haplotype);
oline.push_back(node[0]->single_id);
oline.push_back(node[0]->haplotype);
oline.push_back(chromosome);
oline.push_back(start_pos);
oline.push_back(end_pos);
ofs = node[1]->getIndividualMatchFile();
join(oline, '\t', joined_oline);
*ofs << joined_oline << endl;
}
// newdog : newdog comparison, write out same record twice
else if ( node[0]->is_new && node[1]->is_new && key1 != key2 ) {
// cout << "Writing records for both keys";
oline.push_back(node[0]->single_id);
oline.push_back(node[0]->haplotype);
oline.push_back(node[1]->single_id);
oline.push_back(node[1]->haplotype);
oline.push_back(chromosome);
oline.push_back(start_pos);
oline.push_back(end_pos);
ofs = node[0]->getIndividualMatchFile();
join(oline, '\t', joined_oline);
*ofs << joined_oline << endl;

oline.push_back(node[1]->single_id);
oline.push_back(node[1]->haplotype);
oline.push_back(node[0]->single_id);
oline.push_back(node[0]->haplotype);
oline.push_back(chromosome);
oline.push_back(start_pos);
oline.push_back(end_pos);
ofs = node[1]->getIndividualMatchFile();
join(oline, '\t', joined_oline);
*ofs << joined_oline << endl;
}
else {
throw runtime_error("Unable to process match");
}
}
// Generate regular germline outputs for matches against ref panel or other use cases
else {

cout << stoi(node[1]->single_id) << endl;
fout << node[0]->getID() << '\t';
fout << node[1]->getID() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getChr() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getPhysPos() << ' ';
fout << ALL_SNPS.getSNP(snp_end).getPhysPos() << '\t';
fout << ALL_SNPS.getSNP(snp_start).getSNPID() << ' ';
fout << ALL_SNPS.getSNP(snp_end).getSNPID() << '\t';
fout << ( snp_end - snp_start + 1) << '\t';
fout << setiosflags(ios::fixed) << setprecision(2) << distance << '\t';
if ( genetic ) fout << "cM" << '\t'; else fout << "MB" << '\t';
fout << dif;
for ( int n = 0 ; n < 2 ; n++ )
if ( hom[n] ) fout << '\t' << 1; else fout << '\t' << 0;
fout << endl;
}
}
num_matches++;
}

void join(const vector<string>& v, char c, string& s) {

s.clear();

for (vector<string>::const_iterator p = v.begin();
p != v.end(); ++p) {
s += *p;
if (p != v.end() - 1)
s += c;
}
}
4 changes: 4 additions & 0 deletions Match.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "Individual.h"
#include "math.h"
#include <iomanip>
#include <stdexcept>

class Individual;

Expand Down Expand Up @@ -45,5 +46,8 @@ class Match
bool isHom( int n , unsigned int ms );
};


void join(const vector<string>& v, char c, string& s);

#endif

2 changes: 1 addition & 1 deletion MatchesBuilder.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// MatchesBuilder.h: builds matches from individuals

#ifndef MATCHESBUILDER_H
#define MATCHESBUILDERs_H
#define MATCHESBUILDER_H

#include "BasicDefinitions.h"
#include "Individuals.h"
Expand Down
18 changes: 18 additions & 0 deletions PEDIndividualsExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ void PEDIndividualsExtractor::loadInput()
}

individualsP->initialize();
if ( ALL_SAMPLES.useEmbarkRFGermlineOutput ) {
individualsP->initializeOutputFileHandles(ALL_SAMPLES.chromosome);
}
stream.clear();
}

Expand Down Expand Up @@ -85,6 +88,21 @@ void PEDIndividualsExtractor::getIndividuals()
new_ind[1]->setID( famID + " " + ID + ".1" );
new_ind[0]->setBaseID( baseID );
new_ind[1]->setBaseID( baseID );
new_ind[0]->haplotype = "0";
new_ind[1]->haplotype = "1";
new_ind[0]->single_id = ID;
new_ind[1]->single_id = ID;

if ( ALL_SAMPLES.isNew(new_ind[0]->getBaseID()) ) {
new_ind[0]->is_new = true;
new_ind[1]->is_new = true;
// cout << "Loaded new sample: " << new_ind[0]->single_id << endl;
}
else {
new_ind[0]->is_new = false;
new_ind[1]->is_new = false;
// cout << "Loaded old sample: " << new_ind[0]->single_id << endl;
}

individualsP->addIndividual( new_ind[0] );
individualsP->addIndividual( new_ind[1] );
Expand Down
5 changes: 5 additions & 0 deletions minimal_test/0000_new_dog_proxy_keys_germline_output.plinky
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
1027227 1027227
1027228 1027228
1027229 1027229
1027230 1027230
1027231 1027231
5 changes: 5 additions & 0 deletions minimal_test/0000_old_dog_proxy_keys_germline_output.plinky
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
967975 967975
967976 967976
967977 967977
967978 967978
967979 967979
Loading

0 comments on commit cff6dbc

Please sign in to comment.