Skip to content

Commit

Permalink
Gene name column is added in the output. The output is incompatible w…
Browse files Browse the repository at this point in the history
…ith previous versions
  • Loading branch information
Jianxing Feng committed Aug 21, 2013
1 parent 6b379dc commit 933ec26
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 24 deletions.
28 changes: 19 additions & 9 deletions DataProcessor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -409,7 +409,7 @@ void loadSegInfo(string from_file, int sig_len, int strand_specific_code, vector
/*
* Read data from a file containing gene name and number of reads mapped to each gene
*/
void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<int>& read_counts, vector<int>& gene_length, int verbos_level = 0)
void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<string>& gene_name, vector<int>& read_counts, vector<int>& gene_length, int verbos_level = 0)
{
fstream infile;
infile.open(from_file.data(), ios::in);
Expand All @@ -435,9 +435,9 @@ void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<int>&
split(line, '\t', fields);

gene_sym.push_back(fields[0]);
read_counts.push_back(atoi(fields[1].data()));
if (fields.size() > 2)
gene_length.push_back(atoi(fields[2].data()));
gene_name.push_back(fields[1]);
read_counts.push_back(atoi(fields[2].data()));
gene_length.push_back(atoi(fields[3].data()));
}

if (verbos_level > 0)
Expand Down Expand Up @@ -527,7 +527,8 @@ void scanGPF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
//string& tran_id = fields[0];
string& chr = fields[1];
char strand = fields[2][0];
string& gene_id = fields[11];
string& gene_id = fields[0];
string& gene_name = fields[11];
string& tran_id = fields[0];
int exon_cnt = atoi(fields[7].data());
vector<string> starts, ends;
Expand All @@ -538,7 +539,7 @@ void scanGPF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
{
int start = atoi(starts[i].data());
int end = atoi(ends[i].data());
gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
gene_info.OnAnExon(chr, strand, gene_id, gene_name, tran_id, start, end);
}
}
gene_info.FinishLoadingExons();
Expand Down Expand Up @@ -588,13 +589,22 @@ void scanGTF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
int end = fields[8].find("\"", start);
string gene_id = fields[8].substr(start, end - start);

start = fields[8].find("transcript_id") + 16;
start = fields[8].find("transcript_id") + 15;
end = fields[8].find("\"", start);
string tran_id = fields[8].substr(start, end - start);

string gene_name = "NA";
start = fields[8].find("gene_name");
if ((size_t)start != string::npos)
{
start += 11;
end = fields[8].find("\"", start);
gene_name = fields[8].substr(start, end - start);
}

start = atoi(fields[3].data()) - 1; // GTF start coordinates are 1-based
end = atoi(fields[4].data()); // GTF end coordinates are 1-based
gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
gene_info.OnAnExon(chr, strand, gene_id, gene_name, tran_id, start, end);
}
gene_info.FinishLoadingExons();

Expand Down Expand Up @@ -639,7 +649,7 @@ void scanAnnotBED(string from_file, GeneInfo& gene_info, int verbos_level = 0)

int start = atoi(fields[1].data());
int end = atoi(fields[2].data());
gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
gene_info.OnAnExon(chr, strand, gene_id, "NA", tran_id, start, end);
}
gene_info.FinishLoadingExons();

Expand Down
11 changes: 8 additions & 3 deletions GFOLD.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class GFOLD

string2vec_str_t mGeneSegs;
vector<string> mAllGeneIDs;
vector<string> mAllGeneNames;

double mSignificantCutoff;
int mBurnInCount;
Expand Down Expand Up @@ -101,28 +102,31 @@ class GFOLD
{
string filename = first_group_samples[i] + sample_suffix;
vector<string> gene_ids;
vector<string> gene_names;

vector<vector<int> > dummy;
first_group_gene_read_counts[i].resize(0);
loadGeneReadCounts(filename, gene_ids, first_group_gene_read_counts[i], gene_length, mVerbosLevel);
loadGeneReadCounts(filename, gene_ids, gene_names, first_group_gene_read_counts[i], gene_length, mVerbosLevel);

if (mAllGeneIDs.size() > 0 && mAllGeneIDs != gene_ids)
{
cerr << "ERROR: The read count file " << filename << " is not in the right format. Please refer to the documentation." << endl;
exit(0);
}
mAllGeneIDs = gene_ids;
mAllGeneNames = gene_names;
}

second_group_gene_read_counts.resize(second_group_samples.size());
for (size_t i = 0; i < second_group_samples.size(); ++i)
{
string filename = second_group_samples[i] + sample_suffix;
vector<string> gene_ids;
vector<string> gene_names;

vector<vector<int> > dummy;
second_group_gene_read_counts[i].resize(0);
loadGeneReadCounts(filename, gene_ids, second_group_gene_read_counts[i], gene_length, mVerbosLevel);
loadGeneReadCounts(filename, gene_ids, gene_names, second_group_gene_read_counts[i], gene_length, mVerbosLevel);

if (mAllGeneIDs != gene_ids)
{
Expand Down Expand Up @@ -203,13 +207,14 @@ class GFOLD
output << "# A gene with zero GFOLD value should never be considered as " << endl;
output << "# differentially expressed. For a comprehensive description of " << endl;
output << "# GFOLD, please refer to the manual." << endl;
output << "#GeneSymbol\tGFOLD("<< mSignificantCutoff << ")\tE-FDR\tlog2fdc";
output << "#GeneSymbol\tGeneName\tGFOLD("<< mSignificantCutoff << ")\tE-FDR\tlog2fdc";
if (gene_length.size() > 0)
output << "\t1stRPKM\t2ndRPKM";
output << endl;
for (size_t i = 0; i < mAllGeneIDs.size(); ++i)
{
output << mAllGeneIDs[i] << "\t";
output << mAllGeneNames[i] << "\t";
output << gfold_value[i] << "\t";
output << fdr[i] << "\t";
output << logfdc[i];
Expand Down
10 changes: 8 additions & 2 deletions GeneInfo.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ class Gene: public Interval
string mChr;
char mStrand;
string mGeneSymbol;
string mGeneName;
string mTranscriptID;
vector<Interval> mExons;

Expand Down Expand Up @@ -269,7 +270,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
}
}

void OnAnExon(const string& chr, char strand, const string& gene_id, const string& tran_id, int start, int end)
void OnAnExon(const string& chr, char strand, const string& gene_id, const string& gene_name, const string& tran_id, int start, int end)
{
if (mAllTranscripts.find(tran_id) == mAllTranscripts.end())
{
Expand All @@ -279,6 +280,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
a_gene.mChr += strand;
a_gene.mStrand = strand;
a_gene.mGeneSymbol = gene_id;
a_gene.mGeneName = gene_name;
a_gene.mTranscriptID = tran_id;
mAllTranscripts[tran_id] = a_gene;
}
Expand Down Expand Up @@ -307,6 +309,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
new_gene.mChr = a_gene.mChr;
new_gene.mStrand = a_gene.mStrand;
new_gene.mGeneSymbol = a_gene.mGeneSymbol;
new_gene.mGeneName = a_gene.mGeneName;
new_gene.mTranscriptID = a_gene.mTranscriptID;
mAllGenes[a_gene.mGeneSymbol] = new_gene;
}
Expand Down Expand Up @@ -772,6 +775,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
typedef map<string, int> str2int_t;
str2int_t gene_cnt;
str2int_t gene_length;
map<string, string> symble2name;

// Note that no need to collect information from mAllJunctions if mbCountJunc is not set
assert(!mbCountJunc);
Expand All @@ -785,6 +789,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
for_each_ele_in_group(gene_iter, list<Gene*>, genes)
{
string& gene_sym = (*gene_iter)->mGeneSymbol;
symble2name[gene_sym] = (*gene_iter)->mGeneName;
if (gene_cnt.find(gene_sym) == gene_cnt.end())
{
gene_cnt[gene_sym] = cnt;
Expand Down Expand Up @@ -813,9 +818,10 @@ typedef map<string, list<Interval> > map_str2list_inter_t;

for_each_ele_in_group(iter, str2int_t, gene_cnt)
output << iter->first << "\t"
<< symble2name[iter->first] << "\t"
<< iter->second << "\t"
<< gene_length[iter->first] << "\t"
<< iter->second * 1000.0 / gene_length[iter->first] / seq_depth << endl;
<< iter->second * 1000.0 / gene_length[iter->first] / seq_depth << endl;

output.close();
}
Expand Down
4 changes: 4 additions & 0 deletions README
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
Note that https://bitbucket.org/feeldead/gfold/ may have updated README file

VERSIONS:
V1.1.0. A new column for the gene name is added to the output if the gena name
information is available in the annotation file. Therefore the output format
is different from V1.0.9.

V1.0.9. A bug in calculating RPKM for job 'diff' is corrected. This bug is
caused by integer division of total read count over 1000000 where float
division should be used. It would slightly overestimate the RPKM if the total
Expand Down
28 changes: 18 additions & 10 deletions doc/gfold.pod
Original file line number Diff line number Diff line change
Expand Up @@ -248,17 +248,21 @@ The output file contains 4 columns:

=item 1. B<GeneSymbol>:

GeneSymbol. The order of gene symbol is the same as that appearing in the read count file.
For BED file, this is the 4'th column. For GPF file, this is the first column. For GTF format, this corresponds to 'gene_id' if it exists, 'NA' otherwise.

=item 2. B<Read Count>:
=item 2. B<GeneName>:

For BED file, this is always 'NA'. For GPF file, this is the 12'th column. For GTF format, this corresponds to 'gene_name' if it exists, 'NA' otherwise.

=item 3. B<Read Count>:

The number of reads mapped to this gene.

=item 3. B<Gene exon length>:
=item 4. B<Gene exon length>:

The length sum of all the exons of this gene.

=item 4. B<RPKM>:
=item 5. B<RPKM>:

The expression level of this gene in RPKM.

Expand All @@ -272,9 +276,13 @@ The output file contains 6 columns:

=item 1. B<#GeneSymbol>:

Gene symbols. The order of gene symbol is the same as that appearing in the read count file.
Gene symbols. The order of gene symbol is the same as what appearing in the read count file.

=item 2. B<GeneName>:

Gene name. The order of gene name is the same as what appearing in the read count file.

=item 2. B<GFOLD>:
=item 3. B<GFOLD>:

GFOLD value for every gene. The GFOLD value could be considered as a reliable
log2 fold change. It is positive/negative if the gene is up/down regulated. The
Expand All @@ -292,24 +300,24 @@ never be considered differentially expressed. However, it doesn't mean that all
genes with non-negative GFOLD value are differentially expressed. For taking top
differentially expressed genes, the user is responsible for selecting the cutoff.

=item 3. B<E-FDR>:
=item 4. B<E-FDR>:

Empirical FDR based on replicates. It is always 1 when no replicates are available.

=item 4. B<log2fdc>:
=item 5. B<log2fdc>:

log2 fold change. If no replicate is available, and B<-acc> is T, log2 fold change
is based on read counts and normalization constants. Otherwise, log2 fold change is
based on the sampled expression level from the posterior distribution.

=item 5. B<1stRPKM>:
=item 6. B<1stRPKM>:

The RPKM for the first condition. It is available only if gene length is available.
If multiple replicates are available, the RPKM is calculated simply by summing over
replicates. Because RPKM is acturally using sequencing depth as the normalization
constant, log2 fold change based on RPKM could be different from the log2fdc field.

=item 6. B<2ndRPKM>:
=item 7. B<2ndRPKM>:

The RPKM for the second condition. It is available only if gene length is available.
Please refer to B<1stRPKM> for more information.
Expand Down

0 comments on commit 933ec26

Please sign in to comment.