Gene name column is added in the output. The output is incompatible w…

…ith previous versions
mjoppich · Aug 21, 2013 · 933ec26 · 933ec26
1 parent 6b379dc
commit 933ec26
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 24 deletions.
diff --git a/DataProcessor.hpp b/DataProcessor.hpp
@@ -409,7 +409,7 @@ void loadSegInfo(string from_file, int sig_len, int strand_specific_code, vector
 /*
  * Read data from a file containing gene name and number of reads mapped to each gene 
  */
-void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<int>& read_counts, vector<int>& gene_length, int verbos_level = 0)
+void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<string>& gene_name, vector<int>& read_counts, vector<int>& gene_length, int verbos_level = 0)
 {
     fstream infile;
     infile.open(from_file.data(), ios::in);
@@ -435,9 +435,9 @@ void loadGeneReadCounts(string from_file, vector<string>& gene_sym, vector<int>&
         split(line, '\t', fields);
 
         gene_sym.push_back(fields[0]);
-        read_counts.push_back(atoi(fields[1].data()));
-        if (fields.size() > 2)
-            gene_length.push_back(atoi(fields[2].data()));
+        gene_name.push_back(fields[1]);
+        read_counts.push_back(atoi(fields[2].data()));
+        gene_length.push_back(atoi(fields[3].data()));
     }
 
     if (verbos_level > 0)
@@ -527,7 +527,8 @@ void scanGPF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
         //string& tran_id = fields[0];
         string& chr = fields[1];
         char strand = fields[2][0];
-        string& gene_id = fields[11];
+        string& gene_id = fields[0];
+        string& gene_name = fields[11];
         string& tran_id = fields[0];
         int exon_cnt = atoi(fields[7].data());
         vector<string> starts, ends;
@@ -538,7 +539,7 @@ void scanGPF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
         {
             int start = atoi(starts[i].data());
             int end = atoi(ends[i].data());
-            gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
+            gene_info.OnAnExon(chr, strand, gene_id, gene_name, tran_id, start, end);
         }
     }
     gene_info.FinishLoadingExons();
@@ -588,13 +589,22 @@ void scanGTF(string from_file, GeneInfo& gene_info, int verbos_level = 0)
         int end = fields[8].find("\"", start);
         string gene_id = fields[8].substr(start, end - start); 
 
-        start = fields[8].find("transcript_id") + 16;
+        start = fields[8].find("transcript_id") + 15;
         end = fields[8].find("\"", start);
         string tran_id = fields[8].substr(start, end - start); 
 
+        string gene_name = "NA";
+        start = fields[8].find("gene_name");
+        if ((size_t)start != string::npos)
+        {
+            start += 11;
+            end = fields[8].find("\"", start);
+            gene_name = fields[8].substr(start, end - start); 
+        }
+
         start = atoi(fields[3].data()) - 1;  // GTF start coordinates are 1-based
         end = atoi(fields[4].data());   // GTF end coordinates are 1-based
-        gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
+        gene_info.OnAnExon(chr, strand, gene_id, gene_name, tran_id, start, end);
     }
     gene_info.FinishLoadingExons();
 
@@ -639,7 +649,7 @@ void scanAnnotBED(string from_file, GeneInfo& gene_info, int verbos_level = 0)
 
         int start = atoi(fields[1].data());
         int end = atoi(fields[2].data());
-        gene_info.OnAnExon(chr, strand, gene_id, tran_id, start, end);
+        gene_info.OnAnExon(chr, strand, gene_id, "NA", tran_id, start, end);
     }
     gene_info.FinishLoadingExons();
 

diff --git a/GFOLD.hpp b/GFOLD.hpp
@@ -53,6 +53,7 @@ class GFOLD
 
     string2vec_str_t mGeneSegs;
     vector<string> mAllGeneIDs; 
+    vector<string> mAllGeneNames; 
 
     double mSignificantCutoff;
     int mBurnInCount;
@@ -101,28 +102,31 @@ class GFOLD
         { 
             string filename = first_group_samples[i] + sample_suffix;
             vector<string> gene_ids;
+            vector<string> gene_names;
 
             vector<vector<int> > dummy;
             first_group_gene_read_counts[i].resize(0);
-            loadGeneReadCounts(filename, gene_ids, first_group_gene_read_counts[i], gene_length, mVerbosLevel);
+            loadGeneReadCounts(filename, gene_ids, gene_names, first_group_gene_read_counts[i], gene_length, mVerbosLevel);
 
             if (mAllGeneIDs.size() > 0 && mAllGeneIDs != gene_ids)
             {
                 cerr << "ERROR: The read count file " << filename << " is not in the right format. Please refer to the documentation." << endl;
                 exit(0);
             }
             mAllGeneIDs = gene_ids;
+            mAllGeneNames = gene_names;
         }
 
         second_group_gene_read_counts.resize(second_group_samples.size());
         for (size_t i = 0; i < second_group_samples.size(); ++i)
         { 
             string filename = second_group_samples[i] + sample_suffix;
             vector<string> gene_ids;
+            vector<string> gene_names;
 
             vector<vector<int> > dummy;
             second_group_gene_read_counts[i].resize(0);
-            loadGeneReadCounts(filename, gene_ids, second_group_gene_read_counts[i], gene_length, mVerbosLevel);
+            loadGeneReadCounts(filename, gene_ids, gene_names, second_group_gene_read_counts[i], gene_length, mVerbosLevel);
 
             if (mAllGeneIDs != gene_ids)
             {
@@ -203,13 +207,14 @@ class GFOLD
         output << "# A gene with zero GFOLD value should never be considered as " << endl;
         output << "# differentially expressed. For a comprehensive description of " << endl;
         output << "# GFOLD, please refer to the manual." << endl;
-        output << "#GeneSymbol\tGFOLD("<< mSignificantCutoff << ")\tE-FDR\tlog2fdc";
+        output << "#GeneSymbol\tGeneName\tGFOLD("<< mSignificantCutoff << ")\tE-FDR\tlog2fdc";
 	    if (gene_length.size() > 0)
             output << "\t1stRPKM\t2ndRPKM";
         output << endl;
         for (size_t i = 0; i < mAllGeneIDs.size(); ++i)
         {
             output << mAllGeneIDs[i] << "\t";
+            output << mAllGeneNames[i] << "\t";
             output << gfold_value[i] << "\t";
             output << fdr[i] << "\t";
             output << logfdc[i];

diff --git a/GeneInfo.hpp b/GeneInfo.hpp
@@ -162,6 +162,7 @@ class Gene: public Interval
     string mChr;
     char mStrand;
     string mGeneSymbol;
+    string mGeneName;
     string mTranscriptID;
     vector<Interval> mExons;
 
@@ -269,7 +270,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
         }
     }
 
-    void OnAnExon(const string& chr, char strand, const string& gene_id, const string& tran_id, int start, int end)
+    void OnAnExon(const string& chr, char strand, const string& gene_id, const string& gene_name, const string& tran_id, int start, int end)
     {
         if (mAllTranscripts.find(tran_id) == mAllTranscripts.end())
         {
@@ -279,6 +280,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
                 a_gene.mChr += strand;
             a_gene.mStrand = strand;
             a_gene.mGeneSymbol = gene_id;
+            a_gene.mGeneName = gene_name;
             a_gene.mTranscriptID = tran_id;
             mAllTranscripts[tran_id] = a_gene;
         }
@@ -307,6 +309,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
                 new_gene.mChr = a_gene.mChr;
                 new_gene.mStrand = a_gene.mStrand;
                 new_gene.mGeneSymbol = a_gene.mGeneSymbol;
+                new_gene.mGeneName = a_gene.mGeneName;
                 new_gene.mTranscriptID = a_gene.mTranscriptID;
                 mAllGenes[a_gene.mGeneSymbol] = new_gene;
             }
@@ -772,6 +775,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
         typedef map<string, int> str2int_t;
         str2int_t gene_cnt;
         str2int_t gene_length;
+        map<string, string> symble2name;
 
         // Note that no need to collect information from mAllJunctions if mbCountJunc is not set
         assert(!mbCountJunc);
@@ -785,6 +789,7 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
                 for_each_ele_in_group(gene_iter, list<Gene*>, genes)
                 {
                     string& gene_sym = (*gene_iter)->mGeneSymbol;
+                    symble2name[gene_sym] = (*gene_iter)->mGeneName;
                     if (gene_cnt.find(gene_sym) == gene_cnt.end())
                     {
                         gene_cnt[gene_sym] = cnt;
@@ -813,9 +818,10 @@ typedef map<string, list<Interval> > map_str2list_inter_t;
 
         for_each_ele_in_group(iter, str2int_t, gene_cnt)
             output << iter->first << "\t" 
+                   << symble2name[iter->first] << "\t"
                    << iter->second << "\t" 
                    << gene_length[iter->first] << "\t" 
-	               << iter->second * 1000.0 / gene_length[iter->first] / seq_depth << endl;
+	               << iter->second * 1000.0 / gene_length[iter->first] / seq_depth << endl; 
 
         output.close();
     }

diff --git a/README b/README
@@ -1,6 +1,10 @@
 Note that https://bitbucket.org/feeldead/gfold/ may have updated README file
 
 VERSIONS:
+V1.1.0. A new column for the gene name is added to the output if the gena name
+information is available in the annotation file. Therefore the output format
+is different from V1.0.9.
+
 V1.0.9. A bug in calculating RPKM for job 'diff' is corrected. This bug is
 caused by integer division of total read count over 1000000 where float
 division should be used. It would slightly overestimate the RPKM if the total

diff --git a/doc/gfold.pod b/doc/gfold.pod
@@ -248,17 +248,21 @@ The output file contains 4 columns:
 
 =item 1. B<GeneSymbol>:
 
-GeneSymbol. The order of gene symbol is the same as that appearing in the read count file.
+For BED file, this is the 4'th column. For GPF file, this is the first column. For GTF format, this corresponds to 'gene_id' if it exists, 'NA' otherwise.
 
-=item 2. B<Read Count>:
+=item 2. B<GeneName>:
+
+For BED file, this is always 'NA'. For GPF file, this is the 12'th column. For GTF format, this corresponds to 'gene_name' if it exists, 'NA' otherwise.
+
+=item 3. B<Read Count>:
 
 The number of reads mapped to this gene.
 
-=item 3. B<Gene exon length>:
+=item 4. B<Gene exon length>:
 
 The length sum of all the exons of this gene.
 
-=item 4. B<RPKM>:
+=item 5. B<RPKM>:
 
 The expression level of this gene in RPKM.
 
@@ -272,9 +276,13 @@ The output file contains 6 columns:
 
 =item  1. B<#GeneSymbol>:
 
-Gene symbols. The order of gene symbol is the same as that appearing in the read count file.
+Gene symbols. The order of gene symbol is the same as what appearing in the read count file.
+
+=item  2. B<GeneName>:
+
+Gene name. The order of gene name is the same as what appearing in the read count file.
 
-=item  2. B<GFOLD>:
+=item  3. B<GFOLD>:
 
 GFOLD value for every gene. The GFOLD value could be considered as a reliable
 log2 fold change. It is positive/negative if the gene is up/down regulated. The
@@ -292,24 +300,24 @@ never be considered differentially expressed. However, it doesn't mean that all
 genes with non-negative GFOLD value are differentially expressed. For taking top
 differentially expressed genes, the user is responsible for selecting the cutoff.
 
-=item  3. B<E-FDR>:
+=item  4. B<E-FDR>:
 
 Empirical FDR based on replicates. It is always 1 when no replicates are available.
 
-=item  4. B<log2fdc>:
+=item  5. B<log2fdc>:
 
 log2 fold change. If no replicate is available, and B<-acc> is T, log2 fold change
 is based on read counts and normalization constants. Otherwise, log2 fold change is
 based on the sampled expression level from the posterior distribution.
 
-=item  5. B<1stRPKM>:
+=item  6. B<1stRPKM>:
 
 The RPKM for the first condition. It is available only if gene length is available. 
 If multiple replicates are available, the RPKM is calculated simply by summing over
 replicates. Because RPKM is acturally using sequencing depth as the normalization 
 constant, log2 fold change based on RPKM could be different from the log2fdc field.
 
-=item  6. B<2ndRPKM>:
+=item  7. B<2ndRPKM>:
 
 The RPKM for the second condition. It is available only if gene length is available.
 Please refer to B<1stRPKM> for more information.