Adjusting output to have cell/damaged not 1/0 labels

AlicenJoyHenning · Oct 18, 2024 · 5b4f0a0 · 5b4f0a0
1 parent 7cd251b
commit 5b4f0a0
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/example_data/Kolodziejczyk.csv b/example_data/Kolodziejczyk.csv
@@ -1,4 +1,4 @@
-,Actb,Gadph,Metabolic process,#Detected Genes,Mapping rate
+Identifier,Actb,Gadph,Metabolic process,#Detected Genes,Mapping rate
 ola_mES_2i_1_1.counts,0.006805259,0.00015504,0.023556419,11947,0.775385676
 ola_mES_2i_1_10.counts,0.008647994,0.00023053,0.030453829,11894,0.790886637
 ola_mES_2i_1_11.counts,0.009517874,0.000172775,0.02402108,12321,0.774220237

diff --git a/example_data/labeled_Kolodziejczyk.csv b/example_data/labeled_Kolodziejczyk.csv
@@ -1,4 +1,4 @@
-,Actb,Gadph,Metabolic process,#Detected Genes,Mapping rate,Quality
+Identifier,Actb,Gadph,Metabolic process,#Detected Genes,Mapping rate,Quality
 ola_mES_2i_1_1.counts,0.006805259,0.00015504,0.023556419,11947,0.775385676,1
 ola_mES_2i_1_10.counts,0.008647994,0.00023053,0.030453829,11894,0.790886637,1
 ola_mES_2i_1_11.counts,0.009517874,0.000172775,0.02402108,12321,0.774220237,1

diff --git a/runEnsembleKQC.py b/runEnsembleKQC.py
@@ -2,6 +2,7 @@
 import itertools
 import time
 import numpy as np
+import pandas as pd
 from sklearn.cluster import KMeans
 from sklearn.ensemble import IsolationForest
 from sklearn.metrics.pairwise import cosine_similarity
@@ -72,6 +73,10 @@ def run_enumeration_round_wrapper(args):
     cell_num = len(mat[0])
     print("{} features, {} cells".format(feature_num, cell_num))
 
+    # Create a new DataFrame with the 'Identifier' column
+    identifiers = pd.read_csv(file_name, usecols=['Identifier'])
+    result_df = identifiers.copy()
+
     mat = min_max_scale(mat)
     tmat = np.array(list(zip(*mat)))
     similarity = cosine_similarity(tmat)
@@ -103,7 +108,7 @@ def run_enumeration_round_wrapper(args):
     print(f"Total enumeration rounds: {total_rounds}")
 
     # Limit the number of enumeration rounds to 5
-    enumeration_list = enumeration_list[:10]
+    enumeration_list = enumeration_list[:5]
 
     def progress_wrapper(args):
         result = run_enumeration_round_wrapper(args)
@@ -121,10 +126,11 @@ def progress_wrapper(args):
         precision, recall, F1 = calc_pre_recall_F1(result, label)
         print('precision {:2f} recall {:2f} F1Score {:2f}'.format(precision, recall, F1))
 
+    # Add the results as a new column to the DataFrame
+    result_df['Quality'] = ['cell' if res == 1 else 'damaged' for res in result]
+
     # Store the result to the output path
     if result_path:
-        with open(result_path, 'w') as f:
-            f.write('Quality\n')
-            f.writelines('\n'.join(map(str, result)))
+        result_df.to_csv(result_path, index=False, header=False)
     end = time.time()
     print('Done. Total time: {:2f}s. Results have been stored in {}'.format(end - start, result_path))