support '--only-embedding' option

GCS-ZHN · Sep 14, 2022 · 661c69e · 661c69e
1 parent 0918951
commit 661c69e
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 16 deletions.
diff --git a/src/socube/__init__.py b/src/socube/__init__.py
@@ -20,7 +20,7 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 
-__version__ = "1.0"
+__version__ = "1.1rc1"
 __author__ = "Zhang.H.N"
 __email__ = "zhang.h.n@foxmail.com"
 __url__ = "https://github/GCS-ZHN/socube/"
@@ -80,6 +80,11 @@ def main(*args: str):
                             type=str,
                             default=None,
                             help=help["basic_args"]["cube_id"])
+    basic_args.add_argument("--only-embedding",
+                            action="store_true",
+                            default=False,
+                            help=help["basic_args"]["only_embedding"]
+    )
 
     model_args = parser.add_argument_group(help["model_args"]["title"])
     model_args.add_argument("--learning-rate",
@@ -256,10 +261,13 @@ def main(*args: str):
                              header=None)
 
                 checkData(samples)
-                future: Future = createTrainData(samples,
-                                                 output_path=embedding_path,
-                                                 adj=args.adj_factor,
-                                                 seed=args.seed)
+                if args.only_embedding:
+                    train_data = samples
+                else:
+                    future: Future = createTrainData(samples,
+                                                    output_path=embedding_path,
+                                                    adj=args.adj_factor,
+                                                    seed=args.seed)
 
                 samples = samples.T
                 writeHdf(
@@ -284,25 +292,31 @@ def main(*args: str):
                             seed=args.seed,
                             latent_dim=args.dim,
                             job_id=cube_id)
+
+            if not args.only_embedding:
+                train_data, train_label = future.result()
 
-            train_data, train_label = future.result()
             log("Post-processing",
                 "Processing data with log, std and feature minmax")
             train_data = minmax(std(np.log(train_data + 1)))
             log("Post-processing", "Single channels data is tranforming")
-
             my_cube.batchTransform(train_data, train_path)
-            writeCsv(train_label,
-                     join(train_path, "TrainLabel.csv"),
-                     header=None)
-            if checkExist(join(embedding_path, "ExperimentLabel.csv"),
-                          raise_error=False):
-                shutil.copyfile(join(embedding_path, "ExperimentLabel.csv"),
-                                join(train_path, "ExperimentLabel.csv"))
+
+            if not args.only_embedding:
+                writeCsv(train_label,
+                        join(train_path, "TrainLabel.csv"),
+                        header=None)
+                if checkExist(join(embedding_path, "ExperimentLabel.csv"),
+                            raise_error=False):
+                    shutil.copyfile(join(embedding_path, "ExperimentLabel.csv"),
+                                    join(train_path, "ExperimentLabel.csv"))
 
         elif args.input is not None:
             log("Config", "input is ignored because cube id is specified")
 
+        if args.only_embedding:
+            return
+
         log("Train", "Data check before training start")
         dim = my_cube._config["latent_dim"]
         checkExist(join(train_path, "TrainLabel.csv"))

diff --git a/src/socube/help.en_US.json b/src/socube/help.en_US.json
@@ -11,7 +11,8 @@
         "k": "The k-fold cross-validation is used in training, and the resulting k models will be ensembled into one model. Default 5.",
         "adj_factor": "The adjustment factor for the doublet expression level. By default it is assumed that the doublet expression level is twice the sinlget, but there are fluctuations in the real situation and the expression level can be changed by adjusting this factor. Default 1.0.",
         "dim": "The target dimension for gene degradation is also the number of channels to train the model. Default 10.",
-        "cube_id": "If you want to reuse the socube embedding features obtained earlier,  just specify the embedding ID, which is a string like \"yyyymmdd-HHMMSS-xxx\", along with the original output path."
+        "cube_id": "If you want to reuse the socube embedding features obtained earlier,  just specify the embedding ID, which is a string like \"yyyymmdd-HHMMSS-xxx\", along with the original output path.",
+        "only_embedding": "This option is provided for users who only want to use socube embedding but do not require doublet detection"
     },
     "model_args": {
         "title": "model training configuration",

diff --git a/src/socube/help.zh_CN.json b/src/socube/help.zh_CN.json
@@ -11,7 +11,8 @@
         "k": "在训练中使用k-折交叉验证法，得到的k个模型将被集合成一个模型。默认为5。",
         "adj_factor": "二聚体表达水平的调整系数。默认情况下，假定二聚体的表达水平是单体的两倍，但实际情况存在波动，可以通过调整这个系数改变表达水平。默认为1.0。",
         "dim": "基因特征降维的目标维度也是训练模型的通道数量。默认为10。",
-        "cube_id": "如果你想重新使用先前获得的socube嵌入特征，只需指定embedding ID，这是一个类似于 \"yyymmdd-HHMMSS-xxx\"的字符串，位于embedding子目录下。"
+        "cube_id": "如果你想重新使用先前获得的socube嵌入特征，只需指定embedding ID，这是一个类似于 \"yyymmdd-HHMMSS-xxx\"的字符串，位于embedding子目录下。",
+        "only_embedding": "这个选项提供给那些只想使用socube的特征嵌入功能的用户，使用后不会进行二聚体检测"
     },
     "model_args": {
         "title": "模型训练配置",