rearrange code

loubnabnl · loubnabnl · commit ceaca3aab75a · 2024-01-19T18:17:29.000Z
diff --git a/README.md b/README.md
@@ -55,4 +55,4 @@ python run_pipeline.py --mode load --save_load_path './cc_100k' --build_hf_ds
 # inference mode on new texts from an input dataset
 python run_pipeline.py --mode infer --save_load_path './cc_100k'  --n_samples <NB_INFERENCE_SAMPLES> --input_dataset <HF_DATA_FOR_INFERENCE>
 ```
-The `build_hf_ds` flag builds and pushes HF datasets  for the files and clusters that can be directky used in the FW visualization space (we push the clusters dataset to the hub by default).
+The `build_hf_ds` flag builds and pushes HF datasets, for the files and clusters, that can be directly used in the FW visualization space. In `infer` mode, we push the clusters dataset by default.
diff --git a/run_pipeline.py b/run_pipeline.py
@@ -12,9 +12,7 @@ def get_args():
     parser = argparse.ArgumentParser()
     parser.add_argument("--n_samples", type=int, default=100_000)
     parser.add_argument("--device", type=str, default="cuda")
-    parser.add_argument(
-        "--save_load_path", type=str, default="./fw_afaik_topics_100k_guard_s"
-    )
+    parser.add_argument("--save_load_path", type=str, default="./cc_100k")
     parser.add_argument(
         "--input_dataset", type=str, default="HuggingFaceFW/FW-12-12-2023-CC-2023-06"
     )
@@ -26,7 +24,9 @@ def get_args():
         help="Run the pipeline from scratch/load existing model to build hf datasets or to infer on new texts",
     )
     parser.add_argument(
-        "--inference_repo_name", type=str, default="infer_fw_on_ultrachat",
+        "--inference_repo_name",
+        type=str,
+        default="infer_fw_on_ultrachat",
         help="HF repo name for the clusters dataset in inference mode",
     )
     parser.add_argument(
@@ -47,7 +47,7 @@ def build_hf_data_clusters(cc, texts=None, labels=None):
         texts: list of texts used for inference mode.
         labels: list of cluster labels corresponding to the texts for inference mode.
 
-    If `texts` and `labels` are not provided, the function will use the data available in `cc` 
+    If `texts` and `labels` are not provided, the function will use the data available in `cc`
     to construct the dataset. Otherwise it will run in inference mode on texts.
     """
     cluster_data = []
@@ -96,6 +96,19 @@ def build_hf_data_files(cc):
     return ds
 
 
+def build_and_push(cc, args):
+    """Build HF files & clusters datasts and push them to the hub"""
+    print("Building HF datasets...")
+    ds = build_hf_data_clusters(cc)
+    data_clusters = build_hf_data_files(cc)
+    print(f"Files dataset {ds}\nClusters dataset {data_clusters}")
+
+    repo_name = args.save_load_path.split("/")[-1]
+    print(f"Pushing to the hub at {repo_name}...")
+    ds.push_to_hub(f"{args.username}/{repo_name}", private=True)
+    data_clusters.push_to_hub(f"{args.username}/{repo_name}_clusters", private=True)
+
+
 def main():
     args = get_args()
     cc = ClusterClassifier(embed_device=args.device)
@@ -112,41 +125,34 @@ def main():
         cc.save(args.save_load_path)
         print(f"Saved clusters in {args.save_load_path}.")
 
+        if args.build_hf_ds:
+            build_and_push(cc, args)
+
     elif args.mode == "infer":
-            cc.load(args.save_load_path)
-            # run inference mode on texts using an existing pipeline
-            cc.load(args.save_load_path)
-            print(
-                f"Running inference on {args.n_samples} samples of {args.input_dataset} using clusters in {args.save_load_path}"
-            )
-            texts = load_dataset(args.input_dataset, split="train", token=True).select(
-                range(args.n_samples)
-            )[args.input_content]
-            cluster_labels, _ = cc.infer(texts, top_k=1)
-            ds = build_hf_data_clusters(cc, texts, cluster_labels)
-            print("Pushing to hub...")
-            ds.push_to_hub(f"{args.username}/{args.inference_repo_name}", private=True)
+        # Run inference mode on texts using an existing pipeline
+        cc.load(args.save_load_path)
+        print(
+            f"Running inference on {args.n_samples} samples of {args.input_dataset} using clusters in {args.save_load_path}."
+        )
+        texts = load_dataset(args.input_dataset, split="train", token=True).select(
+            range(args.n_samples)
+        )[args.input_content]
+        cluster_labels, _ = cc.infer(texts, top_k=1)
 
-    else:
-        if not args.build_hf_ds:
-            print("Using mode=load but build_hf_ds is False, nothing to be done.")
+        ds = build_hf_data_clusters(cc, texts, cluster_labels)
+        target_repo = {args.username} / {args.inference_repo_name}
+        print(f"Pushing to hub at {target_repo}...")
+        ds.push_to_hub(f"{target_repo}", private=True)
 
-    if args.build_hf_ds:
-        print("Building HF clustering datasets...")
-        if args.mode == "load":
+    else:
+        # Load existing pipeline
+        if args.build_hf_ds:
             cc.load(args.save_load_path)
-        ds = build_hf_data_clusters(cc)
-        data_clusters = build_hf_data_files(cc)
-        print(f"Files dataset {ds}\nClusters dataset {data_clusters}")
-
-        repo_name = args.save_load_path.split("/")[-1]
-        print(f"Pushing to the hub at {repo_name}...")
-        ds.push_to_hub(f"{args.username}/{repo_name}", private=True)
-        data_clusters.push_to_hub(
-            f"{args.username}/{repo_name}_clusters", private=True
-        )
+            build_and_push(cc, args)
+        else:
+            print("Using mode=load but build_hf_ds is False, nothing to be done.")
 
-    print("Done 🎉!")
+    print("Done 🎉")
 
 
 if __name__ == "__main__":