Merge pull request #1 from huggingface/add-pipeline-script

lvwerra · web-flow · commit f02625b3bd63 · 2024-01-22T08:22:12.000+01:00
Add pipeline script
diff --git a/README.md b/README.md
@@ -3,20 +3,20 @@
 ## Install 
 
 ```bash
-pip install sklearn umap sentence_transformers faiss-cpu plotly matplotlib datasets
+pip install scikit-learn umap-learn sentence_transformers faiss-cpu plotly matplotlib datasets
 ```
 
 ## Usage
 
 Run pipeline and visualize results:
 
 ```python
-from src.text_cluster import ClusterClassifier
+from src.text_clustering import ClusterClassifier
 from datasets import load_dataset
 
 SAMPLE = 100_000
 
-texts = load_dataset("HuggingFaceFW/FW-12-12-2023-CC-2023-06").select(range(SAMPLE))["content"]
+texts = load_dataset("HuggingFaceFW/FW-12-12-2023-CC-2023-06", split="train").select(range(SAMPLE))["content"]
 
 cc = ClusterClassifier(embed_device="mps")
 
@@ -32,7 +32,7 @@ cc.save("./cc_100k")
 
 Load classifier and run inference:
 ```python
-from src.text_cluster import ClusterClassifier
+from src.text_clustering import ClusterClassifier
 
 cc = ClusterClassifier(embed_device="mps")
 
@@ -44,4 +44,15 @@ cc.show()
 
 # classify new texts with k-nearest neighbour search
 cluster_labels, embeddings = cc.infer(some_texts, top_k=1)
-```
+```
+
+You can also run the pipeline using a script with:
+```bash
+# run a new pipeline
+python run_pipeline.py --mode run  --save_load_path './cc_100k' --n_samples 100000 --build_hf_ds
+# load existing pipeline
+python run_pipeline.py --mode load --save_load_path './cc_100k' --build_hf_ds
+# inference mode on new texts from an input dataset
+python run_pipeline.py --mode infer --save_load_path './cc_100k'  --n_samples <NB_INFERENCE_SAMPLES> --input_dataset <HF_DATA_FOR_INFERENCE>
+```
+The `build_hf_ds` flag builds and pushes HF datasets, for the files and clusters, that can be directly used in the FW visualization space. In `infer` mode, we push the clusters dataset by default.
diff --git a/run_pipeline.py b/run_pipeline.py
@@ -0,0 +1,157 @@
+import argparse
+import textwrap
+
+import pandas as pd
+import numpy as np
+from datasets import Dataset, load_dataset
+
+from src.text_clustering import ClusterClassifier
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--n_samples", type=int, default=100_000)
+    parser.add_argument("--device", type=str, default="cuda")
+    parser.add_argument("--save_load_path", type=str, default="./cc_100k")
+    parser.add_argument(
+        "--input_dataset", type=str, default="HuggingFaceFW/FW-12-12-2023-CC-2023-06"
+    )
+    parser.add_argument("--input_content", type=str, default="content")
+    parser.add_argument(
+        "--mode",
+        choices=["run", "load", "infer"],
+        default="run",
+        help="Run the pipeline from scratch/load existing model to build hf datasets or to infer on new texts",
+    )
+    parser.add_argument(
+        "--inference_repo_name",
+        type=str,
+        default="infer_fw_on_ultrachat",
+        help="HF repo name for the clusters dataset in inference mode",
+    )
+    parser.add_argument(
+        "--build_hf_ds",
+        action="store_true",
+        help="Builds HF datasets used for space visualization and pushes them to the hub",
+    )
+    parser.add_argument("--username", type=str, default="loubnabnl")
+    return parser.parse_args()
+
+
+def build_hf_data_clusters(cc, texts=None, labels=None):
+    """
+    Build an HF dataset containing information on each cluster.
+
+    Args:
+        cc: ClusterClassifier object.
+        texts: list of texts used for inference mode.
+        labels: list of cluster labels corresponding to the texts for inference mode.
+
+    If `texts` and `labels` are not provided, the function will use the data available in `cc`
+    to construct the dataset. Otherwise it will run in inference mode on texts.
+    """
+    cluster_data = []
+    for cluster_id in cc.label2docs.keys():
+        if cluster_id == -1:
+            continue
+
+        # inference mode
+        if texts is not None and labels is not None:
+            labels_array = np.array(labels)
+            files_in_cluster = np.where(labels_array == cluster_id)[0]
+            examples = [texts[doc_id] for doc_id in files_in_cluster]
+        else:
+            doc_ids = cc.label2docs[cluster_id]
+            examples = [cc.texts[doc_id] for doc_id in doc_ids]
+
+        cluster_info = {
+            "cluster_id": cluster_id,
+            "summary": cc.cluster_summaries[cluster_id],
+            "examples": examples,
+        }
+
+        if not texts:
+            cluster_info["position"] = cc.cluster_centers[cluster_id]
+
+        cluster_data.append(cluster_info)
+
+    return Dataset.from_pandas(pd.DataFrame(cluster_data))
+
+
+def build_hf_data_files(cc):
+    """
+    Build an HF dataset containing information on each file and the cluster they belong to
+    """
+
+    df = pd.DataFrame(
+        data={
+            "X": cc.projections[:, 0],
+            "Y": cc.projections[:, 1],
+            "labels": cc.cluster_labels,
+            "content_display": [textwrap.fill(txt[:1024], 64) for txt in cc.texts],
+        }
+    )
+    return Dataset.from_pandas(df)
+
+
+def build_and_push(cc, args):
+    """Build HF files & clusters datasts and push them to the hub"""
+    print("Building HF datasets...")
+    ds = build_hf_data_clusters(cc)
+    data_clusters = build_hf_data_files(cc)
+    print(f"Files dataset {ds}\nClusters dataset {data_clusters}")
+
+    repo_name = args.save_load_path.split("/")[-1]
+    print(f"Pushing to the hub at {repo_name}...")
+    ds.push_to_hub(f"{args.username}/{repo_name}", private=True)
+    data_clusters.push_to_hub(f"{args.username}/{repo_name}_clusters", private=True)
+
+
+def main():
+    args = get_args()
+    cc = ClusterClassifier(embed_device=args.device)
+
+    if args.mode == "run":
+        # Run a new pipeline on texts
+        texts = load_dataset(args.input_dataset, split="train", token=True).select(
+            range(args.n_samples)
+        )[args.input_content]
+
+        _, _, summaries = cc.fit(texts)
+        print(f"10 example Summaries:\n{[e for e in summaries.values()][:10]}")
+
+        cc.save(args.save_load_path)
+        print(f"Saved clusters in {args.save_load_path}.")
+
+        if args.build_hf_ds:
+            build_and_push(cc, args)
+
+    elif args.mode == "infer":
+        # Run inference mode on texts using an existing pipeline
+        cc.load(args.save_load_path)
+        print(
+            f"Running inference on {args.n_samples} samples of {args.input_dataset} using clusters in {args.save_load_path}."
+        )
+        texts = load_dataset(args.input_dataset, split="train", token=True).select(
+            range(args.n_samples)
+        )[args.input_content]
+        cluster_labels, _ = cc.infer(texts, top_k=1)
+
+        ds = build_hf_data_clusters(cc, texts, cluster_labels)
+        target_repo = {args.username} / {args.inference_repo_name}
+        print(f"Pushing to hub at {target_repo}...")
+        ds.push_to_hub(f"{target_repo}", private=True)
+
+    else:
+        # Load existing pipeline
+        if args.build_hf_ds:
+            cc.load(args.save_load_path)
+            build_and_push(cc, args)
+        else:
+            print("Using mode=load but build_hf_ds is False, nothing to be done.")
+
+    print("Done 🎉")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/text_clustering.py b/src/text_clustering.py
@@ -14,6 +14,7 @@
 import os
 import json
 from collections import Counter
+from tqdm import tqdm
 
 logging.basicConfig(level=logging.INFO)
 
@@ -127,7 +128,7 @@ def infer(self, texts, top_k=1):
 
         dist, neighbours = self.faiss_index.search(embeddings, top_k)
         inferred_labels = []
-        for i in range(embeddings.shape[0]):
+        for i in tqdm(range(embeddings.shape[0])):
             labels = [self.cluster_labels[doc] for doc in neighbours[i]]
             inferred_labels.append(Counter(labels).most_common(1)[0][0])
 
@@ -198,7 +199,10 @@ def save(self, folder):
 
         with open(f'{folder}/texts.json', 'w') as f:
             json.dump(self.texts, f)
-        
+
+        with open(f"{folder}/mistral_prompt.txt", "w") as f:
+            f.write(DEFAULT_INSTRUCTION)
+    
         if self.cluster_summaries is not None:
             with open(f'{folder}/cluster_summaries.json', 'w') as f:
                 json.dump(self.cluster_summaries, f)