Revert "Document sampling and fix local run" (microsoft#670)

julia-meshcheryakova · web-flow · commit 18caef8032bf · 2024-08-15T13:20:30.000Z
Reverts microsoft#538 Temporarily revert merge until prerelease branch is merged into dev microsoft#556
diff --git a/01_index.py b/01_index.py
@@ -1,6 +1,5 @@
 import json
 import argparse
-import sys
 
 from rag_experiment_accelerator.run.index import run
 from rag_experiment_accelerator.config.config import Config
@@ -18,12 +17,9 @@
     environment = Environment.from_env_or_keyvault()
     config = Config(environment, args.config_path, args.data_dir)
 
-    # Are we running locally and not in AML? We do not want to run sampling on the distributed compute at this stage
-    is_local = "01_index.py" in str(sys.argv[0])
-
     file_paths = get_all_file_paths(config.data_dir)
     for index_config in config.index_configs():
-        index_dict = run(environment, config, index_config, file_paths, is_local)
+        index_dict = run(environment, config, index_config, file_paths)
 
     with open(config.GENERATED_INDEX_NAMES_FILE_PATH, "w") as index_name:
         json.dump(index_dict, index_name, indent=4)
diff --git a/README.md b/README.md
@@ -51,8 +51,6 @@ The custom loader resorts to the simpler 'prebuilt-layout' API model as a fallba
 
 1. **Multi-Lingual**: The tool supports language analyzers for linguistic support on individual languages and specialized (language-agnostic) analyzers for user-defined patterns on search indexes. For more information, see [Types of Analyzers](https://learn.microsoft.com/en-us/azure/search/search-analyzers#types-of-analyzers).
 
-1. **Sampling**: If you have a large dataset and/or want to speed up the experimentation, a sampling process is available to create a small but representative sample of the data for the percentage specified. The data will be clustered by content and a percentage of each cluster will be selected as part of the sample. Results obtained should be roughly indicative of the full dataset within a ~10% margin. Once an approach has been identified, running on the full dataset is recommended for accurate results.
-
 ## Products used
 
 - [Azure AI Search Service](https://learn.microsoft.com/en-us/azure/search/search-create-service-portal) (Note: [Semantic Search](https://learn.microsoft.com/en-us/azure/search/search-get-started-semantic?tabs=dotnet) is available in Azure AI Search, at Basic tier or higher.)
@@ -187,21 +185,21 @@ az deployment sub create --location uksouth --template-file infra/main.bicep \
 
 ## How to use
 
-To use the **RAG Experiment Accelerator** locally, follow these steps:
+To use the **RAG Experiment Accelerator**, follow these steps:
 
 1. Copy the provided `config.sample.json` file to a file named `config.json` and change any hyperparameters to tailor to your experiment.
 2. Run `01_index.py` (python 01_index.py) to create Azure AI Search indexes and load data into them.
   ```bash
   python 01_index.py
   -d "The directory holding the configuration files and data. Defaults to current working directory"
-  --data_dir "The directory holding the data. Defaults to data"
+  -dd "The directory holding the data. Defaults to data"
   -cf "JSON config filename. Defaults to config.json"
   ```
 3. Run `02_qa_generation.py` (python 02_qa_generation.py) to generate question-answer pairs using Azure OpenAI.
   ```bash
   python 02_qa_generation.py
   -d "The directory holding the configuration files and data. Defaults to current working directory"
-  --data_dir "The directory holding the data. Defaults to data"
+  -dd "The directory holding the data. Defaults to data"
   -cf "JSON config filename. Defaults to config.json"
   ```
 4. Run `03_querying.py` (python 03_querying.py) to query Azure AI Search to generate context, re-rank items in context, and get response from Azure OpenAI using the new context.
@@ -219,63 +217,6 @@ To use the **RAG Experiment Accelerator** locally, follow these steps:
 
 Alternatively, you can run the above steps (apart from `02_qa_generation.py`) using an Azure ML pipeline. To do so, follow [the guide here](./docs/azureml-pipeline.md).
 
-### Running with sampling
-
-Sampling will be run locally to create a small but representative slice of the data. This helps with rapid experimentation and keeps costs down. Results obtained should be roughly indicative of the full dataset within a ~10% margin. Once an approach has been identified, running on the full dataset is recommended for accurate results.
-
-**Note**: Sampling can only be run locally, at this stage it is not supported on a distributed AML compute cluster. So the process would be to run sampling locally and then use the generated sample dataset to run on AML.
-
-If you have a very large dataset and want to run a similar approach to sample the data, you can use the pyspark in-memory distributed implementation in the [Data Discovery Toolkit](https://github.com/microsoft/Data-Discovery-Toolkit) for [Microsoft Fabric](https://learn.microsoft.com/en-us/fabric/get-started/microsoft-fabric-overview) or [Azure Synapse Analytics](https://learn.microsoft.com/en-gb/azure/synapse-analytics/).  
-
-#### Available sampling parameters in the config.json file
-
-```json
-    "sampling": {
-        "sample_data": "Set to true to enable sampling",
-        "only_run_sampling": "If set to true, this will only run the sampling step and will not create an index or any subsequent steps, use this if you want to build a small sampled dataset to run in AML",
-        "sample_percentage": "Percentage of the document corpus to sample",
-        "optimum_k": "Set to 'auto' to automatically determine the optimum cluster number or set to a specific value e.g. 15",
-        "min_cluster": "Used by the automated optimum cluster process, this is the minimum number of clusters e.g. 2",
-        "max_cluster": "Used by the automated optimum cluster process, this is the maximum number of clusters e.g. 30",
-    },
-```
-
-
-The sampling process will produce the following artifacts in the sampling directory:
-
-1. A directory named after the config value ```job_name``` containing the subset of files sampled, these can be specified as ```--data_dir``` argument when running the entire process on AML.
-2. A 2 dimensional scatter plot of the clustered files (by content) selected as the sampled dataset in the sampling folder.
-![images/all_cluster_predictions_cluster_number_5.jpg](images/all_cluster_predictions_cluster_number_5.jpg)
-3. A .cvs file of the entire dataset with cluster predictions named "all_cluster_predictions..." and a cvs file with the sampled cluster predictions named "sampled_cluster_predictions...". This can be used for further enriching the dataset, for example, creating a meaningful label per cluster and updates all record. See the [Heuristics classifier in the Data Discovery Toolkit as an example](https://github.com/microsoft/Data-Discovery-Toolkit/blob/main/walkthroughs/heuristics/standalone_text_heuristics.ipynb) or [Pixplotml for image data](https://github.com/microsoft/Data-Discovery-Toolkit?tab=readme-ov-file#using-pixplotml-to-rapidly-visualise-and-label-data-for-training). 
-4. If the ```"optimum_k": auto``` config value is set to auto, the sampling process will attempt to set the optimum number of clusters automatically. This can be overridden if you know roughly how many broad buckets of content exist in your data. An elbow graph will be generated in the sampling folder.
-![Optimum k elbow graph](images/elbow_5.png)
-
-Two options exist for running sampling, namely:
-
-1. Run the entire process locally with sampling, including the index generation and subsequent steps
-2. Run only the sampling locally and then use the created sampled dataset to execute on AML
-
-#### Run the entire process locally
-
-Set the following values to run the indexing process locally:
-
-```json
-    "sampling": {
-        "sample_data": true,
-        "only_run_sampling": false,
-        "sample_percentage": 10,
-        "optimum_k": auto,
-        "min_cluster": 2,
-        "max_cluster": 30
-    },
-```
-
-#### Run only the sampling locally and the subsequent steps on AML
-
-If ```only_run_sampling```config value is set to true, this will only run the sampling step, no index will be created and any other subsequent steps will not executed. Set the ```--data_dir``` argument to directory created by the sampling process which will be:
-
-```artifacts/sampling/config.[job_name]``` and execute the [AML pipeline step.](docs/azureml-pipeline.md)
-
 # Description of configuration elements
 
 ```json
@@ -286,7 +227,6 @@ If ```only_run_sampling```config value is set to true, this will only run the sa
     "job_description": "You may provide a description for the current job run which describes in words what you are about to experiment with",
     "sampling": {
         "sample_data": "Set to true to enable sampling",
-        "only_run_sampling": "If set to true, this will only run the sampling step and will not create an index or any subsequent steps, use this if you want to build a small sampled dataset to run in AML",
         "sample_percentage": "Percentage of the document corpus to sample",
         "optimum_k": "Set to 'auto' to automatically determine the optimum cluster number or set to a specific value e.g. 15",
         "min_cluster": "Used by the automated optimum cluster process, this is the minimum number of clusters e.g. 2",
diff --git a/config.sample.json b/config.sample.json
@@ -4,14 +4,6 @@
     "job_name": "",
     "job_description": "",
     "preprocess": false,
-    "sampling": {
-        "sample_data": true,
-        "only_run_sampling": true,
-        "sample_percentage": 5,
-        "optimum_k": "auto",
-        "min_cluster": 2,
-        "max_cluster": 30
-    },
     "chunking": {
         "chunk_size": [1000],
         "overlap_size": [200],
diff --git a/images/all_cluster_predictions_cluster_number_5.jpg b/images/all_cluster_predictions_cluster_number_5.jpg
diff --git a/images/elbow_5.png b/images/elbow_5.png
diff --git a/rag_experiment_accelerator/config/config.py b/rag_experiment_accelerator/config/config.py
@@ -91,7 +91,7 @@ def __init__(
         self.EF_CONSTRUCTIONS = config_json["ef_construction"]
         self.EF_SEARCHES = config_json["ef_search"]
         self.INDEX_NAME_PREFIX = config_json["index_name_prefix"]
-        self.EXPERIMENT_NAME = self.INDEX_NAME_PREFIX
+        self.EXPERIMENT_NAME = config_json["experiment_name"] or self.INDEX_NAME_PREFIX
         self.JOB_NAME = config_json["job_name"]
         self.JOB_DESCRIPTION = config_json["job_description"]
         self.SEARCH_VARIANTS = config_json["search_types"]
@@ -157,7 +157,6 @@ def __init__(
             self.SAMPLE_OPTIMUM_K = config_json["sampling"]["optimum_k"]
             self.SAMPLE_MIN_CLUSTER = config_json["sampling"]["min_cluster"]
             self.SAMPLE_MAX_CLUSTER = config_json["sampling"]["max_cluster"]
-            self.ONLY_RUN_SAMPLING = config_json["sampling"]["only_run_sampling"]
 
         # log all the configuration settings in debug mode
         for key, value in config_json.items():
diff --git a/rag_experiment_accelerator/config/tests/data/config.json b/rag_experiment_accelerator/config/tests/data/config.json
@@ -7,7 +7,6 @@
     "preprocess": false,
     "sampling": {
         "sample_data": false,
-        "only_run_sampling": false,
         "sample_percentage": 5,
         "optimum_k": "auto",
         "min_cluster": 2,
diff --git a/rag_experiment_accelerator/config/tests/test_config.py b/rag_experiment_accelerator/config/tests/test_config.py
@@ -48,7 +48,7 @@ def test_config_init(mock_create_embedding_model):
     config.embedding_models = [embedding_model_1, embedding_model_2]
 
     assert config.INDEX_NAME_PREFIX == mock_config_data["index_name_prefix"]
-    assert config.EXPERIMENT_NAME == mock_config_data["index_name_prefix"]
+    assert config.EXPERIMENT_NAME == mock_config_data["experiment_name"]
     assert config.CHUNK_SIZES == mock_config_data["chunking"]["chunk_size"]
     assert config.OVERLAP_SIZES == mock_config_data["chunking"]["overlap_size"]
     assert config.CHUNKING_STRATEGY == mock_config_data["chunking_strategy"]
diff --git a/rag_experiment_accelerator/run/index.py b/rag_experiment_accelerator/run/index.py
@@ -30,7 +30,6 @@ def run(
     config: Config,
     index_config: IndexConfig,
     file_paths: list[str],
-    is_local: bool = False,
 ) -> dict[str]:
     """
     Runs the main experiment loop, which chunks and uploads data to Azure AI Search indexes based on the configuration specified in the Config class.
@@ -65,14 +64,10 @@ def run(
         config.AZURE_DOCUMENT_INTELLIGENCE_MODEL,
     )
 
-    if is_local and config.SAMPLE_DATA:
+    if config.SAMPLE_DATA:
         parser = load_parser()
         docs = cluster(docs, config, parser)
 
-        # If run with "ONLY_RUN_SAMPLING" we exit here after creating the sampled dataset for running in AML
-        if config.ONLY_RUN_SAMPLING:
-            return index_dict
-
     docs_ready_to_index = convert_docs_to_vector_db_records(docs)
     embed_chunks(index_config, pre_process, docs_ready_to_index)
 
diff --git a/rag_experiment_accelerator/sampling/clustering.py b/rag_experiment_accelerator/sampling/clustering.py
@@ -1,4 +1,3 @@
-import os
 import warnings
 import numpy as np
 import matplotlib
@@ -11,7 +10,6 @@
 from umap import UMAP
 from scipy.spatial.distance import cdist
 from rag_experiment_accelerator.utils.logging import get_logger
-import shutil
 
 matplotlib.use("Agg")
 plt.style.use("ggplot")
@@ -47,17 +45,14 @@ def spacy_tokenizer(sentence, parser):
         str: The tokenized sentence.
 
     """
-
-    if not isinstance(sentence, str):
-        sentence = sentence["content"]
-
-    tokens = [
+    mytokens = parser(sentence)
+    mytokens = [
         word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_
-        for word in parser(sentence)
+        for word in mytokens
         if not word.is_stop and not word.is_punct
     ]
-    tokenized_sentence = " ".join([token for token in tokens])
-    return tokenized_sentence
+    mytokens = " ".join([i for i in mytokens])
+    return mytokens
 
 
 def determine_optimum_k_elbow(embeddings_2d, X, min_cluster, max_cluster, result_dir):
@@ -177,20 +172,18 @@ def chunk_dict_to_dataframe(all_chunks):
     all_chunks (list[dict]): A list of dictionaries where each dictionary contains a chunk and its corresponding text.
 
     Returns:
-    df (pandas.DataFrame): A DataFrame with three columns - 'chunk', 'text' and 'filename, where 'chunk' contains the chunks and 'text' contains the corresponding text and 'filename' the file name.
+    df (pandas.DataFrame): A DataFrame with two columns - 'chunk' and 'text', where 'chunk' contains the chunks and 'text' contains the corresponding text.
     """
 
     chunks = []
     text = []
-    filename = []
 
     for row in all_chunks:
         key, value = list(row.items())[0]
         chunks.append(key)
         text.append(value)
-        filename.append(value["metadata"]["source"])
 
-    df = pd.DataFrame({"chunk": chunks, "text": text, "filename": filename})
+    df = pd.DataFrame({"chunk": chunks, "text": text})
 
     return df
 
@@ -214,7 +207,6 @@ def cluster_kmeans(embeddings_2d, optimum_k, df, result_dir):
             - chunk (list): Chunk data from the DataFrame.
             - prediction (list): Cluster labels assigned by K-means.
             - prediction_values (list): Unique cluster labels.
-            - filenames (list): File names of the sampled data.
 
     """
     logger.info("Clustering chunks")
@@ -228,19 +220,15 @@ def cluster_kmeans(embeddings_2d, optimum_k, df, result_dir):
     )
 
     # Save
-    filenames = (
-        x
-    ) = y = text = processed_text = chunk = prediction = prediction_values = []
     x = embeddings_2d[:, 0].tolist()
     y = embeddings_2d[:, 1].tolist()
     text = df["text"].tolist()
     processed_text = df["processed_text"].tolist()
     chunk = df["chunk"].tolist()
     prediction = kmeans.labels_.tolist()
     prediction_values = list(set(kmeans.labels_.tolist()))
-    filenames = list(set(df["filename"].tolist()))
 
-    return x, y, text, processed_text, chunk, prediction, prediction_values, filenames
+    return x, y, text, processed_text, chunk, prediction, prediction_values
 
 
 def cluster(all_chunks, config, parser):
@@ -286,16 +274,9 @@ def cluster(all_chunks, config, parser):
         optimum_k = config.SAMPLE_OPTIMUM_K
 
     # Cluster
-    (
-        x,
-        y,
-        text,
-        processed_text,
-        chunk,
-        prediction,
-        prediction_values,
-        filenames,
-    ) = cluster_kmeans(embeddings_2d, optimum_k, df, config.sampling_output_dir)
+    x, y, text, processed_text, chunk, prediction, prediction_values = cluster_kmeans(
+        embeddings_2d, optimum_k, df, config.sampling_output_dir
+    )
 
     # Capture all predictions
     data = {"x": x, "y": y, "text": text, "prediction": prediction, "chunk": chunk}
@@ -333,21 +314,4 @@ def cluster(all_chunks, config, parser):
     sampled_chunks = dataframe_to_chunk_dict(df_concat)
     logger.info(f"Sampled Document chunk length {len(sampled_chunks)}")
 
-    # Preserve the sampled files into directory
-    for filename in filenames:
-        try:
-            fn = os.path.basename(filename)
-            os.makedirs(
-                config.sampling_output_dir + "/" + config.JOB_NAME, exist_ok=True
-            )
-            shutil.copy2(
-                filename, config.sampling_output_dir + "/" + config.JOB_NAME + "/" + fn
-            )
-        except OSError as e:
-            logger.info(f"file {filename} could not be copied with metadata {e}")
-            continue
-    logger.info(
-        f"Sampled Documents have been copied to {config.sampling_output_dir + '/' + config.JOB_NAME + '/'}"
-    )
-
     return sampled_chunks
diff --git a/rag_experiment_accelerator/sampling/tests/data/test1.txt b/rag_experiment_accelerator/sampling/tests/data/test1.txt
diff --git a/rag_experiment_accelerator/sampling/tests/test_clustering.py b/rag_experiment_accelerator/sampling/tests/test_clustering.py
@@ -74,19 +74,15 @@ def test_cluster(mock_logger, mock_df, mock_reducer, mock_df_concat, mock_data_d
     # Arrange
     all_chunks = [
         {
-            "content": "Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds are known for their distinctive cooing call and their ability to navigate long distances. Pigeons are also appreciated for their beauty, with their colorful feathers and iridescent sheen.",
-            "metadata": {"source": mock_data_dir + "/sampling/tests/data/test1.txt"},
+            "text1": "Pigeons, also known as rock doves, are a common sight in urban areas around the world. These birds are known for their distinctive cooing call and their ability to navigate long distances. Pigeons are also appreciated for their beauty, with their colorful feathers and iridescent sheen."
         },
         {
-            "content": "Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, including delivering messages during wartime and racing competitions. They are also popular as pets and can be trained to perform tricks.",
-            "metadata": {"source": mock_data_dir + "/sampling/tests/data/test2.txt"},
+            "text2": "Pigeons have been domesticated for thousands of years and have been used for a variety of purposes, including delivering messages during wartime and racing competitions. They are also popular as pets and can be trained to perform tricks."
         },
         {
-            "content": "Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to spread seeds and nutrients throughout their environment and are even considered a keystone species in some areas.",
-            "metadata": {"source": mock_data_dir + "/sampling/tests/data/test3.txt"},
+            "text3": "Despite their reputation as pests, pigeons play an important role in the ecosystem. They help to spread seeds and nutrients throughout their environment and are even considered a keystone species in some areas."
         },
     ]
-
     config = MagicMock()
     config.SAMPLE_OPTIMUM_K = 2
     config.SAMPLE_MIN_CLUSTER = 1
@@ -113,20 +109,7 @@ def test_cluster(mock_logger, mock_df, mock_reducer, mock_df_concat, mock_data_d
         return_value=2,
     ), patch(
         "rag_experiment_accelerator.sampling.clustering.cluster_kmeans",
-        return_value=(
-            0,
-            0,
-            "text",
-            "processed_text",
-            0,
-            [0, 1],
-            [0.5, 0.6],
-            [
-                mock_data_dir + "/sampling/tests/data/test1.txt",
-                mock_data_dir + "/sampling/tests/data/test2.txt",
-                mock_data_dir + "/sampling/tests/data/test3.txt",
-            ],
-        ),
+        return_value=(0, 0, "text", "processed_text", 0, [0, 1], [0.5, 0.6]),
     ), patch(
         "rag_experiment_accelerator.sampling.clustering.pd.DataFrame",
         return_value=mock_df_concat,
@@ -148,6 +131,7 @@ def test_cluster(mock_logger, mock_df, mock_reducer, mock_df_concat, mock_data_d
                 "sampled_cluster_predictions_cluster_number_2.csv",
             )
         )
+        assert mock_logger.info.call_count == 4
         assert (
             mock_logger.info.call_args_list[0][0][0]
             == "Sampling - Original Document chunk length 3"