Improve API for stability

ashwinprasadme · ashwinprasadme · commit b543a32b267f · 2025-01-21T11:50:23.000+01:00
diff --git a/daswow/daswow_model.py b/daswow/daswow_model.py
@@ -12,6 +12,7 @@
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 MODELS_PATH = os.path.join(SCRIPT_DIR, "models")
 
+download_models_from_github_release()
 
 class Preprocessing:
     # init. set dataframe to be processed
@@ -90,8 +91,6 @@ def __init__(self, nb_path, models_path=MODELS_PATH):
         cf = CellFeatures()
         self.df = cf.get_cell_features_nb(nb_path)
 
-        download_models_from_github_release()
-
         self.preprocesser = Preprocessing(self.df)
         self.model = joblib.load(f"{models_path}/rf_code_scaled.pkl")
         self.tfidf = joblib.load(f"{models_path}/tfidf_vectorizer.pkl")
diff --git a/daswow/model_download.py b/daswow/model_download.py
@@ -25,9 +25,13 @@ def download_models_from_github_release(repo_owner="secure-software-engineering"
         os.makedirs(download_path)
     
     # check if files already exist and remove from the list
+    assets_to_download = []
     for asset_name in asset_names:
-        if os.path.exists(os.path.join(download_path, asset_name)):
-            asset_names.remove(asset_name)
+        if not os.path.exists(os.path.join(download_path, asset_name)):
+            assets_to_download.append(asset_name)
+
+    if not assets_to_download:
+        return "Models already exist" 
 
     # API endpoint to get release info
     url = f"https://api.github.com/repos/{repo_owner}/{repo_name}/releases/tags/{release_tag}"
@@ -41,7 +45,7 @@ def download_models_from_github_release(repo_owner="secure-software-engineering"
     for asset in release_data['assets']:
         print(asset['name'])  # Add this line
 
-    for asset_name in asset_names:
+    for asset_name in assets_to_download:
         # Find the download URL of the asset
         asset_url = None
         for asset in release_data['assets']:
@@ -53,6 +57,7 @@ def download_models_from_github_release(repo_owner="secure-software-engineering"
             raise ValueError(f"Asset '{asset_name}' not found in the release.")
 
         # Download the file
+        print(f"Downloading model file: {asset['name']}")
         response = requests.get(asset_url, stream=True)
         response.raise_for_status()
 
diff --git a/headergen/server.py b/headergen/server.py
@@ -8,7 +8,7 @@
 from fastapi.middleware.gzip import GZipMiddleware
 from fastapi.responses import JSONResponse
 
-from framework_models import get_high_level_phase
+from framework_models import get_high_level_phase, DASWOW_PHASES
 from headergen import headergen
 
 app = FastAPI()
@@ -38,44 +38,53 @@
 @app.post("/get_analysis_notebook/")
 async def get_analysis(file: UploadFile = File(...)):
     """Upload a notebook file, analyze chunks of it, and add metadata."""
-    # Save the uploaded file to the uploads directory
-    file_location = f"{UPLOAD_DIR}/{file.filename}"
-
-    async with aiofiles.open(file_location, "wb") as f:
-        content = await file.read()
-        await f.write(content)
-
-    # Load the notebook
-    async with aiofiles.open(file_location, "r", encoding="utf-8") as file:
-        notebook_content = await file.read()
-        notebook = nbformat.reads(notebook_content, as_version=4)
-
-    # Perform analysis on the uploaded notebook
     try:
-        analysis_meta = headergen.start_headergen(
-            file_location, OUTPUT_DIR, debug_mode=True
-        )
+        # Save the uploaded file to the uploads directory
+        file_location = f"{UPLOAD_DIR}/{file.filename}"
+
+        async with aiofiles.open(file_location, "wb") as f:
+            content = await file.read()
+            await f.write(content)
+
+        # Load the notebook
+        async with aiofiles.open(file_location, "r", encoding="utf-8") as file:
+            notebook_content = await file.read()
+            notebook = nbformat.reads(notebook_content, as_version=4)
+
+        # Perform analysis on the uploaded notebook
+        try:
+            analysis_meta = headergen.start_headergen(
+                file_location, OUTPUT_DIR, debug_mode=True
+            )
+        except Exception as e:
+            logger.error(f"Analysis failed: {str(e)}")
+            raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+
+        # Prepare the analysis output in a chunked dictionary, mapping analysis to cells
+        analysis_output = {"cell_mapping": {}}
+
+        if "block_mapping" in analysis_meta:
+            for cell_index, cell_results in analysis_meta["block_mapping"].items():
+                # Get high-level phases and convert set to list
+                ml_phases = list(set([DASWOW_PHASES.get(tag, "Unknown") for tag in cell_results["dl_pipeline_tag"]]))
+                func_list = {k:{"doc_string":v, "arguments":[]} for k,v in cell_results.get("doc_string", {}).items()}
+
+                for call_args in cell_results["call_args"].values():
+                    for call, args in call_args.items():
+                        if call in func_list:
+                            func_list[call]["arguments"].append(args)
+
+                # Add to the chunked dictionary without modifying the content
+                analysis_output["cell_mapping"][cell_index] = {
+                    "ml_phase": ml_phases,  # Ensure ml_phases is a list, not a set
+                    "functions": func_list,
+                }
+
+        # Return the chunked analysis output without overwriting notebook content
+        return JSONResponse(content=analysis_output)
     except Exception as e:
-        logger.error(f"Analysis failed: {str(e)}")
-        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
-
-    # Prepare the analysis output in a chunked dictionary, mapping analysis to cells
-    analysis_output = {"cell_mapping": {}}
-
-    if "block_mapping" in analysis_meta:
-        for cell_index, cell_results in analysis_meta["block_mapping"].items():
-            # Get high-level phases and convert set to list
-            ml_phases = list(set(cell_results["dl_pipeline_tag"]))
-            func_list = cell_results.get("doc_string", "")
-
-            # Add to the chunked dictionary without modifying the content
-            analysis_output["cell_mapping"][cell_index] = {
-                "ml_phase": ml_phases,  # Ensure ml_phases is a list, not a set
-                "functions": func_list,
-            }
-
-    # Return the chunked analysis output without overwriting notebook content
-    return JSONResponse(content=analysis_output)
+        return JSONResponse(content={"error": str(e)}, status_code=500)
+
 
 
 if __name__ == "__main__":
diff --git a/scripts/simple_example.py b/scripts/simple_example.py
@@ -14,7 +14,7 @@
 script_dir = os.path.abspath(os.path.dirname(__file__))
 
 # Careful, the out_path folder will be removed
-file_path = f"/mnt/Projects/PhD/Research/HeaderGen/git_sources/HeaderGen_github/.scrapy/test/test.py"
+file_path = f"/mnt/Projects/PhD/Research/HeaderGen/git_sources/headergen_githib/.scrapy/notebooks/01-keras-deep-learning-to-solve-titanic.ipynb"
 out_path = f"{script_dir}/results/"
 
 
diff --git a/setup.py b/setup.py
@@ -34,7 +34,7 @@ def package_files(directory):
 
 setuptools.setup(
     name="headergen",
-    version="2.0.0",
+    version="2.0.1",
     description="HeaderGen: Automated cell header generator",
     long_description=open("README.md").read(),
     long_description_content_type="text/markdown",