pro1code1hack
diff --git a/‎.env
Lines changed: 1 addition & 1 deletion b/‎.env
Lines changed: 1 addition & 1 deletion
diff --git a/‎.gitignore
Lines changed: 1 addition & 1 deletion b/‎.gitignore
Lines changed: 1 addition & 1 deletion
diff --git a/‎.pre-commit-config.yaml
Lines changed: 32 additions & 0 deletions b/‎.pre-commit-config.yaml
Lines changed: 32 additions & 0 deletions
diff --git a/‎app.py
Lines changed: 24 additions & 12 deletions b/‎app.py
Lines changed: 24 additions & 12 deletions
diff --git a/‎celery_tasks/celery_config.py
Lines changed: 6 additions & 3 deletions b/‎celery_tasks/celery_config.py
Lines changed: 6 additions & 3 deletions
diff --git a/‎celery_tasks/tasks.py
Lines changed: 1 addition & 1 deletion b/‎celery_tasks/tasks.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎clickhouse/client.py
Lines changed: 15 additions & 10 deletions b/‎clickhouse/client.py
Lines changed: 15 additions & 10 deletions
diff --git a/‎clickhouse/migrations_tables.py
Lines changed: 6 additions & 5 deletions b/‎clickhouse/migrations_tables.py
Lines changed: 6 additions & 5 deletions
diff --git a/‎clickhouse/utils.py
Lines changed: 18 additions & 12 deletions b/‎clickhouse/utils.py
Lines changed: 18 additions & 12 deletions
diff --git a/‎constants.py
Lines changed: 23 additions & 3 deletions b/‎constants.py
Lines changed: 23 additions & 3 deletions
@@ -2,4 +2,4 @@ CLICKHOUSE_HOST=localhost
 CLICKHOUSE_PORT=9000
 CLICKHOUSE_USER=default
 CLICKHOUSE_PASSWORD=
-CLICKHOUSE_DATABASE=test_db
+CLICKHOUSE_DATABASE=test_db
@@ -4,4 +4,4 @@ venv
 analytics-obfuscated-faked.csv
 __pycache__/
 remove_pycache.sh
-tasks.txt
+tasks.txt
@@ -0,0 +1,32 @@
+# .pre-commit-config.yaml
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.5.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+
+  - repo: https://github.com/psf/black
+    rev: 24.4.2
+    hooks:
+      - id: black
+        args: ['--line-length=79']
+
+  - repo: https://github.com/charliermarsh/ruff-pre-commit
+    rev: v0.4.7
+    hooks:
+      - id: ruff
+        args: ['--fix', '--line-length=79']
+
+  - repo: https://github.com/PyCQA/flake8
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v1.10.0
+    hooks:
+      - id: mypy
+        args: ["--ignore-missing-imports"]
@@ -6,6 +6,7 @@
 
 app = FastAPI()
 
+
 class TimeFrameEnum(str, Enum):
     next_1_hour = "next_1_hour"
     next_4_hour = "next_4_hour"
@@ -16,33 +17,43 @@ class TimeFrameEnum(str, Enum):
     next_168_hour = "next_168_hour"
 
 
-
 @app.get("/predict/")
 def get_predictions(pid: str, timeframe: TimeFrameEnum):
     """Get predictions for the specified `pid` and `timeframe`"""
 
-    project_exists_query = f"SELECT pid FROM predictions WHERE pid = '{pid}' LIMIT 1"
+    project_exists_query = (
+        f"SELECT pid FROM predictions WHERE pid = '{pid}' LIMIT 1"
+    )
     project = clickhouse_client.execute_query(project_exists_query)
-    
+
     if not project:
         raise HTTPException(status_code=404, detail="Project does not exist.")
- 
-    prediction_query = f"SELECT {timeframe} FROM predictions WHERE pid = '{pid}'"
+
+    prediction_query = (
+        f"SELECT {timeframe} FROM predictions WHERE pid = '{pid}'"
+    )
     result = clickhouse_client.execute_query(prediction_query)
-    
+
     if not result or not result[0][0]:
-        raise HTTPException(status_code=404, detail="Data not found. Prediction is not available.")
-    
+        raise HTTPException(
+            status_code=404,
+            detail="Data not found. Prediction is not available.",
+        )
+
     prediction_data = json.loads(result[0][0])
-    
+
     if not prediction_data:
-        raise HTTPException(status_code=404, detail="Data not found. Prediction is not available.")
-    
+        raise HTTPException(
+            status_code=404,
+            detail="Data not found. Prediction is not available.",
+        )
+
     return {timeframe: prediction_data}
-    
+
 
 if __name__ == "__main__":
     import uvicorn
+
     uvicorn.run(app, host="0.0.0.0", port=8000)
 
 
@@ -52,6 +63,7 @@ def trigger_training():
     run_training_module.delay()
     return {"message": "Training module triggered"}
 
+
 @app.post("/run_prediction/")
 def trigger_prediction():
     """Trigger the prediction module via Celery"""
 
@@ -3,15 +3,18 @@
 celery_app = Celery(
     "swetrix-ai-celery",
     broker="redis://localhost:6379/0",
-    backend="redis://localhost:6379/0"
+    backend="redis://localhost:6379/0",
 )
 
 celery_app.conf.update(
     result_expires=3600,
     task_serializer="json",
     result_serializer="json",
-    accept_content=["json"]
+    accept_content=["json"],
+    task_soft_time_limit=3600,  # 1 hour soft time limit
+    task_time_limit=3700,  # 1 hour 10 minutes hard time limit
 )
 
 from celery_tasks.tasks import *
-# celery_app.autodiscover_tasks(['celery_tasks'])
+
+# celery_app.autodiscover_tasks(['celery_tasks'])
@@ -10,4 +10,4 @@ def run_training_module():
 
 @celery_app.task
 def run_prediction_module():
-    predict()
+    predict()
@@ -2,29 +2,34 @@
 from dotenv import load_dotenv
 from clickhouse_driver import Client
 
+
 class ClickHouseClient:
     def __init__(self):
         load_dotenv()
-        self.host = os.getenv('CLICKHOUSE_HOST')
-        self.port = os.getenv('CLICKHOUSE_PORT')
-        self.user = os.getenv('CLICKHOUSE_USER')
-        self.password = os.getenv('CLICKHOUSE_PASSWORD')
-        self.database = os.getenv('CLICKHOUSE_DATABASE')
-        
+        self.host = os.getenv("CLICKHOUSE_HOST")
+        self.port = os.getenv("CLICKHOUSE_PORT")
+        self.user = os.getenv("CLICKHOUSE_USER")
+        self.password = os.getenv("CLICKHOUSE_PASSWORD")
+        self.database = os.getenv("CLICKHOUSE_DATABASE")
+
         self.client = Client(
             host=self.host,
             port=self.port,
             user=self.user,
             password=self.password,
-            database=self.database
+            database=self.database,
         )
-    
+
     def execute_query(self, query: str):
         return self.client.execute(query)
-    
+
     def insert_data(self, table: str, data: list):
         self.client.execute(f"INSERT INTO {table} VALUES", data)
 
+    def drop_all_data_from_table(self, table_name: str):
+        """Drop all data from the table as we require to store only one record in the meantime"""
+        query = f"TRUNCATE TABLE {table_name}"
+        self.execute_query(query)
 
-clickhouse_client = ClickHouseClient()
 
+clickhouse_client = ClickHouseClient()
@@ -1,18 +1,19 @@
 from client import ClickHouseClient
 
+
 def create_tables():
     client = ClickHouseClient()
-    
+
     training_tmp_query = """
     CREATE TABLE IF NOT EXISTS training_tmp (
         cat_features Array(String),
         cols Array(String),
         next_hrs Array(String),
-        model String                    
+        model String
     ) ENGINE = MergeTree()
     ORDER BY tuple()
     """
-    
+
     predictions_query = """
     CREATE TABLE IF NOT EXISTS predictions (
         pid String,
@@ -26,12 +27,12 @@ def create_tables():
     ) ENGINE = MergeTree()
     ORDER BY pid
     """
-    
+
     client.execute_query(training_tmp_query)
     client.execute_query(predictions_query)
 
 
 client = ClickHouseClient()
 
 if __name__ == "__main__":
-    create_tables()
+    create_tables()
@@ -1,46 +1,52 @@
 import base64
 import pickle
 import json
-from data.serialisation import serialise_predictions, serialise_data_for_clickhouse
+from data.serialisation import (
+    serialise_predictions,
+    serialise_data_for_clickhouse,
+)
 from clickhouse.client import clickhouse_client
 
 
 """
 Clickhouse does not support the pickled objects yet, and it is a problem.
-There is a solution to use `base64` encoding, store the model as a string and then decode it and use as a pickle object 
+There is a solution to use `base64` encoding, store the model as a string and then decode it and use as a pickle object
 
-Though it is a subject of discussion in the future. I personally prefer to store the model in S3 bucket, but this will require an 
-additional time for development which we do not have to test the model completely in production.  
+Though it is a subject of discussion in the future. I personally prefer to store the model in S3 bucket, but this will require an
+additional time for development which we do not have, as the priority is to test the model in production.
 """
 
-def serialize_model(file_path):
-    with open(file_path, 'rb') as f:
-        pickled_model = f.read()
-    base64_model = base64.b64encode(pickled_model).decode('utf-8')
+
+def serialize_model(model):
+    pickled_model = pickle.dumps(model)
+    base64_model = base64.b64encode(pickled_model).decode("utf-8")
     return base64_model
 
 
 def deserialize_model(base64_model):
-    pickled_model = base64.b64decode(base64_model.encode('utf-8'))
+    pickled_model = base64.b64decode(base64_model.encode("utf-8"))
     model = pickle.loads(pickled_model)
     return model
 
 
 def fetch_model():
-    result = clickhouse_client.execute_query("SELECT model FROM training_tmp LIMIT 1")
+    """Get the serialized model from the database for predictions"""
+    result = clickhouse_client.execute_query("SELECT model FROM training_tmp")
     if result:
         serialized_model = result[0][0]
         model = deserialize_model(serialized_model)
         return model
     else:
         print("No model found")
         return None
-    
+
 
 def insert_predictions(predictions):
     """Insert serialised JSON data into the predictions table"""
     predictions_data = json.loads(predictions)
     processed_data = serialise_predictions(predictions_data)
     serialized_data = serialise_data_for_clickhouse(processed_data)
-    clickhouse_client.insert_data('predictions', serialized_data)
 
+    # Drop previous(not relevant) data before the insertion of new predictions
+    clickhouse_client.drop_all_data_from_table("predictions")
+    clickhouse_client.insert_data("predictions", serialized_data)
@@ -1,3 +1,23 @@
-columns = ("psid","sid","pid","pg","prev","dv","br","os","lc","ref","so","me","ca","cc","rg","ct","sdur","unique","created")
-agg_cols = ["year","month","day","day_of_week","hour","pid"]
-date_col = 'created'
+columns = (
+    "psid",
+    "sid",
+    "pid",
+    "pg",
+    "prev",
+    "dv",
+    "br",
+    "os",
+    "lc",
+    "ref",
+    "so",
+    "me",
+    "ca",
+    "cc",
+    "rg",
+    "ct",
+    "sdur",
+    "unique",
+    "created",
+)
+agg_cols = ["year", "month", "day", "day_of_week", "hour", "pid"]
+date_col = "created"