feat: update data.py and data_v3 which is unfinish function

james397520 · Oct 11, 2023 · 8b54bb8 · 8b54bb8
1 parent 1a68d1e
commit 8b54bb8
Show file tree

Hide file tree

Showing 2 changed files with 98 additions and 63 deletions.
diff --git a/data.py b/data.py
@@ -5,89 +5,63 @@
 # import zipcodetw
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from torch.utils.data import Dataset, DataLoader
+import pickle
 
 
 
-# # Custom dataset
-# class HousePriceDataset(Dataset):
-#     def __init__(self, csv_file, transform=None):
-#         # Load data
-#         data = pd.read_csv(csv_file)
-#         # district_names = pd.read_csv('data/district_names.csv')
-#         # name_code = pd.get_dummies(district_names['行政區名稱'])
-#         # pd.DataFrame(name_code)
-#         # print(name_code['行政區名稱'].tolist())
-
-#         # Drop the '備註' column
-#         # data = data.drop(columns=['備註'])
-#         # Separate features and target variable
-
-#         self.X = data.iloc[:, :-1].values
-#         print(self.X.shape)
-#         print(self.X[0])
-#         self.ground_size = self.X[:,4].reshape(-1, 1)
-#         self.floor = self.X[:,6].reshape(-1, 1)
-#         self.all_floor = self.X[:,7].reshape(-1, 1)
-#         self.age = self.X[:,10].reshape(-1, 1)
-#         self.house_size = self.X[:,11].reshape(-1, 1)
-#         self.parking_size = self.X[:,12].reshape(-1, 1)
-#         self.parking_cnt = self.X[:,13].reshape(-1, 1)
-#         self.lng = self.X[:,14].reshape(-1, 1)
-#         self.lat = self.X[:,15].reshape(-1, 1)
-#         self.main_size = self.X[:,17].reshape(-1, 1)
-#         self.balcony = self.X[:,18].reshape(-1, 1)
-#         self.ancillar_size = self.X[:,19].reshape(-1, 1)
-
-
-
-#         self.y = data.iloc[:, -1].values.reshape(-1, 1)
-#         scaler_x = MinMaxScaler()
-#         scaler_y = MinMaxScaler()
-#         self.ground_size = scaler_x.fit_transform(self.X[:,4].reshape(-1, 1))
-#         # self.X = scaler_x.fit_transform(self.X)
-#         self.ground_size = scaler_x.fit_transform(self.X[:,4].reshape(-1, 1))
-#         self.y = scaler_y.fit_transform(self.y).flatten()
-
-#         self.transform = transform
-
-#     def __len__(self):
-#         return len(self.y)
-
-#     def __getitem__(self, idx):
-#         sample = torch.tensor(self.X[idx], dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)
-
-#         if self.transform:
-#             sample = self.transform(sample)
-
-#         return sample
-
 # Z-Score Normalization Function
-def z_score_normalize(df, columns):
+def z_score_normalize(df, columns, save_scaler_path=None):
     scaler = StandardScaler()
     df[columns] = scaler.fit_transform(df[columns])
+
+    if save_scaler_path:
+        with open(save_scaler_path, 'wb') as f:
+            pickle.dump(scaler, f)
+
     return df
 
 # Min-Max Normalization Function
-def min_max_normalize(df, columns):
+def min_max_normalize(df, columns, save_scaler_path=None):
     scaler = MinMaxScaler()
     df[columns] = scaler.fit_transform(df[columns])
+
+    if save_scaler_path:
+        with open(save_scaler_path, 'wb') as f:
+            pickle.dump(scaler, f)
+
+    return df
+
+# Denormalization Function for Z-Score
+def z_score_denormalize(df, columns, scaler_path):
+    with open(scaler_path, 'rb') as f:
+        scaler = pickle.load(f)
+
+    df[columns] = scaler.inverse_transform(df[columns])
+    return df
+
+# Denormalization Function for Min-Max
+def min_max_denormalize(df, columns, scaler_path):
+    with open(scaler_path, 'rb') as f:
+        scaler = pickle.load(f)
+
+    df[columns] = scaler.inverse_transform(df[columns])
     return df
 
 
 
 # Custom Dataset Class with Normalization Option
-class HousePriceDataset(Dataset):
+class HousePriceTrainDataset(Dataset):
     def __init__(self, dataframe, feature_columns, target_column, normalize_columns=None):
         self.dataframe = dataframe.copy()  # Creating a copy to avoid modifying the original dataframe
 
         # Applying the specified normalization methods to the specified columns
         if normalize_columns:
             for column, method in normalize_columns.items():
                 if method == 'z-score':
-                    self.dataframe = z_score_normalize(self.dataframe, [column])
+                    self.dataframe = z_score_normalize(self.dataframe, [column],save_scaler_path="z_score_normalize_data.pkl")
                 elif method == 'min-max':
-                    self.dataframe = min_max_normalize(self.dataframe, [column])
-                    
+                    self.dataframe = min_max_normalize(self.dataframe, [column],save_scaler_path="min_max_normalize_data.pkl")
+
         self.features = self.dataframe[feature_columns].values
         self.target = self.dataframe[target_column].values
 
@@ -100,6 +74,30 @@ def __getitem__(self, idx):
         return sample
 
 
+class HousePriceTestDataset(Dataset):
+    def __init__(self, dataframe, feature_columns, normalize_columns=None):
+        self.dataframe = dataframe.copy()  # Creating a copy to avoid modifying the original dataframe
+
+        # Applying the specified normalization methods to the specified columns
+        if normalize_columns:
+            for column, method in normalize_columns.items():
+                if method == 'z-score':
+                    self.dataframe = z_score_normalize(self.dataframe, [column])
+                elif method == 'min-max':
+                    self.dataframe = min_max_normalize(self.dataframe, [column])
+
+        self.features = self.dataframe[feature_columns].values
+
+
+    def __len__(self):
+        return len(self.dataframe)
+
+    def __getitem__(self, idx):
+        sample = {'features': torch.tensor(self.features[idx], dtype=torch.float32)
+                  }
+        return sample
+
+
 if __name__ == "__main__":
     # Load dataset
     # train_dataset = HousePriceDataset('data/training_data.csv')
@@ -109,16 +107,16 @@ def __getitem__(self, idx):
     data = pd.read_csv('data/training_data.csv')
     # 指定要標準化的列和標準化方法
     normalize_columns = {
-    '土地面積': 'z-score',
-    '建物面積': 'min-max'
+    '橫坐標': 'min-max', #z-score
+    '縱坐標': 'min-max'
     }
 
     # 選擇要用作特徵的列
-    selected_features = ['土地面積', '建物面積']
+    selected_features = ['橫坐標', '縱坐標']
     target_column = '單價'
 
     # 創建標準化後的數據集
-    normalized_dataset = HousePriceDataset(data, selected_features, target_column, normalize_columns)
+    normalized_dataset = HousePriceTrainDataset(data, selected_features, target_column, normalize_columns)
 
     # 訪問標準化後的數據集中的樣本
     sample = normalized_dataset[0]  # 這將顯示標準化後的第一個樣本

diff --git a/updated_data_v3.py b/updated_data_v3.py
@@ -0,0 +1,37 @@
+
+import json
+import numpy as np
+
+
+
+# Function to save scaler parameters to a JSON file
+def save_scaler_params(scaler, path):
+    params = {
+        "mean": scaler.mean_.tolist() if hasattr(scaler, 'mean_') else None,
+        "scale": scaler.scale_.tolist() if hasattr(scaler, 'scale_') else None,
+        "min": scaler.data_min_.tolist() if hasattr(scaler, 'data_min_') else None,
+        "max": scaler.data_max_.tolist() if hasattr(scaler, 'data_max_') else None
+    }
+    with open(path, 'w') as file:
+        json.dump(params, file)
+
+# Function to load scaler parameters from a JSON file
+def load_scaler_params(path):
+    with open(path, 'r') as file:
+        params = json.load(file)
+    return params
+
+
+
+class HousePriceTestDataset(Dataset):
+    def __init__(self, dataframe, feature_columns, normalize_columns=None):
+        self.dataframe = dataframe.copy()  # Creating a copy to avoid modifying the original dataframe
+
+        # Applying the specified normalization methods to the specified columns
+        if normalize_columns:
+            for column, method in normalize_columns.items():
+                if method == 'z-score':
+                    self.dataframe = z_score_normalize(self.dataframe, [column])
+                elif method == 'min-max':
+                    self.dataframe = min_max_normalize(self.dataframe, [column])
+