Skip to content

Commit

Permalink
data validation is working fine
Browse files Browse the repository at this point in the history
  • Loading branch information
sunnysavita10 committed Aug 24, 2024
1 parent 8d9ac67 commit 7993e18
Show file tree
Hide file tree
Showing 19 changed files with 44,403 additions and 8 deletions.
11,056 changes: 11,056 additions & 0 deletions Artifacts/08_24_2024_11_49_25/data_ingestion/feature_store/NetworkData.csv

Large diffs are not rendered by default.

2,212 changes: 2,212 additions & 0 deletions Artifacts/08_24_2024_11_49_25/data_ingestion/ingested/test.csv

Large diffs are not rendered by default.

8,845 changes: 8,845 additions & 0 deletions Artifacts/08_24_2024_11_49_25/data_ingestion/ingested/train.csv

Large diffs are not rendered by default.

11,056 changes: 11,056 additions & 0 deletions Artifacts/08_24_2024_11_51_48/data_ingestion/feature_store/NetworkData.csv

Large diffs are not rendered by default.

2,212 changes: 2,212 additions & 0 deletions Artifacts/08_24_2024_11_51_48/data_ingestion/ingested/test.csv

Large diffs are not rendered by default.

8,845 changes: 8,845 additions & 0 deletions Artifacts/08_24_2024_11_51_48/data_ingestion/ingested/train.csv

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
Abnormal_URL:
drift_status: false
p_value: 1.0
DNSRecord:
drift_status: false
p_value: 0.8151359073090507
Domain_registeration_length:
drift_status: false
p_value: 0.9659080032037665
Favicon:
drift_status: false
p_value: 0.9563660841655501
Google_Index:
drift_status: false
p_value: 0.9999996521355587
HTTPS_token:
drift_status: false
p_value: 0.9998299425911609
Iframe:
drift_status: false
p_value: 0.9999989383967115
Links_in_tags:
drift_status: false
p_value: 1.0
Links_pointing_to_page:
drift_status: false
p_value: 0.8923284435665428
Page_Rank:
drift_status: false
p_value: 1.0
Prefix_Suffix:
drift_status: false
p_value: 1.0
Redirect:
drift_status: false
p_value: 0.9999953843756324
Request_URL:
drift_status: false
p_value: 0.4527621454370569
Result:
drift_status: false
p_value: 0.9903208205295254
RightClick:
drift_status: false
p_value: 1.0
SFH:
drift_status: false
p_value: 0.9999836499887566
SSLfinal_State:
drift_status: false
p_value: 0.44566877193471394
Shortining_Service:
drift_status: false
p_value: 0.9999062353866229
Statistical_report:
drift_status: false
p_value: 0.9985597993583677
Submitting_to_email:
drift_status: false
p_value: 0.9739932435377546
URL_Length:
drift_status: false
p_value: 0.9925611176019148
URL_of_Anchor:
drift_status: false
p_value: 0.9687622789382879
age_of_domain:
drift_status: false
p_value: 0.6369650807361981
double_slash_redirecting:
drift_status: false
p_value: 0.9999836499887566
having_At_Symbol:
drift_status: false
p_value: 0.9925611176019148
having_IP_Address:
drift_status: false
p_value: 1.0
having_Sub_Domain:
drift_status: false
p_value: 0.9990438890770315
on_mouseover:
drift_status: false
p_value: 0.9999971010402764
popUpWidnow:
drift_status: false
p_value: 0.999931781258988
port:
drift_status: false
p_value: 0.9528546927737338
web_traffic:
drift_status: false
p_value: 1.0
Binary file not shown.
89 changes: 82 additions & 7 deletions networksecurity/components/data_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,38 @@ def __init__(self,data_ingestion_artifact:DataIngestionArtifact,
raise NetworkSecurityException(e,sys)


def validate_number_of_column(self,dataframe:pd.DataFrame)->bool:
def validate_number_of_columns(self,dataframe:pd.DataFrame)->bool:
try:
pass
number_of_columns = len(self._schema_config["columns"])
logging.info(f"Required number of columns: {number_of_columns}")
logging.info(f"Data frame has columns: {len(dataframe.columns)}")

if len(dataframe.columns)==number_of_columns:
return True
return False


except Exception as e:
raise NetworkSecurityException(e,sys)


except Exception as e:
raise NetworkSecurityException(e,sys)

def is_numerical_column_exist(self,dataframe:pd.DataFrame)->bool:
try:
pass
numerical_columns = self._schema_config["numerical_columns"]
dataframe_columns = dataframe.columns

numerical_column_present = True
missing_numerical_columns = []
for num_column in numerical_columns:
if num_column not in dataframe_columns:
numerical_column_present=False
missing_numerical_columns.append(num_column)

logging.info(f"Missing numerical columns: [{missing_numerical_columns}]")
return numerical_column_present
except Exception as e:
raise NetworkSecurityException(e,sys)

Expand All @@ -43,14 +65,67 @@ def read_data(file_path)->pd.DataFrame:

def detect_dataset_drift(self,base_df,current_df,threshold=0.05)->bool:
try:
pass
status=True
report ={}
for column in base_df.columns:
d1 = base_df[column]
d2 = current_df[column]
is_same_dist = ks_2samp(d1,d2)
if threshold<=is_same_dist.pvalue:
is_found=False
else:
is_found = True
status=False
report.update({column:{
"p_value":float(is_same_dist.pvalue),
"drift_status":is_found

}})

drift_report_file_path = self.data_validation_config.drift_report_file_path

#Create directory
dir_path = os.path.dirname(drift_report_file_path)
os.makedirs(dir_path,exist_ok=True)
write_yaml_file(file_path=drift_report_file_path,content=report,)
return status

except Exception as e:
raise NetworkSecurityException(e,sys)

def initiate_data_validation(self)->DataValidationArtifact:
try:
self.read_data()
self.validate_number_of_column()
self.detect_dataset_drift()
train_file_path = self.data_ingestion_artifact.trained_file_path
test_file_path = self.data_ingestion_artifact.test_file_path

#Reading data from train and test file location
train_dataframe = DataValidation.read_data(train_file_path)
test_dataframe = DataValidation.read_data(test_file_path)

#Validate number of columns
status = self.validate_number_of_columns(dataframe=train_dataframe)
if not status:
error_message=f"{error_message}Train dataframe does not contain all columns.\n"
status = self.validate_number_of_columns(dataframe=test_dataframe)
if not status:
error_message=f"{error_message}Test dataframe does not contain all columns.\n"
#if len(error_message)>0:
#raise Exception(error_message)

#Let check data drift
status = self.detect_dataset_drift(base_df=train_dataframe,current_df=test_dataframe)

data_validation_artifact = DataValidationArtifact(
validation_status=status,
valid_train_file_path=self.data_ingestion_artifact.trained_file_path,
valid_test_file_path=self.data_ingestion_artifact.test_file_path,
invalid_train_file_path=None,
invalid_test_file_path=None,
drift_report_file_path=self.data_validation_config.drift_report_file_path,
)

logging.info(f"Data validation artifact: {data_validation_artifact}")

return data_validation_artifact
except Exception as e:
raise NetworkSecurityException(e,sys)
Binary file not shown.
Binary file modified networksecurity/entity/__pycache__/artifact_entity.cpython-310.pyc
Binary file not shown.
Binary file modified networksecurity/entity/__pycache__/config_entity.cpython-310.pyc
Binary file not shown.
Binary file modified networksecurity/exception/__pycache__/exception.cpython-310.pyc
Binary file not shown.
Binary file modified networksecurity/logger/__pycache__/logger.cpython-310.pyc
Binary file not shown.
Binary file not shown.
3 changes: 2 additions & 1 deletion networksecurity/pipeline/training_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def run_pipeline(self):
try:
data_ingestion_artifact=self.start_data_ingestion()
print(data_ingestion_artifact)
data_validation=self.start_data_validation()
data_validation=self.start_data_validation(data_ingestion_artifact=data_ingestion_artifact)
print(data_validation)
except Exception as e:
raise NetworkSecurityException(e,sys)

Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 7993e18

Please sign in to comment.