forked from VincentGranville/Main
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSDV_example.py
69 lines (54 loc) · 2.07 KB
/
SDV_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
# https://docs.sdv.dev/sdv/installation
!pip install sdv
import sdv
print(sdv.__version__)
#--------------
import pandas as pd
#real_data = pd.read_csv('diabetes.csv')
url = "https://raw.githubusercontent.com/VincentGranville/Main/main/diabetes.csv"
real_data = pd.read_csv(url)
# rows with missing data must be treated separately: I remove them here
real_data.drop(real_data.index[(real_data["Insulin"] == 0)], axis=0, inplace=True)
real_data.drop(real_data.index[(real_data["Glucose"] == 0)], axis=0, inplace=True)
real_data.drop(real_data.index[(real_data["BMI"] == 0)], axis=0, inplace=True)
# no further data transformation used beyond this point
real_data.to_csv('diabetes_clean.csv')
real_data.head()
#-------------
real_data = pd.read_csv('diabetes_clean.csv')
real_data.head()
#-------------
# https://docs.sdv.dev/sdv/single-table-data/data-preparation/single-table-metadata-api
from sdv.metadata import SingleTableMetadata
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='diabetes_clean.csv')
python_dict = metadata.to_dict()
metadata.validate()
print(python_dict)
#---------------
from sdv.lite import SingleTablePreset
synthesizer = SingleTablePreset(metadata, name='FAST_ML')
#------------
synthesizer.fit(data=real_data)
synthetic_data = synthesizer.sample(num_rows=500)
print(synthetic_data.head())
synthetic_data.to_csv('diabetes_sdv_synth1.csv')
#--------------------
url = "https://raw.githubusercontent.com/VincentGranville/Main/main/circle8d.csv"
real_data = pd.read_csv(url)
real_data.to_csv('circle8d.csv')
metadata = SingleTableMetadata()
metadata.detect_from_csv(filepath='circle8d.csv')
python_dict = metadata.to_dict()
metadata.validate()
print(python_dict)
#-------------
real_data = pd.read_csv('circle8d.csv')
print((real_data.head())
synthesizer.fit(data=real_data)
synthetic_data = synthesizer.sample(num_rows=1500)
print(synthetic_data.head())
synthetic_data.to_csv('circle8d_sdv_synth1.csv')
#----------- not used, ignore
# sensitive_column_names = ['guest_email', 'billing_address', 'credit_card_number']
# real_data[sensitive_column_names].head(3)