forked from VincentGranville/Main
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsynthesize_categories.py
50 lines (44 loc) · 1.18 KB
/
synthesize_categories.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import pandas as pd
from scipy.stats import norm
import numpy as np
url="https://raw.githubusercontent.com/VincentGranville/Main/main/insurance.csv"
# make sure fields don't contain commas
data = pd.read_csv(url)
print(data.head(10))
groupID = {}
groupLabel = {}
groupCount = {}
ID = 0
Nobs = len(data)
for k in range(0, Nobs):
obs = data.iloc[k] # get observation number k
group = obs[1] +"\t"+obs[4]+"\t"+obs[5]
if group in groupID:
groupCount[group] += 1
else:
groupCount[group] = 1
groupID[group] = ID
groupLabel[ID] = group
ID += 1
Ngroups = len(groupID)
Nobs_synth = 1300
seed = 453
np.random.seed(seed)
GroupCountSynth = {}
Synth_group = {}
for k in range(Nobs_synth):
u = np.random.uniform(0.0, 1.0)
p = 0
ID = -1
while p < u:
ID = ID + 1
group = groupLabel[ID]
p += groupCount[group]/Nobs
group = groupLabel[ID]
if group in GroupCountSynth:
GroupCountSynth[group] += 1
else:
GroupCountSynth[group] = 0
Synth_group[k] = group # group assigned to synth observation k
for group in groupCount:
print(group, groupCount[group], GroupCountSynth[group])