Skip to content

Commit 12fc94f

Browse files
committed
1 parent eabb1e2 commit 12fc94f

File tree

1 file changed

+116
-0
lines changed

1 file changed

+116
-0
lines changed
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
install.packages("hdm")
2+
install.packages("xtable")
3+
install.packages("igraph")
4+
install.packages("ggraph")
5+
## Part 1: Double Lasso and DAGs (20 points)
6+
7+
##Consider the US census data from the year 2015 to analyse the effect of college graduate (clg) status
8+
##and it’s interaction effects with gender (sex), location (mw, so,we, ne) and both on wage jointly. All
9+
##other variables denote some other socio-economic characteristics, e.g. marital status, occupation, and
10+
##experience.
11+
12+
# Cargar los datos desde el enlace proporcionado
13+
file <- "https://raw.githubusercontent.com/CausalAIBook/MetricsMLNotebooks/main/data/wage2015_subsample_inference.csv"
14+
data <- read.csv(file)
15+
16+
# Explorar la estructura de los datos
17+
str(data)
18+
head(data)
19+
20+
y <- data$lwage
21+
Z <- subset(data, select = -c(wage, lwage))
22+
y_log <- log(data$wage)
23+
24+
## A. Generate the dataset with all the two-way interactions between variables.
25+
## Make sure that the categorical variables are transformed to dummies properly.
26+
## Also, note that the resulting dataset contains the treatment and it’s interactions
27+
## with the other variables of interest, so you don’t need to generate them separately. (2 pts)
28+
29+
# Generar las interacciones de dos vías
30+
center_colmeans <- function(x) {
31+
xcenter <- colMeans(x)
32+
x - rep(xcenter, rep.int(nrow(x), ncol(x)))
33+
}
34+
35+
controls_formula <- "~ 0 + (sex + mw + so + we + C(occ2) + C(ind2) + exp1 + exp2 + exp3 + exp4)**2"
36+
Zcontrols <- model.matrix(as.formula(controls_formula), data = Z)
37+
Zcontrols <- center_colmeans(Zcontrols)
38+
39+
linear_het_formula <- "~ 0 + (sex + mw + so + we)"
40+
Zhet <- model.matrix(as.formula(linear_het_formula), data = Z)
41+
Zhet <- center_colmeans(Zhet)
42+
43+
Zhet <- as.data.frame(cbind(Zhet, "clg" = Z$clg))
44+
nonlin_het_formula <- "~ 0 + clg + clg * (sex + mw + so + we)"
45+
Zinteractions <- model.matrix(as.formula(nonlin_het_formula), data = Zhet)
46+
interaction_cols <- Zinteractions[, grepl("clg", colnames(Zinteractions))]
47+
48+
X <- cbind(Zinteractions, Zcontrols)
49+
50+
## B. Use the double lasso technique to find the effect of the treatment and
51+
## it’s relevant interactions on the wage. To tune the penalization parameter
52+
## in the lasso step, cross-validate it. (4 pts)
53+
index_clg <- grep("clg", colnames(Zinteractions))
54+
effects_clg <- hdm::rlassoEffects(x = X, y = y, index = index_clg, post = FALSE)
55+
result <- summary(effects_clg)
56+
print(xtable(result$coef[, c(1, 2, 4)], type = "latex"), digits = 3)
57+
58+
## C. Report a summary of the estimation of the parameters of interest. (2 pts)
59+
pointwise_ci <- confint(effects_clg, level = 0.95)
60+
print(xtable(pointwise_ci), type = "latex")
61+
62+
joint_ci <- confint(effects_clg, level = 0.95, joint = TRUE)
63+
print(xtable(joint_ci), type = "latex")
64+
65+
## D. A college degree has a positive impact on income, but the interactions
66+
## with factors such as gender and geographic location show no significant
67+
## differences. This indicates that the economic benefits of obtaining a degree
68+
## are consistent and do not vary noticeably across the groups analyzed.
69+
70+
## B.You are trying to study the effect of breast feeding in the number of
71+
## infections a baby is likely to have. Your dataset contains the following
72+
## variables : (5 pts)
73+
edges <- matrix(c(
74+
"Individual smoking behavior", "Forced respiratory volume",
75+
"Age", "Forced respiratory volume",
76+
"Age", "Individual smoking behavior",
77+
"Sex", "Individual smoking behavior",
78+
"Sex", "Forced respiratory volume",
79+
"Height", "Forced respiratory volume"
80+
), byrow = TRUE, ncol = 2)
81+
82+
graph <- graph_from_edgelist(edges, directed = TRUE)
83+
84+
ggraph(graph, layout = "fr") +
85+
geom_edge_link(aes(edge_alpha = 0.7), show.legend = FALSE) +
86+
geom_node_point(color = "skyblue", size = 10) +
87+
geom_node_text(aes(label = name), color = "black", fontface = "bold", size = 5) +
88+
theme_void() +
89+
ggtitle("Directed Acyclic Graph (DAG)") +
90+
theme(plot.title = element_text(size = 16, face = "bold"))
91+
## B.You are trying to study the effect of breast feeding in the number of
92+
## infections a baby is likely to have. Your dataset contains the following
93+
## variables : (5 pts)
94+
95+
edges <- matrix(c(
96+
"Breast fed", "Number of infections",
97+
"Marital status", "Family income",
98+
"Family income", "Breast fed",
99+
"Family income", "Childcare outside home",
100+
"Education", "Family income",
101+
"Number of children", "Breast fed",
102+
"Childcare outside home", "Number of infections"
103+
), byrow = TRUE, ncol = 2)
104+
105+
graph <- graph_from_edgelist(edges, directed = TRUE)
106+
107+
ggraph(graph, layout = "fr") +
108+
geom_edge_link(aes(edge_alpha = 0.7), show.legend = FALSE) +
109+
geom_node_point(color = "skyblue", size = 10) +
110+
geom_node_text(aes(label = name), color = "black", fontface = "bold", size = 5) +
111+
theme_void() +
112+
ggtitle("Directed Acyclic Graph (DAG)") +
113+
theme(plot.title = element_text(size = 16, face = "bold"))
114+
115+
116+

0 commit comments

Comments
 (0)