Mind map

Goal

  1. impute missing data
  2. fit regression
library(tidyverse)
library(AER)
library(naniar)
library(mice)

Steps

Step 01. missing data

wages_data <- read_csv("/Users/zero/myrepo/jixingBlogdown/data/Mroz.csv")
wages_data <- wages_data %>% 
    select(wage, educ, fatheduc, motheduc, inlf, hours, 
               kidslt6, kidsge6, age, huswage, 
               mtr, unem, city, exper) %>% 
    mutate_at(vars(kidslt6, kidsge6, hours, educ, age, wage, huswage, mtr,
                    motheduc, fatheduc, unem, exper), as.numeric) %>% 
    mutate_at(vars(city, inlf), as.character)

wages_data <- wages_data %>% 
    mutate(wage = ifelse(is.na(wage), 0, wage))
vis_miss(wages_data)

wages_mis <- ampute(wages_data)$amp
vis_miss(wages_mis)

Step 02. impute data

imp_wages <- mice::parlmice(data = wages_mis, m = 10, maxit = 20, cl.type = "FORK")
imp_wages_df <- mice::complete(imp_wages, "long")

ihs <- function(x){
    log(x + sqrt(x**2 + 1))
}
imp_wages_df <- imp_wages_df %>% 
    group_by(.imp) %>% 
    mutate(ihs_wage = ihs(wage),
           exper2 = exper**2)

Step 03. nest

(imp_wages <- imp_wages_df %>% 
    group_by(.imp) %>% 
    nest())
## # A tibble: 10 x 2
##     .imp data               
##    <int> <list>             
##  1     1 <tibble [753 × 17]>
##  2     2 <tibble [753 × 17]>
##  3     3 <tibble [753 × 17]>
##  4     4 <tibble [753 × 17]>
##  5     5 <tibble [753 × 17]>
##  6     6 <tibble [753 × 17]>
##  7     7 <tibble [753 × 17]>
##  8     8 <tibble [753 × 17]>
##  9     9 <tibble [753 × 17]>
## 10    10 <tibble [753 × 17]>

Step 04. fit

imp_wages_reg = imp_wages %>% 
    mutate(lin_reg = map(data, 
                         ~lm(ihs_wage ~ educ + inlf + hours + 
                                 kidslt6 + kidsge6 + age + huswage + 
                                 mtr + unem + city + exper + exper2, 
                             data = .)))

Step 05. pool

pool_lin_reg <- pool(imp_wages_reg$lin_reg)

Step 06. summary

summary(pool_lin_reg)
##                  estimate    std.error   statistic       df      p.value
## (Intercept)  1.012423e+00 3.461251e-01  2.92501944 400.7382 3.557091e-03
## educ         4.539668e-02 8.424241e-03  5.38881542 656.8772 9.735809e-08
## inlf         1.866331e+00 5.583397e-02 33.42644968 231.0175 0.000000e+00
## hours       -6.421963e-05 3.382794e-05 -1.89841948 135.1251 5.805680e-02
## kidslt6     -5.544361e-03 3.795262e-02 -0.14608637 609.2085 8.838958e-01
## kidsge6     -5.237696e-03 1.509400e-02 -0.34700510 301.2112 7.286930e-01
## age          2.382734e-04 2.760765e-03  0.08630701 549.0705 9.312473e-01
## huswage     -2.568072e-02 6.643815e-03 -3.86535757 319.0680 1.213720e-04
## mtr         -2.089116e+00 3.418482e-01 -6.11123791 434.8243 1.650737e-09
## unem         1.802800e-03 5.595457e-03  0.32218990 573.9496 7.474062e-01
## city         1.423946e-02 3.801650e-02  0.37456010 432.9497 7.081025e-01
## exper        1.279662e-02 6.353684e-03  2.01404701 579.9830 4.439128e-02
## exper2      -1.819858e-04 1.987354e-04 -0.91571864 691.7068 3.601336e-01