## ---- include = FALSE---------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----setup--------------------------------------------------------------------
library(sketching)
seed <- 220526  
set.seed(seed)  

## -----------------------------------------------------------------------------
Y <- AK$LWKLYWGE
intercept <- AK$CNST
X_end <- AK$EDUC
X_exg <- AK[,3:11]
X <- cbind(X_exg, X_end)
Z_inst <- AK[,12:(ncol(AK)-1)]
Z <- cbind(X_exg, Z_inst)
fullsample <- cbind(Y,intercept,X)
n <- nrow(fullsample)
d <- ncol(X)

## -----------------------------------------------------------------------------
# choice of m (data-oblivious sketch size)
target_size <- 0.05
target_power <- 0.8
S_constant <- (stats::qnorm(1-target_size) + stats::qnorm(target_power))^2
tau_limit <- 10
m_ols <- floor(n*S_constant/tau_limit^2) 
print(m_ols)

## -----------------------------------------------------------------------------
ys <- fullsample[,1]
reg <- as.matrix(fullsample[,-1])
fullmodel <- lm(ys ~ reg - 1)
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(fullmodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(fullmodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
subsample <- sketch(fullsample, m_ols, method = "bernoulli")
ys <- subsample[,1]
reg <- subsample[,-1]
submodel <- lm(ys ~ reg - 1) 
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(submodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(submodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
subsample <- sketch(fullsample, m_ols, method = "unif")
ys <- subsample[,1]
reg <- subsample[,-1]
submodel <- lm(ys ~ reg - 1) 
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(submodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(submodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
subsample <- sketch(fullsample, m_ols, method = "countsketch")
ys <- subsample[,1]
reg <- subsample[,-1]
submodel <- lm(ys ~ reg - 1) 
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(submodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(submodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
subsample <- sketch(fullsample, m_ols, method = "srht")
ys <- subsample[,1]
reg <- subsample[,-1]
submodel <- lm(ys ~ reg - 1) 
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(submodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(submodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
fullsample <- cbind(Y,intercept,X,intercept,Z)
n <- nrow(fullsample)
p <- ncol(X)
q <- ncol(Z)
# choice of m (data-oblivious sketch size)
target_size <- 0.05
target_power <- 0.8
S_constant <- (qnorm(1-target_size) + qnorm(target_power))^2
tau_limit <- 5
m_2sls <- floor(n*S_constant/tau_limit^2) 
print(m_2sls)

## -----------------------------------------------------------------------------
ys <- fullsample[,1]
reg <- as.matrix(fullsample[,2:(p+2)])
inst <- as.matrix(fullsample[,(p+3):ncol(fullsample)]) 
fullmodel <- ivreg::ivreg(ys ~ reg - 1 | inst - 1) 
# use homoskedasticity-only asymptotic variance
ztest <- lmtest::coeftest(fullmodel, df = Inf)
est <- ztest[(d+1),1] 
se <- ztest[(d+1),2]
print(c(est,se))
# use heteroskedasticity-robust asymptotic variance
ztest_hc <- lmtest::coeftest(fullmodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
est_hc <- ztest_hc[(d+1),1] 
se_hc <- ztest_hc[(d+1),2]
print(c(est_hc,se_hc))

## -----------------------------------------------------------------------------
# sketching methods for 2SLS
methods <- c("bernoulli","unif","countsketch","srht")
results_2sls <- array(NA, dim = c(length(methods),3))
for (met in 1:length(methods)){
  method <- methods[met]
    # generate a sketch
    subsample <- sketch(fullsample, m_2sls, method = method)
    ys <- subsample[,1]
    reg <- as.matrix(subsample[,2:(p+2)])
    inst <- as.matrix(subsample[,(p+3):ncol(subsample)]) 
    submodel <- ivreg::ivreg(ys ~ reg - 1 | inst - 1) 
    # use homoskedasticity-only asymptotic variance
    ztest <- lmtest::coeftest(submodel, df = Inf)
    est <- ztest[(d+1),1] 
    se <- ztest[(d+1),2]
    # use heteroskedasticity-robust asymptotic variance
    ztest_hc <- lmtest::coeftest(submodel, df = Inf, 
            vcov = sandwich::vcovHC, type = "HC0")
    est_hc <- ztest_hc[(d+1),1] 
    se_hc <- ztest_hc[(d+1),2]
  results_2sls[met,] <- c(est, se, se_hc)
}
rownames(results_2sls) <- methods
colnames(results_2sls) <- c("est", "non-robust se","robust se")
print(results_2sls)