## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = FALSE,
  comment = "#>"
)

options(rmarkdown.html_vignette.check_title = FALSE)

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("multid")

## ----eval = FALSE-------------------------------------------------------------
#  install.packages("devtools")
#  devtools::install_github("vjilmari/multid")

## ----setup,warning = FALSE, message = FALSE-----------------------------------
library(rio)
library(multid)
library(dplyr)
library(overlapping)

## ----importdata---------------------------------------------------------------
# create a temporary directory
td <- tempdir()

# create a temporary file
tf <- tempfile(tmpdir=td, fileext=".zip")

# download file from internet into temporary location
download.file("http://openpsychometrics.org/_rawdata/BIG5.zip", tf)

# list zip archive
file_names <- unzip(tf, list=TRUE)

# extract files from zip file
unzip(tf, exdir=td, overwrite=TRUE)

# use when zip file has only one file
dat <- import(file.path(td, "BIG5/data.csv"))

# delete the files and directories
unlink(td)

## -----------------------------------------------------------------------------
# save names of personality items to a vector
per.items<-names(dat)[which(names(dat)=="E1"):
                        which(names(dat)=="O10")]

# code item response 0 (zero) to not available (NA) on all these items
dat[,per.items]<-
  sapply(dat[,per.items],function(x){ifelse(x==0,NA,x)})

# check that the numerical range makes sense
range(dat[,per.items],na.rm=T)

# calculate sum scores for Big Five (reverse coded items subtracted from 6)
# see codebook from https://openpsychometrics.org for item content and keys

dat$N<-rowMeans(
  cbind(dat[,c("N1","N3","N5","N6","N7","N8","N9","N10")],
        6-dat[,c("N2","N4")]),na.rm=T)

dat$E<-rowMeans(
  cbind(dat[,c("E1","E3","E5","E7","E9")],
        6-dat[,c("E2","E4","E6","E8","E10")]),na.rm=T)

dat$O<-rowMeans(
  cbind(dat[,c("O1","O3","O5","O7","O8","O9","O10")],
        6-dat[,c("O2","O4","O6")]),na.rm=T)

dat$A<-rowMeans(
  cbind(dat[,c("A2","A4","A6","A8","A9","A10")],
        6-dat[,c("A1","A3","A5","A7")]),na.rm=T)

dat$C<-rowMeans(
  cbind(dat[,c("C1","C3","C5","C7","C9","C10")],
        6-dat[,c("C2","C4","C6","C8")]),na.rm=T)


## -----------------------------------------------------------------------------
US.dat<-
  na.omit(dat[dat$country=="US" & (dat$gender==1 | dat$gender==2),
              c("gender",per.items,"N","E","O","A","C")])

# code categorical gender variables
US.dat$gender.cat<-
  ifelse(US.dat$gender==1,"Male","Female")


## -----------------------------------------------------------------------------
d.N<-d_pooled_sd(data = US.dat,var = "N",
            group.var = "gender.cat",group.values = c("Male","Female"),
            infer=T)

d.E<-d_pooled_sd(data = US.dat,var = "E",
            group.var = "gender.cat",group.values = c("Male","Female"),
            infer=T)

d.O<-d_pooled_sd(data = US.dat,var = "O",
            group.var = "gender.cat",group.values = c("Male","Female"),
            infer=T)

d.A<-d_pooled_sd(data = US.dat,var = "A",
            group.var = "gender.cat",group.values = c("Male","Female"),
            infer=T)

d.C<-d_pooled_sd(data = US.dat,var = "C",
            group.var = "gender.cat",group.values = c("Male","Female"),
            infer=T)

# compile to a table

uni.d<-rbind(d.N,d.E,d.O,d.A,d.C)
rownames(uni.d)<-c("N","E","O","A","C")

round(uni.d,2)


## -----------------------------------------------------------------------------

# let's use the previously obtained univariate d-values
d_vector<-uni.d[,"D"]
# calculate intercorrelations
cor_mat<-cor(US.dat[,c("N","E","O","A","C")])
# test that the ordering of these variables matches
names(d_vector)==colnames(cor_mat)

# print the correlation matrix
round(cor_mat,2)

# Calculate D
D_maha<-sqrt(t(d_vector) %*% solve(cor_mat) %*% d_vector)
D_maha

## -----------------------------------------------------------------------------
# set seed number for reproducibility
set.seed(37127)

# decide the size of the regularization (training data fold)
# here, half of the smaller of the male and female groups is used
size=round(min(table(US.dat$gender.cat))/2)

# run D_regularized
D_out<-
  D_regularized(data=US.dat,
                mv.vars=c("N","E","O","A","C"),
                group.var="gender.cat",
                group.values = c("Male","Female"),
                out = T,
                size=size,
                pcc = T,
                auc = T,
                pred.prob = T)

# print the output
round(D_out$D,2)

## -----------------------------------------------------------------------------
round(100*D_out$P.table,1)

## -----------------------------------------------------------------------------
coefficients(D_out$cv.mod,
             s = "lambda.min")

coefficients(D_out$cv.mod,
             s = "lambda.1se")

plot(D_out$cv.mod)

## ----fig.height=4, fig.width=8------------------------------------------------

# Obtain the D-estimate
D<-unname(D_out$D[,"D"])

# Overlap relative to a single distribution (OVL)
OVL<-2*pnorm((-D/2))

# Proportion of overlap relative to the joint distribution
OVL2<-OVL/(2-OVL)

# non-parametric overlap with overlapping package

np.overlap<-
  overlap(x = list(D_out$pred.dat[
  D_out$pred.dat$group=="Male","pred"],
  D_out$pred.dat[
  D_out$pred.dat$group=="Female","pred"]),
  plot=T)

# this corresponds to Proportion of overlap relative to the joint distribution (OVL2)
np.OVL2<-unname(np.overlap$OV)

# from which Proportion of overlap relative to a single distribution (OVL) is approximated at
np.OVL<-(2*np.OVL2)/(1+np.OVL2)

# comparison of different overlap-estimates
round(cbind(OVL,np.OVL,OVL2,np.OVL2),2)


## -----------------------------------------------------------------------------
# number of redraws
reps=10

# placeholder for D-estimates
D_out_boot<-rep(NA,reps)

# repeat the procedure to obtain a distribution

t1<-Sys.time()

for (i in 1:reps){
  # draw replaced samples with the same size as the original data
  boot.dat<-sample_n(US.dat,size=nrow(US.dat),replace=T)
  
  # run D_regularized for each sample
  D_out_boot[i]<-
    D_regularized(data=boot.dat,
                mv.vars=c("N","E","O","A","C"),
              group.var="gender.cat",
              group.values = c("Male","Female"),
              out = T,
              size=size)$D[,"D"]
  
}
t2<-Sys.time()
t2-t1

# print 95% CI percentile interval

round(quantile(D_out_boot,c(.025,.975)),2)


## -----------------------------------------------------------------------------
# place-holder vector for univariate Cohen d's for items
d_vector_items<-rep(NA,length(per.items))

# loop through the items and obtain d for each item

for (i in 1:length(per.items)){
  d_vector_items[i]<-
    d_pooled_sd(data=US.dat,
              var=per.items[i],
            group.var="gender.cat",
            group.values=c("Male","Female"))[,"D"]
  
}

cor_mat_items<-cor(US.dat[,per.items])

# Calculate D
D_maha_items<-
  sqrt(t(d_vector_items) %*% solve(cor_mat_items) %*% d_vector_items)
D_maha_items

## -----------------------------------------------------------------------------

D_items_out<-
  D_regularized(data=US.dat,
                mv.vars=per.items,
              group.var="gender.cat",
              group.values = c("Male","Female"),
              out = T,
              size=size,pcc = T,auc = T,pred.prob = T)

# Item-based D
round(D_items_out$D,2)

# Big Five domain-based D
round(D_out$D,2)

## -----------------------------------------------------------------------------
round(100*D_items_out$P.table,1)
round(100*D_out$P.table,1)

## ----fig.height=4, fig.width=8------------------------------------------------

# Obtain the D-estimate
D_items<-unname(D_items_out$D[,"D"])

# Overlap relative to a single distribution (OVL)
OVL_items<-2*pnorm((-D_items/2))

# Proportion of overlap relative to the joint distribution
OVL2_items<-OVL_items/(2-OVL_items)

# non-parametric overlap

np.overlap_items<-
  overlap(x = list(D_items_out$pred.dat[
  D_items_out$pred.dat$group=="Male","pred"],
  D_items_out$pred.dat[
  D_items_out$pred.dat$group=="Female","pred"]),
  plot=T)

# this corresponds to Proportion of overlap relative to the joint distribution (OVL2)
np.OVL2_items<-unname(np.overlap_items$OV)

# from which Proportion of overlap relative to a single distribution (OVL) is approximated at
np.OVL_items<-(2*np.OVL2_items)/(1+np.OVL2_items)

# compare the different overlap-estimates

round(cbind(OVL_items,np.OVL_items,
            OVL2_items,np.OVL2_items),2)

# print Big Five domain based overlaps for comparison

round(cbind(OVL,np.OVL,OVL2,np.OVL2),2)

## -----------------------------------------------------------------------------
sessionInfo()