Friday, August 27, 2021

Preprocess clustered binary data before inference on sample proportion

preprocess.clustered.binary.data <- function(x, n)
{
    ## Purpose: Preprocess clustered binary data before inference on sample proportion
    ##          Reference: Rao, J. N. K., & Scott, A. J. (1992). A simple method for
    ##                     the analysis of clustered binary data. Biometrics, 577-585.
    ## Keywords: variance adjustment, ratio estimator, correlated data
    ## Arguments:
    ##   x: a vector for the numerators across clusters.
    ##   n: a vector for the denominators across clusters.
    ## Return: a pre-processed numerator and denominator for overall sample proportion
    ## Author: Feiming Chen
    ## ________________________________________________

    m <- length(x)
    cat("Number of Clusters =", m, "\n")

    n0 <- sum(n)
    cat("Raw Sample Size =", n0, "\n")

    x0 <- sum(x)
    cat("Raw Incidence Count =", x0, "\n")

    cat("\nEstimates Regarding the Overall Sample Proportion:\n")

    p <- x0 / n0
    cat("  Point Estimate =", round(p, 4), "\n")

    v0 <- p * (1 - p) / n0
    cat("  Naive Binomial Variance =", round(v0, 4), "\n")

    r <- x - n * p
    v <- m * sum(r^2) / (m - 1) / n0^2
    cat("  Correct Variance  =", round(v, 4), "\n")

    d <- v / v0
    cat("\nDesign Effect (Variance Inflation Factor due to Clustering) =", round(d, 4), "\n")

    n1 <- n0 / d
    cat("Effective Sample Size (n) =", round(n1), "\n")

    x1 <- x0 / d
    cat("Effective Incidence Count (x) =", round(x1), "\n")

    ## Return the pre-processed numerator (x) and denominator (n) for
    ## the overall sample proportion (p = x / n). 
    list(x = x1, n = n1)

}
if (F) {                                # Unit Test
    x = c(1,1,2,0,5,0,1,4,0,1,0,0,0,0,0,4,3,0,1,1,1,2,0,1,0,2,1,0,1,5,0,0,0,0,0,0,2,0,2,0,0,2,1,2,2,0,0,1,0,1) 
    n = c(2,2,3,0,7,1,2,5,1,2,0,0,4,2,0,7,4,1,1,1,4,2,3,1,0,2,1,0,1,5,3,0,1,3,0,0,2,1,2,0,0,5,3,2,2,1,0,2,1,1) 

    a <- preprocess.clustered.binary.data(x, n)
    ## Number of Clusters = 50 
    ## Raw Sample Size = 93 
    ## Raw Incidence Count = 50 

    ## Estimates Regarding the Overall Sample Proportion:
    ##   Point Estimate = 0.5376 
    ##   Naive Binomial Variance = 0.0027 
    ##   Correct Variance  = 0.004 

    ## Design Effect (Variance Inflation Factor due to Clustering) = 1.4895 
    ## Effective Sample Size (n) = 62 
    ## Effective Incidence Count (x) = 34 

    prop.test(a$x, a$n)
    ## 95 percent confidence interval:
    ##  0.40774 0.66290
    ## sample estimates:
    ##       p 
    ## 0.53763 

    ## Compare to Bootstrap Confidence Intervals
    library(boot)
    r <- boot(data.frame(x=x, n=n), function(dat, idx) { d <- dat[idx,]; sum(d$x)/sum(d$n)}, R = 10000)
    boot.ci(r)
    ## Bootstrap Percentile CI: ( 0.4118,  0.6569 ), which is a bit tighter than the VIF method. 

    ## Compare to Unadjusted (Wrong) CI: 
    prop.test(sum(x), sum(n))
    ## 95 percent confidence interval:
    ##  0.43159 0.64053, which is too narrow and is wrong. 

}