preprocess.clustered.binary.data <- function(x, n) { ## Purpose: Preprocess clustered binary data before inference on sample proportion ## Reference: Rao, J. N. K., & Scott, A. J. (1992). A simple method for ## the analysis of clustered binary data. Biometrics, 577-585. ## Keywords: variance adjustment, ratio estimator, correlated data ## Arguments: ## x: a vector for the numerators across clusters. ## n: a vector for the denominators across clusters. ## Return: a pre-processed numerator and denominator for overall sample proportion ## Author: Feiming Chen ## ________________________________________________ m <- length(x) cat("Number of Clusters =", m, "\n") n0 <- sum(n) cat("Raw Sample Size =", n0, "\n") x0 <- sum(x) cat("Raw Incidence Count =", x0, "\n") cat("\nEstimates Regarding the Overall Sample Proportion:\n") p <- x0 / n0 cat(" Point Estimate =", round(p, 4), "\n") v0 <- p * (1 - p) / n0 cat(" Naive Binomial Variance =", round(v0, 4), "\n") r <- x - n * p v <- m * sum(r^2) / (m - 1) / n0^2 cat(" Correct Variance =", round(v, 4), "\n") d <- v / v0 cat("\nDesign Effect (Variance Inflation Factor due to Clustering) =", round(d, 4), "\n") n1 <- n0 / d cat("Effective Sample Size (n) =", round(n1), "\n") x1 <- x0 / d cat("Effective Incidence Count (x) =", round(x1), "\n") ## Return the pre-processed numerator (x) and denominator (n) for ## the overall sample proportion (p = x / n). list(x = x1, n = n1) } if (F) { # Unit Test x = c(1,1,2,0,5,0,1,4,0,1,0,0,0,0,0,4,3,0,1,1,1,2,0,1,0,2,1,0,1,5,0,0,0,0,0,0,2,0,2,0,0,2,1,2,2,0,0,1,0,1) n = c(2,2,3,0,7,1,2,5,1,2,0,0,4,2,0,7,4,1,1,1,4,2,3,1,0,2,1,0,1,5,3,0,1,3,0,0,2,1,2,0,0,5,3,2,2,1,0,2,1,1) a <- preprocess.clustered.binary.data(x, n) ## Number of Clusters = 50 ## Raw Sample Size = 93 ## Raw Incidence Count = 50 ## Estimates Regarding the Overall Sample Proportion: ## Point Estimate = 0.5376 ## Naive Binomial Variance = 0.0027 ## Correct Variance = 0.004 ## Design Effect (Variance Inflation Factor due to Clustering) = 1.4895 ## Effective Sample Size (n) = 62 ## Effective Incidence Count (x) = 34 prop.test(a$x, a$n) ## 95 percent confidence interval: ## 0.40774 0.66290 ## sample estimates: ## p ## 0.53763 ## Compare to Bootstrap Confidence Intervals library(boot) r <- boot(data.frame(x=x, n=n), function(dat, idx) { d <- dat[idx,]; sum(d$x)/sum(d$n)}, R = 10000) boot.ci(r) ## Bootstrap Percentile CI: ( 0.4118, 0.6569 ), which is a bit tighter than the VIF method. ## Compare to Unadjusted (Wrong) CI: prop.test(sum(x), sum(n)) ## 95 percent confidence interval: ## 0.43159 0.64053, which is too narrow and is wrong. }
Friday, August 27, 2021
Preprocess clustered binary data before inference on sample proportion
Subscribe to:
Post Comments (Atom)
No comments:
Post a Comment