plot.compare.CI <- function(x, CI, ylab = "Estimates", ...) { ## Purpose: Make a plot to compare point estimates and their confidence intervals ## (presumably derived from different methods) ## Arguments: ## x: a vector of point estimates ## CI: a matrix of CI's. Each column is a CI pair. ## 1st row is lower CI. 2nd row is upper CI ## ...: pass to the plot function. ## Return: a plot. ## Author: Feiming Chen ## ________________________________________________ require(gplots) ## Determine how best to put X-axis Labels -------------------- lx <- length(x) # number of point estimates if (lx > 7) { xlas <- 2 o <- par(mar = c(10, 4, 4, 2)) on.exit(par(o)) } li <- CI[1,] ui <- CI[2,] plotCI(x, li = li, ui = ui, barcol="blue", xaxt="n", xlab = "", ylab = ylab, sfrac = 0.005, xlim = c(0.7, lx + 0.3), ...) s <- 1:lx ss <- names(x) axis(side=1, at = s, labels= ss, tick = F, las = 0) ## put on numerical labels on the CI's. s1 <- s + 0.18 # offset ## median & CI labels text(s1, y = li, labels = round(li, 2)) text(s1, y = ui, labels = round(ui, 2)) text(s1, y = x, labels = round(x, 2)) } if (F) { # Unit Test x <- 1:3 names(x) <- c("A", "B", "C") CI <- matrix(c(0.8, 1.2, 1.7, 2.3, 2.9, 3.1), nrow = 2) plot.compare.CI(x, CI) }
Wednesday, October 20, 2021
Make a plot to compare point estimates and their confidence intervals
Friday, August 27, 2021
Preprocess clustered binary data before inference on sample proportion
preprocess.clustered.binary.data <- function(x, n) { ## Purpose: Preprocess clustered binary data before inference on sample proportion ## Reference: Rao, J. N. K., & Scott, A. J. (1992). A simple method for ## the analysis of clustered binary data. Biometrics, 577-585. ## Keywords: variance adjustment, ratio estimator, correlated data ## Arguments: ## x: a vector for the numerators across clusters. ## n: a vector for the denominators across clusters. ## Return: a pre-processed numerator and denominator for overall sample proportion ## Author: Feiming Chen ## ________________________________________________ m <- length(x) cat("Number of Clusters =", m, "\n") n0 <- sum(n) cat("Raw Sample Size =", n0, "\n") x0 <- sum(x) cat("Raw Incidence Count =", x0, "\n") cat("\nEstimates Regarding the Overall Sample Proportion:\n") p <- x0 / n0 cat(" Point Estimate =", round(p, 4), "\n") v0 <- p * (1 - p) / n0 cat(" Naive Binomial Variance =", round(v0, 4), "\n") r <- x - n * p v <- m * sum(r^2) / (m - 1) / n0^2 cat(" Correct Variance =", round(v, 4), "\n") d <- v / v0 cat("\nDesign Effect (Variance Inflation Factor due to Clustering) =", round(d, 4), "\n") n1 <- n0 / d cat("Effective Sample Size (n) =", round(n1), "\n") x1 <- x0 / d cat("Effective Incidence Count (x) =", round(x1), "\n") ## Return the pre-processed numerator (x) and denominator (n) for ## the overall sample proportion (p = x / n). list(x = x1, n = n1) } if (F) { # Unit Test x = c(1,1,2,0,5,0,1,4,0,1,0,0,0,0,0,4,3,0,1,1,1,2,0,1,0,2,1,0,1,5,0,0,0,0,0,0,2,0,2,0,0,2,1,2,2,0,0,1,0,1) n = c(2,2,3,0,7,1,2,5,1,2,0,0,4,2,0,7,4,1,1,1,4,2,3,1,0,2,1,0,1,5,3,0,1,3,0,0,2,1,2,0,0,5,3,2,2,1,0,2,1,1) a <- preprocess.clustered.binary.data(x, n) ## Number of Clusters = 50 ## Raw Sample Size = 93 ## Raw Incidence Count = 50 ## Estimates Regarding the Overall Sample Proportion: ## Point Estimate = 0.5376 ## Naive Binomial Variance = 0.0027 ## Correct Variance = 0.004 ## Design Effect (Variance Inflation Factor due to Clustering) = 1.4895 ## Effective Sample Size (n) = 62 ## Effective Incidence Count (x) = 34 prop.test(a$x, a$n) ## 95 percent confidence interval: ## 0.40774 0.66290 ## sample estimates: ## p ## 0.53763 ## Compare to Bootstrap Confidence Intervals library(boot) r <- boot(data.frame(x=x, n=n), function(dat, idx) { d <- dat[idx,]; sum(d$x)/sum(d$n)}, R = 10000) boot.ci(r) ## Bootstrap Percentile CI: ( 0.4118, 0.6569 ), which is a bit tighter than the VIF method. ## Compare to Unadjusted (Wrong) CI: prop.test(sum(x), sum(n)) ## 95 percent confidence interval: ## 0.43159 0.64053, which is too narrow and is wrong. }
Wednesday, June 30, 2021
Bootstrap Confidence Interval for Data with Repeated Measures
stat.bootstrap <- function(id, val, func = mean, N.bootstrap = 10000) { ## Purpose: Calculate bootstrap-based 95% confidence interval for data with repeated measures ## Arguments: ## id: uniquely identifies a subject (patient) ## val: a numeric vector to be summarized. It contains repeated measures per subject. ## func: a function for calculating the summary statistic from a numeric vector. Default to "mean". ## N.bootstrap: number of bootstrap samples. Default to 10000. ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic ## Author: Feiming Chen ## ________________________________________________ fname <- deparse(substitute(func)) cat("Summary Statistic:", fname, "=", func(val), "\n") id.uniq <- unique(id) N <- length(id.uniq) r <- rep(NA, N.bootstrap) for (i in 1:N.bootstrap) { s <- sample(id.uniq, N, replace = TRUE) v <- c() for (j in s) v <- c(v, val[id == j]) r[i] <- func(v) } hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname)) cat("95% Bootstrap Percentile Confidence Interval:\n") quantile(r, c(0.025, 0.975)) } if (F) { # Unit Test id <- c(1, 1, 1, 2, 2, 3) val <- c(3, 3, 3, 4, 4, 5) stat.bootstrap(id, val) ## Summary Statistic: mean = 3.6667 ## 95% Bootstrap Percentile Confidence Interval: ## 2.5% 97.5% ## 3 5 stat.bootstrap(id, val, func = sd) ## Summary Statistic: sd = 0.8165 ## 95% Bootstrap Percentile Confidence Interval: ## 2.5% 97.5% ## 0.0000 1.0954 }
Thursday, May 20, 2021
Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios
ratio.of.two.prop <- function(x, n) { ## Purpose: Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios. ## Arguments: ## x: a vector of two integers (numerator of the two proportions). ## n: a vector of two integers (denominator of the two proportions). ## Return: The point estimate and the 95% confidence interval of the ratio of two proportions, where the ratio is the first proportion divided by the second proportion. ## Author: Feiming Chen ## ________________________________________________ if (x[1] == 0 || x[1] == n[1] || x[2] == 0 || x[2] == n[2]) { x <- x + 0.5 n <- n + 1 } p <- x / n se <- sqrt((1 - p[1]) / (p[1] * n[1]) + (1 - p[2]) / (p[2] * n[2])) pp <- p[1] / p[2] # point estimate LB <- exp(log(pp) - 1.96 * se) UB <- exp(log(pp) + 1.96 * se) c(pp, LB, UB) } if (F) { # Unit Test ratio.of.two.prop(c(431, 30), c(460, 146)) ## [1] 4.5599 3.3116 6.2786 ratio.of.two.prop(c(0, 146), c(460, 146)) ## [1] 0.00108830 0.00006817 0.01737423 }
Tuesday, May 4, 2021
Calculate the correlation within subjects
repeated.measures.correlation <- function(subject, x, y) { ## Purpose: Calculate the correlation within subjects based on the paper: ## Bland, J. Martin, and Douglas G. Altman. "Statistics notes: Calculating ## correlation coefficients with repeated observations: Part 1—correlation ## within subjects." Bmj 310.6977 (1995): 446. ## Arguments: ## subject: ID for subject. Each subject has multiple correlated measurements (x, y) ## x: the variable that correlates with y. ## y: the variable that correlates with x. ## Return: ## Author: Feiming Chen ## ________________________________________________ dat <- data.frame(subject = factor(subject), x = x, y = y) b <- summary(aov(y ~ subject + x, dat)) print(b) s <- b[[1]][[2]][-1] r0 <- sqrt(s[1]/sum(s)) # within-subject correlation cat("\nWithin-Subject Correlation =", round(r0, 4), "\n") r0 } if (F) { # Unit Test ## simulated data r <- 0.7 # True Within-Subject Correlation z <- atanh(r) N <- 30 se <- 1/sqrt(N-3) zz <- rnorm(N, z, se) rr <- tanh(zz) library(MASS) set.seed(1) dat <- data.frame(subject = factor(rep(1:N, each = 100))) dat$x <- dat$y <- NA for (i in 1:N) { a <- mvrnorm(n = 100, mu = c(0, 0), Sigma = matrix(c(1, rr[i], rr[i], 1), 2)) dat[dat$subject == i, c(2, 3)] <- a } ## ANOVA analysis b <- summary(aov(y ~ subject + x, dat)) s <- b[[1]][[2]][-1] r0 <- sqrt(s[1]/sum(s)) # 0.70471 ## using average of correlations d1 <- as.numeric(by(dat, dat$subject, function(d) cor(d$x, d$y))) d2 <- atanh(d1) d3 <- sample.mean.CI(d2) tanh(d3) # mean = 0.72 (0.68,0.75) (with Fisher's Z) sample.mean.CI(d1) # mean = 0.71 (0.67,0.74) (without Fisher's Z) repeated.measures.correlation(dat$subject, dat$x, dat$y) # 0.7047 }
Thursday, April 29, 2021
Fisher's Z Transform
Fisher.Z <- function(x, inverse = FALSE) { ## Purpose: Fisher's Z Transform (application to correlation confidence interval) ## Arguments: ## x: a positive numeric vector ## inverse: if TRUE, do the inverse transform. ## Return: a transformed (or back-transformed) vector ## Author: Feiming Chen ## ________________________________________________ if (inverse) { y <- tanh(x) ## equivalent to the following: ## e <- exp(2 * x) ## y <- (e - 1) / (e + 1) } else { y <- atanh(x) ## equivalent to the following: ## y <- 1/2 * log((1 + x) / (1 - x)) } y } if (F) { # Unit Test Fisher.Z(0.8) # 1.0986 Fisher.Z(1.2, inverse = TRUE) # 0.83365 x <- rbeta(10, 1, 1) # simulate correlations Fisher.Z(x) Fisher.Z(x, inverse = TRUE) all.equal(x, Fisher.Z(Fisher.Z(x), inverse = TRUE)) # TRUE }
Calculate bootstrap confidence intervals for a statistic of a numeric vector
Using the library "boot"
my.boot <- function(x, func = mean, R = 10000) { ## Purpose: Calculate bootstrap confidence interval for a statistic of a numeric vector ## Arguments: ## x: a vector of numerical values ## func: a function to calculate the statistic (default to "mean") ## R: number of replicate bootstrap samples (default to 10000) ## Return: the bootstrap confidence intervals (percentile and BCa) ## Author: Feiming Chen ## ________________________________________________ library(boot) b <- boot(x, function(x, i) func(x[i]), R = R) print(boot.ci(b, type = c("perc", "bca"))) } if (F) { x <- rnorm(100) my.boot(x) my.boot(x, func = median) ## Level Percentile BCa ## 95% (-0.1284, 0.2726 ) (-0.1371, 0.2533 ) }Not using the library "boot"
stat.bootstrap.independent <- function(val, func = mean, N.bootstrap = 10000) { ## Purpose: Calculate bootstrap-based 95% confidence interval for independent univariate data ## Arguments: ## val: a numeric vector to be summarized. ## func: a function for calculating the summary statistic from a numeric vector. Default to "mean". ## N.bootstrap: number of bootstrap samples. Default to 10000. ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic ## Author: Feiming Chen ## ________________________________________________ fname <- deparse(substitute(func)) ans <- func(val) cat("Summary Statistic:", fname, "=", ans, "\n") N <- length(val) r <- rep(NA, N.bootstrap) set.seed(1) for (i in 1:N.bootstrap) { s <- sample(val, size = N, replace = TRUE) r[i] <- func(s) } hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname)) cat("95% Bootstrap Percentile Confidence Interval:\n") quantile(r, c(0.025, 0.975)) } if (F) { # Unit Test val <- c(3, 3, 3, 4, 4, 5) stat.bootstrap.independent(val) # compare with t-test based CI: (2.8098 4.5235) ## Summary Statistic: mean = 3.6667 ## 95% Bootstrap Percentile Confidence Interval: ## 2.5% 97.5% ## 3.1667 4.3333 }
Monday, April 26, 2021
Simple Survival Analysis of time-to-event data with missing data
my.survival.analysis <- function(dat, study.end.time = NULL) { ## Purpose: Survival Analysis of time-to-event data with missing data. Assume two groups. ## Arguments: ## dat: time-to-event data that look like (variable name is fixed): ## ## time status arm ## 9 1 1 ## 13 1 0 ## 13 0 1 ## ## The meaning of each variable is as follows: ## time: the follow-up time (numeric) for right censored data. ## status: the status indicator. 0 = right censored, 1 = event. ## arm: the group indicator for comparison. 0 = control group, 1 = treatment group. ## ## study.end.time: truncation time (the end of the time window in which to compare two groups). ## It needs to be smaller than the default value (minimum of the largest observed time in each of the two groups) ## ## Return: Survival Data Analysis Summary and Plots ## Author: Feiming Chen ## ________________________________________________ library(survival) dat <<- dat # so that "cox.zph(f2)" below works. fmla <- Surv(time, status) ~ arm # formula for survival time analysis f <- survfit(formula = fmla, data = dat) print(f) ## Simple plot of survival curves plot(f, lwd = 2, col = c("red", "blue"), mark.time = TRUE, xlab = "Time", ylab = "Survival Probability") legend("topright", legend = names(f$strata), bg = "lightyellow", col = c("red", "blue"), title = "Groups", lwd = 2) cat("\n------------------------------\nLog-Rank test for comparing two survival curves:\n") print(survdiff(fmla, dat)) cat("\n------------------------------\nSurvival Probability at Study End:\n") if (missing(study.end.time)) { study.end.time <- max(f$time) tau <- NULL } else tau <- study.end.time ans <- summary(f, times = study.end.time, extend = TRUE, rmean = study.end.time) print(ans) ## Post-test disease risk = 1 - survival.probability.at.end ## ARR (Absolute Risk Reduction) - Check the direction of the risk reduction manually! ARR <- diff(ans$surv) cat("ARR (Absolute Risk Reduction) =", round(ARR, 2), "\n") NNT <- ceiling(1 / ARR) cat("NNT (Number Needed to Treat) =", NNT, "\n") OR <- ans$surv[1] / (1 - ans$surv[1]) / (ans$surv[2] / (1 - ans$surv[2])) cat("Odds Ratio:", round(OR, 2), "\n") cat("\n------------------------------\nUsing Cox Regression Model:\n") f2 <- coxph(formula = fmla, data = dat) print(f2) cat("Hazard Ratio =", round(exp(f2$coefficients), 2), "\n") cat("\n------------------------------\nAssessing Proportional Hazards Assumptions:\n") print(cox.zph(f2)) cat("\n------------------------------\nRMST (Restricted Mean Survival Time) Analysis\n") library(survRM2) r <- rmst2(time = dat$time, status = dat$status, arm = dat$arm, tau = tau) print(r) ## plot(r, xlab = "Time", ylab = "Survival Probability") f } if (F) { # Unit Test d <- data.frame(time = aml$time, status = aml$status, arm = ifelse(aml$x == "Maintained", 1, 0)) f <- my.survival.analysis(d, study.end.time = 35) d <- survRM2::rmst2.sample.data() f <- my.survival.analysis(d, study.end.time = 10) ## cannot do it inside a function. ## make survival curve with cumulative events/incidences f <- survfit(Surv(time, status) ~ x, data = aml) library(survminer) ggsurvplot(f, data = aml, fun = "event", conf.int = TRUE, pval = TRUE, risk.table = "abs_pct", ggtheme = theme_bw(), palette = c("red", "blue"), risk.table.y.text.col = TRUE, risk.table.y.text = FALSE, ncensor.plot = TRUE) }
Call: survfit(formula = fmla, data = dat) n events median 0.95LCL 0.95UCL arm=0 12 11 23 8 NA arm=1 11 7 31 18 NA ------------------------------ Log-Rank test for comparing two survival curves: Call: survdiff(formula = fmla, data = dat) N Observed Expected (O-E)^2/E (O-E)^2/V arm=0 12 11 7.31 1.86 3.4 arm=1 11 7 10.69 1.27 3.4 Chisq= 3.4 on 1 degrees of freedom, p= 0.07 ------------------------------ Survival Probability at Study End: Call: survfit(formula = fmla, data = dat) arm=0 time n.risk n.event survival std.err lower 95% CI upper 95% CI 35.0000 2.0000 9.0000 0.1944 0.1219 0.0569 0.6642 arm=1 time n.risk n.event survival std.err lower 95% CI upper 95% CI 35.000 3.000 6.000 0.368 0.163 0.155 0.875 ARR (Absolute Risk Reduction) = 0.17 NNT (Number Needed to Treat) = 6 Odds Ratio: 0.41 ------------------------------ Using Cox Regression Model: Call: coxph(formula = fmla, data = dat) coef exp(coef) se(coef) z p arm -0.92 0.40 0.51 -1.8 0.07 Likelihood ratio test=3.4 on 1 df, p=0.066 n= 23, number of events= 18 Hazard Ratio = 0.4 ------------------------------ Assessing Proportional Hazards Assumptions: chisq df p arm 0.00788 1 0.93 GLOBAL 0.00788 1 0.93 ------------------------------ RMST (Restricted Mean Survival Time) Analysis The truncation time: tau = 35 was specified. Restricted Mean Survival Time (RMST) by arm Est. se lower .95 upper .95 RMST (arm=1) 27.057 2.906 21.361 32.753 RMST (arm=0) 20.958 3.456 14.185 27.732 Restricted Mean Time Lost (RMTL) by arm Est. se lower .95 upper .95 RMTL (arm=1) 7.943 2.906 2.247 13.639 RMTL (arm=0) 14.042 3.456 7.268 20.815 Between-group contrast Est. lower .95 upper .95 p RMST (arm=1)-(arm=0) 6.098 -2.752 14.949 0.177 RMST (arm=1)/(arm=0) 1.291 0.878 1.899 0.194 RMTL (arm=1)/(arm=0) 0.566 0.238 1.342 0.196
Friday, April 16, 2021
Search for the sample size that produces a specific (lower) confidence limit
search.n.binary.proportion <- function(p, cl, alpha = 0.05) { ## Purpose: Search for the sample size that produces a specific (lower) confidence limit. ## Arguments: ## p: reported point estimate ## cl: reported lowered confidence limit ## alpha: type I error rate for two-sided confidence interval. Default to 5%. ## Return: a conservative estimate for the 95% confidence interval ## Author: Feiming Chen ## ________________________________________________ N <- 50 # start at sample size of 50 while (N < 2000) { # end at sample size of 1000 x <- round(N * p) r <- prop.test(x, N, conf.level = 1 - alpha, correct = FALSE) cl2 <- r$conf.int[1] if (cl2 > cl) break N <- N + 1 } N <- N - 1 x <- floor(N * p) cat("Sample Size =", N, ", X =", x, "\n") prop.test(x, N, conf.level = 0.95) } if (F) { # Unit Test search.n.binary.proportion(0.849, 0.804, alpha = 0.1) ## Sample Size = 194 , X = 164 ## 1-sample proportions test with continuity correction ## data: x out of N, null probability 0.5 ## X-squared = 91.2, df = 1, p-value <2e-16 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.78497 0.89167 ## sample estimates: ## p ## 0.84536 }
Tuesday, April 6, 2021
Calculate sample proportion and its confidence interval based on the Score Test
sample.prop.CI <- function(x, n) { ## Purpose: Calculate sample proportion and its confidence interval based on the Score Test ## Arguments: ## x: a count in the numerator ## n: total count in the denominator ## Return: print sample proportion and its confidence interval ## Author: Feiming Chen ## ________________________________________________ r <- prop.test(x, n, correct = FALSE) paste0(round(r$estimate, 2), " (", paste0(round(r$conf.int, 2), collapse = ","), ")") } if (F) { # Unit Test set.seed(1) sample.prop.CI(10, 100) # "0.1 (0.06,0.17)" }
Assess risk stratification performance
risk.stratification.analysis <- function(x) { ## Purpose: Assess risk stratification performance. ## Arguments: ## x: a numeric vector of length 2K for the raw confusion matrix. ## First K numbers: count of each risk group with reference positive results ## Second K numbers: count of each risk group with reference negative results ## Return: ## an expanded confusion matrix with post-test disease risks and likelihood ratios. ## Author: Feiming Chen ## ________________________________________________ d <- as.data.frame(addmargins(matrix(x, ncol = 2))) n <- nrow(d) - 1 # number of risk groups g.names <- paste0("Group ", 1:n) # group names dimnames(d) <- list(c(g.names, "Sum"), c("Ref.Pos", "Ref.Neg", "Sum")) d$Post.Test.Risk <- with(d, Ref.Pos / Sum, 2) d$Likelihood.Ratio <- with(d, Ref.Pos / Ref.Pos[n+1] / (Ref.Neg / Ref.Neg[n+1])) d } if (F) { # Unit Test x <- c(38, 27, 35, 5, 10, 41, 137, 158) risk.stratification.analysis(x) ## Ref.Pos Ref.Neg Sum Post.Test.Risk Likelihood.Ratio ## Group 1 38 10 48 0.79 12.52 ## Group 2 27 41 68 0.40 2.17 ## Group 3 35 137 172 0.20 0.84 ## Group 4 5 158 163 0.03 0.10 ## Sum 105 346 451 0.23 1.00 }
Friday, March 26, 2021
Calculate sample mean and its confidence interval using t-distribution
sample.mean.CI <- function(x, log.transform = FALSE) { ## Purpose: Calculate sample mean and its confidence interval using t-distribution ## Arguments: ## x: a vector of numeric values ## log.transform: if T, apply log transformation to the data first. ## Return: print sample mean and its confidence interval ## Author: Feiming Chen ## ________________________________________________ if (log.transform) { x <- log(x) } r <- t.test(x) e <- r$estimate c1 <- r$conf.int[1] c2 <- r$conf.int[2] if (log.transform) { e <- exp(e) c1 <- exp(c1) c2 <- exp(c2) } ans <- c(e, c1, c2) cat("Mean = ", convert.to.CI.text(ans), "\n") ans } if (F) { # Unit Test set.seed(1) sample.mean.CI(rnorm(100)) # Mean = 0.11 (-0.07 , 0.29) sample.mean.CI(c(1, 2, 5, 10, 100), log.transform = TRUE) # Mean = 6.31 (0.7 , 57.23) }
Thursday, March 25, 2021
Sample Size for One-Sample Z test
sample.size.one.sample.Z.test <- function(E, S, alpha = 0.025, beta = 0.2) { ## Purpose: Sample Size for One-sample Z test ## H1: mu1 (expected) > mu0 (target) ## Assume type I error of 2.5% (alpha), and power of 80% (1 - beta). ## Arguments: ## E: effect size (difference of expected mean and target mean) ## S: estimated spread (standard deviation) from pilot sample ## alpha: type I error (default to 2.5%) ## beta: type II error (default to 20%) ## Return: sample size ## Author: Feiming Chen ## ________________________________________________ a <- qnorm(1 - alpha) - qnorm(beta) # default = 2.8 ## Note: a minimum sample size of 30 is required for the sample mean to approximate the normal distribution N <- max(30, ceiling((a * S / E)^2)) cat("Sample Size =", N, ", when Effect Size =", E, ", SD =", S, ", Type I Error =", alpha, ", Power = ", 1 - beta, "\n") } if (F) { # Unit Test sample.size.one.sample.Z.test(0.1, 0.2) # 32 for (E in seq(0.05, 0.2, 0.05)) { for (SD in seq(0.2, 0.5, 0.1)) { sample.size.one.sample.Z.test(E, SD) } } ## Sample Size = 126 , when Effect Size = 0.05 , SD = 0.2 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 283 , when Effect Size = 0.05 , SD = 0.3 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 503 , when Effect Size = 0.05 , SD = 0.4 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 785 , when Effect Size = 0.05 , SD = 0.5 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 32 , when Effect Size = 0.1 , SD = 0.2 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 71 , when Effect Size = 0.1 , SD = 0.3 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 126 , when Effect Size = 0.1 , SD = 0.4 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 197 , when Effect Size = 0.1 , SD = 0.5 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 30 , when Effect Size = 0.15 , SD = 0.2 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 32 , when Effect Size = 0.15 , SD = 0.3 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 56 , when Effect Size = 0.15 , SD = 0.4 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 88 , when Effect Size = 0.15 , SD = 0.5 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.2 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.3 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 32 , when Effect Size = 0.2 , SD = 0.4 , Type I Error = 0.025 , Power = 0.8 ## Sample Size = 50 , when Effect Size = 0.2 , SD = 0.5 , Type I Error = 0.025 , Power = 0.8 }
Wednesday, February 24, 2021
Prevalence Scaling
prevalence.scaling <- function(model.P, model.prevalence = 0.5, target.prevalence = 0.1) { ## Purpose: Convert a probability based on training data prevalence to ## a probability based on target population prevalence. ## Arguments: ## model.P: a vector of model-based probability (risk score) ## model.prevalence: prevalence of the model's training data ## target.prevalence: prevalence of the target population ## Return: ## a vector of scaled probabilities reflecting the prevalence of the target population ## Author: Feiming Chen ## ________________________________________________ ## convert a probability to the odds O <- function(p) p / (1-p) 1 / ( 1 + O(model.prevalence) / O(target.prevalence) / O(model.P) ) } if (F) { # Unit Test set.seed(1) prevalence.scaling(runif(5)) ## [1] 0.038614 0.061784 0.129688 0.523663 0.027304 }
Thursday, February 18, 2021
Confidence Interval of the Difference between Two Correlated Proportions (McNemar Test)
CI.diff.two.correlated.proportions <- function(A, B, C, D, alpha = 0.05) { ## Purpose: CI of the difference between two correlated proportions (McNemar Test) ## Arguments: ## A, B, C, D: the 2x2 incidence count table for the matched pairs. ## A = both positive; ## B = treatment positive, control negative; ## C = treatment negative; control positive; ## D = both negative; ## alpha: type I error. Default to 0.05 (for two-sided CI). ## Return: 95% Confidence Interval of the Difference ## Author: Feiming Chen ## Method: Wald Z Method with Continuity Correction ## Reference: Newcombe (1998c), page 2638. ## ________________________________________________ N <- A + B + C + D # total number of matched pairs Delta <- (B - C) / N sw <- sqrt(((A + D) * (B + C) + 4 * B * C) / N^3) z <- qnorm(1 - alpha / 2) # default: 1.96 corr.term <- 1 / N # continuity correction L <- Delta - z * sw - corr.term U <- Delta + z * sw + corr.term x <- matrix(c(A, B, C, D), 2, 2, byrow = TRUE, dimnames = list(c("Test.POS", "Test.NEG"), c("Control.POS", "Control.NEG"))) cat("Raw Data (x):\n") print(x) cat("\nData Converted to Proportions:\n") print(round(x/N, 2)) cat("\nDifference of Two Paired Proportions =", round(Delta, 3), "\n") p.discordant <- (B+C) / N cat("Proportion of Discordant Pairs =", round(p.discordant, 3), "\n") N.future <- sample.size.for.two.correlated.proportions.test(Delta, p.discordant) cat("\nCurrent Sample Size =", N, "\n") cat("Future Sample Size =", N.future, "\n") print(mcnemar.test(x)) cat("\nConfidence Interval = (", round(L, 4), ",", round(U, 4), ")\n\n") } if (F) { # Unit Test CI.diff.two.correlated.proportions(12, 9, 2, 21) ## Confidence Interval = ( -0.0037 , 0.3219 ) CI.diff.two.correlated.proportions(25, 20, 5, 25) ## Raw Data (x): ## Control.POS Control.NEG ## Test.POS 25 20 ## Test.NEG 5 25 ## Data Converted to Proportions: ## Control.POS Control.NEG ## Test.POS 0.33 0.27 ## Test.NEG 0.07 0.33 ## Difference of Two Paired Proportions = 0.2 ## Proportion of Discordant Pairs = 0.333 ## Current Sample Size = 75 ## Future Sample Size = 63 ## McNemar's Chi-squared test with continuity correction ## data: x ## McNemar's chi-squared = 7.84, df = 1, p-value = 0.00511 ## Confidence Interval = ( 0.0641 , 0.3359 ) }
Sample Size for Two Correlated Proportions Based on McNemar Test
sample.size.for.two.correlated.proportions.test <- function(p.diff = 0.1, p.discordant = 0.2, alpha = 0.025, beta = 0.2) { ## Purpose: Sample Size for Two Correlated Proportions Test based on McNemar Test ## H0: p2 <= p1 ## H1: p2 > p1 ## (Application: tests of sensitivity, specificity with three-way, matched-pairs study) ## Arguments: ## p.diff: Effect size for the difference between two correlated proportions. Default to 10%. ## p.discordant: Proportion of pairs for which the responses differed. Default to 20%. ## alpha: type I error. Default to 0.025 (for one-sided test). ## beta: type II error (1 - power). Default to 0.2 (so power is 80%). ## Return: ## Sample Size ## Author: Feiming Chen ## Reference: Machin, Campbell, Fayers, and Pinol (1997). ## ________________________________________________ p10 <- (p.discordant + p.diff) / 2 # with default, p10 = 0.15 p01 <- (p.discordant - p.diff) / 2 # default: p01 = 0.05 OR <- p10 / p01 # default: OR = 3 OR.plus.one <- OR + 1 # default: 4 OR.minus.one <- OR - 1 # default: 2 N = (qnorm(1 - alpha) * OR.plus.one + qnorm(1 - beta) * sqrt(OR.plus.one^2 - OR.minus.one^2 * p.discordant))^2 / (OR.minus.one^2 * p.discordant) ceiling(N) } if (F) { # Unit Test sample.size.for.two.correlated.proportions.test() # 155 (PASS output: 155 under normal approx.) }
Wednesday, February 17, 2021
Sample Size for One Proportion Test
sample.size.for.one.proportion.test <- function(p0 = 0.8, p1 = 0.9, alpha = 0.025, beta = 0.2) { ## Purpose: Sample Size for One Proportion Test. ## H0: p <= p0 ## H1: p > p0 ## (Application: tests of sensitivity, specificity with performance goals) ## Arguments: ## p0: performance goal (minimally acceptable proportion). Default to 80%. ## p1: expected performance (minimal proportion under the alternative hypothesis). Defaul to 90%. ## alpha: type I error. Default to 0.025 (for one-sided test). ## beta: type II error (1 - power). Default to 0.2 (so power is 80%). ## Return: ## Sample Size ## Author: Feiming Chen ## Reference: Biswas, Bipasa. "Clinical performance evaluation of molecular diagnostic tests." ## The Journal of Molecular Diagnostics 18.6 (2016): 803-812. ## ________________________________________________ N = (qnorm(1 - alpha) * sqrt(p0 * (1 - p0)) + qnorm(1 - beta) * sqrt(p1 * (1 - p1)))^2 / (p1 - p0)^2 ceiling(N) } if (F) { # Unit Test sample.size.for.one.proportion.test() # 108 (PASS output: 107 under exact test; 108 under normal approx.) }
Subscribe to:
Posts (Atom)