Fisher.Z <- function(x, inverse = FALSE) { ## Purpose: Fisher's Z Transform (application to correlation confidence interval) ## Arguments: ## x: a positive numeric vector ## inverse: if TRUE, do the inverse transform. ## Return: a transformed (or back-transformed) vector ## Author: Feiming Chen ## ________________________________________________ if (inverse) { y <- tanh(x) ## equivalent to the following: ## e <- exp(2 * x) ## y <- (e - 1) / (e + 1) } else { y <- atanh(x) ## equivalent to the following: ## y <- 1/2 * log((1 + x) / (1 - x)) } y } if (F) { # Unit Test Fisher.Z(0.8) # 1.0986 Fisher.Z(1.2, inverse = TRUE) # 0.83365 x <- rbeta(10, 1, 1) # simulate correlations Fisher.Z(x) Fisher.Z(x, inverse = TRUE) all.equal(x, Fisher.Z(Fisher.Z(x), inverse = TRUE)) # TRUE }
Thursday, April 29, 2021
Fisher's Z Transform
Calculate bootstrap confidence intervals for a statistic of a numeric vector
Using the library "boot"
my.boot <- function(x, func = mean, R = 10000) { ## Purpose: Calculate bootstrap confidence interval for a statistic of a numeric vector ## Arguments: ## x: a vector of numerical values ## func: a function to calculate the statistic (default to "mean") ## R: number of replicate bootstrap samples (default to 10000) ## Return: the bootstrap confidence intervals (percentile and BCa) ## Author: Feiming Chen ## ________________________________________________ library(boot) b <- boot(x, function(x, i) func(x[i]), R = R) print(boot.ci(b, type = c("perc", "bca"))) } if (F) { x <- rnorm(100) my.boot(x) my.boot(x, func = median) ## Level Percentile BCa ## 95% (-0.1284, 0.2726 ) (-0.1371, 0.2533 ) }Not using the library "boot"
stat.bootstrap.independent <- function(val, func = mean, N.bootstrap = 10000) { ## Purpose: Calculate bootstrap-based 95% confidence interval for independent univariate data ## Arguments: ## val: a numeric vector to be summarized. ## func: a function for calculating the summary statistic from a numeric vector. Default to "mean". ## N.bootstrap: number of bootstrap samples. Default to 10000. ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic ## Author: Feiming Chen ## ________________________________________________ fname <- deparse(substitute(func)) ans <- func(val) cat("Summary Statistic:", fname, "=", ans, "\n") N <- length(val) r <- rep(NA, N.bootstrap) set.seed(1) for (i in 1:N.bootstrap) { s <- sample(val, size = N, replace = TRUE) r[i] <- func(s) } hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname)) cat("95% Bootstrap Percentile Confidence Interval:\n") quantile(r, c(0.025, 0.975)) } if (F) { # Unit Test val <- c(3, 3, 3, 4, 4, 5) stat.bootstrap.independent(val) # compare with t-test based CI: (2.8098 4.5235) ## Summary Statistic: mean = 3.6667 ## 95% Bootstrap Percentile Confidence Interval: ## 2.5% 97.5% ## 3.1667 4.3333 }
Monday, April 26, 2021
Simple Survival Analysis of time-to-event data with missing data
my.survival.analysis <- function(dat, study.end.time = NULL) { ## Purpose: Survival Analysis of time-to-event data with missing data. Assume two groups. ## Arguments: ## dat: time-to-event data that look like (variable name is fixed): ## ## time status arm ## 9 1 1 ## 13 1 0 ## 13 0 1 ## ## The meaning of each variable is as follows: ## time: the follow-up time (numeric) for right censored data. ## status: the status indicator. 0 = right censored, 1 = event. ## arm: the group indicator for comparison. 0 = control group, 1 = treatment group. ## ## study.end.time: truncation time (the end of the time window in which to compare two groups). ## It needs to be smaller than the default value (minimum of the largest observed time in each of the two groups) ## ## Return: Survival Data Analysis Summary and Plots ## Author: Feiming Chen ## ________________________________________________ library(survival) dat <<- dat # so that "cox.zph(f2)" below works. fmla <- Surv(time, status) ~ arm # formula for survival time analysis f <- survfit(formula = fmla, data = dat) print(f) ## Simple plot of survival curves plot(f, lwd = 2, col = c("red", "blue"), mark.time = TRUE, xlab = "Time", ylab = "Survival Probability") legend("topright", legend = names(f$strata), bg = "lightyellow", col = c("red", "blue"), title = "Groups", lwd = 2) cat("\n------------------------------\nLog-Rank test for comparing two survival curves:\n") print(survdiff(fmla, dat)) cat("\n------------------------------\nSurvival Probability at Study End:\n") if (missing(study.end.time)) { study.end.time <- max(f$time) tau <- NULL } else tau <- study.end.time ans <- summary(f, times = study.end.time, extend = TRUE, rmean = study.end.time) print(ans) ## Post-test disease risk = 1 - survival.probability.at.end ## ARR (Absolute Risk Reduction) - Check the direction of the risk reduction manually! ARR <- diff(ans$surv) cat("ARR (Absolute Risk Reduction) =", round(ARR, 2), "\n") NNT <- ceiling(1 / ARR) cat("NNT (Number Needed to Treat) =", NNT, "\n") OR <- ans$surv[1] / (1 - ans$surv[1]) / (ans$surv[2] / (1 - ans$surv[2])) cat("Odds Ratio:", round(OR, 2), "\n") cat("\n------------------------------\nUsing Cox Regression Model:\n") f2 <- coxph(formula = fmla, data = dat) print(f2) cat("Hazard Ratio =", round(exp(f2$coefficients), 2), "\n") cat("\n------------------------------\nAssessing Proportional Hazards Assumptions:\n") print(cox.zph(f2)) cat("\n------------------------------\nRMST (Restricted Mean Survival Time) Analysis\n") library(survRM2) r <- rmst2(time = dat$time, status = dat$status, arm = dat$arm, tau = tau) print(r) ## plot(r, xlab = "Time", ylab = "Survival Probability") f } if (F) { # Unit Test d <- data.frame(time = aml$time, status = aml$status, arm = ifelse(aml$x == "Maintained", 1, 0)) f <- my.survival.analysis(d, study.end.time = 35) d <- survRM2::rmst2.sample.data() f <- my.survival.analysis(d, study.end.time = 10) ## cannot do it inside a function. ## make survival curve with cumulative events/incidences f <- survfit(Surv(time, status) ~ x, data = aml) library(survminer) ggsurvplot(f, data = aml, fun = "event", conf.int = TRUE, pval = TRUE, risk.table = "abs_pct", ggtheme = theme_bw(), palette = c("red", "blue"), risk.table.y.text.col = TRUE, risk.table.y.text = FALSE, ncensor.plot = TRUE) }
Call: survfit(formula = fmla, data = dat) n events median 0.95LCL 0.95UCL arm=0 12 11 23 8 NA arm=1 11 7 31 18 NA ------------------------------ Log-Rank test for comparing two survival curves: Call: survdiff(formula = fmla, data = dat) N Observed Expected (O-E)^2/E (O-E)^2/V arm=0 12 11 7.31 1.86 3.4 arm=1 11 7 10.69 1.27 3.4 Chisq= 3.4 on 1 degrees of freedom, p= 0.07 ------------------------------ Survival Probability at Study End: Call: survfit(formula = fmla, data = dat) arm=0 time n.risk n.event survival std.err lower 95% CI upper 95% CI 35.0000 2.0000 9.0000 0.1944 0.1219 0.0569 0.6642 arm=1 time n.risk n.event survival std.err lower 95% CI upper 95% CI 35.000 3.000 6.000 0.368 0.163 0.155 0.875 ARR (Absolute Risk Reduction) = 0.17 NNT (Number Needed to Treat) = 6 Odds Ratio: 0.41 ------------------------------ Using Cox Regression Model: Call: coxph(formula = fmla, data = dat) coef exp(coef) se(coef) z p arm -0.92 0.40 0.51 -1.8 0.07 Likelihood ratio test=3.4 on 1 df, p=0.066 n= 23, number of events= 18 Hazard Ratio = 0.4 ------------------------------ Assessing Proportional Hazards Assumptions: chisq df p arm 0.00788 1 0.93 GLOBAL 0.00788 1 0.93 ------------------------------ RMST (Restricted Mean Survival Time) Analysis The truncation time: tau = 35 was specified. Restricted Mean Survival Time (RMST) by arm Est. se lower .95 upper .95 RMST (arm=1) 27.057 2.906 21.361 32.753 RMST (arm=0) 20.958 3.456 14.185 27.732 Restricted Mean Time Lost (RMTL) by arm Est. se lower .95 upper .95 RMTL (arm=1) 7.943 2.906 2.247 13.639 RMTL (arm=0) 14.042 3.456 7.268 20.815 Between-group contrast Est. lower .95 upper .95 p RMST (arm=1)-(arm=0) 6.098 -2.752 14.949 0.177 RMST (arm=1)/(arm=0) 1.291 0.878 1.899 0.194 RMTL (arm=1)/(arm=0) 0.566 0.238 1.342 0.196
Friday, April 16, 2021
Search for the sample size that produces a specific (lower) confidence limit
search.n.binary.proportion <- function(p, cl, alpha = 0.05) { ## Purpose: Search for the sample size that produces a specific (lower) confidence limit. ## Arguments: ## p: reported point estimate ## cl: reported lowered confidence limit ## alpha: type I error rate for two-sided confidence interval. Default to 5%. ## Return: a conservative estimate for the 95% confidence interval ## Author: Feiming Chen ## ________________________________________________ N <- 50 # start at sample size of 50 while (N < 2000) { # end at sample size of 1000 x <- round(N * p) r <- prop.test(x, N, conf.level = 1 - alpha, correct = FALSE) cl2 <- r$conf.int[1] if (cl2 > cl) break N <- N + 1 } N <- N - 1 x <- floor(N * p) cat("Sample Size =", N, ", X =", x, "\n") prop.test(x, N, conf.level = 0.95) } if (F) { # Unit Test search.n.binary.proportion(0.849, 0.804, alpha = 0.1) ## Sample Size = 194 , X = 164 ## 1-sample proportions test with continuity correction ## data: x out of N, null probability 0.5 ## X-squared = 91.2, df = 1, p-value <2e-16 ## alternative hypothesis: true p is not equal to 0.5 ## 95 percent confidence interval: ## 0.78497 0.89167 ## sample estimates: ## p ## 0.84536 }
Tuesday, April 6, 2021
Calculate sample proportion and its confidence interval based on the Score Test
sample.prop.CI <- function(x, n) { ## Purpose: Calculate sample proportion and its confidence interval based on the Score Test ## Arguments: ## x: a count in the numerator ## n: total count in the denominator ## Return: print sample proportion and its confidence interval ## Author: Feiming Chen ## ________________________________________________ r <- prop.test(x, n, correct = FALSE) paste0(round(r$estimate, 2), " (", paste0(round(r$conf.int, 2), collapse = ","), ")") } if (F) { # Unit Test set.seed(1) sample.prop.CI(10, 100) # "0.1 (0.06,0.17)" }
Assess risk stratification performance
risk.stratification.analysis <- function(x) { ## Purpose: Assess risk stratification performance. ## Arguments: ## x: a numeric vector of length 2K for the raw confusion matrix. ## First K numbers: count of each risk group with reference positive results ## Second K numbers: count of each risk group with reference negative results ## Return: ## an expanded confusion matrix with post-test disease risks and likelihood ratios. ## Author: Feiming Chen ## ________________________________________________ d <- as.data.frame(addmargins(matrix(x, ncol = 2))) n <- nrow(d) - 1 # number of risk groups g.names <- paste0("Group ", 1:n) # group names dimnames(d) <- list(c(g.names, "Sum"), c("Ref.Pos", "Ref.Neg", "Sum")) d$Post.Test.Risk <- with(d, Ref.Pos / Sum, 2) d$Likelihood.Ratio <- with(d, Ref.Pos / Ref.Pos[n+1] / (Ref.Neg / Ref.Neg[n+1])) d } if (F) { # Unit Test x <- c(38, 27, 35, 5, 10, 41, 137, 158) risk.stratification.analysis(x) ## Ref.Pos Ref.Neg Sum Post.Test.Risk Likelihood.Ratio ## Group 1 38 10 48 0.79 12.52 ## Group 2 27 41 68 0.40 2.17 ## Group 3 35 137 172 0.20 0.84 ## Group 4 5 158 163 0.03 0.10 ## Sum 105 346 451 0.23 1.00 }
Subscribe to:
Posts (Atom)