Thursday, April 29, 2021

Fisher's Z Transform

Fisher.Z <- function(x, inverse = FALSE)
{
    ## Purpose: Fisher's Z Transform (application to correlation confidence interval)
    ## Arguments:
    ##   x: a positive numeric vector
    ##   inverse: if TRUE, do the inverse transform. 
    ## Return: a transformed (or back-transformed) vector
    ## Author: Feiming Chen
    ## ________________________________________________

    if (inverse) {
        y <- tanh(x)
        ## equivalent to the following:
        ## e <- exp(2 * x)
        ## y <- (e - 1) / (e + 1)
    } else {
        y <- atanh(x)
        ## equivalent to the following:
        ## y <- 1/2 * log((1 + x) / (1 - x))
    }
    y
}
if (F) {                                # Unit Test
    Fisher.Z(0.8)                       # 1.0986
    Fisher.Z(1.2, inverse = TRUE)       # 0.83365

    x <- rbeta(10, 1, 1)                # simulate correlations
    Fisher.Z(x)
    Fisher.Z(x, inverse = TRUE)
    all.equal(x, Fisher.Z(Fisher.Z(x), inverse = TRUE)) # TRUE
}

Calculate bootstrap confidence intervals for a statistic of a numeric vector

Using the library "boot"

my.boot <- function(x, func = mean, R = 10000) {
    ## Purpose: Calculate bootstrap confidence interval for a statistic of a numeric vector
    ## Arguments:
    ##   x: a vector of numerical values
    ##   func: a function to calculate the statistic (default to "mean")
    ##   R: number of replicate bootstrap samples (default to 10000)
    ## Return: the bootstrap confidence intervals (percentile and BCa)
    ## Author: Feiming Chen
    ## ________________________________________________

    library(boot)
    b <- boot(x, function(x, i) func(x[i]), R = R)
    print(boot.ci(b, type = c("perc", "bca")))
}
if (F) {
    x <- rnorm(100)
    my.boot(x)
    my.boot(x, func = median)
    ## Level     Percentile            BCa          
    ## 95%   (-0.1284,  0.2726 )   (-0.1371,  0.2533 )  
}

Not using the library "boot"

stat.bootstrap.independent <- function(val, func = mean, N.bootstrap = 10000)
{
    ## Purpose: Calculate bootstrap-based 95% confidence interval for independent univariate data
    ## Arguments:
    ##   val: a numeric vector to be summarized. 
    ##   func: a function for calculating the summary statistic from a numeric vector.  Default to "mean". 
    ##   N.bootstrap: number of bootstrap samples.  Default to 10000. 
    ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic
    ## Author: Feiming Chen
    ## ________________________________________________

    fname <- deparse(substitute(func))
    ans <- func(val)
    cat("Summary Statistic:", fname, "=", ans, "\n")

    N <- length(val)
    r <- rep(NA, N.bootstrap)
    set.seed(1)
    for (i in 1:N.bootstrap) {
        s <- sample(val, size = N, replace = TRUE)
        r[i] <- func(s)
    }

    hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname))

    cat("95% Bootstrap Percentile Confidence Interval:\n")
    quantile(r, c(0.025, 0.975))
}
if (F) {                                # Unit Test
    val <- c(3, 3, 3, 4, 4, 5)
    stat.bootstrap.independent(val)          # compare with t-test based CI: (2.8098 4.5235)
    ## Summary Statistic: mean = 3.6667 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##   2.5%  97.5% 
    ## 3.1667 4.3333 
}



Monday, April 26, 2021

Simple Survival Analysis of time-to-event data with missing data

my.survival.analysis <- function(dat, study.end.time = NULL)
{
    ## Purpose: Survival Analysis of time-to-event data with missing data. Assume two groups. 
    ## Arguments:
    ##   dat: time-to-event data that look like (variable name is fixed):
    ## 
    ##         time status  arm
    ##           9      1    1
    ##          13      1    0
    ##          13      0    1
    ## 
    ##     The meaning of each variable is as follows: 
    ##       time: the follow-up time (numeric) for right censored data.
    ##       status: the status indicator. 0 = right censored, 1 = event. 
    ##       arm: the group indicator for comparison. 0 = control group, 1 = treatment group. 
    ## 
    ##   study.end.time: truncation time (the end of the time window in which to compare two groups).
    ##                   It needs to be smaller than the default value (minimum of the largest observed time in each of the two groups)
    ## 
    ## Return: Survival Data Analysis Summary and Plots
    ## Author: Feiming Chen
    ## ________________________________________________

    library(survival)

    dat <<- dat                         # so that "cox.zph(f2)" below works. 

    fmla <- Surv(time, status) ~ arm    # formula for survival time analysis

    f <- survfit(formula = fmla, data = dat)

    print(f)

    ## Simple plot of survival curves
    plot(f, lwd = 2, col = c("red", "blue"), mark.time = TRUE, xlab = "Time", ylab = "Survival Probability")
    legend("topright", legend = names(f$strata), bg = "lightyellow", col = c("red", "blue"), title = "Groups", lwd = 2)

    cat("\n------------------------------\nLog-Rank test for comparing two survival curves:\n")
    print(survdiff(fmla, dat))

    cat("\n------------------------------\nSurvival Probability at Study End:\n")
    if (missing(study.end.time)) {
        study.end.time <- max(f$time)
        tau <- NULL
    } else  tau <- study.end.time

    ans <- summary(f, times = study.end.time, extend = TRUE, rmean = study.end.time)
    print(ans)
    ## Post-test disease risk = 1 - survival.probability.at.end

    ## ARR (Absolute Risk Reduction) - Check the direction of the risk reduction manually! 
    ARR <- diff(ans$surv)
    cat("ARR (Absolute Risk Reduction) =", round(ARR, 2), "\n")
    NNT <- ceiling(1 / ARR)
    cat("NNT (Number Needed to Treat) =", NNT, "\n")
    OR <- ans$surv[1] / (1 - ans$surv[1]) / (ans$surv[2] / (1 - ans$surv[2]))
    cat("Odds Ratio:", round(OR, 2), "\n")

    cat("\n------------------------------\nUsing Cox Regression Model:\n")
    f2 <- coxph(formula = fmla, data = dat)
    print(f2)
    cat("Hazard Ratio =", round(exp(f2$coefficients), 2), "\n")

    cat("\n------------------------------\nAssessing Proportional Hazards Assumptions:\n")
    print(cox.zph(f2))

    cat("\n------------------------------\nRMST (Restricted Mean Survival Time) Analysis\n")
    library(survRM2)
    r <- rmst2(time = dat$time, status = dat$status, arm = dat$arm, tau = tau)
    print(r)
    ## plot(r, xlab = "Time", ylab = "Survival Probability")

    f
}

if (F) {                                # Unit Test
    d <- data.frame(time = aml$time, status = aml$status, arm = ifelse(aml$x == "Maintained", 1, 0))
    f <- my.survival.analysis(d, study.end.time = 35)

    d <- survRM2::rmst2.sample.data()
    f <- my.survival.analysis(d, study.end.time = 10)

    ## cannot do it inside a function.
    ## make survival curve with cumulative events/incidences
    f <- survfit(Surv(time, status) ~ x, data = aml)
    library(survminer)
    ggsurvplot(f, data = aml, fun = "event", conf.int = TRUE, pval = TRUE, risk.table = "abs_pct", ggtheme = theme_bw(), palette = c("red", "blue"), risk.table.y.text.col = TRUE, risk.table.y.text = FALSE, ncensor.plot = TRUE)

}


Call: survfit(formula = fmla, data = dat)

       n events median 0.95LCL 0.95UCL
arm=0 12     11     23       8      NA
arm=1 11      7     31      18      NA

------------------------------
Log-Rank test for comparing two survival curves:
Call:
survdiff(formula = fmla, data = dat)

       N Observed Expected (O-E)^2/E (O-E)^2/V
arm=0 12       11     7.31      1.86       3.4
arm=1 11        7    10.69      1.27       3.4

 Chisq= 3.4  on 1 degrees of freedom, p= 0.07 

------------------------------
Survival Probability at Study End:
Call: survfit(formula = fmla, data = dat)

                arm=0 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
     35.0000       2.0000       9.0000       0.1944       0.1219       0.0569       0.6642 

                arm=1 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
      35.000        3.000        6.000        0.368        0.163        0.155        0.875 

ARR (Absolute Risk Reduction) = 0.17 
NNT (Number Needed to Treat) = 6 
Odds Ratio: 0.41 

------------------------------
Using Cox Regression Model:
Call:
coxph(formula = fmla, data = dat)

     coef exp(coef) se(coef)    z    p
arm -0.92      0.40     0.51 -1.8 0.07

Likelihood ratio test=3.4  on 1 df, p=0.066
n= 23, number of events= 18 
Hazard Ratio = 0.4 

------------------------------
Assessing Proportional Hazards Assumptions:
         chisq df    p
arm    0.00788  1 0.93
GLOBAL 0.00788  1 0.93

------------------------------
RMST (Restricted Mean Survival Time) Analysis

The truncation time: tau = 35  was specified. 

Restricted Mean Survival Time (RMST) by arm 
               Est.    se lower .95 upper .95
RMST (arm=1) 27.057 2.906    21.361    32.753
RMST (arm=0) 20.958 3.456    14.185    27.732


Restricted Mean Time Lost (RMTL) by arm 
               Est.    se lower .95 upper .95
RMTL (arm=1)  7.943 2.906     2.247    13.639
RMTL (arm=0) 14.042 3.456     7.268    20.815


Between-group contrast 
                      Est. lower .95 upper .95     p
RMST (arm=1)-(arm=0) 6.098    -2.752    14.949 0.177
RMST (arm=1)/(arm=0) 1.291     0.878     1.899 0.194
RMTL (arm=1)/(arm=0) 0.566     0.238     1.342 0.196

Friday, April 16, 2021

Search for the sample size that produces a specific (lower) confidence limit

search.n.binary.proportion <- function(p, cl, alpha = 0.05)
{
    ## Purpose: Search for the sample size that produces a specific (lower) confidence limit.
    ## Arguments:
    ##   p: reported point estimate
    ##   cl: reported lowered confidence limit
    ##   alpha: type I error rate for two-sided confidence interval. Default to 5%. 
    ## Return: a conservative estimate for the 95% confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    N <- 50                             # start at sample size of 50
    while (N < 2000) {                  # end at sample size of 1000
        x <- round(N * p)
        r <- prop.test(x, N, conf.level = 1 - alpha, correct = FALSE)
        cl2 <- r$conf.int[1]
        if (cl2 > cl) break
        N <- N + 1 
    }
    N <- N - 1
    x <- floor(N * p)
    cat("Sample Size =", N, ", X =", x, "\n")
    prop.test(x, N, conf.level = 0.95)
}
if (F) {                                # Unit Test
    search.n.binary.proportion(0.849, 0.804, alpha = 0.1)

    ## Sample Size = 194 , X = 164 

    ##  1-sample proportions test with continuity correction

    ## data:  x out of N, null probability 0.5
    ## X-squared = 91.2, df = 1, p-value <2e-16
    ## alternative hypothesis: true p is not equal to 0.5
    ## 95 percent confidence interval:
    ##  0.78497 0.89167
    ## sample estimates:
    ##       p 
    ## 0.84536 

}

Tuesday, April 6, 2021

Calculate sample proportion and its confidence interval based on the Score Test

sample.prop.CI <- function(x, n)
{
    ## Purpose: Calculate sample proportion and its confidence interval based on the Score Test
    ## Arguments:
    ##   x: a count in the numerator
    ##   n: total count in the denominator
    ## Return: print sample proportion and its confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    r <- prop.test(x, n, correct = FALSE)
    paste0(round(r$estimate, 2), " (", paste0(round(r$conf.int, 2), collapse = ","), ")")
}
if (F) {                                # Unit Test
    set.seed(1)
    sample.prop.CI(10, 100)             # "0.1 (0.06,0.17)"
}

Assess risk stratification performance

risk.stratification.analysis <- function(x)
{
    ## Purpose: Assess risk stratification performance. 
    ## Arguments:
    ##   x: a numeric vector of length 2K for the raw confusion matrix.
    ##      First K numbers: count of each risk group with reference positive results
    ##      Second K numbers: count of each risk group with reference negative results
    ## Return:
    ##   an expanded confusion matrix with post-test disease risks and likelihood ratios.
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    n <- nrow(d) - 1                    # number of risk groups
    g.names <- paste0("Group ", 1:n)    # group names
    dimnames(d) <- list(c(g.names, "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Post.Test.Risk <- with(d, Ref.Pos / Sum, 2)
    d$Likelihood.Ratio <- with(d, Ref.Pos / Ref.Pos[n+1] / (Ref.Neg / Ref.Neg[n+1]))

    d
}
if (F) {                                # Unit Test
    x <- c(38, 27, 35, 5, 10, 41, 137, 158)
    risk.stratification.analysis(x)

    ##         Ref.Pos Ref.Neg Sum Post.Test.Risk Likelihood.Ratio
    ## Group 1      38      10  48           0.79            12.52
    ## Group 2      27      41  68           0.40             2.17
    ## Group 3      35     137 172           0.20             0.84
    ## Group 4       5     158 163           0.03             0.10
    ## Sum         105     346 451           0.23             1.00

}