Wednesday, October 20, 2021

Make a plot to compare point estimates and their confidence intervals

plot.compare.CI <- function(x, CI, ylab = "Estimates", ...)
{
    ## Purpose: Make a plot to compare point estimates and their confidence intervals
    ##          (presumably derived from different methods)
    ## Arguments:
    ##   x: a vector of point estimates
    ##   CI: a matrix of CI's. Each column is a CI pair.
    ##       1st row is lower CI. 2nd row is upper CI
    ##   ...: pass to the plot function. 
    ## Return: a plot. 
    ## Author: Feiming Chen
    ## ________________________________________________

    require(gplots)

    ## Determine how best to put X-axis Labels --------------------
    lx <- length(x)                     # number of point estimates

    if (lx > 7) {
        xlas <- 2
        o <- par(mar = c(10, 4, 4, 2))
        on.exit(par(o))
    }

    li <- CI[1,]
    ui <- CI[2,]
    plotCI(x, li = li, ui = ui, barcol="blue", xaxt="n", xlab = "", ylab = ylab, sfrac = 0.005, xlim = c(0.7, lx + 0.3), ...)

    s <- 1:lx
    ss <- names(x)
    axis(side=1, at = s, labels= ss, tick = F, las = 0)

    ## put on numerical labels on the CI's. 
    s1 <- s + 0.18                       # offset
    ## median & CI labels
    text(s1, y = li, labels = round(li, 2))
    text(s1, y = ui, labels = round(ui, 2))
    text(s1, y = x,  labels = round(x, 2))
}
if (F) {                                # Unit Test
    x <- 1:3
    names(x) <- c("A", "B", "C")
    CI <- matrix(c(0.8, 1.2, 1.7, 2.3, 2.9, 3.1), nrow = 2)
    plot.compare.CI(x, CI)
}

Friday, August 27, 2021

Preprocess clustered binary data before inference on sample proportion

preprocess.clustered.binary.data <- function(x, n)
{
    ## Purpose: Preprocess clustered binary data before inference on sample proportion
    ##          Reference: Rao, J. N. K., & Scott, A. J. (1992). A simple method for
    ##                     the analysis of clustered binary data. Biometrics, 577-585.
    ## Keywords: variance adjustment, ratio estimator, correlated data
    ## Arguments:
    ##   x: a vector for the numerators across clusters.
    ##   n: a vector for the denominators across clusters.
    ## Return: a pre-processed numerator and denominator for overall sample proportion
    ## Author: Feiming Chen
    ## ________________________________________________

    m <- length(x)
    cat("Number of Clusters =", m, "\n")

    n0 <- sum(n)
    cat("Raw Sample Size =", n0, "\n")

    x0 <- sum(x)
    cat("Raw Incidence Count =", x0, "\n")

    cat("\nEstimates Regarding the Overall Sample Proportion:\n")

    p <- x0 / n0
    cat("  Point Estimate =", round(p, 4), "\n")

    v0 <- p * (1 - p) / n0
    cat("  Naive Binomial Variance =", round(v0, 4), "\n")

    r <- x - n * p
    v <- m * sum(r^2) / (m - 1) / n0^2
    cat("  Correct Variance  =", round(v, 4), "\n")

    d <- v / v0
    cat("\nDesign Effect (Variance Inflation Factor due to Clustering) =", round(d, 4), "\n")

    n1 <- n0 / d
    cat("Effective Sample Size (n) =", round(n1), "\n")

    x1 <- x0 / d
    cat("Effective Incidence Count (x) =", round(x1), "\n")

    ## Return the pre-processed numerator (x) and denominator (n) for
    ## the overall sample proportion (p = x / n). 
    list(x = x1, n = n1)

}
if (F) {                                # Unit Test
    x = c(1,1,2,0,5,0,1,4,0,1,0,0,0,0,0,4,3,0,1,1,1,2,0,1,0,2,1,0,1,5,0,0,0,0,0,0,2,0,2,0,0,2,1,2,2,0,0,1,0,1) 
    n = c(2,2,3,0,7,1,2,5,1,2,0,0,4,2,0,7,4,1,1,1,4,2,3,1,0,2,1,0,1,5,3,0,1,3,0,0,2,1,2,0,0,5,3,2,2,1,0,2,1,1) 

    a <- preprocess.clustered.binary.data(x, n)
    ## Number of Clusters = 50 
    ## Raw Sample Size = 93 
    ## Raw Incidence Count = 50 

    ## Estimates Regarding the Overall Sample Proportion:
    ##   Point Estimate = 0.5376 
    ##   Naive Binomial Variance = 0.0027 
    ##   Correct Variance  = 0.004 

    ## Design Effect (Variance Inflation Factor due to Clustering) = 1.4895 
    ## Effective Sample Size (n) = 62 
    ## Effective Incidence Count (x) = 34 

    prop.test(a$x, a$n)
    ## 95 percent confidence interval:
    ##  0.40774 0.66290
    ## sample estimates:
    ##       p 
    ## 0.53763 

    ## Compare to Bootstrap Confidence Intervals
    library(boot)
    r <- boot(data.frame(x=x, n=n), function(dat, idx) { d <- dat[idx,]; sum(d$x)/sum(d$n)}, R = 10000)
    boot.ci(r)
    ## Bootstrap Percentile CI: ( 0.4118,  0.6569 ), which is a bit tighter than the VIF method. 

    ## Compare to Unadjusted (Wrong) CI: 
    prop.test(sum(x), sum(n))
    ## 95 percent confidence interval:
    ##  0.43159 0.64053, which is too narrow and is wrong. 

}

Wednesday, June 30, 2021

Bootstrap Confidence Interval for Data with Repeated Measures

stat.bootstrap <- function(id, val, func = mean, N.bootstrap = 10000)
{
    ## Purpose: Calculate bootstrap-based 95% confidence interval for data with repeated measures
    ## Arguments:
    ##   id: uniquely identifies a subject (patient)
    ##   val: a numeric vector to be summarized.  It contains repeated measures per subject.
    ##   func: a function for calculating the summary statistic from a numeric vector.  Default to "mean". 
    ##   N.bootstrap: number of bootstrap samples.  Default to 10000. 
    ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic
    ## Author: Feiming Chen
    ## ________________________________________________

    fname <- deparse(substitute(func))
    cat("Summary Statistic:", fname, "=", func(val), "\n")

    id.uniq <- unique(id)
    N <- length(id.uniq)

    r <- rep(NA, N.bootstrap)
    for (i in 1:N.bootstrap) {
        s <- sample(id.uniq, N, replace = TRUE)
        v <- c()
        for (j in s) v <- c(v, val[id == j])
        r[i] <- func(v)
    }

    hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname))
    cat("95% Bootstrap Percentile Confidence Interval:\n")
    quantile(r, c(0.025, 0.975))
}
if (F) {                                # Unit Test
    id <- c(1, 1, 1, 2, 2, 3)
    val <- c(3, 3, 3, 4, 4, 5)
    stat.bootstrap(id, val)
    ## Summary Statistic: mean = 3.6667 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##  2.5% 97.5% 
    ##     3     5 
    stat.bootstrap(id, val, func = sd)
    ## Summary Statistic: sd = 0.8165 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##   2.5%  97.5% 
    ## 0.0000 1.0954 
}

Thursday, May 20, 2021

Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios

ratio.of.two.prop <- function(x, n)
{
    ## Purpose: Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios. 
    ## Arguments:
    ##   x: a vector of two integers (numerator of the two proportions). 
    ##   n: a vector of two integers (denominator of the two proportions).
    ## Return: The point estimate and the 95% confidence interval of the ratio of two proportions, where the ratio is the first proportion divided by the second proportion. 
    ## Author: Feiming Chen
    ## ________________________________________________

    if (x[1] == 0 || x[1] == n[1] || x[2] == 0 || x[2] == n[2]) {
        x <- x + 0.5
        n <- n + 1
    }

    p <- x / n

    se <- sqrt((1 - p[1]) / (p[1] * n[1]) + (1 - p[2]) / (p[2] * n[2]))
    
    pp <- p[1] / p[2]                   # point estimate
    LB <- exp(log(pp) - 1.96 * se)
    UB <- exp(log(pp) + 1.96 * se)

    c(pp, LB, UB)
}
if (F) {                                # Unit Test
    ratio.of.two.prop(c(431, 30), c(460, 146))
    ## [1] 4.5599 3.3116 6.2786
    ratio.of.two.prop(c(0, 146), c(460, 146))
    ## [1] 0.00108830 0.00006817 0.01737423   
}

Tuesday, May 4, 2021

Calculate the correlation within subjects

repeated.measures.correlation <- function(subject, x, y)
{
    ## Purpose: Calculate the correlation within subjects based on the paper:
    ##    Bland, J. Martin, and Douglas G. Altman. "Statistics notes: Calculating
    ##    correlation coefficients with repeated observations: Part 1—correlation
    ##    within subjects." Bmj 310.6977 (1995): 446.
    ## Arguments:
    ##    subject: ID for subject.  Each subject has multiple correlated measurements (x, y)
    ##    x: the variable that correlates with y.
    ##    y: the variable that correlates with x. 
    ## Return: 
    ## Author: Feiming Chen
    ## ________________________________________________

    dat <- data.frame(subject = factor(subject), x = x, y = y)
    b <- summary(aov(y ~ subject + x, dat))
    print(b)
    s <- b[[1]][[2]][-1]    
    r0 <- sqrt(s[1]/sum(s))             # within-subject correlation
    cat("\nWithin-Subject Correlation =", round(r0, 4), "\n")
    r0
}
if (F) {                                # Unit Test
    ## simulated data
    r <- 0.7                            # True Within-Subject Correlation
    z <- atanh(r)
    N <- 30
    se <- 1/sqrt(N-3)
    zz <- rnorm(N, z, se)
    rr <- tanh(zz)

    library(MASS)
    set.seed(1)
    dat <- data.frame(subject = factor(rep(1:N, each = 100)))
    dat$x <- dat$y <- NA
    for (i in 1:N) {
        a <- mvrnorm(n = 100, mu = c(0, 0), Sigma = matrix(c(1, rr[i], rr[i], 1), 2))
        dat[dat$subject == i, c(2, 3)] <- a
    }

    ## ANOVA analysis
    b <- summary(aov(y ~ subject + x, dat))
    s <- b[[1]][[2]][-1]    
    r0 <- sqrt(s[1]/sum(s))             # 0.70471

    ## using average of correlations
    d1 <- as.numeric(by(dat, dat$subject, function(d) cor(d$x, d$y)))
    d2 <- atanh(d1)
    d3 <- sample.mean.CI(d2)
    tanh(d3)                            # mean = 0.72 (0.68,0.75) (with Fisher's Z)
    sample.mean.CI(d1)                  # mean = 0.71 (0.67,0.74) (without Fisher's Z)
    repeated.measures.correlation(dat$subject, dat$x, dat$y) # 0.7047
}

Thursday, April 29, 2021

Fisher's Z Transform

Fisher.Z <- function(x, inverse = FALSE)
{
    ## Purpose: Fisher's Z Transform (application to correlation confidence interval)
    ## Arguments:
    ##   x: a positive numeric vector
    ##   inverse: if TRUE, do the inverse transform. 
    ## Return: a transformed (or back-transformed) vector
    ## Author: Feiming Chen
    ## ________________________________________________

    if (inverse) {
        y <- tanh(x)
        ## equivalent to the following:
        ## e <- exp(2 * x)
        ## y <- (e - 1) / (e + 1)
    } else {
        y <- atanh(x)
        ## equivalent to the following:
        ## y <- 1/2 * log((1 + x) / (1 - x))
    }
    y
}
if (F) {                                # Unit Test
    Fisher.Z(0.8)                       # 1.0986
    Fisher.Z(1.2, inverse = TRUE)       # 0.83365

    x <- rbeta(10, 1, 1)                # simulate correlations
    Fisher.Z(x)
    Fisher.Z(x, inverse = TRUE)
    all.equal(x, Fisher.Z(Fisher.Z(x), inverse = TRUE)) # TRUE
}

Calculate bootstrap confidence intervals for a statistic of a numeric vector

Using the library "boot"

my.boot <- function(x, func = mean, R = 10000) {
    ## Purpose: Calculate bootstrap confidence interval for a statistic of a numeric vector
    ## Arguments:
    ##   x: a vector of numerical values
    ##   func: a function to calculate the statistic (default to "mean")
    ##   R: number of replicate bootstrap samples (default to 10000)
    ## Return: the bootstrap confidence intervals (percentile and BCa)
    ## Author: Feiming Chen
    ## ________________________________________________

    library(boot)
    b <- boot(x, function(x, i) func(x[i]), R = R)
    print(boot.ci(b, type = c("perc", "bca")))
}
if (F) {
    x <- rnorm(100)
    my.boot(x)
    my.boot(x, func = median)
    ## Level     Percentile            BCa          
    ## 95%   (-0.1284,  0.2726 )   (-0.1371,  0.2533 )  
}
Not using the library "boot"

stat.bootstrap.independent <- function(val, func = mean, N.bootstrap = 10000)
{
    ## Purpose: Calculate bootstrap-based 95% confidence interval for independent univariate data
    ## Arguments:
    ##   val: a numeric vector to be summarized. 
    ##   func: a function for calculating the summary statistic from a numeric vector.  Default to "mean". 
    ##   N.bootstrap: number of bootstrap samples.  Default to 10000. 
    ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic
    ## Author: Feiming Chen
    ## ________________________________________________

    fname <- deparse(substitute(func))
    ans <- func(val)
    cat("Summary Statistic:", fname, "=", ans, "\n")

    N <- length(val)
    r <- rep(NA, N.bootstrap)
    set.seed(1)
    for (i in 1:N.bootstrap) {
        s <- sample(val, size = N, replace = TRUE)
        r[i] <- func(s)
    }

    hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname))

    cat("95% Bootstrap Percentile Confidence Interval:\n")
    quantile(r, c(0.025, 0.975))
}
if (F) {                                # Unit Test
    val <- c(3, 3, 3, 4, 4, 5)
    stat.bootstrap.independent(val)          # compare with t-test based CI: (2.8098 4.5235)
    ## Summary Statistic: mean = 3.6667 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##   2.5%  97.5% 
    ## 3.1667 4.3333 
}

Monday, April 26, 2021

Simple Survival Analysis of time-to-event data with missing data

my.survival.analysis <- function(dat, study.end.time = NULL)
{
    ## Purpose: Survival Analysis of time-to-event data with missing data. Assume two groups. 
    ## Arguments:
    ##   dat: time-to-event data that look like (variable name is fixed):
    ## 
    ##         time status  arm
    ##           9      1    1
    ##          13      1    0
    ##          13      0    1
    ## 
    ##     The meaning of each variable is as follows: 
    ##       time: the follow-up time (numeric) for right censored data.
    ##       status: the status indicator. 0 = right censored, 1 = event. 
    ##       arm: the group indicator for comparison. 0 = control group, 1 = treatment group. 
    ## 
    ##   study.end.time: truncation time (the end of the time window in which to compare two groups).
    ##                   It needs to be smaller than the default value (minimum of the largest observed time in each of the two groups)
    ## 
    ## Return: Survival Data Analysis Summary and Plots
    ## Author: Feiming Chen
    ## ________________________________________________

    library(survival)

    dat <<- dat                         # so that "cox.zph(f2)" below works. 

    fmla <- Surv(time, status) ~ arm    # formula for survival time analysis

    f <- survfit(formula = fmla, data = dat)

    print(f)

    ## Simple plot of survival curves
    plot(f, lwd = 2, col = c("red", "blue"), mark.time = TRUE, xlab = "Time", ylab = "Survival Probability")
    legend("topright", legend = names(f$strata), bg = "lightyellow", col = c("red", "blue"), title = "Groups", lwd = 2)

    cat("\n------------------------------\nLog-Rank test for comparing two survival curves:\n")
    print(survdiff(fmla, dat))

    cat("\n------------------------------\nSurvival Probability at Study End:\n")
    if (missing(study.end.time)) {
        study.end.time <- max(f$time)
        tau <- NULL
    } else  tau <- study.end.time

    ans <- summary(f, times = study.end.time, extend = TRUE, rmean = study.end.time)
    print(ans)
    ## Post-test disease risk = 1 - survival.probability.at.end

    ## ARR (Absolute Risk Reduction) - Check the direction of the risk reduction manually! 
    ARR <- diff(ans$surv)
    cat("ARR (Absolute Risk Reduction) =", round(ARR, 2), "\n")
    NNT <- ceiling(1 / ARR)
    cat("NNT (Number Needed to Treat) =", NNT, "\n")
    OR <- ans$surv[1] / (1 - ans$surv[1]) / (ans$surv[2] / (1 - ans$surv[2]))
    cat("Odds Ratio:", round(OR, 2), "\n")

    cat("\n------------------------------\nUsing Cox Regression Model:\n")
    f2 <- coxph(formula = fmla, data = dat)
    print(f2)
    cat("Hazard Ratio =", round(exp(f2$coefficients), 2), "\n")

    cat("\n------------------------------\nAssessing Proportional Hazards Assumptions:\n")
    print(cox.zph(f2))

    cat("\n------------------------------\nRMST (Restricted Mean Survival Time) Analysis\n")
    library(survRM2)
    r <- rmst2(time = dat$time, status = dat$status, arm = dat$arm, tau = tau)
    print(r)
    ## plot(r, xlab = "Time", ylab = "Survival Probability")

    f
}

if (F) {                                # Unit Test
    d <- data.frame(time = aml$time, status = aml$status, arm = ifelse(aml$x == "Maintained", 1, 0))
    f <- my.survival.analysis(d, study.end.time = 35)

    d <- survRM2::rmst2.sample.data()
    f <- my.survival.analysis(d, study.end.time = 10)

    ## cannot do it inside a function.
    ## make survival curve with cumulative events/incidences
    f <- survfit(Surv(time, status) ~ x, data = aml)
    library(survminer)
    ggsurvplot(f, data = aml, fun = "event", conf.int = TRUE, pval = TRUE, risk.table = "abs_pct", ggtheme = theme_bw(), palette = c("red", "blue"), risk.table.y.text.col = TRUE, risk.table.y.text = FALSE, ncensor.plot = TRUE)

}

Call: survfit(formula = fmla, data = dat)

       n events median 0.95LCL 0.95UCL
arm=0 12     11     23       8      NA
arm=1 11      7     31      18      NA

------------------------------
Log-Rank test for comparing two survival curves:
Call:
survdiff(formula = fmla, data = dat)

       N Observed Expected (O-E)^2/E (O-E)^2/V
arm=0 12       11     7.31      1.86       3.4
arm=1 11        7    10.69      1.27       3.4

 Chisq= 3.4  on 1 degrees of freedom, p= 0.07 

------------------------------
Survival Probability at Study End:
Call: survfit(formula = fmla, data = dat)

                arm=0 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
     35.0000       2.0000       9.0000       0.1944       0.1219       0.0569       0.6642 

                arm=1 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
      35.000        3.000        6.000        0.368        0.163        0.155        0.875 

ARR (Absolute Risk Reduction) = 0.17 
NNT (Number Needed to Treat) = 6 
Odds Ratio: 0.41 

------------------------------
Using Cox Regression Model:
Call:
coxph(formula = fmla, data = dat)

     coef exp(coef) se(coef)    z    p
arm -0.92      0.40     0.51 -1.8 0.07

Likelihood ratio test=3.4  on 1 df, p=0.066
n= 23, number of events= 18 
Hazard Ratio = 0.4 

------------------------------
Assessing Proportional Hazards Assumptions:
         chisq df    p
arm    0.00788  1 0.93
GLOBAL 0.00788  1 0.93

------------------------------
RMST (Restricted Mean Survival Time) Analysis

The truncation time: tau = 35  was specified. 

Restricted Mean Survival Time (RMST) by arm 
               Est.    se lower .95 upper .95
RMST (arm=1) 27.057 2.906    21.361    32.753
RMST (arm=0) 20.958 3.456    14.185    27.732


Restricted Mean Time Lost (RMTL) by arm 
               Est.    se lower .95 upper .95
RMTL (arm=1)  7.943 2.906     2.247    13.639
RMTL (arm=0) 14.042 3.456     7.268    20.815


Between-group contrast 
                      Est. lower .95 upper .95     p
RMST (arm=1)-(arm=0) 6.098    -2.752    14.949 0.177
RMST (arm=1)/(arm=0) 1.291     0.878     1.899 0.194
RMTL (arm=1)/(arm=0) 0.566     0.238     1.342 0.196

Friday, April 16, 2021

Search for the sample size that produces a specific (lower) confidence limit

search.n.binary.proportion <- function(p, cl, alpha = 0.05)
{
    ## Purpose: Search for the sample size that produces a specific (lower) confidence limit.
    ## Arguments:
    ##   p: reported point estimate
    ##   cl: reported lowered confidence limit
    ##   alpha: type I error rate for two-sided confidence interval. Default to 5%. 
    ## Return: a conservative estimate for the 95% confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    N <- 50                             # start at sample size of 50
    while (N < 2000) {                  # end at sample size of 1000
        x <- round(N * p)
        r <- prop.test(x, N, conf.level = 1 - alpha, correct = FALSE)
        cl2 <- r$conf.int[1]
        if (cl2 > cl) break
        N <- N + 1 
    }
    N <- N - 1
    x <- floor(N * p)
    cat("Sample Size =", N, ", X =", x, "\n")
    prop.test(x, N, conf.level = 0.95)
}
if (F) {                                # Unit Test
    search.n.binary.proportion(0.849, 0.804, alpha = 0.1)

    ## Sample Size = 194 , X = 164 

    ##  1-sample proportions test with continuity correction

    ## data:  x out of N, null probability 0.5
    ## X-squared = 91.2, df = 1, p-value <2e-16
    ## alternative hypothesis: true p is not equal to 0.5
    ## 95 percent confidence interval:
    ##  0.78497 0.89167
    ## sample estimates:
    ##       p 
    ## 0.84536 

}

Tuesday, April 6, 2021

Calculate sample proportion and its confidence interval based on the Score Test

sample.prop.CI <- function(x, n)
{
    ## Purpose: Calculate sample proportion and its confidence interval based on the Score Test
    ## Arguments:
    ##   x: a count in the numerator
    ##   n: total count in the denominator
    ## Return: print sample proportion and its confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    r <- prop.test(x, n, correct = FALSE)
    paste0(round(r$estimate, 2), " (", paste0(round(r$conf.int, 2), collapse = ","), ")")
}
if (F) {                                # Unit Test
    set.seed(1)
    sample.prop.CI(10, 100)             # "0.1 (0.06,0.17)"
}

Assess risk stratification performance

risk.stratification.analysis <- function(x)
{
    ## Purpose: Assess risk stratification performance. 
    ## Arguments:
    ##   x: a numeric vector of length 2K for the raw confusion matrix.
    ##      First K numbers: count of each risk group with reference positive results
    ##      Second K numbers: count of each risk group with reference negative results
    ## Return:
    ##   an expanded confusion matrix with post-test disease risks and likelihood ratios.
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    n <- nrow(d) - 1                    # number of risk groups
    g.names <- paste0("Group ", 1:n)    # group names
    dimnames(d) <- list(c(g.names, "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Post.Test.Risk <- with(d, Ref.Pos / Sum, 2)
    d$Likelihood.Ratio <- with(d, Ref.Pos / Ref.Pos[n+1] / (Ref.Neg / Ref.Neg[n+1]))

    d
}
if (F) {                                # Unit Test
    x <- c(38, 27, 35, 5, 10, 41, 137, 158)
    risk.stratification.analysis(x)

    ##         Ref.Pos Ref.Neg Sum Post.Test.Risk Likelihood.Ratio
    ## Group 1      38      10  48           0.79            12.52
    ## Group 2      27      41  68           0.40             2.17
    ## Group 3      35     137 172           0.20             0.84
    ## Group 4       5     158 163           0.03             0.10
    ## Sum         105     346 451           0.23             1.00

}

Friday, March 26, 2021

Calculate sample mean and its confidence interval using t-distribution

sample.mean.CI <- function(x, log.transform = FALSE)
{
    ## Purpose: Calculate sample mean and its confidence interval using t-distribution
    ## Arguments:
    ##   x: a vector of numeric values
    ##   log.transform: if T, apply log transformation to the data first. 
    ## Return: print sample mean and its confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    if (log.transform) {
        x <- log(x)
    }
    r <- t.test(x)
    e <- r$estimate
    c1 <- r$conf.int[1]
    c2 <- r$conf.int[2]

    if (log.transform) {
        e <- exp(e)
        c1 <- exp(c1)
        c2 <- exp(c2)
    }
    
    ans <- c(e, c1, c2)
    cat("Mean = ", convert.to.CI.text(ans), "\n")
    ans

}
if (F) {                                # Unit Test
    set.seed(1)
    sample.mean.CI(rnorm(100))          # Mean =  0.11 (-0.07 , 0.29) 
    sample.mean.CI(c(1, 2, 5, 10, 100), log.transform = TRUE) # Mean =  6.31 (0.7 , 57.23) 
}

Thursday, March 25, 2021

Sample Size for One-Sample Z test

sample.size.one.sample.Z.test <- function(E, S, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for One-sample Z test
    ##    H1: mu1 (expected) > mu0 (target)
    ##    Assume type I error of 2.5% (alpha), and power of 80% (1 - beta). 
    ## Arguments:
    ##   E: effect size (difference of expected mean and target mean)
    ##   S: estimated spread (standard deviation) from pilot sample
    ##   alpha: type I error (default to 2.5%)
    ##   beta: type II error (default to 20%)
    ## Return: sample size
    ## Author: Feiming Chen
    ## ________________________________________________

    a <- qnorm(1 - alpha) - qnorm(beta) # default = 2.8

    ## Note: a minimum sample size of 30 is required for the sample mean to approximate the normal distribution
    N <- max(30, ceiling((a * S / E)^2))

    cat("Sample Size =", N, ", when Effect Size =", E, ", SD =", S, ", Type I Error =", alpha, ", Power = ", 1 - beta, "\n")
}
if (F) {                                # Unit Test
    sample.size.one.sample.Z.test(0.1, 0.2) # 32

    for (E in seq(0.05, 0.2, 0.05)) {
        for (SD in seq(0.2, 0.5, 0.1)) {
            sample.size.one.sample.Z.test(E, SD)
        }
    }

    ## Sample Size = 126 , when Effect Size = 0.05 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 283 , when Effect Size = 0.05 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 503 , when Effect Size = 0.05 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 785 , when Effect Size = 0.05 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.1 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 71 , when Effect Size = 0.1 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 126 , when Effect Size = 0.1 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 197 , when Effect Size = 0.1 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.15 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.15 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 56 , when Effect Size = 0.15 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 88 , when Effect Size = 0.15 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.2 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 50 , when Effect Size = 0.2 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 

}

Wednesday, February 24, 2021

Prevalence Scaling

prevalence.scaling <- function(model.P, model.prevalence = 0.5, target.prevalence = 0.1)
{
    ## Purpose: Convert a probability based on training data prevalence to
    ##          a probability based on target population prevalence. 
    ## Arguments:
    ##   model.P: a vector of model-based probability (risk score)
    ##   model.prevalence: prevalence of the model's training data
    ##   target.prevalence: prevalence of the target population
    ## Return:
    ##   a vector of scaled probabilities reflecting the prevalence of the target population
    ## Author: Feiming Chen
    ## ________________________________________________

    ## convert a probability to the odds
    O <- function(p)  p / (1-p)

    1 / ( 1 + O(model.prevalence) / O(target.prevalence) / O(model.P) )
}
if (F) {                                # Unit Test
    set.seed(1)
    prevalence.scaling(runif(5))
    ## [1] 0.038614 0.061784 0.129688 0.523663 0.027304
}

Thursday, February 18, 2021

Confidence Interval of the Difference between Two Correlated Proportions (McNemar Test)

CI.diff.two.correlated.proportions <- function(A, B, C, D, alpha = 0.05)
{
    ## Purpose: CI of the difference between two correlated proportions (McNemar Test)
    ## Arguments:
    ##   A, B, C, D: the 2x2 incidence count table for the matched pairs. 
    ##               A = both positive;
    ##               B = treatment positive, control negative;
    ##               C = treatment negative; control positive;
    ##               D = both negative;
    ##   alpha: type I error. Default to 0.05 (for two-sided CI).
    ## Return: 95% Confidence Interval of the Difference
    ## Author: Feiming Chen
    ## Method: Wald Z Method with Continuity Correction
    ## Reference: Newcombe (1998c), page 2638.
    ## ________________________________________________

    N <- A + B + C + D                  # total number of matched pairs
    Delta <- (B - C) / N
    sw <- sqrt(((A + D) * (B + C) + 4 * B * C) / N^3)
    z <- qnorm(1 - alpha / 2)           # default: 1.96
    corr.term <- 1 / N                  # continuity correction 
    L <- Delta - z * sw - corr.term
    U <- Delta + z * sw + corr.term
    x <- matrix(c(A, B, C, D), 2, 2, byrow = TRUE, 
                dimnames = list(c("Test.POS", "Test.NEG"), c("Control.POS", "Control.NEG")))
    cat("Raw Data (x):\n")
    print(x)
    cat("\nData Converted to Proportions:\n")
    print(round(x/N, 2))
    cat("\nDifference of Two Paired Proportions =", round(Delta, 3), "\n")
    p.discordant <- (B+C) / N
    cat("Proportion of Discordant Pairs =", round(p.discordant, 3), "\n")
    N.future <- sample.size.for.two.correlated.proportions.test(Delta, p.discordant)
    cat("\nCurrent Sample Size =", N, "\n")
    cat("Future  Sample Size =", N.future, "\n")

    print(mcnemar.test(x))
    cat("\nConfidence Interval = (", round(L, 4), ",", round(U, 4), ")\n\n")
}
if (F) {                                # Unit Test
    CI.diff.two.correlated.proportions(12, 9, 2, 21) 
    ## Confidence Interval = ( -0.0037 , 0.3219 )
    CI.diff.two.correlated.proportions(25, 20, 5, 25) 
    ## Raw Data (x):
    ##          Control.POS Control.NEG
    ## Test.POS          25          20
    ## Test.NEG           5          25

    ## Data Converted to Proportions:
    ##          Control.POS Control.NEG
    ## Test.POS        0.33        0.27
    ## Test.NEG        0.07        0.33

    ## Difference of Two Paired Proportions = 0.2 
    ## Proportion of Discordant Pairs = 0.333 

    ## Current Sample Size = 75 
    ## Future  Sample Size = 63 

    ##  McNemar's Chi-squared test with continuity correction

    ## data:  x
    ## McNemar's chi-squared = 7.84, df = 1, p-value = 0.00511


    ## Confidence Interval = ( 0.0641 , 0.3359 )
}

Sample Size for Two Correlated Proportions Based on McNemar Test

sample.size.for.two.correlated.proportions.test <- function(p.diff = 0.1, p.discordant = 0.2, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for Two Correlated Proportions Test based on McNemar Test
    ##   H0: p2 <= p1
    ##   H1: p2 >  p1
    ##   (Application: tests of sensitivity, specificity with three-way, matched-pairs study)
    ## Arguments:
    ##   p.diff: Effect size for the difference between two correlated proportions. Default to 10%. 
    ##   p.discordant: Proportion of pairs for which the responses differed. Default to 20%. 
    ##   alpha: type I error. Default to 0.025 (for one-sided test).
    ##   beta: type II error (1 - power). Default to 0.2 (so power is 80%). 
    ## Return:
    ##   Sample Size
    ## Author: Feiming Chen
    ## Reference: Machin, Campbell, Fayers, and Pinol (1997).
    ## ________________________________________________

    p10 <- (p.discordant + p.diff) / 2  # with default, p10 = 0.15
    p01 <- (p.discordant - p.diff) / 2  # default: p01 = 0.05
    OR <- p10 / p01                     # default: OR = 3
    OR.plus.one <- OR + 1               # default: 4
    OR.minus.one <- OR - 1              # default: 2
    
    N = (qnorm(1 - alpha) * OR.plus.one + qnorm(1 - beta) * sqrt(OR.plus.one^2 - OR.minus.one^2 * p.discordant))^2 / (OR.minus.one^2 * p.discordant)
    ceiling(N)
}
if (F) {                                # Unit Test
    sample.size.for.two.correlated.proportions.test() # 155 (PASS output: 155 under normal approx.)
}

Wednesday, February 17, 2021

Sample Size for One Proportion Test

sample.size.for.one.proportion.test <- function(p0  = 0.8, p1 = 0.9, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for One Proportion Test.
    ##   H0: p <= p0
    ##   H1: p >  p0
    ##   (Application: tests of sensitivity, specificity with performance goals)
    ## Arguments:
    ##   p0: performance goal (minimally acceptable proportion). Default to 80%. 
    ##   p1: expected performance (minimal proportion under the alternative hypothesis). Defaul to 90%. 
    ##   alpha: type I error. Default to 0.025 (for one-sided test).
    ##   beta: type II error (1 - power). Default to 0.2 (so power is 80%). 
    ## Return:
    ##   Sample Size
    ## Author: Feiming Chen
    ## Reference: Biswas, Bipasa. "Clinical performance evaluation of molecular diagnostic tests."
    ##            The Journal of Molecular Diagnostics 18.6 (2016): 803-812.
    ## ________________________________________________

    N = (qnorm(1 - alpha) * sqrt(p0 * (1 - p0)) + qnorm(1 - beta) * sqrt(p1 * (1 - p1)))^2 / (p1 - p0)^2
    ceiling(N)
}
if (F) {                                # Unit Test
    sample.size.for.one.proportion.test() # 108 (PASS output: 107 under exact test; 108 under normal approx.)
}

R Function Library

Wednesday, October 20, 2021

Make a plot to compare point estimates and their confidence intervals

Friday, August 27, 2021

Preprocess clustered binary data before inference on sample proportion

Wednesday, June 30, 2021

Bootstrap Confidence Interval for Data with Repeated Measures

Thursday, May 20, 2021

Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios

Tuesday, May 4, 2021

Calculate the correlation within subjects

Thursday, April 29, 2021

Fisher's Z Transform

Calculate bootstrap confidence intervals for a statistic of a numeric vector

Using the library "boot"

Not using the library "boot"

Monday, April 26, 2021

Simple Survival Analysis of time-to-event data with missing data

Friday, April 16, 2021

Search for the sample size that produces a specific (lower) confidence limit

Tuesday, April 6, 2021

Calculate sample proportion and its confidence interval based on the Score Test

Assess risk stratification performance

Friday, March 26, 2021

Calculate sample mean and its confidence interval using t-distribution

Thursday, March 25, 2021

Sample Size for One-Sample Z test

Wednesday, February 24, 2021

Prevalence Scaling

Thursday, February 18, 2021

Confidence Interval of the Difference between Two Correlated Proportions (McNemar Test)

Sample Size for Two Correlated Proportions Based on McNemar Test

Wednesday, February 17, 2021

Sample Size for One Proportion Test

Blog Archive