Wednesday, October 20, 2021

Make a plot to compare point estimates and their confidence intervals

plot.compare.CI <- function(x, CI, ylab = "Estimates", ...)
{
    ## Purpose: Make a plot to compare point estimates and their confidence intervals
    ##          (presumably derived from different methods)
    ## Arguments:
    ##   x: a vector of point estimates
    ##   CI: a matrix of CI's. Each column is a CI pair.
    ##       1st row is lower CI. 2nd row is upper CI
    ##   ...: pass to the plot function. 
    ## Return: a plot. 
    ## Author: Feiming Chen
    ## ________________________________________________

    require(gplots)

    ## Determine how best to put X-axis Labels --------------------
    lx <- length(x)                     # number of point estimates

    if (lx > 7) {
        xlas <- 2
        o <- par(mar = c(10, 4, 4, 2))
        on.exit(par(o))
    }

    li <- CI[1,]
    ui <- CI[2,]
    plotCI(x, li = li, ui = ui, barcol="blue", xaxt="n", xlab = "", ylab = ylab, sfrac = 0.005, xlim = c(0.7, lx + 0.3), ...)

    s <- 1:lx
    ss <- names(x)
    axis(side=1, at = s, labels= ss, tick = F, las = 0)

    ## put on numerical labels on the CI's. 
    s1 <- s + 0.18                       # offset
    ## median & CI labels
    text(s1, y = li, labels = round(li, 2))
    text(s1, y = ui, labels = round(ui, 2))
    text(s1, y = x,  labels = round(x, 2))
}
if (F) {                                # Unit Test
    x <- 1:3
    names(x) <- c("A", "B", "C")
    CI <- matrix(c(0.8, 1.2, 1.7, 2.3, 2.9, 3.1), nrow = 2)
    plot.compare.CI(x, CI)
}

Friday, August 27, 2021

Preprocess clustered binary data before inference on sample proportion

preprocess.clustered.binary.data <- function(x, n)
{
    ## Purpose: Preprocess clustered binary data before inference on sample proportion
    ##          Reference: Rao, J. N. K., & Scott, A. J. (1992). A simple method for
    ##                     the analysis of clustered binary data. Biometrics, 577-585.
    ## Keywords: variance adjustment, ratio estimator, correlated data
    ## Arguments:
    ##   x: a vector for the numerators across clusters.
    ##   n: a vector for the denominators across clusters.
    ## Return: a pre-processed numerator and denominator for overall sample proportion
    ## Author: Feiming Chen
    ## ________________________________________________

    m <- length(x)
    cat("Number of Clusters =", m, "\n")

    n0 <- sum(n)
    cat("Raw Sample Size =", n0, "\n")

    x0 <- sum(x)
    cat("Raw Incidence Count =", x0, "\n")

    cat("\nEstimates Regarding the Overall Sample Proportion:\n")

    p <- x0 / n0
    cat("  Point Estimate =", round(p, 4), "\n")

    v0 <- p * (1 - p) / n0
    cat("  Naive Binomial Variance =", round(v0, 4), "\n")

    r <- x - n * p
    v <- m * sum(r^2) / (m - 1) / n0^2
    cat("  Correct Variance  =", round(v, 4), "\n")

    d <- v / v0
    cat("\nDesign Effect (Variance Inflation Factor due to Clustering) =", round(d, 4), "\n")

    n1 <- n0 / d
    cat("Effective Sample Size (n) =", round(n1), "\n")

    x1 <- x0 / d
    cat("Effective Incidence Count (x) =", round(x1), "\n")

    ## Return the pre-processed numerator (x) and denominator (n) for
    ## the overall sample proportion (p = x / n). 
    list(x = x1, n = n1)

}
if (F) {                                # Unit Test
    x = c(1,1,2,0,5,0,1,4,0,1,0,0,0,0,0,4,3,0,1,1,1,2,0,1,0,2,1,0,1,5,0,0,0,0,0,0,2,0,2,0,0,2,1,2,2,0,0,1,0,1) 
    n = c(2,2,3,0,7,1,2,5,1,2,0,0,4,2,0,7,4,1,1,1,4,2,3,1,0,2,1,0,1,5,3,0,1,3,0,0,2,1,2,0,0,5,3,2,2,1,0,2,1,1) 

    a <- preprocess.clustered.binary.data(x, n)
    ## Number of Clusters = 50 
    ## Raw Sample Size = 93 
    ## Raw Incidence Count = 50 

    ## Estimates Regarding the Overall Sample Proportion:
    ##   Point Estimate = 0.5376 
    ##   Naive Binomial Variance = 0.0027 
    ##   Correct Variance  = 0.004 

    ## Design Effect (Variance Inflation Factor due to Clustering) = 1.4895 
    ## Effective Sample Size (n) = 62 
    ## Effective Incidence Count (x) = 34 

    prop.test(a$x, a$n)
    ## 95 percent confidence interval:
    ##  0.40774 0.66290
    ## sample estimates:
    ##       p 
    ## 0.53763 

    ## Compare to Bootstrap Confidence Intervals
    library(boot)
    r <- boot(data.frame(x=x, n=n), function(dat, idx) { d <- dat[idx,]; sum(d$x)/sum(d$n)}, R = 10000)
    boot.ci(r)
    ## Bootstrap Percentile CI: ( 0.4118,  0.6569 ), which is a bit tighter than the VIF method. 

    ## Compare to Unadjusted (Wrong) CI: 
    prop.test(sum(x), sum(n))
    ## 95 percent confidence interval:
    ##  0.43159 0.64053, which is too narrow and is wrong. 

}

Wednesday, June 30, 2021

Bootstrap Confidence Interval for Data with Repeated Measures

stat.bootstrap <- function(id, val, func = mean, N.bootstrap = 10000)
{
    ## Purpose: Calculate bootstrap-based 95% confidence interval for data with repeated measures
    ## Arguments:
    ##   id: uniquely identifies a subject (patient)
    ##   val: a numeric vector to be summarized.  It contains repeated measures per subject.
    ##   func: a function for calculating the summary statistic from a numeric vector.  Default to "mean". 
    ##   N.bootstrap: number of bootstrap samples.  Default to 10000. 
    ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic
    ## Author: Feiming Chen
    ## ________________________________________________

    fname <- deparse(substitute(func))
    cat("Summary Statistic:", fname, "=", func(val), "\n")

    id.uniq <- unique(id)
    N <- length(id.uniq)

    r <- rep(NA, N.bootstrap)
    for (i in 1:N.bootstrap) {
        s <- sample(id.uniq, N, replace = TRUE)
        v <- c()
        for (j in s) v <- c(v, val[id == j])
        r[i] <- func(v)
    }

    hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname))
    cat("95% Bootstrap Percentile Confidence Interval:\n")
    quantile(r, c(0.025, 0.975))
}
if (F) {                                # Unit Test
    id <- c(1, 1, 1, 2, 2, 3)
    val <- c(3, 3, 3, 4, 4, 5)
    stat.bootstrap(id, val)
    ## Summary Statistic: mean = 3.6667 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##  2.5% 97.5% 
    ##     3     5 
    stat.bootstrap(id, val, func = sd)
    ## Summary Statistic: sd = 0.8165 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##   2.5%  97.5% 
    ## 0.0000 1.0954 
}


Thursday, May 20, 2021

Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios

ratio.of.two.prop <- function(x, n)
{
    ## Purpose: Calculate the 95% CI of the ratio of two proportions, with application to diagnostic likelihood ratios. 
    ## Arguments:
    ##   x: a vector of two integers (numerator of the two proportions). 
    ##   n: a vector of two integers (denominator of the two proportions).
    ## Return: The point estimate and the 95% confidence interval of the ratio of two proportions, where the ratio is the first proportion divided by the second proportion. 
    ## Author: Feiming Chen
    ## ________________________________________________

    if (x[1] == 0 || x[1] == n[1] || x[2] == 0 || x[2] == n[2]) {
        x <- x + 0.5
        n <- n + 1
    }

    p <- x / n

    se <- sqrt((1 - p[1]) / (p[1] * n[1]) + (1 - p[2]) / (p[2] * n[2]))
    
    pp <- p[1] / p[2]                   # point estimate
    LB <- exp(log(pp) - 1.96 * se)
    UB <- exp(log(pp) + 1.96 * se)

    c(pp, LB, UB)
}
if (F) {                                # Unit Test
    ratio.of.two.prop(c(431, 30), c(460, 146))
    ## [1] 4.5599 3.3116 6.2786
    ratio.of.two.prop(c(0, 146), c(460, 146))
    ## [1] 0.00108830 0.00006817 0.01737423   
}

Tuesday, May 4, 2021

Calculate the correlation within subjects

repeated.measures.correlation <- function(subject, x, y)
{
    ## Purpose: Calculate the correlation within subjects based on the paper:
    ##    Bland, J. Martin, and Douglas G. Altman. "Statistics notes: Calculating
    ##    correlation coefficients with repeated observations: Part 1—correlation
    ##    within subjects." Bmj 310.6977 (1995): 446.
    ## Arguments:
    ##    subject: ID for subject.  Each subject has multiple correlated measurements (x, y)
    ##    x: the variable that correlates with y.
    ##    y: the variable that correlates with x. 
    ## Return: 
    ## Author: Feiming Chen
    ## ________________________________________________

    dat <- data.frame(subject = factor(subject), x = x, y = y)
    b <- summary(aov(y ~ subject + x, dat))
    print(b)
    s <- b[[1]][[2]][-1]    
    r0 <- sqrt(s[1]/sum(s))             # within-subject correlation
    cat("\nWithin-Subject Correlation =", round(r0, 4), "\n")
    r0
}
if (F) {                                # Unit Test
    ## simulated data
    r <- 0.7                            # True Within-Subject Correlation
    z <- atanh(r)
    N <- 30
    se <- 1/sqrt(N-3)
    zz <- rnorm(N, z, se)
    rr <- tanh(zz)

    library(MASS)
    set.seed(1)
    dat <- data.frame(subject = factor(rep(1:N, each = 100)))
    dat$x <- dat$y <- NA
    for (i in 1:N) {
        a <- mvrnorm(n = 100, mu = c(0, 0), Sigma = matrix(c(1, rr[i], rr[i], 1), 2))
        dat[dat$subject == i, c(2, 3)] <- a
    }

    ## ANOVA analysis
    b <- summary(aov(y ~ subject + x, dat))
    s <- b[[1]][[2]][-1]    
    r0 <- sqrt(s[1]/sum(s))             # 0.70471

    ## using average of correlations
    d1 <- as.numeric(by(dat, dat$subject, function(d) cor(d$x, d$y)))
    d2 <- atanh(d1)
    d3 <- sample.mean.CI(d2)
    tanh(d3)                            # mean = 0.72 (0.68,0.75) (with Fisher's Z)
    sample.mean.CI(d1)                  # mean = 0.71 (0.67,0.74) (without Fisher's Z)
    repeated.measures.correlation(dat$subject, dat$x, dat$y) # 0.7047
}

Thursday, April 29, 2021

Fisher's Z Transform

Fisher.Z <- function(x, inverse = FALSE)
{
    ## Purpose: Fisher's Z Transform (application to correlation confidence interval)
    ## Arguments:
    ##   x: a positive numeric vector
    ##   inverse: if TRUE, do the inverse transform. 
    ## Return: a transformed (or back-transformed) vector
    ## Author: Feiming Chen
    ## ________________________________________________

    if (inverse) {
        y <- tanh(x)
        ## equivalent to the following:
        ## e <- exp(2 * x)
        ## y <- (e - 1) / (e + 1)
    } else {
        y <- atanh(x)
        ## equivalent to the following:
        ## y <- 1/2 * log((1 + x) / (1 - x))
    }
    y
}
if (F) {                                # Unit Test
    Fisher.Z(0.8)                       # 1.0986
    Fisher.Z(1.2, inverse = TRUE)       # 0.83365

    x <- rbeta(10, 1, 1)                # simulate correlations
    Fisher.Z(x)
    Fisher.Z(x, inverse = TRUE)
    all.equal(x, Fisher.Z(Fisher.Z(x), inverse = TRUE)) # TRUE
}

Calculate bootstrap confidence intervals for a statistic of a numeric vector

Using the library "boot"

my.boot <- function(x, func = mean, R = 10000) {
    ## Purpose: Calculate bootstrap confidence interval for a statistic of a numeric vector
    ## Arguments:
    ##   x: a vector of numerical values
    ##   func: a function to calculate the statistic (default to "mean")
    ##   R: number of replicate bootstrap samples (default to 10000)
    ## Return: the bootstrap confidence intervals (percentile and BCa)
    ## Author: Feiming Chen
    ## ________________________________________________

    library(boot)
    b <- boot(x, function(x, i) func(x[i]), R = R)
    print(boot.ci(b, type = c("perc", "bca")))
}
if (F) {
    x <- rnorm(100)
    my.boot(x)
    my.boot(x, func = median)
    ## Level     Percentile            BCa          
    ## 95%   (-0.1284,  0.2726 )   (-0.1371,  0.2533 )  
}

Not using the library "boot"

stat.bootstrap.independent <- function(val, func = mean, N.bootstrap = 10000)
{
    ## Purpose: Calculate bootstrap-based 95% confidence interval for independent univariate data
    ## Arguments:
    ##   val: a numeric vector to be summarized. 
    ##   func: a function for calculating the summary statistic from a numeric vector.  Default to "mean". 
    ##   N.bootstrap: number of bootstrap samples.  Default to 10000. 
    ## Return: Point estimate, bootstrap percentile 95% CI, histogram for the bootstrap distribution of the target statistic
    ## Author: Feiming Chen
    ## ________________________________________________

    fname <- deparse(substitute(func))
    ans <- func(val)
    cat("Summary Statistic:", fname, "=", ans, "\n")

    N <- length(val)
    r <- rep(NA, N.bootstrap)
    set.seed(1)
    for (i in 1:N.bootstrap) {
        s <- sample(val, size = N, replace = TRUE)
        r[i] <- func(s)
    }

    hist(r, xlab = fname, main = paste("Bootstrap Distribution: ", fname))

    cat("95% Bootstrap Percentile Confidence Interval:\n")
    quantile(r, c(0.025, 0.975))
}
if (F) {                                # Unit Test
    val <- c(3, 3, 3, 4, 4, 5)
    stat.bootstrap.independent(val)          # compare with t-test based CI: (2.8098 4.5235)
    ## Summary Statistic: mean = 3.6667 
    ## 95% Bootstrap Percentile Confidence Interval:
    ##   2.5%  97.5% 
    ## 3.1667 4.3333 
}



Monday, April 26, 2021

Simple Survival Analysis of time-to-event data with missing data

my.survival.analysis <- function(dat, study.end.time = NULL)
{
    ## Purpose: Survival Analysis of time-to-event data with missing data. Assume two groups. 
    ## Arguments:
    ##   dat: time-to-event data that look like (variable name is fixed):
    ## 
    ##         time status  arm
    ##           9      1    1
    ##          13      1    0
    ##          13      0    1
    ## 
    ##     The meaning of each variable is as follows: 
    ##       time: the follow-up time (numeric) for right censored data.
    ##       status: the status indicator. 0 = right censored, 1 = event. 
    ##       arm: the group indicator for comparison. 0 = control group, 1 = treatment group. 
    ## 
    ##   study.end.time: truncation time (the end of the time window in which to compare two groups).
    ##                   It needs to be smaller than the default value (minimum of the largest observed time in each of the two groups)
    ## 
    ## Return: Survival Data Analysis Summary and Plots
    ## Author: Feiming Chen
    ## ________________________________________________

    library(survival)

    dat <<- dat                         # so that "cox.zph(f2)" below works. 

    fmla <- Surv(time, status) ~ arm    # formula for survival time analysis

    f <- survfit(formula = fmla, data = dat)

    print(f)

    ## Simple plot of survival curves
    plot(f, lwd = 2, col = c("red", "blue"), mark.time = TRUE, xlab = "Time", ylab = "Survival Probability")
    legend("topright", legend = names(f$strata), bg = "lightyellow", col = c("red", "blue"), title = "Groups", lwd = 2)

    cat("\n------------------------------\nLog-Rank test for comparing two survival curves:\n")
    print(survdiff(fmla, dat))

    cat("\n------------------------------\nSurvival Probability at Study End:\n")
    if (missing(study.end.time)) {
        study.end.time <- max(f$time)
        tau <- NULL
    } else  tau <- study.end.time

    ans <- summary(f, times = study.end.time, extend = TRUE, rmean = study.end.time)
    print(ans)
    ## Post-test disease risk = 1 - survival.probability.at.end

    ## ARR (Absolute Risk Reduction) - Check the direction of the risk reduction manually! 
    ARR <- diff(ans$surv)
    cat("ARR (Absolute Risk Reduction) =", round(ARR, 2), "\n")
    NNT <- ceiling(1 / ARR)
    cat("NNT (Number Needed to Treat) =", NNT, "\n")
    OR <- ans$surv[1] / (1 - ans$surv[1]) / (ans$surv[2] / (1 - ans$surv[2]))
    cat("Odds Ratio:", round(OR, 2), "\n")

    cat("\n------------------------------\nUsing Cox Regression Model:\n")
    f2 <- coxph(formula = fmla, data = dat)
    print(f2)
    cat("Hazard Ratio =", round(exp(f2$coefficients), 2), "\n")

    cat("\n------------------------------\nAssessing Proportional Hazards Assumptions:\n")
    print(cox.zph(f2))

    cat("\n------------------------------\nRMST (Restricted Mean Survival Time) Analysis\n")
    library(survRM2)
    r <- rmst2(time = dat$time, status = dat$status, arm = dat$arm, tau = tau)
    print(r)
    ## plot(r, xlab = "Time", ylab = "Survival Probability")

    f
}

if (F) {                                # Unit Test
    d <- data.frame(time = aml$time, status = aml$status, arm = ifelse(aml$x == "Maintained", 1, 0))
    f <- my.survival.analysis(d, study.end.time = 35)

    d <- survRM2::rmst2.sample.data()
    f <- my.survival.analysis(d, study.end.time = 10)

    ## cannot do it inside a function.
    ## make survival curve with cumulative events/incidences
    f <- survfit(Surv(time, status) ~ x, data = aml)
    library(survminer)
    ggsurvplot(f, data = aml, fun = "event", conf.int = TRUE, pval = TRUE, risk.table = "abs_pct", ggtheme = theme_bw(), palette = c("red", "blue"), risk.table.y.text.col = TRUE, risk.table.y.text = FALSE, ncensor.plot = TRUE)

}


Call: survfit(formula = fmla, data = dat)

       n events median 0.95LCL 0.95UCL
arm=0 12     11     23       8      NA
arm=1 11      7     31      18      NA

------------------------------
Log-Rank test for comparing two survival curves:
Call:
survdiff(formula = fmla, data = dat)

       N Observed Expected (O-E)^2/E (O-E)^2/V
arm=0 12       11     7.31      1.86       3.4
arm=1 11        7    10.69      1.27       3.4

 Chisq= 3.4  on 1 degrees of freedom, p= 0.07 

------------------------------
Survival Probability at Study End:
Call: survfit(formula = fmla, data = dat)

                arm=0 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
     35.0000       2.0000       9.0000       0.1944       0.1219       0.0569       0.6642 

                arm=1 
        time       n.risk      n.event     survival      std.err lower 95% CI upper 95% CI 
      35.000        3.000        6.000        0.368        0.163        0.155        0.875 

ARR (Absolute Risk Reduction) = 0.17 
NNT (Number Needed to Treat) = 6 
Odds Ratio: 0.41 

------------------------------
Using Cox Regression Model:
Call:
coxph(formula = fmla, data = dat)

     coef exp(coef) se(coef)    z    p
arm -0.92      0.40     0.51 -1.8 0.07

Likelihood ratio test=3.4  on 1 df, p=0.066
n= 23, number of events= 18 
Hazard Ratio = 0.4 

------------------------------
Assessing Proportional Hazards Assumptions:
         chisq df    p
arm    0.00788  1 0.93
GLOBAL 0.00788  1 0.93

------------------------------
RMST (Restricted Mean Survival Time) Analysis

The truncation time: tau = 35  was specified. 

Restricted Mean Survival Time (RMST) by arm 
               Est.    se lower .95 upper .95
RMST (arm=1) 27.057 2.906    21.361    32.753
RMST (arm=0) 20.958 3.456    14.185    27.732


Restricted Mean Time Lost (RMTL) by arm 
               Est.    se lower .95 upper .95
RMTL (arm=1)  7.943 2.906     2.247    13.639
RMTL (arm=0) 14.042 3.456     7.268    20.815


Between-group contrast 
                      Est. lower .95 upper .95     p
RMST (arm=1)-(arm=0) 6.098    -2.752    14.949 0.177
RMST (arm=1)/(arm=0) 1.291     0.878     1.899 0.194
RMTL (arm=1)/(arm=0) 0.566     0.238     1.342 0.196

Friday, April 16, 2021

Search for the sample size that produces a specific (lower) confidence limit

search.n.binary.proportion <- function(p, cl, alpha = 0.05)
{
    ## Purpose: Search for the sample size that produces a specific (lower) confidence limit.
    ## Arguments:
    ##   p: reported point estimate
    ##   cl: reported lowered confidence limit
    ##   alpha: type I error rate for two-sided confidence interval. Default to 5%. 
    ## Return: a conservative estimate for the 95% confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    N <- 50                             # start at sample size of 50
    while (N < 2000) {                  # end at sample size of 1000
        x <- round(N * p)
        r <- prop.test(x, N, conf.level = 1 - alpha, correct = FALSE)
        cl2 <- r$conf.int[1]
        if (cl2 > cl) break
        N <- N + 1 
    }
    N <- N - 1
    x <- floor(N * p)
    cat("Sample Size =", N, ", X =", x, "\n")
    prop.test(x, N, conf.level = 0.95)
}
if (F) {                                # Unit Test
    search.n.binary.proportion(0.849, 0.804, alpha = 0.1)

    ## Sample Size = 194 , X = 164 

    ##  1-sample proportions test with continuity correction

    ## data:  x out of N, null probability 0.5
    ## X-squared = 91.2, df = 1, p-value <2e-16
    ## alternative hypothesis: true p is not equal to 0.5
    ## 95 percent confidence interval:
    ##  0.78497 0.89167
    ## sample estimates:
    ##       p 
    ## 0.84536 

}

Tuesday, April 6, 2021

Calculate sample proportion and its confidence interval based on the Score Test

sample.prop.CI <- function(x, n)
{
    ## Purpose: Calculate sample proportion and its confidence interval based on the Score Test
    ## Arguments:
    ##   x: a count in the numerator
    ##   n: total count in the denominator
    ## Return: print sample proportion and its confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    r <- prop.test(x, n, correct = FALSE)
    paste0(round(r$estimate, 2), " (", paste0(round(r$conf.int, 2), collapse = ","), ")")
}
if (F) {                                # Unit Test
    set.seed(1)
    sample.prop.CI(10, 100)             # "0.1 (0.06,0.17)"
}

Assess risk stratification performance

risk.stratification.analysis <- function(x)
{
    ## Purpose: Assess risk stratification performance. 
    ## Arguments:
    ##   x: a numeric vector of length 2K for the raw confusion matrix.
    ##      First K numbers: count of each risk group with reference positive results
    ##      Second K numbers: count of each risk group with reference negative results
    ## Return:
    ##   an expanded confusion matrix with post-test disease risks and likelihood ratios.
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    n <- nrow(d) - 1                    # number of risk groups
    g.names <- paste0("Group ", 1:n)    # group names
    dimnames(d) <- list(c(g.names, "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Post.Test.Risk <- with(d, Ref.Pos / Sum, 2)
    d$Likelihood.Ratio <- with(d, Ref.Pos / Ref.Pos[n+1] / (Ref.Neg / Ref.Neg[n+1]))

    d
}
if (F) {                                # Unit Test
    x <- c(38, 27, 35, 5, 10, 41, 137, 158)
    risk.stratification.analysis(x)

    ##         Ref.Pos Ref.Neg Sum Post.Test.Risk Likelihood.Ratio
    ## Group 1      38      10  48           0.79            12.52
    ## Group 2      27      41  68           0.40             2.17
    ## Group 3      35     137 172           0.20             0.84
    ## Group 4       5     158 163           0.03             0.10
    ## Sum         105     346 451           0.23             1.00

}

Friday, March 26, 2021

Calculate sample mean and its confidence interval using t-distribution

sample.mean.CI <- function(x, log.transform = FALSE)
{
    ## Purpose: Calculate sample mean and its confidence interval using t-distribution
    ## Arguments:
    ##   x: a vector of numeric values
    ##   log.transform: if T, apply log transformation to the data first. 
    ## Return: print sample mean and its confidence interval
    ## Author: Feiming Chen
    ## ________________________________________________

    if (log.transform) {
        x <- log(x)
    }
    r <- t.test(x)
    e <- r$estimate
    c1 <- r$conf.int[1]
    c2 <- r$conf.int[2]

    if (log.transform) {
        e <- exp(e)
        c1 <- exp(c1)
        c2 <- exp(c2)
    }
    
    ans <- c(e, c1, c2)
    cat("Mean = ", convert.to.CI.text(ans), "\n")
    ans

}
if (F) {                                # Unit Test
    set.seed(1)
    sample.mean.CI(rnorm(100))          # Mean =  0.11 (-0.07 , 0.29) 
    sample.mean.CI(c(1, 2, 5, 10, 100), log.transform = TRUE) # Mean =  6.31 (0.7 , 57.23) 
}


Thursday, March 25, 2021

Sample Size for One-Sample Z test

sample.size.one.sample.Z.test <- function(E, S, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for One-sample Z test
    ##    H1: mu1 (expected) > mu0 (target)
    ##    Assume type I error of 2.5% (alpha), and power of 80% (1 - beta). 
    ## Arguments:
    ##   E: effect size (difference of expected mean and target mean)
    ##   S: estimated spread (standard deviation) from pilot sample
    ##   alpha: type I error (default to 2.5%)
    ##   beta: type II error (default to 20%)
    ## Return: sample size
    ## Author: Feiming Chen
    ## ________________________________________________

    a <- qnorm(1 - alpha) - qnorm(beta) # default = 2.8

    ## Note: a minimum sample size of 30 is required for the sample mean to approximate the normal distribution
    N <- max(30, ceiling((a * S / E)^2))

    cat("Sample Size =", N, ", when Effect Size =", E, ", SD =", S, ", Type I Error =", alpha, ", Power = ", 1 - beta, "\n")
}
if (F) {                                # Unit Test
    sample.size.one.sample.Z.test(0.1, 0.2) # 32

    for (E in seq(0.05, 0.2, 0.05)) {
        for (SD in seq(0.2, 0.5, 0.1)) {
            sample.size.one.sample.Z.test(E, SD)
        }
    }

    ## Sample Size = 126 , when Effect Size = 0.05 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 283 , when Effect Size = 0.05 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 503 , when Effect Size = 0.05 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 785 , when Effect Size = 0.05 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.1 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 71 , when Effect Size = 0.1 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 126 , when Effect Size = 0.1 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 197 , when Effect Size = 0.1 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.15 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.15 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 56 , when Effect Size = 0.15 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 88 , when Effect Size = 0.15 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.2 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 30 , when Effect Size = 0.2 , SD = 0.3 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 32 , when Effect Size = 0.2 , SD = 0.4 , Type I Error = 0.025 , Power =  0.8 
    ## Sample Size = 50 , when Effect Size = 0.2 , SD = 0.5 , Type I Error = 0.025 , Power =  0.8 

}


Wednesday, February 24, 2021

Prevalence Scaling

prevalence.scaling <- function(model.P, model.prevalence = 0.5, target.prevalence = 0.1)
{
    ## Purpose: Convert a probability based on training data prevalence to
    ##          a probability based on target population prevalence. 
    ## Arguments:
    ##   model.P: a vector of model-based probability (risk score)
    ##   model.prevalence: prevalence of the model's training data
    ##   target.prevalence: prevalence of the target population
    ## Return:
    ##   a vector of scaled probabilities reflecting the prevalence of the target population
    ## Author: Feiming Chen
    ## ________________________________________________

    ## convert a probability to the odds
    O <- function(p)  p / (1-p)

    1 / ( 1 + O(model.prevalence) / O(target.prevalence) / O(model.P) )
}
if (F) {                                # Unit Test
    set.seed(1)
    prevalence.scaling(runif(5))
    ## [1] 0.038614 0.061784 0.129688 0.523663 0.027304
}

Thursday, February 18, 2021

Confidence Interval of the Difference between Two Correlated Proportions (McNemar Test)

CI.diff.two.correlated.proportions <- function(A, B, C, D, alpha = 0.05)
{
    ## Purpose: CI of the difference between two correlated proportions (McNemar Test)
    ## Arguments:
    ##   A, B, C, D: the 2x2 incidence count table for the matched pairs. 
    ##               A = both positive;
    ##               B = treatment positive, control negative;
    ##               C = treatment negative; control positive;
    ##               D = both negative;
    ##   alpha: type I error. Default to 0.05 (for two-sided CI).
    ## Return: 95% Confidence Interval of the Difference
    ## Author: Feiming Chen
    ## Method: Wald Z Method with Continuity Correction
    ## Reference: Newcombe (1998c), page 2638.
    ## ________________________________________________

    N <- A + B + C + D                  # total number of matched pairs
    Delta <- (B - C) / N
    sw <- sqrt(((A + D) * (B + C) + 4 * B * C) / N^3)
    z <- qnorm(1 - alpha / 2)           # default: 1.96
    corr.term <- 1 / N                  # continuity correction 
    L <- Delta - z * sw - corr.term
    U <- Delta + z * sw + corr.term
    x <- matrix(c(A, B, C, D), 2, 2, byrow = TRUE, 
                dimnames = list(c("Test.POS", "Test.NEG"), c("Control.POS", "Control.NEG")))
    cat("Raw Data (x):\n")
    print(x)
    cat("\nData Converted to Proportions:\n")
    print(round(x/N, 2))
    cat("\nDifference of Two Paired Proportions =", round(Delta, 3), "\n")
    p.discordant <- (B+C) / N
    cat("Proportion of Discordant Pairs =", round(p.discordant, 3), "\n")
    N.future <- sample.size.for.two.correlated.proportions.test(Delta, p.discordant)
    cat("\nCurrent Sample Size =", N, "\n")
    cat("Future  Sample Size =", N.future, "\n")

    print(mcnemar.test(x))
    cat("\nConfidence Interval = (", round(L, 4), ",", round(U, 4), ")\n\n")
}
if (F) {                                # Unit Test
    CI.diff.two.correlated.proportions(12, 9, 2, 21) 
    ## Confidence Interval = ( -0.0037 , 0.3219 )
    CI.diff.two.correlated.proportions(25, 20, 5, 25) 
    ## Raw Data (x):
    ##          Control.POS Control.NEG
    ## Test.POS          25          20
    ## Test.NEG           5          25

    ## Data Converted to Proportions:
    ##          Control.POS Control.NEG
    ## Test.POS        0.33        0.27
    ## Test.NEG        0.07        0.33

    ## Difference of Two Paired Proportions = 0.2 
    ## Proportion of Discordant Pairs = 0.333 

    ## Current Sample Size = 75 
    ## Future  Sample Size = 63 

    ##  McNemar's Chi-squared test with continuity correction

    ## data:  x
    ## McNemar's chi-squared = 7.84, df = 1, p-value = 0.00511


    ## Confidence Interval = ( 0.0641 , 0.3359 )
}

Sample Size for Two Correlated Proportions Based on McNemar Test

sample.size.for.two.correlated.proportions.test <- function(p.diff = 0.1, p.discordant = 0.2, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for Two Correlated Proportions Test based on McNemar Test
    ##   H0: p2 <= p1
    ##   H1: p2 >  p1
    ##   (Application: tests of sensitivity, specificity with three-way, matched-pairs study)
    ## Arguments:
    ##   p.diff: Effect size for the difference between two correlated proportions. Default to 10%. 
    ##   p.discordant: Proportion of pairs for which the responses differed. Default to 20%. 
    ##   alpha: type I error. Default to 0.025 (for one-sided test).
    ##   beta: type II error (1 - power). Default to 0.2 (so power is 80%). 
    ## Return:
    ##   Sample Size
    ## Author: Feiming Chen
    ## Reference: Machin, Campbell, Fayers, and Pinol (1997).
    ## ________________________________________________

    p10 <- (p.discordant + p.diff) / 2  # with default, p10 = 0.15
    p01 <- (p.discordant - p.diff) / 2  # default: p01 = 0.05
    OR <- p10 / p01                     # default: OR = 3
    OR.plus.one <- OR + 1               # default: 4
    OR.minus.one <- OR - 1              # default: 2
    
    N = (qnorm(1 - alpha) * OR.plus.one + qnorm(1 - beta) * sqrt(OR.plus.one^2 - OR.minus.one^2 * p.discordant))^2 / (OR.minus.one^2 * p.discordant)
    ceiling(N)
}
if (F) {                                # Unit Test
    sample.size.for.two.correlated.proportions.test() # 155 (PASS output: 155 under normal approx.)
}

Wednesday, February 17, 2021

Sample Size for One Proportion Test

sample.size.for.one.proportion.test <- function(p0  = 0.8, p1 = 0.9, alpha = 0.025, beta = 0.2)
{
    ## Purpose: Sample Size for One Proportion Test.
    ##   H0: p <= p0
    ##   H1: p >  p0
    ##   (Application: tests of sensitivity, specificity with performance goals)
    ## Arguments:
    ##   p0: performance goal (minimally acceptable proportion). Default to 80%. 
    ##   p1: expected performance (minimal proportion under the alternative hypothesis). Defaul to 90%. 
    ##   alpha: type I error. Default to 0.025 (for one-sided test).
    ##   beta: type II error (1 - power). Default to 0.2 (so power is 80%). 
    ## Return:
    ##   Sample Size
    ## Author: Feiming Chen
    ## Reference: Biswas, Bipasa. "Clinical performance evaluation of molecular diagnostic tests."
    ##            The Journal of Molecular Diagnostics 18.6 (2016): 803-812.
    ## ________________________________________________

    N = (qnorm(1 - alpha) * sqrt(p0 * (1 - p0)) + qnorm(1 - beta) * sqrt(p1 * (1 - p1)))^2 / (p1 - p0)^2
    ceiling(N)
}
if (F) {                                # Unit Test
    sample.size.for.one.proportion.test() # 108 (PASS output: 107 under exact test; 108 under normal approx.)
}