Thursday, April 27, 2023

Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)

c.statistic <- function(R, D)
{
    ## Purpose: Calculate the c-statistic (ROC-AUC), which is a
    ##   concordance measure.  If we take all possible pairs of
    ##   patients where one has disease (1) and the other has no
    ##   disease (0), we can calculate the proportion of all such
    ##   pairs where the diseased one has higher score (device output)
    ##   than the normal one (if they have the same value, we count
    ##   this as ‘half a victory’).
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   R: Clinical Reference Standard (0 = Negative, 1 = Positive)
    ##   D: Device Diagnostic Output (a continuous variable)
    ## 
    ## Return: 
    ##   - c : the c-statistic
    ## ________________________________________________

    f <- split(D, R)
    o <- outer(f$"1", f$"0", "-")
    c.statistic <- mean((o > 0) + (o == 0)/2)
    cat("c-statistic (ROC-AUC) =", c.statistic, "\n")
    c.statistic
}
if (F) {                                # Unit Test
    R <- rep(0:1, each = 5)
    c.statistic(R, 1:10)                # 1
    c.statistic(R, 10:1)                # 0
    c.statistic(R, rep(1, 10))          # 0.5
    neg <- rep(1:5, times=c(33,6,6,11,2))
    pos <- rep(1:5, times=c(3,2,2,11,33))
    R <- c(rep(0, length(neg)), rep(1, length(pos)))
    D <- c(neg, pos)
    c.statistic(R, D)                   # 0.89317
}

Import a text block (with only numeric values) and convert it to data frame

scan.text <- function(b, n.col = 1, v.names = "")
{
    ## Purpose: Import a text block (with only numeric values) and convert it to data frame. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - b : a block of text containing only numeric values separated by spaces
    ##   - n.col : number of columns in the data frame
    ##   - v.names : variable names 
    ## Return: 
    ##   a data frame containing imported numbers
    ## 
    ## ________________________________________________

    d1 <- scan(textConnection(b))
    d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T))
    names(d2) <- v.names
    d2
}
if (F) {                                # Unit Test
    scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2"))
    ##   ID V1 V2
    ## 1  1  2  3
    ## 2  4  5  6
}

Saturday, March 18, 2023

Accuracy Analysis for Binary Data with Repeated Measurements

accuracy.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           device - binary response from device (1 = Positive; 0 = Negative)
    ##           reference - binary response from reference (1 = Positive; 0 = Negative)
    ##           
    ## Return: 
    ##   - a list of binary accuracy metrics. 
    ## 
    ## ________________________________________________

    d <- dat$device
    r <- dat$reference

    sensitivity = mean(d[r == 1])
    specificity = 1 - mean(d[r == 0])
    PPV  = mean(r[d == 1])
    NPV = 1 - mean(r[d == 0])
    PLR = sensitivity / (1 - specificity)
    NLR = (1 - sensitivity) / specificity

    c(Sensitivity = sensitivity, Specificity = specificity, PPV = PPV, NPV = NPV, PLR = PLR, NLR = NLR)
}
if (F) {
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(1, 0), 5)))
    ## [1]   0   0   0   0   0 Inf
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(0, 1), 5)))
    ## [1]   1   1   1   1 Inf   0             
}

accuracy.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR), 
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           device - binary response from device
    ##           reference - binary response from reference
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR).
    ## 
    ## ________________________________________________

    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- accuracy.metrics(dat)
    cat("\nPoint Estimates of Accuracy Metrics:\n")
    print(round(est, 4))

    ## Make Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        accuracy.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Accuracy Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- accuracy.analysis(dat, boot.size = 10)
}

Tuesday, February 28, 2023

Agreement Analysis for Continuous Data with Repeated Measurements

agreement.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate agreement metrics. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names and with no NA values
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##           
    ## Return: 
    ##   - a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________

    reference.mean <- mean(dat$reference)
    dat$difference <- dat$device - dat$reference

    ## Bland-Altman Analysis with Repeated Measures
    BA.analysis <- bland.altman.repeated.vary(difference ~ factor(subject), dat, silent = TRUE)

    ## For MAE and RMSE
    MAE <- mean(abs(dat$difference))    # Mean Absolute Error
    MAE.pct <- MAE / reference.mean     
    RMSE <- sqrt(mean(dat$difference^2)) # Root Mean Squared Error

    ## For Correlation: calculate patient-level summary metrics from repeated measures, then average across patients. 
    ds <- split(dat, dat$subject)
    sapply(ds, function(d) cor(d$reference, d$device, method = "pearson")) -> ds2
    Correlation <- mean(ds2)

    c(ref.mean = reference.mean,
      MAE = MAE,
      MAE.pct = MAE.pct,
      RMSE = RMSE,
      Correlation = Correlation, 
      Bias = BA.analysis$m,
      SD = BA.analysis$s, 
      LoA.L = BA.analysis$loa[1],
      LoA.U = BA.analysis$loa[2],
      SD0 = BA.analysis$s0,             # no adjustment for repeated measurements
      LoA.L0 = BA.analysis$loa0[1],     # no adjustment for repeated measurements
      LoA.U0 = BA.analysis$loa0[2])     # no adjustment for repeated measurements

}
if (F) {
    agreement.metrics(data.frame(subject = rep(1:30, 10), reference = 5 + rnorm(300), device = 6 + rnorm(300)))
}

agreement.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation)
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________


    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- agreement.metrics(dat)
    cat("\nPoint Estimates of Agreement Metrics:\n")
    print(round(est, 4))

    ## 2000 Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        agreement.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Agreement Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- agreement.analysis(dat, boot.size = 10)
}

Calculate the mean and 95% confidence interval of a sample of numbers under a transform

sample.mean.CI.transform <- function(x, transform.func = identity, anti.transform.func = identity)
{
    ## Purpose: 
    ##   Calculate the mean and 95% CI of a sample of numbers under a transform. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - x : a vector of positive numerical values
    ##   - transform.func: a function for transforming the original values to a new scale.
    ##   - anti.transform.func: a function for back-transforming the values to the original scale.
    ## 
    ## Return: 
    ##   - m : mean based on a particular transform function. 
    ## 
    ## ________________________________________________

    y <- transform.func(x)              # transform to a new scale
    a <- t.test(y)                      # estimate the mean and 95% CI based on t-distribution
    c(anti.transform.func(a$estimate), CI = anti.transform.func(a$conf.int))
}
if (F) {                                # Unit Test
    x <- exp(1:10)
    sample.mean.CI.transform(x)             # no transform
    ## estimate the mean under the log-normal distributional assumption
    sample.mean.CI.transform(x, transform.func = log, anti.transform.func = exp) 
}

Thursday, June 30, 2022

Use simulation to estimate the coverage probability of the quantile rank-based confidence interval

The original code comes from https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970

test.coverage.via.simulation <- function(lu, n, p, alpha = 0.1)
{
    ## Purpose: 
    ##   Use simulation to estimate the coverage probability of the quantile rank-based CI.
    ##   
    ## Reference (code origin):
    ##   https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970
    ## 
    ## Arguments:
    ##   - lu: a vector of two numbers for the lower and upper limit of the CI in ranks. 
    ##   - n: sample size
    ##   - p: quantile probability cutpoint (between 0 and 1)
    ##   - alpha: type I error level (default to 0.1 so a 90% CI is calculated)
    ## 
    ## Return: 
    ##   Average coverage probability from 10,000 simulated samples.
    ## ________________________________________________

    ## Generate many random samples of size "n" from a known distribution
    ## and compute actual CI's from those samples using the given rank-based CI. 
    set.seed(1)
    n.sim <- 1e4
    index <- function(x, i) ifelse(i == Inf, Inf, ifelse(i == -Inf, -Inf, x[i]))
    sim <- replicate(n.sim, index(sort(rnorm(n)), lu))

    ## Compute the actual proportion of those intervals that cover the theoretical quantile.
    F.q <- qnorm(p)
    covers <- sim[1, ] <= F.q & F.q <= sim[2, ]
    mean.coverage <- mean(covers)
    message("Mean Coverage Over 10,000 Simulated Samples = ", signif(mean.coverage, 4))
}
if (F) {                                # Unit Test
    lu <- c(85, 97)
    n <- 100
    p <- 0.9
    alpha <- 0.05
    test.coverage.via.simulation(lu, n, p, alpha)
    ## Mean Coverage Over 10,000 Simulated Samples = 0.9528
}

Rank-based Confidence Interval for a Quantile

The original code comes from https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970


quantile.CI.ranks <- function(n, p, alpha = 0.1) {
    ## Purpose:
    ##   Calculate a two-sided near-symmetric distribution-free
    ##   confidence interval with confidence level of (1 - alpha) for
    ##   a quantile, by searching over a small range of upper and
    ##   lower order statistics for the closest coverage to (1 -
    ##   alpha) (but not less than it, if possible).
    ##
    ## Reference (code origin):
    ##   https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970
    ## 
    ## Arguments:
    ##   - n: sample size
    ##   - p: quantile probability cutpoint (between 0 and 1)
    ##   - alpha: type I error level (default to 0.1 so a 90% CI is calculated)
    ##
    ## Return:
    ##   - CI: two indices into the order statistics for the two-sided CI of the quantile
    ##   - coverage: theoretical coverage probability of the CI, which should be close to (1 - alpha).

    ## a small candidate list of order statistics for the lower/upper limits of the CI of a quantile
    l <- qbinom(alpha/2, n, p) + (-2:2) + 1 # lower limit candidates
    u <- qbinom(1 - alpha/2, n, p) + (-2:2)  # upper limit candidates

    ## out-of-bound order statistics correspond to no limit
    l[l < 0] <- -Inf                    # no lower limit
    u[u > n] <- Inf                     # no upper limit

    ## for each pair of candidate lower/upper limit, calculate the coverage probability
    coverage <- outer(l, u, function(l, u) pbinom(u - 1, n, p) - pbinom(l - 1, n, p))

    ## if no coverage is above (1 - alpha), choose the max coverage; 
    ## otherwise, look for the smallest coverage that is above (1 - alpha). 
    if (max(coverage) < 1 - alpha) {
        i <- which(coverage == max(coverage)) 
    } else {                            
        i <- which(coverage == min(coverage[coverage >= 1 - alpha]))
    }

    j <- i[1]                           # in case there are multiple candidates

    ## identify the order statistics and its coverage.
    L <- rep(l, 5)[j]
    U <- rep(u, each = 5)[j]
    return(list(CI = c(L, U), coverage = coverage[j]))
}
if (F) {                                # Unit Test
    alpha <- 0.05
    n <- 100
    p <- 0.9
    quantile.CI(n, p, alpha)
    ## $CI
    ## [1] 85 97

    ## $coverage
    ## [1] 0.95227
}