Thursday, August 31, 2023

Diagnostic Evaluation from Data Presented in a Confusion Matrix

confusion.matrix.diagnostic.accuracy <- function(x)
{
    ## Purpose:
    ##   Calculate sensitivity, specificity, and other metrics. 
    ## 
    ## Arguments:
    ##   x: a numeric vector of length 4 for the raw confusion matrix.
    ##      First 2 numbers: count of (Test+, Test-) with reference positive results
    ##      Last 2 numbers: count of (Test+, Test-) with reference negative results
    ##                   Ref+, Ref-
    ##            Test+   x1    x3
    ##            Test-   x2    x4
    ## 
    ## Return:
    ##   Sensitivity, Specificity, PPV, NPV, PLR, NLR and their CI's
    ## 
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    dimnames(d) <- list(c("Test.Pos", "Test.Neg", "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Likelihood.Ratio <- sapply(1:3, function(i) convert.to.CI.text(ratio.of.two.prop(x=c(d[i, 1], d[i, 2]), n=c(d[3, 1], d[3, 2])), digits = 1)) 

    d$"Performance" <- c(paste(c("PPV =", "NPV ="), my.proportion(c(d[1,1], d[2,2]), c(d[1,3], d[2,3]), digits = 1)),
                                    paste("Prevalence =", my.proportion(d[3,1], d[3,3], digits = 1)))

    sens <- d[1,1] / d[3, 1]
    spec <- d[2,2] / d[3, 2]
    PPV <- d[1,1] / d[1, 3]
    NPV <- d[2,2] / d[2, 3]

    J <- sens + spec - 1                # Youden's index in percentage
    ## J2 <-  d[1,1] / d[3, 1] -  d[1,2] / d[3, 2] # identical in the form of risk difference 
    J.ci <- prop.test(c(d[1,1], d[1,2]), c(d[3,1], d[3,2]))$conf.int
    J.str <- convert.to.CI.text(c(J, J.ci))

    NND <- 1 / (PPV + NPV - 1)          # Number Needed to Diagnose
    ## NND2 <- 1 / (d[1,1] / d[1, 3] -  d[2,1] / d[2, 3])  # identical in the form of risk difference 
    NND.ci <- 1 / prop.test(c(d[1,1], d[2,1]), c(d[1,3], d[2,3]))$conf.int
    NND.str <- convert.to.CI.text(ceiling(c(NND, rev(NND.ci))))

    b <- c(paste(c("Sensitivity =", "Specificity ="), my.proportion(c(d[1,1], d[2,2]), c(d[3,1], d[3,2]), digits = 1)), "", "",
           paste0(paste0("J = ", J.str), ", ", paste0("NND = ", NND.str)))

    d2 <- rbind(d, b)
    rownames(d2)[4] <- "Performance"

    tex.print(d2, file = "~/temp/confusion-matrix-performance", caption = "Performance Evaluation from Confusion Matrix")

    d2
}
if (F) {
    ## bowel cancer diagnostic test (https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Confusion_matrix)
    x <- c(20, 10, 180, 1820)
    confusion.matrix.diagnostic.accuracy(x)
    confusion.matrix.diagnostic.accuracy(c(95, 5, 5, 95))
    confusion.matrix.diagnostic.accuracy(c(95, 5, 500, 9500))
    confusion.matrix.diagnostic.accuracy(c(90, 10, 1000, 9000))
}

Thursday, April 27, 2023

Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)

c.statistic <- function(R, D)
{
    ## Purpose: Calculate the c-statistic (ROC-AUC), which is a
    ##   concordance measure.  If we take all possible pairs of
    ##   patients where one has disease (1) and the other has no
    ##   disease (0), we can calculate the proportion of all such
    ##   pairs where the diseased one has higher score (device output)
    ##   than the normal one (if they have the same value, we count
    ##   this as ‘half a victory’).
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   R: Clinical Reference Standard (0 = Negative, 1 = Positive)
    ##   D: Device Diagnostic Output (a continuous variable)
    ## 
    ## Return: 
    ##   - c : the c-statistic
    ## ________________________________________________

    f <- split(D, R)
    o <- outer(f$"1", f$"0", "-")
    c.statistic <- mean((o > 0) + (o == 0)/2)
    cat("c-statistic (ROC-AUC) =", c.statistic, "\n")
    c.statistic
}
if (F) {                                # Unit Test
    R <- rep(0:1, each = 5)
    c.statistic(R, 1:10)                # 1
    c.statistic(R, 10:1)                # 0
    c.statistic(R, rep(1, 10))          # 0.5
    neg <- rep(1:5, times=c(33,6,6,11,2))
    pos <- rep(1:5, times=c(3,2,2,11,33))
    R <- c(rep(0, length(neg)), rep(1, length(pos)))
    D <- c(neg, pos)
    c.statistic(R, D)                   # 0.89317
}

Import a text block (with only numeric values) and convert it to data frame

scan.text <- function(b, n.col = 1, v.names = "")
{
    ## Purpose: Import a text block (with only numeric values) and convert it to data frame. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - b : a block of text containing only numeric values separated by spaces
    ##   - n.col : number of columns in the data frame
    ##   - v.names : variable names 
    ## Return: 
    ##   a data frame containing imported numbers
    ## 
    ## ________________________________________________

    d1 <- scan(textConnection(b))
    d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T))
    names(d2) <- v.names
    d2
}
if (F) {                                # Unit Test
    scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2"))
    ##   ID V1 V2
    ## 1  1  2  3
    ## 2  4  5  6
}

Saturday, March 18, 2023

Accuracy Analysis for Binary Data with Repeated Measurements

accuracy.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           device - binary response from device (1 = Positive; 0 = Negative)
    ##           reference - binary response from reference (1 = Positive; 0 = Negative)
    ##           
    ## Return: 
    ##   - a list of binary accuracy metrics. 
    ## 
    ## ________________________________________________

    d <- dat$device
    r <- dat$reference

    sensitivity = mean(d[r == 1])
    specificity = 1 - mean(d[r == 0])
    PPV  = mean(r[d == 1])
    NPV = 1 - mean(r[d == 0])
    PLR = sensitivity / (1 - specificity)
    NLR = (1 - sensitivity) / specificity

    c(Sensitivity = sensitivity, Specificity = specificity, PPV = PPV, NPV = NPV, PLR = PLR, NLR = NLR)
}
if (F) {
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(1, 0), 5)))
    ## [1]   0   0   0   0   0 Inf
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(0, 1), 5)))
    ## [1]   1   1   1   1 Inf   0             
}

accuracy.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR), 
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           device - binary response from device
    ##           reference - binary response from reference
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR).
    ## 
    ## ________________________________________________

    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- accuracy.metrics(dat)
    cat("\nPoint Estimates of Accuracy Metrics:\n")
    print(round(est, 4))

    ## Make Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        accuracy.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Accuracy Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- accuracy.analysis(dat, boot.size = 10)
}

Tuesday, February 28, 2023

Agreement Analysis for Continuous Data with Repeated Measurements

agreement.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate agreement metrics. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names and with no NA values
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##           
    ## Return: 
    ##   - a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________

    reference.mean <- mean(dat$reference)
    dat$difference <- dat$device - dat$reference

    ## Bland-Altman Analysis with Repeated Measures
    BA.analysis <- bland.altman.repeated.vary(difference ~ factor(subject), dat, silent = TRUE)

    ## For MAE and RMSE
    MAE <- mean(abs(dat$difference))    # Mean Absolute Error
    MAE.pct <- MAE / reference.mean     
    RMSE <- sqrt(mean(dat$difference^2)) # Root Mean Squared Error

    ## For Correlation: calculate patient-level summary metrics from repeated measures, then average across patients. 
    ds <- split(dat, dat$subject)
    sapply(ds, function(d) cor(d$reference, d$device, method = "pearson")) -> ds2
    Correlation <- mean(ds2)

    c(ref.mean = reference.mean,
      MAE = MAE,
      MAE.pct = MAE.pct,
      RMSE = RMSE,
      Correlation = Correlation, 
      Bias = BA.analysis$m,
      SD = BA.analysis$s, 
      LoA.L = BA.analysis$loa[1],
      LoA.U = BA.analysis$loa[2],
      SD0 = BA.analysis$s0,             # no adjustment for repeated measurements
      LoA.L0 = BA.analysis$loa0[1],     # no adjustment for repeated measurements
      LoA.U0 = BA.analysis$loa0[2])     # no adjustment for repeated measurements

}
if (F) {
    agreement.metrics(data.frame(subject = rep(1:30, 10), reference = 5 + rnorm(300), device = 6 + rnorm(300)))
}

agreement.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation)
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________


    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- agreement.metrics(dat)
    cat("\nPoint Estimates of Agreement Metrics:\n")
    print(round(est, 4))

    ## 2000 Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        agreement.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Agreement Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- agreement.analysis(dat, boot.size = 10)
}

Calculate the mean and 95% confidence interval of a sample of numbers under a transform

sample.mean.CI.transform <- function(x, transform.func = identity, anti.transform.func = identity)
{
    ## Purpose: 
    ##   Calculate the mean and 95% CI of a sample of numbers under a transform. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - x : a vector of positive numerical values
    ##   - transform.func: a function for transforming the original values to a new scale.
    ##   - anti.transform.func: a function for back-transforming the values to the original scale.
    ## 
    ## Return: 
    ##   - m : mean based on a particular transform function. 
    ## 
    ## ________________________________________________

    y <- transform.func(x)              # transform to a new scale
    a <- t.test(y)                      # estimate the mean and 95% CI based on t-distribution
    c(anti.transform.func(a$estimate), CI = anti.transform.func(a$conf.int))
}
if (F) {                                # Unit Test
    x <- exp(1:10)
    sample.mean.CI.transform(x)             # no transform
    ## estimate the mean under the log-normal distributional assumption
    sample.mean.CI.transform(x, transform.func = log, anti.transform.func = exp) 
}