Monday, March 18, 2024

Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"

make.Sankey.diagram.for.Dx.performance <- function(Prevalence = 0.2, Sensitivity = 0.8, Specificity = 0.8, N = 1000)
{
    ## Purpose: Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"
    ##   ...
    ## 
    ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses)
    ## 
    ## Arguments: 
    ##   - Prevalence : disease prevalence assumed in the target population
    ##   - Sensitivity : sensitivity estimate
    ##   - Specificity : specificity estimate
    ##   - N : assumed number of patients in a sample from the target population
    ## 
    ## Return: 
    ##   - p1 : a HTML widget to be displayed in the browser. Main branches by disease status. 
    ##   - p2 : a HTML widget to be displayed in the browser. Main branches by device predictions.
    ## ________________________________________________

    np <- round(N * Prevalence)         # number of positives
    nn <- N - np                        # number of negatives
    npp <- round(np * Sensitivity)      # true positives
    npn <- np - npp                     # false negatives
    nnn <- round(nn * Specificity)      # true negatives
    nnp <- nn - nnn                     # false positives
    ndp <- npp + nnp                    # device positives
    ndn <- nnn + npn                    # device negatives


    cat("Sensitivity =", npp, "/", np, "=", Sensitivity, "\n")
    cat("Specificity =", nnn, "/", nn, "=", Specificity, "\n")
    cat("PPV =", npp, "/", ndp, "=", round(npp / ndp, 2), "\n")
    cat("NPV =", nnn, "/", ndn, "=", round(nnn / ndn, 2), "\n")

    library(networkD3)


    ############################# Disease Branches ###########################################

    ## A connection data frame is a list of flows with intensity for each flow
    links <- data.frame(
        source=c("n","n", "np", "np", "nn", "nn"), 
        target=c("np","nn", "npp", "npn", "nnn", "nnp"), 
        value=c(np, nn, npp, npn, nnn, nnp)
    )

    ## From these flows we need to create a node data frame: it lists every entities involved in the flow
    nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                        name = c(paste0("Patients (", N, ")"),
                                 paste0("Disease Positives (", np, ")"), 
                                 paste0("Disease Negatives (", nn, ")"), 
                                 paste0("True Positives (", npp, ")"), 
                                 paste0("False Negatives (", npn, ")"), 
                                 paste0("True Negatives (", nnn, ")"), 
                                 paste0("False Positives (", nnp, ")")))


    ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
    links$IDsource <- match(links$source, nodes$id) - 1 
    links$IDtarget <- match(links$target, nodes$id) - 1

    ## Make the Sankey Diagram with Disease Branches
    p1 <- sankeyNetwork(Links = links, Nodes = nodes,
                        Source = "IDsource", Target = "IDtarget",
                        Value = "value", NodeID = "name", 
                        fontSize = 50, 
                        nodeWidth = 30, nodePadding = 200)
    print(p1)

    ############################# Prediction Branches ###########################################

    ## A connection data frame is a list of flows with intensity for each flow
    links <- data.frame(
        source=c("n","n", "ndp", "ndp", "ndn", "ndn"), 
        target=c("ndp","ndn", "npp", "nnp", "nnn", "npn"), 
        value=c(ndp, ndn, npp, nnp, nnn, npn)
    )

    ## From these flows we need to create a node data frame: it lists every entities involved in the flow
    nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                        name = c(paste0("Patients (", N, ")"),
                                 paste0("Device Positives (", ndp, ")"), 
                                 paste0("Device Negatives (", ndn, ")"), 
                                 paste0("True Positives (", npp, ")"), 
                                 paste0("False Positives (", nnp, ")"), 
                                 paste0("True Negatives (", nnn, ")"), 
                                 paste0("False Negatives (", npn, ")"))) 

    ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
    links$IDsource <- match(links$source, nodes$id) - 1 
    links$IDtarget <- match(links$target, nodes$id) - 1

    ## Make the Sankey Diagram 
    p2 <- sankeyNetwork(Links = links, Nodes = nodes,
                        Source = "IDsource", Target = "IDtarget",
                        Value = "value", NodeID = "name", 
                        fontSize = 50, 
                        nodeWidth = 30, nodePadding = 200)
    print(p2)
    
}
if (F) {                                # Unit Test
    make.Sankey.diagram.for.Dx.performance(Prevalence = 0.2, Sensitivity = 0.8, Specificity = 0.8)
}



Thursday, August 31, 2023

Diagnostic Evaluation from Data Presented in a Confusion Matrix

confusion.matrix.diagnostic.accuracy <- function(x)
{
    ## Purpose:
    ##   Calculate sensitivity, specificity, and other metrics. 
    ## 
    ## Arguments:
    ##   x: a numeric vector of length 4 for the raw confusion matrix.
    ##      First 2 numbers: count of (Test+, Test-) with reference positive results
    ##      Last 2 numbers: count of (Test+, Test-) with reference negative results
    ##                   Ref+, Ref-
    ##            Test+   x1    x3
    ##            Test-   x2    x4
    ## 
    ## Return:
    ##   Sensitivity, Specificity, PPV, NPV, PLR, NLR and their CI's
    ## 
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    dimnames(d) <- list(c("Test.Pos", "Test.Neg", "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Likelihood.Ratio <- sapply(1:3, function(i) convert.to.CI.text(ratio.of.two.prop(x=c(d[i, 1], d[i, 2]), n=c(d[3, 1], d[3, 2])), digits = 1)) 

    d$"Performance" <- c(paste(c("PPV =", "NPV ="), my.proportion(c(d[1,1], d[2,2]), c(d[1,3], d[2,3]), digits = 1)),
                                    paste("Prevalence =", my.proportion(d[3,1], d[3,3], digits = 1)))

    sens <- d[1,1] / d[3, 1]
    spec <- d[2,2] / d[3, 2]
    PPV <- d[1,1] / d[1, 3]
    NPV <- d[2,2] / d[2, 3]

    J <- sens + spec - 1                # Youden's index in percentage
    ## J2 <-  d[1,1] / d[3, 1] -  d[1,2] / d[3, 2] # identical in the form of risk difference 
    J.ci <- prop.test(c(d[1,1], d[1,2]), c(d[3,1], d[3,2]))$conf.int
    J.str <- convert.to.CI.text(c(J, J.ci))

    NND <- 1 / (PPV + NPV - 1)          # Number Needed to Diagnose
    ## NND2 <- 1 / (d[1,1] / d[1, 3] -  d[2,1] / d[2, 3])  # identical in the form of risk difference 
    NND.ci <- 1 / prop.test(c(d[1,1], d[2,1]), c(d[1,3], d[2,3]))$conf.int
    NND.str <- convert.to.CI.text(ceiling(c(NND, rev(NND.ci))))

    b <- c(paste(c("Sensitivity =", "Specificity ="), my.proportion(c(d[1,1], d[2,2]), c(d[3,1], d[3,2]), digits = 1)), "", "",
           paste0(paste0("J = ", J.str), ", ", paste0("NND = ", NND.str)))

    d2 <- rbind(d, b)
    rownames(d2)[4] <- "Performance"

    tex.print(d2, file = "~/temp/confusion-matrix-performance", caption = "Performance Evaluation from Confusion Matrix")

    d2
}
if (F) {
    ## bowel cancer diagnostic test (https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Confusion_matrix)
    x <- c(20, 10, 180, 1820)
    confusion.matrix.diagnostic.accuracy(x)
    confusion.matrix.diagnostic.accuracy(c(95, 5, 5, 95))
    confusion.matrix.diagnostic.accuracy(c(95, 5, 500, 9500))
    confusion.matrix.diagnostic.accuracy(c(90, 10, 1000, 9000))
}

Thursday, April 27, 2023

Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)

c.statistic <- function(R, D)
{
    ## Purpose: Calculate the c-statistic (ROC-AUC), which is a
    ##   concordance measure.  If we take all possible pairs of
    ##   patients where one has disease (1) and the other has no
    ##   disease (0), we can calculate the proportion of all such
    ##   pairs where the diseased one has higher score (device output)
    ##   than the normal one (if they have the same value, we count
    ##   this as ‘half a victory’).
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   R: Clinical Reference Standard (0 = Negative, 1 = Positive)
    ##   D: Device Diagnostic Output (a continuous variable)
    ## 
    ## Return: 
    ##   - c : the c-statistic
    ## ________________________________________________

    f <- split(D, R)
    o <- outer(f$"1", f$"0", "-")
    c.statistic <- mean((o > 0) + (o == 0)/2)
    cat("c-statistic (ROC-AUC) =", c.statistic, "\n")
    c.statistic
}
if (F) {                                # Unit Test
    R <- rep(0:1, each = 5)
    c.statistic(R, 1:10)                # 1
    c.statistic(R, 10:1)                # 0
    c.statistic(R, rep(1, 10))          # 0.5
    neg <- rep(1:5, times=c(33,6,6,11,2))
    pos <- rep(1:5, times=c(3,2,2,11,33))
    R <- c(rep(0, length(neg)), rep(1, length(pos)))
    D <- c(neg, pos)
    c.statistic(R, D)                   # 0.89317
}

Import a text block (with only numeric values) and convert it to data frame

scan.text <- function(b, n.col = 1, v.names = "")
{
    ## Purpose: Import a text block (with only numeric values) and convert it to data frame. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - b : a block of text containing only numeric values separated by spaces
    ##   - n.col : number of columns in the data frame
    ##   - v.names : variable names 
    ## Return: 
    ##   a data frame containing imported numbers
    ## 
    ## ________________________________________________

    d1 <- scan(textConnection(b))
    d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T))
    names(d2) <- v.names
    d2
}
if (F) {                                # Unit Test
    scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2"))
    ##   ID V1 V2
    ## 1  1  2  3
    ## 2  4  5  6
}

Saturday, March 18, 2023

Accuracy Analysis for Binary Data with Repeated Measurements

accuracy.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           device - binary response from device (1 = Positive; 0 = Negative)
    ##           reference - binary response from reference (1 = Positive; 0 = Negative)
    ##           
    ## Return: 
    ##   - a list of binary accuracy metrics. 
    ## 
    ## ________________________________________________

    d <- dat$device
    r <- dat$reference

    sensitivity = mean(d[r == 1])
    specificity = 1 - mean(d[r == 0])
    PPV  = mean(r[d == 1])
    NPV = 1 - mean(r[d == 0])
    PLR = sensitivity / (1 - specificity)
    NLR = (1 - sensitivity) / specificity

    c(Sensitivity = sensitivity, Specificity = specificity, PPV = PPV, NPV = NPV, PLR = PLR, NLR = NLR)
}
if (F) {
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(1, 0), 5)))
    ## [1]   0   0   0   0   0 Inf
    accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(0, 1), 5)))
    ## [1]   1   1   1   1 Inf   0             
}

accuracy.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR), 
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           device - binary response from device
    ##           reference - binary response from reference
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR).
    ## 
    ## ________________________________________________

    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- accuracy.metrics(dat)
    cat("\nPoint Estimates of Accuracy Metrics:\n")
    print(round(est, 4))

    ## Make Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        accuracy.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Accuracy Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- accuracy.analysis(dat, boot.size = 10)
}

Tuesday, February 28, 2023

Agreement Analysis for Continuous Data with Repeated Measurements

agreement.metrics <- function(dat)
{
    ## Purpose: 
    ##   Calculate agreement metrics. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names and with no NA values
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##           
    ## Return: 
    ##   - a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________

    reference.mean <- mean(dat$reference)
    dat$difference <- dat$device - dat$reference

    ## Bland-Altman Analysis with Repeated Measures
    BA.analysis <- bland.altman.repeated.vary(difference ~ factor(subject), dat, silent = TRUE)

    ## For MAE and RMSE
    MAE <- mean(abs(dat$difference))    # Mean Absolute Error
    MAE.pct <- MAE / reference.mean     
    RMSE <- sqrt(mean(dat$difference^2)) # Root Mean Squared Error

    ## For Correlation: calculate patient-level summary metrics from repeated measures, then average across patients. 
    ds <- split(dat, dat$subject)
    sapply(ds, function(d) cor(d$reference, d$device, method = "pearson")) -> ds2
    Correlation <- mean(ds2)

    c(ref.mean = reference.mean,
      MAE = MAE,
      MAE.pct = MAE.pct,
      RMSE = RMSE,
      Correlation = Correlation, 
      Bias = BA.analysis$m,
      SD = BA.analysis$s, 
      LoA.L = BA.analysis$loa[1],
      LoA.U = BA.analysis$loa[2],
      SD0 = BA.analysis$s0,             # no adjustment for repeated measurements
      LoA.L0 = BA.analysis$loa0[1],     # no adjustment for repeated measurements
      LoA.U0 = BA.analysis$loa0[2])     # no adjustment for repeated measurements

}
if (F) {
    agreement.metrics(data.frame(subject = rep(1:30, 10), reference = 5 + rnorm(300), device = 6 + rnorm(300)))
}

agreement.analysis <- function(dat, boot.size = 2000)
{
    ## Purpose: 
    ##   Calculate agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation)
    ##   with their CI's, even in the presence of repeated measurements. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - dat : a data frame with the following variable names
    ##           subject - subject identifier
    ##           reference - continuous response from reference
    ##           device - continuous response from device
    ##   - boot.size : bootstrap sample size. 
    ## 
    ## Return: 
    ##   - Point estimates and 95% CI's for a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). 
    ## 
    ## ________________________________________________


    id <- dat$subject
    unique.ID <- unique(id)
    N <- length(unique.ID)
    cat("Number of Unique Patients =", N, "\n")

    K <- nrow(dat) / N 
    cat("Total number of repeated measurements =", nrow(dat), "\n")
    cat("Average number of repeated measurements per patient =", K, "\n")

    ## Point Estimates
    est <- agreement.metrics(dat)
    cat("\nPoint Estimates of Agreement Metrics:\n")
    print(round(est, 4))

    ## 2000 Bootstrap Samples
    cat("\nBootstrap Sample Size =", boot.size, "\n")
    set.seed(1)
    replicate(n = boot.size, {
        s <- sample(unique.ID, replace = TRUE)  # a bootstrap sample of patient ID's
        ## find all rows with the ID's in the bootstrap sample of ID's 
        v <- c()
        for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample
        agreement.metrics(dat[v,])
    }) -> est.boot

    CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975)))
    cat("95% Bootstrap Percentile Confidence Intervals of Agreement Metrics:\n")
    print(round(CI, 4))

    invisible(list(est = est, CI = CI))}

if (F) {                                # Unit Test
    ## ans <- agreement.analysis(dat, boot.size = 10)
}

Calculate the mean and 95% confidence interval of a sample of numbers under a transform

sample.mean.CI.transform <- function(x, transform.func = identity, anti.transform.func = identity)
{
    ## Purpose: 
    ##   Calculate the mean and 95% CI of a sample of numbers under a transform. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - x : a vector of positive numerical values
    ##   - transform.func: a function for transforming the original values to a new scale.
    ##   - anti.transform.func: a function for back-transforming the values to the original scale.
    ## 
    ## Return: 
    ##   - m : mean based on a particular transform function. 
    ## 
    ## ________________________________________________

    y <- transform.func(x)              # transform to a new scale
    a <- t.test(y)                      # estimate the mean and 95% CI based on t-distribution
    c(anti.transform.func(a$estimate), CI = anti.transform.func(a$conf.int))
}
if (F) {                                # Unit Test
    x <- exp(1:10)
    sample.mean.CI.transform(x)             # no transform
    ## estimate the mean under the log-normal distributional assumption
    sample.mean.CI.transform(x, transform.func = log, anti.transform.func = exp) 
}