R Function Library

Wednesday, June 18, 2025

Compare Different Methods for Estimating Binary Proportions from Repeated Measurement Data

## Data 1 (mild case, under-dispersion)
n = c(5,3,1,4,5,2,5,1,2,4)
d <- data.frame(subject = rep(1:10, times = n), 
                response = c(1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,0,1,1,1,0,1))

## Data 2 (normal case, over-dispersion)
n <- c(3, 2, 4, 5, 2, 6, 4, 3, 5, 2)
d <- data.frame(subject = rep(1:10, times = n),
                 response = c(1, 1, 1,        # Subject 1
                              0, 0,           # Subject 2
                              1, 0, 1, 1,     # Subject 3
                              0, 0, 0, 0, 0,  # Subject 4
                              1, 1,           # Subject 5
                              1, 1, 1, 1, 1, 1, # Subject 6
                              0, 0, 1, 0,     # Subject 7
                              1, 1, 1,        # Subject 8
                              0, 1, 0, 1, 0,  # Subject 9
                              0, 0            # Subject 10
                              )
                )

N <- dim(d)[1]

## Naive: Wilson Score 
prop.test(sum(d$response), N)

## Naive: Clopper Pearson ("exact")
binom.test(sum(d$response), N)      

## Event-level: Cluster Bootstrap
stat.bootstrap.cluster(id = d$subject, val = d$response) 

## Rao-Scott Method
x = as.vector(by(d$response, d$subject, sum))
a <- preprocess.clustered.binary.data(x, n)
prop.test(a$x, a$n)

## Mixed effects model 
library(lme4) 
library(modelbased)
g <- glmer(response ~ 1 + (1|subject), data = d, family = binomial)
estimate_relation(g)

## GEE model
library(geepack)
f <- geeglm(response ~ 1, data = d, id = subject, family = binomial)
estimate_relation(f)

Monday, March 31, 2025

Example Analysis of Binary Data with Repeated Measurement Structure

d <- data.frame(subject = rep(1:10, times = c(5,3,1,4,5,2,5,1,2,4)), 
                response = c(1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,1,1,0,1,1,0,1,0,0,1,1,1,0,1))
N <- dim(d)[1]

## Naive: Wilson Score 
prop.test(sum(d$response), N)

## Naive: Clopper Pearson ("exact")
binom.test(sum(d$response), N)          #  0.52632  (0.28864, 0.75553)

## Event-level: Cluster Bootstrap
stat.bootstrap.cluster(id = d$subject, val = d$response) 

## Rao-Scott Method
x = as.vector(by(d$response, d$subject, sum))
n = c(5,3,1,4,5,2,5,1,2,4)
a <- preprocess.clustered.binary.data(x, n)
prop.test(a$x, a$n)

## Mixed effects model 
library(lme4) 
library(modelbased)
g <- glmer(response ~ 1 + (1|subject), data = d, family = binomial)
estimate_relation(g)

## GEE model
library(geepack)
f <- geeglm(response ~ 1, data = d, id = subject,
            family = binomial, corstr = "exchangeable")
estimate_relation(f)

Tuesday, October 1, 2024

Expected Specificity Given Sensitivity, PPV, and Prevalence Assumption

specificity.given.PPV <- function(Sensitivity = 0.5, 
                                  PPV = 0.3, 
                                  Prevalence = 0.1)
{
    ## Purpose: calculate specificity based on sensitivity, PPV, and prevalence. 
    ## 
    ## Arguments:
    ##   Sensitivity: performance target for sensitivity
    ##   PPV: performance target for PPV (positive predictive value)
    ##   Prevalence: assumed disease prevalence in the target population
    ## 
    ## Return:
    ##   Specificity: expected performance target for specificity 
    ##
    ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses)
    ## ________________________________________________

    ## Assume 1000 patients
    N <- 1000
    n.pos <- N * Prevalence
    n.neg <- N - n.pos
    n.true.pos <- n.pos * Sensitivity
    n.false.neg <- n.pos - n.true.pos
    Specificity <- 1 - n.true.pos / (n.neg * (PPV / (1 - PPV)))

    cat("What should the Specificity be if we want", paste0(round(100*Sensitivity), "%"), "Sensitivity and", paste0(round(100*PPV), "%"),
        "PPV under", paste0(round(100*Prevalence), "%"), "Prevalence assumption?\n")
    cat("Specificity =", round(Specificity, 3), "\n\n")

    make.Sankey.diagram.for.Dx.performance(Sensitivity = Sensitivity, Specificity = Specificity, Prevalence = Prevalence)

}
if (F) {                                # Unit Test
    specificity.given.PPV(Sensitivity = 0.7, PPV = 0.3, Prevalence = 0.05)
}

Monday, March 18, 2024

Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"

make.Sankey.diagram.for.Dx.performance <- function(Sensitivity = 0.8, Specificity = 0.8, Prevalence = 0.2, N = 1000, by.prediction = FALSE)
{
    ## Purpose: Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"
    ## 
    ## Reference: https://r-graph-gallery.com/sankey-diagram.html
    ##            https://christophergandrud.github.io/networkD3
    ## 
    ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses)
    ## 
    ## Arguments: 
    ##   - Prevalence : disease prevalence assumed in the target population
    ##   - Sensitivity : sensitivity estimate
    ##   - Specificity : specificity estimate
    ##   - N : assumed number of patients in a sample from the target population
    ##   - by.prediction: if T, make main branches by device predictions.
    ## 
    ## Return: 
    ##   - p1 : a HTML widget to be displayed in the browser. Main branches by disease status. 
    ##   - p2 : a HTML widget to be displayed in the browser. Main branches by device predictions.
    ## ________________________________________________

    np <- round(N * Prevalence)         # number of positives
    nn <- N - np                        # number of negatives
    npp <- round(np * Sensitivity)      # true positives
    npn <- np - npp                     # false negatives
    nnn <- round(nn * Specificity)      # true negatives
    nnp <- nn - nnn                     # false positives
    ndp <- npp + nnp                    # device positives
    ndn <- nnn + npn                    # device negatives


    cat("Simulation Assumptions:\n")
    cat("Prevalence =", Prevalence, "\n")
    cat("Sensitivity =", Sensitivity, "\n")
    cat("Specificity =", Specificity, "\n\n")

    cat("Among", N, "patients in the target population:\n")
    cat("Number of Positives =", N, "*", Prevalence, "=", np, "\n")
    cat("Number of True Positives =", np, "*", Sensitivity, "=", npp, "\n")
    cat("Number of False Negatives =", np, "-", npp, "=", npn, "\n\n")

    cat("Number of Negatives =", N, "*", (1 - Prevalence), "=", nn, "\n")
    cat("Number of True Negatives =", nn, "*", Specificity, "=", nnn, "\n")
    cat("Number of False Positives =", nn, "-", nnn, "=", nnp, "\n\n")

    cat("Number of Device Positives =", npp, "+", nnp, "=", ndp, "\n")
    cat("Number of Device Negatives =", nnn, "+", npn, "=", ndn, "\n\n")

    cat("Sensitivity =", npp, "/", np, "=", Sensitivity, "\n")
    cat("Specificity =", nnn, "/", nn, "=", Specificity, "\n")
    cat("PPV =", npp, "/", ndp, "=", round(npp / ndp, 2), "\n")
    cat("NPV =", nnn, "/", ndn, "=", round(nnn / ndn, 2), "\n")

    library(networkD3)

    if (!by.prediction) {
############################# Disease Branches ###########################################

        ## A connection data frame is a list of flows with intensity for each flow
        links <- data.frame(
            source=c("n","n", "np", "np", "nn", "nn"), 
            target=c("np","nn", "npp", "npn", "nnn", "nnp"), 
            value=c(np, nn, npp, npn, nnn, nnp)
        )

        ## From these flows we need to create a node data frame: it lists every entities involved in the flow
        nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                            name = c(paste0("Patients (", N, ")"),
                                     paste0("Disease Positives (", np, ")"), 
                                     paste0("Disease Negatives (", nn, ")"), 
                                     paste0("True Positives (", npp, ")"), 
                                     paste0("False Negatives (", npn, ")"), 
                                     paste0("True Negatives (", nnn, ")"), 
                                     paste0("False Positives (", nnp, ")")))


        ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
        links$IDsource <- match(links$source, nodes$id) - 1 
        links$IDtarget <- match(links$target, nodes$id) - 1

        ## Make the Sankey Diagram with Disease Branches
        p1 <- sankeyNetwork(Links = links, Nodes = nodes,
                            Source = "IDsource", Target = "IDtarget",
                            Value = "value", NodeID = "name", 
                            fontSize = 50, 
                            nodeWidth = 30, nodePadding = 200)
        print(p1)

        ## save the widget
        htmlwidgets::saveWidget(p1, file="~/temp/sankey1.html")
    } else {
############################# Prediction Branches ###########################################

        ## A connection data frame is a list of flows with intensity for each flow
        links <- data.frame(
            source=c("n","n", "ndp", "ndp", "ndn", "ndn"), 
            target=c("ndp","ndn", "npp", "nnp", "nnn", "npn"), 
            value=c(ndp, ndn, npp, nnp, nnn, npn)
        )

        ## From these flows we need to create a node data frame: it lists every entities involved in the flow
        nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                            name = c(paste0("Patients (", N, ")"),
                                     paste0("Device Positives (", ndp, ")"), 
                                     paste0("Device Negatives (", ndn, ")"), 
                                     paste0("True Positives (", npp, ")"), 
                                     paste0("False Positives (", nnp, ")"), 
                                     paste0("True Negatives (", nnn, ")"), 
                                     paste0("False Negatives (", npn, ")"))) 

        ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
        links$IDsource <- match(links$source, nodes$id) - 1 
        links$IDtarget <- match(links$target, nodes$id) - 1

        ## Make the Sankey Diagram 
        p2 <- sankeyNetwork(Links = links, Nodes = nodes,
                            Source = "IDsource", Target = "IDtarget",
                            Value = "value", NodeID = "name", 
                            fontSize = 50, 
                            nodeWidth = 30, nodePadding = 200)
        print(p2)
        htmlwidgets::saveWidget(p2, file="~/temp/sankey2.html")
    }
}
if (F) {                                # Unit Test
    make.Sankey.diagram.for.Dx.performance(Prevalence = 0.1, Sensitivity = 0.8, Specificity = 0.8)
}

Thursday, August 31, 2023

Diagnostic Evaluation from Data Presented in a Confusion Matrix

confusion.matrix.diagnostic.accuracy <- function(x)
{
    ## Purpose:
    ##   Calculate sensitivity, specificity, and other metrics. 
    ## 
    ## Arguments:
    ##   x: a numeric vector of length 4 for the raw confusion matrix.
    ##      First 2 numbers: count of (Test+, Test-) with reference positive results
    ##      Last 2 numbers: count of (Test+, Test-) with reference negative results
    ##                   Ref+, Ref-
    ##            Test+   x1    x3
    ##            Test-   x2    x4
    ## 
    ## Return:
    ##   Sensitivity, Specificity, PPV, NPV, PLR, NLR and their CI's
    ## 
    ## Author: Feiming Chen
    ## ________________________________________________

    d <- as.data.frame(addmargins(matrix(x, ncol = 2)))
    dimnames(d) <- list(c("Test.Pos", "Test.Neg", "Sum"), c("Ref.Pos", "Ref.Neg", "Sum"))

    d$Likelihood.Ratio <- sapply(1:3, function(i) convert.to.CI.text(ratio.of.two.prop(x=c(d[i, 1], d[i, 2]), n=c(d[3, 1], d[3, 2])), digits = 1)) 

    d$"Performance" <- c(paste(c("PPV =", "NPV ="), my.proportion(c(d[1,1], d[2,2]), c(d[1,3], d[2,3]), digits = 1)),
                                    paste("Prevalence =", my.proportion(d[3,1], d[3,3], digits = 1)))

    sens <- d[1,1] / d[3, 1]
    spec <- d[2,2] / d[3, 2]
    PPV <- d[1,1] / d[1, 3]
    NPV <- d[2,2] / d[2, 3]

    J <- sens + spec - 1                # Youden's index in percentage
    ## J2 <-  d[1,1] / d[3, 1] -  d[1,2] / d[3, 2] # identical in the form of risk difference 
    J.ci <- prop.test(c(d[1,1], d[1,2]), c(d[3,1], d[3,2]))$conf.int
    J.str <- convert.to.CI.text(c(J, J.ci))

    NND <- 1 / (PPV + NPV - 1)          # Number Needed to Diagnose
    ## NND2 <- 1 / (d[1,1] / d[1, 3] -  d[2,1] / d[2, 3])  # identical in the form of risk difference 
    NND.ci <- 1 / prop.test(c(d[1,1], d[2,1]), c(d[1,3], d[2,3]))$conf.int
    NND.str <- convert.to.CI.text(ceiling(c(NND, rev(NND.ci))))

    b <- c(paste(c("Sensitivity =", "Specificity ="), my.proportion(c(d[1,1], d[2,2]), c(d[3,1], d[3,2]), digits = 1)), "", "",
           paste0(paste0("J = ", J.str), ", ", paste0("NND = ", NND.str)))

    d2 <- rbind(d, b)
    rownames(d2)[4] <- "Performance"

    tex.print(d2, file = "~/temp/confusion-matrix-performance", caption = "Performance Evaluation from Confusion Matrix")

    d2
}
if (F) {
    ## bowel cancer diagnostic test (https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Confusion_matrix)
    x <- c(20, 10, 180, 1820)
    confusion.matrix.diagnostic.accuracy(x)
    confusion.matrix.diagnostic.accuracy(c(95, 5, 5, 95))
    confusion.matrix.diagnostic.accuracy(c(95, 5, 500, 9500))
    confusion.matrix.diagnostic.accuracy(c(90, 10, 1000, 9000))
}

Thursday, April 27, 2023

Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)

c.statistic <- function(R, D)
{
    ## Purpose: Calculate the c-statistic (ROC-AUC), which is a
    ##   concordance measure.  If we take all possible pairs of
    ##   patients where one has disease (1) and the other has no
    ##   disease (0), we can calculate the proportion of all such
    ##   pairs where the diseased one has higher score (device output)
    ##   than the normal one (if they have the same value, we count
    ##   this as ‘half a victory’).
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   R: Clinical Reference Standard (0 = Negative, 1 = Positive)
    ##   D: Device Diagnostic Output (a continuous variable)
    ## 
    ## Return: 
    ##   - c : the c-statistic
    ## ________________________________________________

    f <- split(D, R)
    o <- outer(f$"1", f$"0", "-")
    c.statistic <- mean((o > 0) + (o == 0)/2)
    cat("c-statistic (ROC-AUC) =", c.statistic, "\n")
    c.statistic
}
if (F) {                                # Unit Test
    R <- rep(0:1, each = 5)
    c.statistic(R, 1:10)                # 1
    c.statistic(R, 10:1)                # 0
    c.statistic(R, rep(1, 10))          # 0.5
    neg <- rep(1:5, times=c(33,6,6,11,2))
    pos <- rep(1:5, times=c(3,2,2,11,33))
    R <- c(rep(0, length(neg)), rep(1, length(pos)))
    D <- c(neg, pos)
    c.statistic(R, D)                   # 0.89317
}

Import a text block (with only numeric values) and convert it to data frame

scan.text <- function(b, n.col = 1, v.names = "")
{
    ## Purpose: Import a text block (with only numeric values) and convert it to data frame. 
    ## 
    ## Author: Feiming Chen
    ## 
    ## Arguments: 
    ##   - b : a block of text containing only numeric values separated by spaces
    ##   - n.col : number of columns in the data frame
    ##   - v.names : variable names 
    ## Return: 
    ##   a data frame containing imported numbers
    ## 
    ## ________________________________________________

    d1 <- scan(textConnection(b))
    d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T))
    names(d2) <- v.names
    d2
}
if (F) {                                # Unit Test
    scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2"))
    ##   ID V1 V2
    ## 1  1  2  3
    ## 2  4  5  6
}

Wednesday, June 18, 2025

Compare Different Methods for Estimating Binary Proportions from Repeated Measurement Data

Monday, March 31, 2025

Example Analysis of Binary Data with Repeated Measurement Structure

Tuesday, October 1, 2024

Expected Specificity Given Sensitivity, PPV, and Prevalence Assumption

Monday, March 18, 2024

Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"

Thursday, August 31, 2023

Diagnostic Evaluation from Data Presented in a Confusion Matrix

Thursday, April 27, 2023

Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)

Import a text block (with only numeric values) and convert it to data frame

Blog Archive