Monday, March 18, 2024

Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"

make.Sankey.diagram.for.Dx.performance <- function(Prevalence = 0.2, Sensitivity = 0.8, Specificity = 0.8, N = 1000)
{
    ## Purpose: Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"
    ##   ...
    ## 
    ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses)
    ## 
    ## Arguments: 
    ##   - Prevalence : disease prevalence assumed in the target population
    ##   - Sensitivity : sensitivity estimate
    ##   - Specificity : specificity estimate
    ##   - N : assumed number of patients in a sample from the target population
    ## 
    ## Return: 
    ##   - p1 : a HTML widget to be displayed in the browser. Main branches by disease status. 
    ##   - p2 : a HTML widget to be displayed in the browser. Main branches by device predictions.
    ## ________________________________________________

    np <- round(N * Prevalence)         # number of positives
    nn <- N - np                        # number of negatives
    npp <- round(np * Sensitivity)      # true positives
    npn <- np - npp                     # false negatives
    nnn <- round(nn * Specificity)      # true negatives
    nnp <- nn - nnn                     # false positives
    ndp <- npp + nnp                    # device positives
    ndn <- nnn + npn                    # device negatives


    cat("Sensitivity =", npp, "/", np, "=", Sensitivity, "\n")
    cat("Specificity =", nnn, "/", nn, "=", Specificity, "\n")
    cat("PPV =", npp, "/", ndp, "=", round(npp / ndp, 2), "\n")
    cat("NPV =", nnn, "/", ndn, "=", round(nnn / ndn, 2), "\n")

    library(networkD3)


    ############################# Disease Branches ###########################################

    ## A connection data frame is a list of flows with intensity for each flow
    links <- data.frame(
        source=c("n","n", "np", "np", "nn", "nn"), 
        target=c("np","nn", "npp", "npn", "nnn", "nnp"), 
        value=c(np, nn, npp, npn, nnn, nnp)
    )

    ## From these flows we need to create a node data frame: it lists every entities involved in the flow
    nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                        name = c(paste0("Patients (", N, ")"),
                                 paste0("Disease Positives (", np, ")"), 
                                 paste0("Disease Negatives (", nn, ")"), 
                                 paste0("True Positives (", npp, ")"), 
                                 paste0("False Negatives (", npn, ")"), 
                                 paste0("True Negatives (", nnn, ")"), 
                                 paste0("False Positives (", nnp, ")")))


    ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
    links$IDsource <- match(links$source, nodes$id) - 1 
    links$IDtarget <- match(links$target, nodes$id) - 1

    ## Make the Sankey Diagram with Disease Branches
    p1 <- sankeyNetwork(Links = links, Nodes = nodes,
                        Source = "IDsource", Target = "IDtarget",
                        Value = "value", NodeID = "name", 
                        fontSize = 50, 
                        nodeWidth = 30, nodePadding = 200)
    print(p1)

    ############################# Prediction Branches ###########################################

    ## A connection data frame is a list of flows with intensity for each flow
    links <- data.frame(
        source=c("n","n", "ndp", "ndp", "ndn", "ndn"), 
        target=c("ndp","ndn", "npp", "nnp", "nnn", "npn"), 
        value=c(ndp, ndn, npp, nnp, nnn, npn)
    )

    ## From these flows we need to create a node data frame: it lists every entities involved in the flow
    nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), 
                        name = c(paste0("Patients (", N, ")"),
                                 paste0("Device Positives (", ndp, ")"), 
                                 paste0("Device Negatives (", ndn, ")"), 
                                 paste0("True Positives (", npp, ")"), 
                                 paste0("False Positives (", nnp, ")"), 
                                 paste0("True Negatives (", nnn, ")"), 
                                 paste0("False Negatives (", npn, ")"))) 

    ## With networkD3, connection must be provided using id, not using real name like in the links dataframe.
    links$IDsource <- match(links$source, nodes$id) - 1 
    links$IDtarget <- match(links$target, nodes$id) - 1

    ## Make the Sankey Diagram 
    p2 <- sankeyNetwork(Links = links, Nodes = nodes,
                        Source = "IDsource", Target = "IDtarget",
                        Value = "value", NodeID = "name", 
                        fontSize = 50, 
                        nodeWidth = 30, nodePadding = 200)
    print(p2)
    
}
if (F) {                                # Unit Test
    make.Sankey.diagram.for.Dx.performance(Prevalence = 0.2, Sensitivity = 0.8, Specificity = 0.8)
}