specificity.given.PPV <- function(Sensitivity = 0.5, PPV = 0.3, Prevalence = 0.1) { ## Purpose: calculate specificity based on sensitivity, PPV, and prevalence. ## ## Arguments: ## Sensitivity: performance target for sensitivity ## PPV: performance target for PPV (positive predictive value) ## Prevalence: assumed disease prevalence in the target population ## ## Return: ## Specificity: expected performance target for specificity ## ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses) ## ________________________________________________ ## Assume 1000 patients N <- 1000 n.pos <- N * Prevalence n.neg <- N - n.pos n.true.pos <- n.pos * Sensitivity n.false.neg <- n.pos - n.true.pos Specificity <- 1 - n.true.pos / (n.neg * (PPV / (1 - PPV))) cat("What should the Specificity be if we want", paste0(round(100*Sensitivity), "%"), "Sensitivity and", paste0(round(100*PPV), "%"), "PPV under", paste0(round(100*Prevalence), "%"), "Prevalence assumption?\n") cat("Specificity =", round(Specificity, 3), "\n\n") make.Sankey.diagram.for.Dx.performance(Sensitivity = Sensitivity, Specificity = Specificity, Prevalence = Prevalence) } if (F) { # Unit Test specificity.given.PPV(Sensitivity = 0.7, PPV = 0.3, Prevalence = 0.05) }
R Function Library
R Function Code for Data Analysis, Statistics, and Visualization.
Tuesday, October 1, 2024
Expected Specificity Given Sensitivity, PPV, and Prevalence Assumption
Monday, March 18, 2024
Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3"
make.Sankey.diagram.for.Dx.performance <- function(Sensitivity = 0.8, Specificity = 0.8, Prevalence = 0.2, N = 1000, by.prediction = FALSE) { ## Purpose: Make a Sankey diagram to illustrate Diagnostic Performance using R package "NetworkD3" ## ## Reference: https://r-graph-gallery.com/sankey-diagram.html ## https://christophergandrud.github.io/networkD3 ## ## Author: Feiming Chen (Copyright 2024; GPLv3; No Warranty; gnu.org/licenses) ## ## Arguments: ## - Prevalence : disease prevalence assumed in the target population ## - Sensitivity : sensitivity estimate ## - Specificity : specificity estimate ## - N : assumed number of patients in a sample from the target population ## - by.prediction: if T, make main branches by device predictions. ## ## Return: ## - p1 : a HTML widget to be displayed in the browser. Main branches by disease status. ## - p2 : a HTML widget to be displayed in the browser. Main branches by device predictions. ## ________________________________________________ np <- round(N * Prevalence) # number of positives nn <- N - np # number of negatives npp <- round(np * Sensitivity) # true positives npn <- np - npp # false negatives nnn <- round(nn * Specificity) # true negatives nnp <- nn - nnn # false positives ndp <- npp + nnp # device positives ndn <- nnn + npn # device negatives cat("Simulation Assumptions:\n") cat("Prevalence =", Prevalence, "\n") cat("Sensitivity =", Sensitivity, "\n") cat("Specificity =", Specificity, "\n\n") cat("Among", N, "patients in the target population:\n") cat("Number of Positives =", N, "*", Prevalence, "=", np, "\n") cat("Number of True Positives =", np, "*", Sensitivity, "=", npp, "\n") cat("Number of False Negatives =", np, "-", npp, "=", npn, "\n\n") cat("Number of Negatives =", N, "*", (1 - Prevalence), "=", nn, "\n") cat("Number of True Negatives =", nn, "*", Specificity, "=", nnn, "\n") cat("Number of False Positives =", nn, "-", nnn, "=", nnp, "\n\n") cat("Number of Device Positives =", npp, "+", nnp, "=", ndp, "\n") cat("Number of Device Negatives =", nnn, "+", npn, "=", ndn, "\n\n") cat("Sensitivity =", npp, "/", np, "=", Sensitivity, "\n") cat("Specificity =", nnn, "/", nn, "=", Specificity, "\n") cat("PPV =", npp, "/", ndp, "=", round(npp / ndp, 2), "\n") cat("NPV =", nnn, "/", ndn, "=", round(nnn / ndn, 2), "\n") library(networkD3) if (!by.prediction) { ############################# Disease Branches ########################################### ## A connection data frame is a list of flows with intensity for each flow links <- data.frame( source=c("n","n", "np", "np", "nn", "nn"), target=c("np","nn", "npp", "npn", "nnn", "nnp"), value=c(np, nn, npp, npn, nnn, nnp) ) ## From these flows we need to create a node data frame: it lists every entities involved in the flow nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), name = c(paste0("Patients (", N, ")"), paste0("Disease Positives (", np, ")"), paste0("Disease Negatives (", nn, ")"), paste0("True Positives (", npp, ")"), paste0("False Negatives (", npn, ")"), paste0("True Negatives (", nnn, ")"), paste0("False Positives (", nnp, ")"))) ## With networkD3, connection must be provided using id, not using real name like in the links dataframe. links$IDsource <- match(links$source, nodes$id) - 1 links$IDtarget <- match(links$target, nodes$id) - 1 ## Make the Sankey Diagram with Disease Branches p1 <- sankeyNetwork(Links = links, Nodes = nodes, Source = "IDsource", Target = "IDtarget", Value = "value", NodeID = "name", fontSize = 50, nodeWidth = 30, nodePadding = 200) print(p1) ## save the widget htmlwidgets::saveWidget(p1, file="~/temp/sankey1.html") } else { ############################# Prediction Branches ########################################### ## A connection data frame is a list of flows with intensity for each flow links <- data.frame( source=c("n","n", "ndp", "ndp", "ndn", "ndn"), target=c("ndp","ndn", "npp", "nnp", "nnn", "npn"), value=c(ndp, ndn, npp, nnp, nnn, npn) ) ## From these flows we need to create a node data frame: it lists every entities involved in the flow nodes <- data.frame(id = unique(c(as.character(links$source), as.character(links$target))), name = c(paste0("Patients (", N, ")"), paste0("Device Positives (", ndp, ")"), paste0("Device Negatives (", ndn, ")"), paste0("True Positives (", npp, ")"), paste0("False Positives (", nnp, ")"), paste0("True Negatives (", nnn, ")"), paste0("False Negatives (", npn, ")"))) ## With networkD3, connection must be provided using id, not using real name like in the links dataframe. links$IDsource <- match(links$source, nodes$id) - 1 links$IDtarget <- match(links$target, nodes$id) - 1 ## Make the Sankey Diagram p2 <- sankeyNetwork(Links = links, Nodes = nodes, Source = "IDsource", Target = "IDtarget", Value = "value", NodeID = "name", fontSize = 50, nodeWidth = 30, nodePadding = 200) print(p2) htmlwidgets::saveWidget(p2, file="~/temp/sankey2.html") } } if (F) { # Unit Test make.Sankey.diagram.for.Dx.performance(Prevalence = 0.1, Sensitivity = 0.8, Specificity = 0.8) }
Thursday, August 31, 2023
Diagnostic Evaluation from Data Presented in a Confusion Matrix
confusion.matrix.diagnostic.accuracy <- function(x) { ## Purpose: ## Calculate sensitivity, specificity, and other metrics. ## ## Arguments: ## x: a numeric vector of length 4 for the raw confusion matrix. ## First 2 numbers: count of (Test+, Test-) with reference positive results ## Last 2 numbers: count of (Test+, Test-) with reference negative results ## Ref+, Ref- ## Test+ x1 x3 ## Test- x2 x4 ## ## Return: ## Sensitivity, Specificity, PPV, NPV, PLR, NLR and their CI's ## ## Author: Feiming Chen ## ________________________________________________ d <- as.data.frame(addmargins(matrix(x, ncol = 2))) dimnames(d) <- list(c("Test.Pos", "Test.Neg", "Sum"), c("Ref.Pos", "Ref.Neg", "Sum")) d$Likelihood.Ratio <- sapply(1:3, function(i) convert.to.CI.text(ratio.of.two.prop(x=c(d[i, 1], d[i, 2]), n=c(d[3, 1], d[3, 2])), digits = 1)) d$"Performance" <- c(paste(c("PPV =", "NPV ="), my.proportion(c(d[1,1], d[2,2]), c(d[1,3], d[2,3]), digits = 1)), paste("Prevalence =", my.proportion(d[3,1], d[3,3], digits = 1))) sens <- d[1,1] / d[3, 1] spec <- d[2,2] / d[3, 2] PPV <- d[1,1] / d[1, 3] NPV <- d[2,2] / d[2, 3] J <- sens + spec - 1 # Youden's index in percentage ## J2 <- d[1,1] / d[3, 1] - d[1,2] / d[3, 2] # identical in the form of risk difference J.ci <- prop.test(c(d[1,1], d[1,2]), c(d[3,1], d[3,2]))$conf.int J.str <- convert.to.CI.text(c(J, J.ci)) NND <- 1 / (PPV + NPV - 1) # Number Needed to Diagnose ## NND2 <- 1 / (d[1,1] / d[1, 3] - d[2,1] / d[2, 3]) # identical in the form of risk difference NND.ci <- 1 / prop.test(c(d[1,1], d[2,1]), c(d[1,3], d[2,3]))$conf.int NND.str <- convert.to.CI.text(ceiling(c(NND, rev(NND.ci)))) b <- c(paste(c("Sensitivity =", "Specificity ="), my.proportion(c(d[1,1], d[2,2]), c(d[3,1], d[3,2]), digits = 1)), "", "", paste0(paste0("J = ", J.str), ", ", paste0("NND = ", NND.str))) d2 <- rbind(d, b) rownames(d2)[4] <- "Performance" tex.print(d2, file = "~/temp/confusion-matrix-performance", caption = "Performance Evaluation from Confusion Matrix") d2 } if (F) { ## bowel cancer diagnostic test (https://en.wikipedia.org/wiki/Sensitivity_and_specificity#Confusion_matrix) x <- c(20, 10, 180, 1820) confusion.matrix.diagnostic.accuracy(x) confusion.matrix.diagnostic.accuracy(c(95, 5, 5, 95)) confusion.matrix.diagnostic.accuracy(c(95, 5, 500, 9500)) confusion.matrix.diagnostic.accuracy(c(90, 10, 1000, 9000)) }
Thursday, April 27, 2023
Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)
c.statistic <- function(R, D) { ## Purpose: Calculate the c-statistic (ROC-AUC), which is a ## concordance measure. If we take all possible pairs of ## patients where one has disease (1) and the other has no ## disease (0), we can calculate the proportion of all such ## pairs where the diseased one has higher score (device output) ## than the normal one (if they have the same value, we count ## this as ‘half a victory’). ## ## Author: Feiming Chen ## ## Arguments: ## R: Clinical Reference Standard (0 = Negative, 1 = Positive) ## D: Device Diagnostic Output (a continuous variable) ## ## Return: ## - c : the c-statistic ## ________________________________________________ f <- split(D, R) o <- outer(f$"1", f$"0", "-") c.statistic <- mean((o > 0) + (o == 0)/2) cat("c-statistic (ROC-AUC) =", c.statistic, "\n") c.statistic } if (F) { # Unit Test R <- rep(0:1, each = 5) c.statistic(R, 1:10) # 1 c.statistic(R, 10:1) # 0 c.statistic(R, rep(1, 10)) # 0.5 neg <- rep(1:5, times=c(33,6,6,11,2)) pos <- rep(1:5, times=c(3,2,2,11,33)) R <- c(rep(0, length(neg)), rep(1, length(pos))) D <- c(neg, pos) c.statistic(R, D) # 0.89317 }
Import a text block (with only numeric values) and convert it to data frame
scan.text <- function(b, n.col = 1, v.names = "") { ## Purpose: Import a text block (with only numeric values) and convert it to data frame. ## ## Author: Feiming Chen ## ## Arguments: ## - b : a block of text containing only numeric values separated by spaces ## - n.col : number of columns in the data frame ## - v.names : variable names ## Return: ## a data frame containing imported numbers ## ## ________________________________________________ d1 <- scan(textConnection(b)) d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T)) names(d2) <- v.names d2 } if (F) { # Unit Test scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2")) ## ID V1 V2 ## 1 1 2 3 ## 2 4 5 6 }
Saturday, March 18, 2023
Accuracy Analysis for Binary Data with Repeated Measurements
accuracy.metrics <- function(dat) { ## Purpose: ## Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## device - binary response from device (1 = Positive; 0 = Negative) ## reference - binary response from reference (1 = Positive; 0 = Negative) ## ## Return: ## - a list of binary accuracy metrics. ## ## ________________________________________________ d <- dat$device r <- dat$reference sensitivity = mean(d[r == 1]) specificity = 1 - mean(d[r == 0]) PPV = mean(r[d == 1]) NPV = 1 - mean(r[d == 0]) PLR = sensitivity / (1 - specificity) NLR = (1 - sensitivity) / specificity c(Sensitivity = sensitivity, Specificity = specificity, PPV = PPV, NPV = NPV, PLR = PLR, NLR = NLR) } if (F) { accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(1, 0), 5))) ## [1] 0 0 0 0 0 Inf accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(0, 1), 5))) ## [1] 1 1 1 1 Inf 0 } accuracy.analysis <- function(dat, boot.size = 2000) { ## Purpose: ## Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR), ## with their CI's, even in the presence of repeated measurements. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## subject - subject identifier ## device - binary response from device ## reference - binary response from reference ## - boot.size : bootstrap sample size. ## ## Return: ## - Point estimates and 95% CI's for a list of binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). ## ## ________________________________________________ id <- dat$subject unique.ID <- unique(id) N <- length(unique.ID) cat("Number of Unique Patients =", N, "\n") K <- nrow(dat) / N cat("Total number of repeated measurements =", nrow(dat), "\n") cat("Average number of repeated measurements per patient =", K, "\n") ## Point Estimates est <- accuracy.metrics(dat) cat("\nPoint Estimates of Accuracy Metrics:\n") print(round(est, 4)) ## Make Bootstrap Samples cat("\nBootstrap Sample Size =", boot.size, "\n") set.seed(1) replicate(n = boot.size, { s <- sample(unique.ID, replace = TRUE) # a bootstrap sample of patient ID's ## find all rows with the ID's in the bootstrap sample of ID's v <- c() for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample accuracy.metrics(dat[v,]) }) -> est.boot CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975))) cat("95% Bootstrap Percentile Confidence Intervals of Accuracy Metrics:\n") print(round(CI, 4)) invisible(list(est = est, CI = CI))} if (F) { # Unit Test ## ans <- accuracy.analysis(dat, boot.size = 10) }
Tuesday, February 28, 2023
Agreement Analysis for Continuous Data with Repeated Measurements
agreement.metrics <- function(dat) { ## Purpose: ## Calculate agreement metrics. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names and with no NA values ## subject - subject identifier ## reference - continuous response from reference ## device - continuous response from device ## ## Return: ## - a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). ## ## ________________________________________________ reference.mean <- mean(dat$reference) dat$difference <- dat$device - dat$reference ## Bland-Altman Analysis with Repeated Measures BA.analysis <- bland.altman.repeated.vary(difference ~ factor(subject), dat, silent = TRUE) ## For MAE and RMSE MAE <- mean(abs(dat$difference)) # Mean Absolute Error MAE.pct <- MAE / reference.mean RMSE <- sqrt(mean(dat$difference^2)) # Root Mean Squared Error ## For Correlation: calculate patient-level summary metrics from repeated measures, then average across patients. ds <- split(dat, dat$subject) sapply(ds, function(d) cor(d$reference, d$device, method = "pearson")) -> ds2 Correlation <- mean(ds2) c(ref.mean = reference.mean, MAE = MAE, MAE.pct = MAE.pct, RMSE = RMSE, Correlation = Correlation, Bias = BA.analysis$m, SD = BA.analysis$s, LoA.L = BA.analysis$loa[1], LoA.U = BA.analysis$loa[2], SD0 = BA.analysis$s0, # no adjustment for repeated measurements LoA.L0 = BA.analysis$loa0[1], # no adjustment for repeated measurements LoA.U0 = BA.analysis$loa0[2]) # no adjustment for repeated measurements } if (F) { agreement.metrics(data.frame(subject = rep(1:30, 10), reference = 5 + rnorm(300), device = 6 + rnorm(300))) } agreement.analysis <- function(dat, boot.size = 2000) { ## Purpose: ## Calculate agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation) ## with their CI's, even in the presence of repeated measurements. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## subject - subject identifier ## reference - continuous response from reference ## device - continuous response from device ## - boot.size : bootstrap sample size. ## ## Return: ## - Point estimates and 95% CI's for a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). ## ## ________________________________________________ id <- dat$subject unique.ID <- unique(id) N <- length(unique.ID) cat("Number of Unique Patients =", N, "\n") K <- nrow(dat) / N cat("Total number of repeated measurements =", nrow(dat), "\n") cat("Average number of repeated measurements per patient =", K, "\n") ## Point Estimates est <- agreement.metrics(dat) cat("\nPoint Estimates of Agreement Metrics:\n") print(round(est, 4)) ## 2000 Bootstrap Samples cat("\nBootstrap Sample Size =", boot.size, "\n") set.seed(1) replicate(n = boot.size, { s <- sample(unique.ID, replace = TRUE) # a bootstrap sample of patient ID's ## find all rows with the ID's in the bootstrap sample of ID's v <- c() for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample agreement.metrics(dat[v,]) }) -> est.boot CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975))) cat("95% Bootstrap Percentile Confidence Intervals of Agreement Metrics:\n") print(round(CI, 4)) invisible(list(est = est, CI = CI))} if (F) { # Unit Test ## ans <- agreement.analysis(dat, boot.size = 10) }
Subscribe to:
Posts (Atom)