c.statistic <- function(R, D) { ## Purpose: Calculate the c-statistic (ROC-AUC), which is a ## concordance measure. If we take all possible pairs of ## patients where one has disease (1) and the other has no ## disease (0), we can calculate the proportion of all such ## pairs where the diseased one has higher score (device output) ## than the normal one (if they have the same value, we count ## this as ‘half a victory’). ## ## Author: Feiming Chen ## ## Arguments: ## R: Clinical Reference Standard (0 = Negative, 1 = Positive) ## D: Device Diagnostic Output (a continuous variable) ## ## Return: ## - c : the c-statistic ## ________________________________________________ f <- split(D, R) o <- outer(f$"1", f$"0", "-") c.statistic <- mean((o > 0) + (o == 0)/2) cat("c-statistic (ROC-AUC) =", c.statistic, "\n") c.statistic } if (F) { # Unit Test R <- rep(0:1, each = 5) c.statistic(R, 1:10) # 1 c.statistic(R, 10:1) # 0 c.statistic(R, rep(1, 10)) # 0.5 neg <- rep(1:5, times=c(33,6,6,11,2)) pos <- rep(1:5, times=c(3,2,2,11,33)) R <- c(rep(0, length(neg)), rep(1, length(pos))) D <- c(neg, pos) c.statistic(R, D) # 0.89317 }
Thursday, April 27, 2023
Calculate the discriminatory power of a model prediction index using the concordance metric c-statistic (ROC-AUC)
Import a text block (with only numeric values) and convert it to data frame
scan.text <- function(b, n.col = 1, v.names = "") { ## Purpose: Import a text block (with only numeric values) and convert it to data frame. ## ## Author: Feiming Chen ## ## Arguments: ## - b : a block of text containing only numeric values separated by spaces ## - n.col : number of columns in the data frame ## - v.names : variable names ## Return: ## a data frame containing imported numbers ## ## ________________________________________________ d1 <- scan(textConnection(b)) d2 <- as.data.frame(matrix(d1, ncol = n.col, byrow = T)) names(d2) <- v.names d2 } if (F) { # Unit Test scan.text("1 2 3 4 5 6", n.col = 3, v.names = c("ID", "V1", "V2")) ## ID V1 V2 ## 1 1 2 3 ## 2 4 5 6 }
Saturday, March 18, 2023
Accuracy Analysis for Binary Data with Repeated Measurements
accuracy.metrics <- function(dat) { ## Purpose: ## Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## device - binary response from device (1 = Positive; 0 = Negative) ## reference - binary response from reference (1 = Positive; 0 = Negative) ## ## Return: ## - a list of binary accuracy metrics. ## ## ________________________________________________ d <- dat$device r <- dat$reference sensitivity = mean(d[r == 1]) specificity = 1 - mean(d[r == 0]) PPV = mean(r[d == 1]) NPV = 1 - mean(r[d == 0]) PLR = sensitivity / (1 - specificity) NLR = (1 - sensitivity) / specificity c(Sensitivity = sensitivity, Specificity = specificity, PPV = PPV, NPV = NPV, PLR = PLR, NLR = NLR) } if (F) { accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(1, 0), 5))) ## [1] 0 0 0 0 0 Inf accuracy.metrics(data.frame(device = rep(c(0,1), 5), reference = rep(c(0, 1), 5))) ## [1] 1 1 1 1 Inf 0 } accuracy.analysis <- function(dat, boot.size = 2000) { ## Purpose: ## Calculate binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR), ## with their CI's, even in the presence of repeated measurements. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## subject - subject identifier ## device - binary response from device ## reference - binary response from reference ## - boot.size : bootstrap sample size. ## ## Return: ## - Point estimates and 95% CI's for a list of binary accuracy metrics (sensitivity, specificity, PPV, NPV, PLR, NLR). ## ## ________________________________________________ id <- dat$subject unique.ID <- unique(id) N <- length(unique.ID) cat("Number of Unique Patients =", N, "\n") K <- nrow(dat) / N cat("Total number of repeated measurements =", nrow(dat), "\n") cat("Average number of repeated measurements per patient =", K, "\n") ## Point Estimates est <- accuracy.metrics(dat) cat("\nPoint Estimates of Accuracy Metrics:\n") print(round(est, 4)) ## Make Bootstrap Samples cat("\nBootstrap Sample Size =", boot.size, "\n") set.seed(1) replicate(n = boot.size, { s <- sample(unique.ID, replace = TRUE) # a bootstrap sample of patient ID's ## find all rows with the ID's in the bootstrap sample of ID's v <- c() for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample accuracy.metrics(dat[v,]) }) -> est.boot CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975))) cat("95% Bootstrap Percentile Confidence Intervals of Accuracy Metrics:\n") print(round(CI, 4)) invisible(list(est = est, CI = CI))} if (F) { # Unit Test ## ans <- accuracy.analysis(dat, boot.size = 10) }
Tuesday, February 28, 2023
Agreement Analysis for Continuous Data with Repeated Measurements
agreement.metrics <- function(dat) { ## Purpose: ## Calculate agreement metrics. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names and with no NA values ## subject - subject identifier ## reference - continuous response from reference ## device - continuous response from device ## ## Return: ## - a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). ## ## ________________________________________________ reference.mean <- mean(dat$reference) dat$difference <- dat$device - dat$reference ## Bland-Altman Analysis with Repeated Measures BA.analysis <- bland.altman.repeated.vary(difference ~ factor(subject), dat, silent = TRUE) ## For MAE and RMSE MAE <- mean(abs(dat$difference)) # Mean Absolute Error MAE.pct <- MAE / reference.mean RMSE <- sqrt(mean(dat$difference^2)) # Root Mean Squared Error ## For Correlation: calculate patient-level summary metrics from repeated measures, then average across patients. ds <- split(dat, dat$subject) sapply(ds, function(d) cor(d$reference, d$device, method = "pearson")) -> ds2 Correlation <- mean(ds2) c(ref.mean = reference.mean, MAE = MAE, MAE.pct = MAE.pct, RMSE = RMSE, Correlation = Correlation, Bias = BA.analysis$m, SD = BA.analysis$s, LoA.L = BA.analysis$loa[1], LoA.U = BA.analysis$loa[2], SD0 = BA.analysis$s0, # no adjustment for repeated measurements LoA.L0 = BA.analysis$loa0[1], # no adjustment for repeated measurements LoA.U0 = BA.analysis$loa0[2]) # no adjustment for repeated measurements } if (F) { agreement.metrics(data.frame(subject = rep(1:30, 10), reference = 5 + rnorm(300), device = 6 + rnorm(300))) } agreement.analysis <- function(dat, boot.size = 2000) { ## Purpose: ## Calculate agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation) ## with their CI's, even in the presence of repeated measurements. ## ## Author: Feiming Chen ## ## Arguments: ## - dat : a data frame with the following variable names ## subject - subject identifier ## reference - continuous response from reference ## device - continuous response from device ## - boot.size : bootstrap sample size. ## ## Return: ## - Point estimates and 95% CI's for a list of agreement metrics (MAE, MAE%, RMSE, LOA, Bias, Average Correlation). ## ## ________________________________________________ id <- dat$subject unique.ID <- unique(id) N <- length(unique.ID) cat("Number of Unique Patients =", N, "\n") K <- nrow(dat) / N cat("Total number of repeated measurements =", nrow(dat), "\n") cat("Average number of repeated measurements per patient =", K, "\n") ## Point Estimates est <- agreement.metrics(dat) cat("\nPoint Estimates of Agreement Metrics:\n") print(round(est, 4)) ## 2000 Bootstrap Samples cat("\nBootstrap Sample Size =", boot.size, "\n") set.seed(1) replicate(n = boot.size, { s <- sample(unique.ID, replace = TRUE) # a bootstrap sample of patient ID's ## find all rows with the ID's in the bootstrap sample of ID's v <- c() for (j in s) v <- c(v, which(id == j)) # a new bootstrap sample agreement.metrics(dat[v,]) }) -> est.boot CI <- apply(est.boot, 1, function(x) quantile(x, c(0.025, 0.975))) cat("95% Bootstrap Percentile Confidence Intervals of Agreement Metrics:\n") print(round(CI, 4)) invisible(list(est = est, CI = CI))} if (F) { # Unit Test ## ans <- agreement.analysis(dat, boot.size = 10) }
Calculate the mean and 95% confidence interval of a sample of numbers under a transform
sample.mean.CI.transform <- function(x, transform.func = identity, anti.transform.func = identity) { ## Purpose: ## Calculate the mean and 95% CI of a sample of numbers under a transform. ## ## Author: Feiming Chen ## ## Arguments: ## - x : a vector of positive numerical values ## - transform.func: a function for transforming the original values to a new scale. ## - anti.transform.func: a function for back-transforming the values to the original scale. ## ## Return: ## - m : mean based on a particular transform function. ## ## ________________________________________________ y <- transform.func(x) # transform to a new scale a <- t.test(y) # estimate the mean and 95% CI based on t-distribution c(anti.transform.func(a$estimate), CI = anti.transform.func(a$conf.int)) } if (F) { # Unit Test x <- exp(1:10) sample.mean.CI.transform(x) # no transform ## estimate the mean under the log-normal distributional assumption sample.mean.CI.transform(x, transform.func = log, anti.transform.func = exp) }
Thursday, June 30, 2022
Use simulation to estimate the coverage probability of the quantile rank-based confidence interval
The original code comes from https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970
test.coverage.via.simulation <- function(lu, n, p, alpha = 0.1) { ## Purpose: ## Use simulation to estimate the coverage probability of the quantile rank-based CI. ## ## Reference (code origin): ## https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970 ## ## Arguments: ## - lu: a vector of two numbers for the lower and upper limit of the CI in ranks. ## - n: sample size ## - p: quantile probability cutpoint (between 0 and 1) ## - alpha: type I error level (default to 0.1 so a 90% CI is calculated) ## ## Return: ## Average coverage probability from 10,000 simulated samples. ## ________________________________________________ ## Generate many random samples of size "n" from a known distribution ## and compute actual CI's from those samples using the given rank-based CI. set.seed(1) n.sim <- 1e4 index <- function(x, i) ifelse(i == Inf, Inf, ifelse(i == -Inf, -Inf, x[i])) sim <- replicate(n.sim, index(sort(rnorm(n)), lu)) ## Compute the actual proportion of those intervals that cover the theoretical quantile. F.q <- qnorm(p) covers <- sim[1, ] <= F.q & F.q <= sim[2, ] mean.coverage <- mean(covers) message("Mean Coverage Over 10,000 Simulated Samples = ", signif(mean.coverage, 4)) } if (F) { # Unit Test lu <- c(85, 97) n <- 100 p <- 0.9 alpha <- 0.05 test.coverage.via.simulation(lu, n, p, alpha) ## Mean Coverage Over 10,000 Simulated Samples = 0.9528 }
Rank-based Confidence Interval for a Quantile
The original code comes from https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970
quantile.CI.ranks <- function(n, p, alpha = 0.1) { ## Purpose: ## Calculate a two-sided near-symmetric distribution-free ## confidence interval with confidence level of (1 - alpha) for ## a quantile, by searching over a small range of upper and ## lower order statistics for the closest coverage to (1 - ## alpha) (but not less than it, if possible). ## ## Reference (code origin): ## https://stats.stackexchange.com/questions/99829/how-to-obtain-a-confidence-interval-for-a-percentile/284970#284970 ## ## Arguments: ## - n: sample size ## - p: quantile probability cutpoint (between 0 and 1) ## - alpha: type I error level (default to 0.1 so a 90% CI is calculated) ## ## Return: ## - CI: two indices into the order statistics for the two-sided CI of the quantile ## - coverage: theoretical coverage probability of the CI, which should be close to (1 - alpha). ## a small candidate list of order statistics for the lower/upper limits of the CI of a quantile l <- qbinom(alpha/2, n, p) + (-2:2) + 1 # lower limit candidates u <- qbinom(1 - alpha/2, n, p) + (-2:2) # upper limit candidates ## out-of-bound order statistics correspond to no limit l[l < 0] <- -Inf # no lower limit u[u > n] <- Inf # no upper limit ## for each pair of candidate lower/upper limit, calculate the coverage probability coverage <- outer(l, u, function(l, u) pbinom(u - 1, n, p) - pbinom(l - 1, n, p)) ## if no coverage is above (1 - alpha), choose the max coverage; ## otherwise, look for the smallest coverage that is above (1 - alpha). if (max(coverage) < 1 - alpha) { i <- which(coverage == max(coverage)) } else { i <- which(coverage == min(coverage[coverage >= 1 - alpha])) } j <- i[1] # in case there are multiple candidates ## identify the order statistics and its coverage. L <- rep(l, 5)[j] U <- rep(u, each = 5)[j] return(list(CI = c(L, U), coverage = coverage[j])) } if (F) { # Unit Test alpha <- 0.05 n <- 100 p <- 0.9 quantile.CI(n, p, alpha) ## $CI ## [1] 85 97 ## $coverage ## [1] 0.95227 }
Subscribe to:
Posts (Atom)