1 year ago
#343772
Francesco Pisu
Precision-recall curve with bootstrapped confidence-interval with R and pROC
I have ground-truth labels and predicted probabilities and I want to generate a precision-recall (PR) curve with bootstrapped confidence intervals. As far as I know, there is no R package that allows one to generate such PR curves with confidence intervals.
I'm using R with the pROC package and this is what I achieved so far. The confidence area looks too spiky. Did I get something wrong ?
Minimal reproducible example:
library(pROC)
library(ggplot2)
# external is a dataframe with columns target (ground-truths) and proba (predicted probabilities).
target <- c(0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0,
1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0,
0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0)
proba <- c(0.3788080, 0.2307075, 0.3632901, 0.2151600, 0.2219767, 0.2151600,
0.2307075, 0.4866651, 0.2122365, 0.3765080, 0.6658652, 0.3925962, 0.2267252,
0.6605233, 0.2122365, 0.2232976, 0.2388451, 0.2348628, 0.2151600, 0.6961893,
0.6602093, 0.2151600, 0.4539989, 0.3632901, 0.4411348, 0.2307075, 0.3672428,
0.2151600, 0.3632901, 0.2267252, 0.2122365, 0.3649428, 0.4041614, 0.2151600,
0.3672428, 0.6833584, 0.2151600, 0.3765080, 0.3925962, 0.4382865, 0.2151600,
0.2122365, 0.2203741, 0.6605233, 0.6980382, 0.3748553, 0.6409143, 0.3632901,
0.5391744, 0.4411348, 0.4052658, 0.3632901, 0.2203741, 0.2348628, 0.6744679,
0.3765080, 0.6514071, 0.2267252, 0.4782437, 0.6578743, 0.3632901, 0.2232976,
0.2151600, 0.2151600, 0.2151600, 0.3649428, 0.3632901, 0.3788080, 0.3632901,
0.2232976, 0.3748553, 0.2122365, 0.2232976, 0.6510241, 0.2151600, 0.2232976,
0.3748553, 0.2151600, 0.2232976, 0.6386143, 0.3748553, 0.2307075, 0.4321651,
0.3649428, 0.5446964, 0.3655901, 0.2151600, 0.2388451, 0.3655901, 0.3788080,
0.2267252, 0.3649428, 0.3735473, 0.6477646, 0.2151600, 0.2307075, 0.3632901,
0.3788080, 0.2151600, 0.3672428, 0.3748553, 0.2203741, 0.3632901, 0.3655901,
0.3748553, 0.2203741)
external <- as.data.frame(cbind(target, proba))
# generate ROC curve
roc1 <- pROC::roc(external$target, external$proba, plot=FALSE,
legacy.axes=TRUE, percent=FALSE)
# compute recall and precision at each ROC curve's threshold
prcoords <- pROC::coords(roc1, "all", ret = c("threshold", "recall", "precision"), transpose = FALSE)
# compute 95% confidence intervals for recall and precision at each threshold
pr.cis <- pROC::ci.coords(roc1, prcoords$threshold, ret=c("recall", "precision"))
# build a dataframe where columns x contains recall values and lower/upper columns
# contain lower and upper bounds of corresponding precision values.
pr.cis <- data.frame(pr.cis[2]) # convert precision coords to data frame
pr.cis.df <- data.frame(x = prcoords$recall,
lower = pr.cis[, 1],
upper = pr.cis[, 3])
# plot precision recall coordinates along with confidence area
ggplot(prcoords, aes(recall, precision)) +
geom_path(aes(recall, precision), colour="salmon") + # needed to connect points in order of appearance
geom_ribbon(aes(x=pr.cis.df$x, ymin=pr.cis.df$lower, ymax=pr.cis.df$upper),
alpha=0.3,
fill="lightblue") +
theme(aspect.ratio = 1) +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"))
r
proc
precision-recall
0 Answers
Your Answer