Eraser


First things first : Guideline regarding code writing style:

Different persons like to write/organize codes in different ways. I generally followed the following points.

 for(i in 1:10){
    a = i
    b = i + 1
 }

I wrote

 for(i in 1:10)
 {
    a = i
    b = i + 1
 }


We are going to import an example SPLC data in R as a data.table object, see how to extract different variables and sort the data in different ways. Then, we will choose variables to include into our analysis and impute the data with those variables only (using R package mice). Then, we will learn to fit penalized regression for competing risk (using R package crrp) under different settings and visualize the estimated regression coefficients for different penalty functions. After that, we will use penalized regression-based estimates/variables in the traditional Fine & Gray (FGR) model structure to perform prediction. Finally, the predictive performance will be evaluated through different measures of discrimination and calibration. (using R packages riskRegression, pec, prodlim).

The data file is in data directory, the source files are in source directory, the analysis result files are saved as R workspace within Results directory, and plots and tables are saved within tables_and_figures directory.


Set directories and load all libraries

# empty workspace
rm(list = ls())

# load library
library(data.table)  # for data input/output
library(mice)  # for imputation using mice() function
library(crrp)  # for penalized regression analysis using gcrrp() function
library(kableExtra)  # for printing nice looking tables using functions like column_spec()
library(ggplot2)  # for making nice looking compact plots using data.table object
library(riskRegression)  # for fitting Fine and Gray model using FGR()
library(pec)  # for predicting cumulative incidence from fitted CR object using predictEventProb()
library(prodlim)

# set directories
workdir = "/Users/nsanyal/Downloads/PR_for_CR/"


Import an example SPLC dataset

D = fread(paste0(workdir, "data/example_SPLC_data.csv"))
dim(D)
## [1] 1000   20


Different ways to extract and sort variables of the data

# copy data.table into another object
D0 = copy(D)  # D0=D will not copy, just will assign another name, so changing one will change the other  

# extract column 'event' as vector
D[, event]  # or D$event  

# extract column 'event' as data.table
D[, "event"]  # or D[,.(event)]   

# extract columns 'Time' and 'event' as data.table
D[, c("Time", "event")]  # or D[,.(Time,event)]  

# extract columns with names saved in a variable
cnames = c("Time", "event", "edu", "sex")
D[, ..cnames]  # or D[,cnames,with=F]  

# sort data by increasing order of 'Time'
setkey(D, Time)  # or setorder(D,Time)   

# sort data by increasing order of 'Time' (first) and 'edu' (next)
setkey(D, Time, edu)  # or setorder(D,Time,edu)  

# sort data by decreasing order of 'Time'
setorder(D, -Time)  # or setorderv(D,'Time',order=-1). setkey allows sorting only by increasing order.  

# sort data by alphabetically ordered column names (convenient for looking up a column name)
setcolorder(D, sort(names(D)))

# reset data as in the beginning
D = copy(D0)


Select variables for analysis

# exclude never smokers (1=never, 2=former, 3=current)
D = D[!D$smkstatus2 == 1, ]
dim(D)
# [1] 6372 102


# Choose variables to include and subset data
vars.include = c("Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", 
    "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", 
    "USPSTF")
data = D[, ..vars.include]
data$age.ix.cat = unname(sapply(data$age.ix.cat, function(x) switch(x, `71-80` = "71.80", `80+` = "80", 
    `61-70` = "61.70", `<55` = "55", `56-60` = "56.60")))  # replace characters -,< and + by . 


# Set missing code, if any, to NA. Then, convert categorical variables to factor.
data$age.ix.cat = relevel(as.factor(data$age.ix.cat), ref = "61.70")

data$chemo_ix = ifelse(data$chemo_ix == 9, NA, data$chemo_ix)
data$chemo_ix = relevel(as.factor(data$chemo_ix), ref = 1)

data$copd = ifelse(data$copd == 9, NA, data$copd)
data$copd = as.factor(as.character(data$copd))

data$edu = relevel(as.factor(as.character(data$edu)), ref = "2")  # 2 = college group is reference 

data$fh = as.factor(as.character(data$fh))

data$hist2.ix = relevel(as.factor(data$hist2.ix), ref = "SQ")

data$ph = as.factor(as.character(data$ph))

data$race.cat = relevel(as.factor(data$race.cat), ref = "W")

data$radiation_ix = ifelse(data$radiation_ix == 9, NA, data$radiation_ix)
data$radiation_ix = relevel(as.factor(as.character(data$radiation_ix)), ref = "0")  # 0 = No radiotherapy 

data$sex = relevel(as.factor(as.character(data$sex)), ref = "0")  # 0 = Female
data$smkstatus2 = relevel(as.factor(as.character(data$smkstatus2)), ref = "2")  # 2 = Former 

data$stage2.ix = ifelse(data$stage2.ix == 9, NA, data$stage2.ix)
data$stage2.ix = relevel(as.factor(as.character(data$stage2.ix)), ref = "0")  # 0 = No stage 2 primary LC 

data$surgery_ix = ifelse(data$surgery_ix == 9, NA, data$surgery_ix)
data$surgery_ix = relevel(as.factor(data$surgery_ix), ref = 1)

data$USPSTF = relevel(as.factor(as.character(data$USPSTF)), ref = "0")  # 0 = No stage 2 primary LC


Impute data and save in file

seed = c(12)
mi = mice(data = data, m = 1, maxit = 5, print = T, seed = seed)
data.imp = complete(mi, 1)
# save( list = c('mi','data.imp'), file = paste0('Results/data_imp_seed',seed,'.Rdata') )


Penalized regression fit

# load imputed data
load(paste0(workdir, "Results/data_imp_seed12.Rdata"))

# create design matrix and
mat = data.imp[, -c(1:2)]
expr = as.formula(paste(" ~ ", paste(colnames(mat), collapse = " + ")))
X = model.matrix(expr, mat)[, -1]

# create group indicator (for inputing in gcrrp() function) by counting number of groups created in X
group = rep(1:ncol(mat), times = sapply(mat, function(x) {
    if (class(x) == "numeric") return(1) else return(length(unique(x)) - 1)
}))

# fit LASSO
fit.glasso = gcrrp(time = data.imp$Time, fstatus = data.imp$event, failcode = 1, X = X, group = group, 
    penalty = "gLASSO", lambda.min = 0.001, nlambda = 50)  # default choices.
# run str(fit.glasso) to see the structure of fit.glasso object.

# fit adaptive LASSO
fit.gadalasso = gcrrp(time = data.imp$Time, fstatus = data.imp$event, failcode = 1, X = X, group = group, 
    penalty = "gLASSO", weighted = T, lambda.min = 0.001, nlambda = 50)  # default choices.

# fit SCAD
fit.gscad = gcrrp(time = data.imp$Time, fstatus = data.imp$event, failcode = 1, X = X, group = group, 
    penalty = "gSCAD", lambda.min = 0.001, nlambda = 50, gamma = 3.7)  # default choices.

# fit MCP
fit.gmcp = gcrrp(time = data.imp$Time, fstatus = data.imp$event, failcode = 1, X = X, group = group, 
    penalty = "gMCP", lambda.min = 0.001, nlambda = 50, , gamma = 2.7)  # default choices.

# save fits
save(fit.glasso, file = paste0(workdir, "Results/gLASSO_fit_imp_seed12.Rdata"))
save(fit.gadalasso, file = paste0(workdir, "Results/gADALASSO_fit_imp_seed12.Rdata"))
save(fit.gscad, file = paste0(workdir, "Results/gSCAD_fit_imp_seed12.Rdata"))
save(fit.gmcp, file = paste0(workdir, "Results/gMCP_fit_imp_seed12.Rdata"))


Plot penalized regression coefficients

# load fits
load(paste0(workdir, "Results/gLASSO_fit_imp_seed12.Rdata"))
load(paste0(workdir, "Results/gADALASSO_fit_imp_seed12.Rdata"))
load(paste0(workdir, "Results/gSCAD_fit_imp_seed12.Rdata"))
load(paste0(workdir, "Results/gMCP_fit_imp_seed12.Rdata"))

# extract coefficients for both GCV and BIC based tuning parameter selection
beta.glasso.bic = fit.glasso$beta[, which.min(fit.glasso$BIC)]
beta.gadalasso.bic = fit.gadalasso$beta[, which.min(fit.gadalasso$BIC)]
beta.gscad.bic = fit.gscad$beta[, which.min(fit.gscad$BIC)]
beta.gmcp.bic = fit.gmcp$beta[, which.min(fit.gmcp$BIC)]

beta.glasso.gcv = fit.glasso$beta[, which.min(fit.glasso$GCV)]
beta.gadalasso.gcv = fit.gadalasso$beta[, which.min(fit.gadalasso$GCV)]
beta.gscad.gcv = fit.gscad$beta[, which.min(fit.gscad$GCV)]
beta.gmcp.gcv = fit.gmcp$beta[, which.min(fit.gmcp$GCV)]


# make coefficient x method table
beta.all.bic = cbind(beta.glasso.bic, beta.gadalasso.bic, beta.gscad.bic, beta.gmcp.bic)
beta.all.gcv = cbind(beta.glasso.gcv, beta.gadalasso.gcv, beta.gscad.gcv, beta.gmcp.gcv)

kable(cbind(beta.all.bic, beta.all.gcv), row.names = T, caption = "GCV and BIC based variable selection for cause 1", 
    linesep = "") %>% column_spec(5, border_right = T) %>% kable_styling(full_width = F)
GCV and BIC based variable selection for cause 1
beta.glasso.bic beta.gadalasso.bic beta.gscad.bic beta.gmcp.bic beta.glasso.gcv beta.gadalasso.gcv beta.gscad.gcv beta.gmcp.gcv
age.ix.cat55 0.0000000 0.0000000 0.000000 0.000000 -1.4575606 -2.6698418 -6.8191033 -6.8150809
age.ix.cat56.60 0.0000000 0.0000000 0.000000 0.000000 0.0747861 -0.0485878 -0.1423634 -0.1618113
age.ix.cat71.80 0.0000000 0.0000000 0.000000 0.000000 -0.3730483 -0.5435110 -0.6720156 -0.6657066
age.ix.cat80 0.0000000 0.0000000 0.000000 0.000000 -0.0992690 -0.1745293 -0.2088444 -0.1993495
bmi 0.0000000 0.0000000 0.000000 0.000000 0.0233010 0.0149302 0.0406255 0.0414460
chemo_ix1 0.0000000 0.0000000 0.000000 0.000000 -0.8936411 -1.0492702 -1.2220898 -1.2302037
cigday2 0.0000000 0.0000000 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.0000000
copd1 0.0000000 0.0000000 0.000000 0.000000 0.0052276 0.0000000 0.0276097 0.0000000
edu1 0.0000000 0.0000000 0.000000 0.000000 0.1257211 0.0148201 0.1081530 0.1119839
edu3 0.0000000 0.0000000 0.000000 0.000000 0.3815423 0.0459760 0.2882186 0.2879215
fh1 0.0000000 0.0000000 0.000000 0.000000 -0.4381514 -0.4582016 -0.4893955 -0.5254502
hist2.ixAD 0.0000000 0.0000000 0.000000 0.000000 0.8633487 1.1292301 1.1764403 1.1829045
hist2.ixLC 0.0000000 0.0000000 0.000000 0.000000 1.1540491 1.5064344 1.6203130 1.6199115
hist2.ixOTH 0.0000000 0.0000000 0.000000 0.000000 0.5612671 0.7131332 0.7077201 0.7055595
hist2.ixSC 0.0000000 0.0000000 0.000000 0.000000 -0.4435133 -1.7466435 -5.4763283 -5.4651681
packyears2 0.0000000 0.0000000 0.000000 0.000000 -0.0010113 0.0000000 0.0000000 0.0000000
ph1 0.0000000 0.0000000 0.000000 0.000000 0.0103285 0.0000000 0.0000000 0.0000000
quityears2 0.0000000 0.0000000 0.000000 0.000000 0.0000000 0.0000000 0.0000000 0.0000000
race.catA 0.0000000 0.0000000 0.000000 0.000000 -0.0226699 -0.0784260 -0.0459666 -0.0669693
race.catB 0.0000000 0.0000000 0.000000 0.000000 -0.2293959 -0.3210262 -0.4237903 -0.4832460
race.catH 0.0000000 0.0000000 0.000000 0.000000 -0.5393309 -0.8860188 -0.9087899 -1.0745480
race.catHW 0.0000000 0.0000000 0.000000 0.000000 0.4481362 0.6980772 0.7266344 0.7387063
race.catO 0.0000000 0.0000000 0.000000 0.000000 -1.5362620 -3.2426866 -3.2254721 -6.8849971
radiation_ix1 0.0000000 0.0000000 0.000000 0.000000 0.4642847 0.5082836 0.6016434 0.5908213
sex1 0.0000000 0.0000000 0.000000 0.000000 -0.1925784 -0.1173216 -0.1553838 -0.1590882
smkstatus23 0.0000000 0.0000000 0.000000 0.000000 0.5170287 0.6486791 0.7606614 0.7768355
stage2.ix1 -0.7600735 -0.7120922 0.000000 0.000000 -1.4745838 -1.6264616 -1.6413689 -1.6396480
surgery_ix1 1.2753809 0.9382062 2.342946 2.343131 1.3680460 1.4165489 1.4773960 1.4743209
USPSTF1 0.0000000 0.0000000 0.000000 0.000000 -0.6529241 -0.9045143 -1.0360865 -1.0507383
# do line-plot of GCV based coefficient estimates
range = range(c(beta.all.bic, beta.all.gcv))

par(oma = c(4, 2, 0, 0))
plot(beta.glasso.gcv, main = "GCV based estimates for cause 1", ylim = range, pch = 1, xaxt = "n", xlab = "", 
    ylab = "beta estimates from crrp", yaxt = "n")
points(beta.gadalasso.gcv, col = "yellow", pch = 20)
points(beta.gscad.gcv, col = "red", pch = 1)
points(beta.gmcp.gcv, col = "green", pch = 20)
axis(side = 1, at = 1:length(beta.glasso.gcv), labels = names(beta.glasso.gcv), las = 3)
axis(side = 2, at = c(-2, -1, -0.5, -0.25, 0, 0.25, 0.5, 1, 2), las = 2)
abline(h = c(-2, -1, -0.5, -0.25, 0, 0.25, 0.5, 1, 2), lty = 3)
abline(v = 1:length(beta.glasso.gcv), lty = 3, col = "grey")
legend("bottomleft", legend = c("gLASSO", "gADLASSO", "gSCAD", "gMCP"), pch = c(1, 20, 1, 20), col = c("black", 
    "yellow", "red", "green"), horiz = T)

# do bar-plot of GCV based coefficient estimates that are > 0.1 in magnitude
vars.chosen = names(which(apply(beta.all.gcv, 1, function(x) {
    any(abs(x) > 0.1)
})))
beta.vars.chosen.gcv = beta.all.gcv[rownames(beta.all.gcv) %in% vars.chosen, ]
colnames(beta.vars.chosen.gcv) = c("gLASSO", "gADALASSO", "gSCAD", "gMCP")
beta.vars.chosen.gcv.df = as.data.table(beta.vars.chosen.gcv)
beta.vars.chosen.gcv.df[, `:=`(covariates, rownames(beta.vars.chosen.gcv))]
beta.vars.chosen.gcv.df.melt = melt(beta.vars.chosen.gcv.df, id.var = c("covariates"))

ggplot(beta.vars.chosen.gcv.df.melt) + geom_col(aes(x = variable, y = value, fill = variable), position = "dodge", 
    show.legend = F) + ylab("Coeff. Estimates") + xlab("Penalized regression penalty") + facet_wrap(covariates ~ 
    ., nrow = 4) + theme(axis.text.x = element_text(angle = 45, hjust = 1.1, size = 12), axis.text.y = element_text(size = 12), 
    axis.title.x = element_text(size = 15), axis.title.y = element_text(size = 15), strip.text.x = element_text(size = 12))

# save coefficients
namecombs = as.character(Reduce(function(X, Y) outer(X, Y, FUN = paste, sep = "."), list(c("beta"), c("glasso", 
    "gadalasso", "gscad", "gmcp"), c("gcv", "bic"))))
allcoef = data.frame(coef = names(beta.glasso.bic), mget(namecombs))
# fwrite( allcoef, paste0( 'Results/PR_coef_imp_seed12.csv' ) )


How to incorporate penalized regression results into Fine & Gray (FGR) model based prediction

There are two ways:

  • Method 1 (FGR by penalty): Fit FGR model with all variables to obtain an FGR object. But then within that FGR object, substitute the FGR coefficients by the penalized regression coefficients. Then, use the FGR object to predict.

  • Method 2 (FGR with penalty): Fit FGR model with only those variables which are selected (i.e., |coef|>a given threshold) by penalized regression. Then, use the FGR object to predict.

In Method 1, we are using penalized regression for both variable selection and estimation. In Method 2, we are using penalized regression only for variable seelction whereas estimation is done by FGR.


Method 1: Fit Fine-and-Gray (FGR) model

# fix formula
formula.FGR = as.formula(paste("Hist(Time,event) ~", paste0(colnames(X), collapse = " +")))

# build input data frame
df.FGR = data.frame(Time = data.imp$Time, event = data.imp$event, X)

# fit FGR model
fit.FGR = FGR(formula.FGR, cause = 1, data = df.FGR)
fit.FGR$call$formula = formula.FGR


Method 1: Substitution of FGR coefficients by penalized regression coefficients

fit.FGR.by_glasso_gcv = fit.FGR.by_gadalasso_gcv = fit.FGR.by_gscad_gcv = fit.FGR.by_gmcp_gcv = fit.FGR

fit.FGR.by_glasso_gcv$crrFit$coef = allcoef$beta.glasso.gcv
fit.FGR.by_gadalasso_gcv$crrFit$coef = allcoef$beta.gadalasso.gcv
fit.FGR.by_gscad_gcv$crrFit$coef = allcoef$beta.gscad.gcv
fit.FGR.by_gmcp_gcv$crrFit$coef = allcoef$beta.gmcp.gcv


Method 1: Prediction

# set time-points for prediction and evaluation
times = c(5, 10, 15)  # time-points to predict at
ref.time = 10  # time-point to use for risk stratification and decision analysis later

# prediction using ordinary FGR and Method 1
pred.FGR = predictEventProb(fit.FGR, newdata = df.FGR, cause = 1, times = times)
pred.FGR.by_glasso_gcv = predictEventProb(fit.FGR.by_glasso_gcv, newdata = df.FGR, cause = 1, times = times)
pred.FGR.by_gadalasso_gcv = predictEventProb(fit.FGR.by_gadalasso_gcv, newdata = df.FGR, cause = 1, times = times)
pred.FGR.by_gscad_gcv = predictEventProb(fit.FGR.by_gscad_gcv, newdata = df.FGR, cause = 1, times = times)
pred.FGR.by_gmcp_gcv = predictEventProb(fit.FGR.by_gmcp_gcv, newdata = df.FGR, cause = 1, times = times)


Method 1: Prediction evaluation

# load prediction evaluation source code
source(paste0(workdir, "source/source_general.R"))
source(paste0(workdir, "source/source_predbased.R"))


# FGR prediction evaluation.
cindex.FGR = cindex_predbased(pred.FGR, formula.FGR, df.FGR, times)$AppCindex[[1]]  # Harrell-type C-index
sc = Score_predbased(pred = getpred_for_score(pred.FGR, df.FGR, cause = 1, times), "FGR", Hist(Time, 
    event) ~ 1, df.FGR, cause = 1, times = times, plots = "cal")
auc.FGR = sc$AUC$score  # time-dependent (time-varying) area under the ROC (AUROC)
brier.FGR = sc$Brier$score  # time-dependent (time-varying) Brier score

pdf(paste0(workdir, "tables_and_figures/FGR_performance_imp_seed12.pdf"))

### Calibration plot using pec package
j = 2
x = calPlot_predbased(pred = pred.FGR[, j], formula = formula.FGR, cause = 1, data = df.FGR, time = times[j], 
    method = "quantile", q = 10, ylim = c(0, 0.2), xlim = c(0, 0.2))
calplotFrame = x$plotFrame
prd = as.vector(calplotFrame[, "Pred"])
dif = as.vector(calplotFrame[, "Pred"]) - as.vector(calplotFrame[, "Obs"])
x = plot(prd, dif, xlab = paste0("Predicted ", ref.time, "-year Risk (quantile)"), ylab = "Pred. prob. - Obsd. prob.", 
    pch = 1, col = "blue", cex.lab = 1, cex.axis = 1, cex.main = 1, ylim = c(-0.05, 0.05))
abline(h = 0, col = "red", lty = "dotted")

### Calibration plot using riskRegression package
plotCalibration(sc, ylim = c(0, 0.5), xlim = c(0, 0.5), cens.method = "local", xlab = "Pred. risk", ylab = "Obsd. freq.", 
    mgp = c(3, 3, 0), round = T)

### Decile plot using estimated probability (akin to HL method)
x = myDecilePlot2(k = 10, event = df.FGR$event, Time = df.FGR$Time, PROB = pred.FGR[, j], minTime = 1, 
    maxTime = 20, YLIM = c(0, 0.15), MAIN = paste("Obsd. SPLC incidence by decile of estimated risk"))

### Decision curve analysis
df.FGR1 = as.data.frame(df.FGR)  # Without this stdca won't work
df.FGR1$risk = pred.FGR[, j]
x = stdca(data = df.FGR1, outcome = "event", ttoutcome = "Time", timepoint = 10, xby = 0.03, predictors = "risk", 
    cmprsk = T, probability = T, xstop = 0.2)
title(paste("Decision Curve Analysis", cex = 1))
abline(v = 0.01, col = "gray", lty = "dotted")
legend("left", legend = c("Risk-based", "Screen All", "Screen None"), col = c("red", "blue", "dark green"), 
    lty = c("solid", "dotted", "solid"), lwd = 2, cex = 1, bg = "white")

mtext("fit.FGR", outer = TRUE, line = -2, cex = 1.5)
dev.off()




# LASSO-based FGR prediction evaluation. Save plots.
cindex.FGR.by_glasso = cindex_predbased(pred.FGR.by_glasso_gcv, formula.FGR, df.FGR, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.by_glasso_gcv, df.FGR, cause = 1, times), "FGR", 
    Hist(Time, event) ~ 1, df.FGR, cause = 1, times = times, plots = "cal")
auc.FGR.by_glasso = sc$AUC$score
brier.FGR.by_glasso = sc$Brier$score



# ADALASSO-based FGR prediction evaluation. Save plots.
cindex.FGR.by_gadalasso = cindex_predbased(pred.FGR.by_gadalasso_gcv, formula.FGR, df.FGR, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.by_gadalasso_gcv, df.FGR, cause = 1, times), "FGR", 
    Hist(Time, event) ~ 1, df.FGR, cause = 1, times = times, plots = "cal")
auc.FGR.by_gadalasso = sc$AUC$score
brier.FGR.by_gadalasso = sc$Brier$score



# SCAD-based FGR prediction evaluation. Save plots.
cindex.FGR.by_gscad = cindex_predbased(pred.FGR.by_gscad_gcv, formula.FGR, df.FGR, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.by_gscad_gcv, df.FGR, cause = 1, times), "FGR", 
    Hist(Time, event) ~ 1, df.FGR, cause = 1, times = times, plots = "cal")
auc.FGR.by_gscad = sc$AUC$score
brier.FGR.by_gscad = sc$Brier$score



# MCP-based FGR prediction evaluation. Save plots.
cindex.FGR.by_gmcp = cindex_predbased(pred.FGR.by_gmcp_gcv, formula.FGR, df.FGR, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.by_gmcp_gcv, df.FGR, cause = 1, times), "FGR", 
    Hist(Time, event) ~ 1, df.FGR, cause = 1, times = times, plots = "cal")
auc.FGR.by_gmcp = sc$AUC$score
brier.FGR.by_gmcp = sc$Brier$score




# print all validation measures (C-index,AUC,Brier)
mget(grep("cindex", ls(), value = T)[-1])
mget(grep("auc", ls(), value = T))
mget(grep("brier", ls(), value = T))


Method 2: Fit Fine-and-Gray (FGR) model with penalized regression selected variables

# Fix selection threshold for penalized regression
sel.thres = 0.01

# LASSO-based selection and FGR fit
glasso.varsel = allcoef$coef[which(abs(allcoef$beta.glasso.gcv) > sel.thres)]
formula.FGR.glasso = as.formula(paste("Hist(Time,event) ~", paste0(glasso.varsel, collapse = " +")))
df.FGR.glasso = data.frame(Time = data.imp$Time, event = data.imp$event, X[, glasso.varsel])
fit.FGR.with_glasso_gcv = FGR(formula.FGR.glasso, cause = 1, data = df.FGR.glasso)
fit.FGR.with_glasso_gcv$call$formula = formula.FGR.glasso

# ADALASSO-based selection and FGR fit
gadalasso.vars = allcoef$coef[which(abs(allcoef$beta.gadalasso.gcv) > sel.thres)]
formula.FGR.gadalasso = as.formula(paste("Hist(Time,event) ~", paste0(gadalasso.vars, collapse = " +")))
df.FGR.gadalasso = data.frame(Time = data.imp$Time, event = data.imp$event, X[, gadalasso.vars])
fit.FGR.with_gadalasso_gcv = FGR(formula.FGR.gadalasso, cause = 1, data = df.FGR.gadalasso)
fit.FGR.with_gadalasso_gcv$call$formula = formula.FGR.gadalasso

# SCAD-based selection and FGR fit
gscad.vars = allcoef$coef[which(abs(allcoef$beta.gscad.gcv) > sel.thres)]
formula.FGR.gscad = as.formula(paste("Hist(Time,event) ~", paste0(gscad.vars, collapse = " +")))
df.FGR.gscad = data.frame(Time = data.imp$Time, event = data.imp$event, X[, gscad.vars])
fit.FGR.with_gscad_gcv = FGR(formula.FGR.gscad, cause = 1, data = df.FGR.gscad)
fit.FGR.with_gscad_gcv$call$formula = formula.FGR.gscad

# MCP-based selection and FGR fit
gmcp.vars = allcoef$coef[which(abs(allcoef$beta.gmcp.gcv) > sel.thres)]
formula.FGR.gmcp = as.formula(paste("Hist(Time,event) ~", paste0(gmcp.vars, collapse = " +")))
df.FGR.gmcp = data.frame(Time = data.imp$Time, event = data.imp$event, X[, gmcp.vars])
fit.FGR.with_gmcp_gcv = FGR(formula.FGR.gmcp, cause = 1, data = df.FGR.gmcp)
fit.FGR.with_gmcp_gcv$call$formula = formula.FGR.gmcp


Method 2: Prediction

# set time-points for prediction and evaluation
times = c(5, 10, 15)  # time-points to predict at
ref.time = 10  # time-point to use for risk stratification and decision analysis later

# prediction using ordinary FGR and Method 1
pred.FGR.with_glasso_gcv = predictEventProb(fit.FGR.with_glasso_gcv, newdata = df.FGR.glasso, cause = 1, 
    times = times)
pred.FGR.with_gadalasso_gcv = predictEventProb(fit.FGR.with_gadalasso_gcv, newdata = df.FGR.gadalasso, 
    cause = 1, times = times)
pred.FGR.with_gscad_gcv = predictEventProb(fit.FGR.with_gscad_gcv, newdata = df.FGR.gscad, cause = 1, 
    times = times)
pred.FGR.with_gmcp_gcv = predictEventProb(fit.FGR.with_gmcp_gcv, newdata = df.FGR.gmcp, cause = 1, times = times)


Method 2: Prediction evaluation

# load prediction evaluation source code
source(paste0(workdir, "source/source_general.R"))
source(paste0(workdir, "source/source_predbased.R"))


# LASSO-based FGR prediction evaluation. Save plots.
cindex.FGR.with_glasso = cindex_predbased(pred.FGR.with_glasso_gcv, formula.FGR.glasso, df.FGR.glasso, 
    times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.with_glasso_gcv, df.FGR.glasso, cause = 1, times), 
    "FGR", Hist(Time, event) ~ 1, df.FGR.glasso, cause = 1, times = times, plots = "cal")
auc.FGR.with_glasso = sc$AUC$score
brier.FGR.with_glasso = sc$Brier$score



# ADALASSO-based FGR prediction evaluation. Save plots.
cindex.FGR.with_gadalasso = cindex_predbased(pred.FGR.with_gadalasso_gcv, formula.FGR.gadalasso, df.FGR.gadalasso, 
    times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.with_gadalasso_gcv, df.FGR.gadalasso, cause = 1, 
    times), "FGR", Hist(Time, event) ~ 1, df.FGR.gadalasso, cause = 1, times = times, plots = "cal")
auc.FGR.with_gadalasso = sc$AUC$score
brier.FGR.with_gadalasso = sc$Brier$score



# SCAD-based FGR prediction evaluation. Save plots.
cindex.FGR.with_gscad = cindex_predbased(pred.FGR.with_gscad_gcv, formula.FGR.gscad, df.FGR.gscad, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.with_gscad_gcv, df.FGR.gscad, cause = 1, times), 
    "FGR", Hist(Time, event) ~ 1, df.FGR.gscad, cause = 1, times = times, plots = "cal")
auc.FGR.with_gscad = sc$AUC$score
brier.FGR.with_gscad = sc$Brier$score



# MCP-based FGR prediction evaluation. Save plots.
cindex.FGR.with_gmcp = cindex_predbased(pred.FGR.with_gmcp_gcv, formula.FGR.gmcp, df.FGR.gmcp, times)$AppCindex[[1]]
sc = Score_predbased(pred = getpred_for_score(pred.FGR.with_gmcp_gcv, df.FGR.gmcp, cause = 1, times), 
    "FGR", Hist(Time, event) ~ 1, df.FGR.gmcp, cause = 1, times = times, plots = "cal")
auc.FGR.with_gmcp = sc$AUC$score
brier.FGR.with_gmcp = sc$Brier$score


# print all validation measures (C-index,AUC,Brier)
mget(grep("cindex", ls(), value = T)[-1])
mget(grep("auc", ls(), value = T))
mget(grep("brier", ls(), value = T))