Guideline regarding code writing style:

Different persons like to write/organize codes in different ways. I generally followed the following points.

 for(i in 1:10){
  a = i
  b = i + 1
 }

I wrote

 for(i in 1:10)
 {
  a = i
  b = i + 1
 }



We are going to import an example SPLC data in R as a data.table object and fit CoxBoost model to the data (using R package CoxBoost). The data file is in data directory and the analysis result files are saved as R workspace within Results directory. Below, in [1]-[4] we prepare the data for analysis and in [5] we fit the CoxBoost model with user-chosen number of boosting steps and penalty parameter. In [6] and [7], we see how to objectively determine the number of boosting steps and penalty parameter. In practice, we can run [6] and [7] first to get objectively determned values and then use them to fit main CoxBoost model in [5].


[1] Set directories and load all libraries

 # empty workspace
 rm(list=ls())  
 
 # load library
 library(data.table) # for data input/output
 library(mice) # for imputation using mice() function
 library(CoxBoost) # for CoxBoost analysis using CoxBoost() function
 library(kableExtra) # for printing nice looking tables using functions like column_spec()
 library(ggplot2) # for making nice looking compact plots using data.table object
 library(riskRegression) # for fitting Fine and Gray model using FGR()
 library(pec) # for predicting cumulative incidence from fitted CR object using predictEventProb()
 library(prodlim) 
 
 # set directories
 workdir = "/Users/nsanyal/Dropbox/Statistics/Research_Nil/Project-10-CompetingRisk/Introductory_manuals/CB_for_CR/"


[2] Import an example SPLC dataset

 # import data
 D = fread( paste0( workdir, "data/example_SPLC_data.csv" ) )
 dim(D)
## [1] 1000   20


[3] Select variables for analysis

 # exclude never smokers (1=never, 2=former, 3=current)
 D = D[ !D$smkstatus2 == 1,]
 dim(D)
 # [1] 873  20

 
 # Choose variables to include and subset data
 vars.include = c( "Time", "event", "age.ix.cat", "bmi", "chemo_ix", "cigday2", "copd", "edu", "fh", "hist2.ix", "packyears2", "ph", "quityears2", "race.cat", "radiation_ix", "sex", "smkstatus2", "stage2.ix", "surgery_ix", "USPSTF" )   
 data = D[,..vars.include]
 data$age.ix.cat = unname( sapply( data$age.ix.cat, function(x) switch(x, "71-80" = "71.80", "80+" = "80", "61-70" = "61.70", "<55" = "55", "56-60" = "56.60" )  ) )   # replace characters -,< and + by . 


 # Set missing code, if any, to NA. Then, convert categorical variables to factor.
 data$age.ix.cat = relevel( as.factor(data$age.ix.cat), ref='61.70' )   

 data$chemo_ix = ifelse( data$chemo_ix==9, NA, data$chemo_ix )
 data$chemo_ix = relevel( as.factor(data$chemo_ix), ref=1 ) 

 data$copd = ifelse( data$copd==9, NA, data$copd )
 data$copd = as.factor( as.character(data$copd) ) 

 data$edu = relevel( as.factor( as.character(data$edu) ), ref="2") # 2 = college group is reference 

 data$fh = as.factor( as.character(data$fh) ) 

 data$hist2.ix = relevel( as.factor(data$hist2.ix), ref="SQ" ) 

 data$ph = as.factor( as.character(data$ph) ) 

 data$race.cat = relevel( as.factor(data$race.cat), ref="W" ) 

 data$radiation_ix = ifelse( data$radiation_ix==9, NA, data$radiation_ix )
 data$radiation_ix = relevel( as.factor( as.character(data$radiation_ix) ), ref="0") # 0 = No radiotherapy 

 data$sex = relevel( as.factor( as.character(data$sex) ), ref="0") # 0 = Female
 data$smkstatus2 = relevel( as.factor( as.character(data$smkstatus2) ), ref="2") # 2 = Former 

 data$stage2.ix = ifelse( data$stage2.ix==9, NA, data$stage2.ix )
 data$stage2.ix = relevel( as.factor( as.character(data$stage2.ix) ), ref="0") # 0 = No stage 2 primary LC 

 data$surgery_ix = ifelse( data$surgery_ix==9, NA, data$surgery_ix )
 data$surgery_ix = relevel( as.factor(data$surgery_ix), ref=1 ) 

 data$USPSTF = relevel( as.factor( as.character(data$USPSTF) ), ref="0") # 0 = No stage 2 primary LC


[4] Impute data and save in file

 # set a seed
 seed = c(12) 

 # perform single imputation
 mi = mice( data = data, m = 1, maxit = 5, print = T, seed = seed ) 

 # extract imputed dataset
 data.imp = complete(mi,1)

 # # save imputed data
 # save( list = c("mi","data.imp"), file = paste0("Results/data_imp_seed",seed,".Rdata") )


[5] Fit CoxBoost model

 # load imputed data
 load(paste0(workdir,"Results/data_imp_seed12.Rdata"))

 # create covariate matrix and
 mat = data.imp[, -c(1:2)]
 expr = as.formula(paste(" ~ ", paste(colnames(mat), collapse = " + ")))
 X = model.matrix(expr, mat)[, -1]

 # fit CoxBoost model
 fit.cb = CoxBoost( 
           time = data.imp$Time, 
           status = data.imp$event, 
           x = X,                 # n x p matrix of covariates.
           standardize = T,       # whether or not covariates should be standardized for
                                  # estimation. Mandatory covariates are not standardized.
           unpen.index=c(5,25),   # indices of mandatory covariates. here bmi and sex are chosen.
           stepno = 100,          # number of boosting steps (M).
           penalty = 100, 
           criterion = "pscore",  # criterion to be used for selection in each boosting step.
                                  # "pscore" corresponds to the penalized score statistics, "score" to the
                                  # un-penalized score statistics. Different results will only be seen for
                                  # un-standardized covariates ("pscore" will result in preferential
                                  # selection of covariates with larger covariance), or if different
                                  # penalties are used for different covariates.

           return.score = T,      # whether or not the value of the score statistic (or penalized
                                  # score statistic, depending on criterion), as evaluated in each boosting
                                  # step for every covariate, should be returned.

           trace = T              # logical value indicating whether progress in estimation
                                  # should be indicated by printing the name of the covariate updated.
          )  


 # print a summary of the fitted CoxBoost model
 summary(fit.cb)

 # variables contained in the fitted object
 names(fit.cb)

 # coefficient estimates
 dim(fit.cb$coefficients)   # (stepno+1) x p matrix containing the coefficient estimates for the 
                            # (standardized) optional covariates for boosting steps 0 to stepno.

 fit.cb$coefficients[100,]  # check which coefficients are estimated as 0 in the last step
 fit.cb$coefficients[99,]
 fit.cb$coefficients[80,]

 # finally selected and not-selected coefficients
 colnames(X)[ which(fit.cb$coefficients[100,] != 0) ]  # selected
 colnames(X)[ which(fit.cb$coefficients[100,] == 0) ]  # not selected

 # score statistic estimates
 dim(fit.cb$scoremat)   # stepno x (p-p.unpen) matrix containing the value of the score statistic for 
                        # each of the optional covariates before each boosting step.

 which.max(fit.cb$scoremat[1,])   # which coefficient was updated in the first step
 which.max(fit.cb$scoremat[2,])
 which.max(fit.cb$scoremat[3,])
 which.max(fit.cb$scoremat[4,])

 # linear predictor estimates
 dim(fit.cb$linear.predictor)   # (stepno+1) x n matrix giving the linear predictor for boosting steps 
                                # 0 to stepno and every observation.

 # cumulative baseline hazard estimates
 dim(fit.cb$Lambda)   # (stepno+1) x ntime matrix with the Breslow estimate for the cumulative baseline
                      # hazard for boosting steps 0 to stepno for every event time.

 # used penalty values
 fit.cb$penalty

[6] Determining adequate penalty parameter by cross-validation

 # Fit K-fold cross-validation for CoxBoost to determine adequate penalty parameter (to be chosen only very coarsely)
 optim.fit.cb = optimCoxBoostPenalty(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
                minstepno = 50, maxstepno = 200,   # range of boosting steps in which the “optimal” number 
                                                   # of boosting steps is wanted to be.
                start.penalty = 100,  # start value for the search for the appropriate penalty.
                iter.max = 10, trace = T)

 # the optimal penalty
 optim.fit.cb$penalty

[7] Determining the optimal number of boosting steps by cross-validation

 # Fit K-fold cross-validation for CoxBoost to determine optimal number of boosting steps
 cv.fit.cb = cv.CoxBoost(time = data.imp$Time, status = data.imp$event, x = X, unpen.index=c(5,25),
             maxstepno = 100, K = 10, type = "verweij", trace = T, penalty = 100, 
             criterion = "pscore" ) 
             # maxstepno = maximum number of boosting steps to evaluate. The returned “optimal” number of
             # boosting steps will be in the range [0,maxstepno].

 # plot mean partial log-likelihood
 cv.fit.cb$mean.logplik   # vector of length maxstepno+1 with the mean partial log-likelihood for boosting
                          # steps 0 to maxstepno
 plot(cv.fit.cb$mean.logplik)

 # optimal boosting step number
 cv.fit.cb$optimal.step   # having minimum mean partial log-likelihood.

[9] References and other resources