Random forest analysis: Appropriate Pain Behaviours Questionniare (APBQ)-Male

Peter Kamerman

First version: January 28, 2016

Latest version: September 05, 2016

Session setup

# Load packages
library(ggplot2)
library(scales)
library(grid)
library(cowplot)

## Warning: package 'cowplot' was built under R version 3.2.5

library(readr)

## Warning: package 'readr' was built under R version 3.2.5

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.2.5

library(tidyr)

## Warning: package 'tidyr' was built under R version 3.2.5

library(knitr)

## Warning: package 'knitr' was built under R version 3.2.5

library(party)

## Warning: package 'zoo' was built under R version 3.2.5

# Palette
palette = c('#000000', '#FF0000')

# knitr chunk options
opts_chunk$set(echo = TRUE,
               warning = TRUE,
               message = FALSE,
               cache = TRUE,
               fig.path = './figures/apbq-male/',
               fig.width = 11.69,
               fig.height = 8.27,
               dev = c('png', 'pdf'),
               tidy = FALSE)

Load data

data <- read_csv('./data/random-forest.csv')

Quick look

dim(data)

## [1] 212  14

names(data)

##  [1] "ID"         "CPT"        "PPT"        "Race"       "Sex"       
##  [6] "Anxiety"    "Depression" "PCS_trait"  "PCS_state"  "APBQ-F"    
## [11] "APBQ-M"     "Education"  "Assets"     "Employment"

head(data)

## # A tibble: 6 × 14
##      ID   CPT   PPT  Race   Sex Anxiety Depression PCS_trait PCS_state
##   <int> <int> <int> <chr> <chr>   <dbl>      <dbl>     <int>     <int>
## 1     1   217   874     W     M     1.5       1.00        11        10
## 2     2   300  1154     W     M     2.4       1.53        15        20
## 3     4    39   741     B     F     2.3       1.60        21        11
## 4     5    53  1100     B     F     2.3       1.40        23        24
## 5     6   300  1100     B     M     1.3       1.27        13         5
## 6     7   300  1249     W     M     1.4       1.13         3         2
## # ... with 5 more variables: `APBQ-F` <dbl>, `APBQ-M` <dbl>,
## #   Education <int>, Assets <dbl>, Employment <int>

tail(data)

## # A tibble: 6 × 14
##      ID   CPT   PPT  Race   Sex Anxiety Depression PCS_trait PCS_state
##   <int> <int> <int> <chr> <chr>   <dbl>      <dbl>     <int>     <int>
## 1   254   111   664     W     F     1.6       1.80        12         8
## 2   255    15   524     B     F     2.2       2.13        NA        NA
## 3   258   300  1126     B     M     1.2       1.20        NA         8
## 4   259    70  1117     B     F     1.7       2.00         7        10
## 5   260    53   892     B     F     1.4         NA        37        12
## 6   262    41   471     B     M     1.0       1.27        27        NA
## # ... with 5 more variables: `APBQ-F` <dbl>, `APBQ-M` <dbl>,
## #   Education <int>, Assets <dbl>, Employment <int>

Process data

# Clean data
data <- data %>%
    mutate(Ancestry = factor(Race),
           APBQM = `APBQ-M`,
           Sex = factor(Sex),
           Education = factor(Education, ordered = TRUE)) %>%
    select(-c(ID, CPT, PPT, Race, Depression, Anxiety, 
              PCS_trait, PCS_state, `APBQ-F`, `APBQ-M`))
# Complete X and Y variable dataset
data_complete <- data[complete.cases(data), ] 
# Length of full dataset (with NAs)
nrow(data)

## [1] 212

# Length of complete cases dataset
nrow(data_complete)

## [1] 190

glimpse(data_complete)

## Observations: 190
## Variables: 6
## $ Sex        <fctr> M, M, F, F, M, M, F, M, M, F, M, M, F, F, F, M, F,...
## $ Education  <ord> 2, 3, 3, 3, 3, 3, 3, 3, 2, 3, 3, 3, 3, 3, 2, 3, 3, ...
## $ Assets     <dbl> 1.0, 1.0, 1.0, 1.0, 0.6, 1.0, 1.0, 1.0, 1.0, 1.0, 1...
## $ Employment <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ Ancestry   <fctr> W, W, B, B, B, W, W, W, W, W, W, W, W, W, W, W, W,...
## $ APBQM      <dbl> 1.8, 3.0, 3.7, 1.5, -3.1, 3.4, 5.2, 3.7, 5.5, 3.0, ...

Simple single tree

single_tree <- ctree(APBQM ~ ., data = data_complete)
plot(single_tree)

Random Forest

# Set random seeds (used sampling on first run only)
# seed_1 <- sample(1:10000, 1); seed_1
# seed_2 <- sample(1:10000, 1); seed_2
seed_1 <- 7534
seed_2 <- 597
# Data controls
## mtry estimated as sqrt of variables 
data.control_1 <- cforest_unbiased(ntree = 500, mtry = 3) 
data.control_2 <- cforest_unbiased(ntree = 2000, mtry = 3) 
# Model 1
#########
set.seed(seed_1)
# ntree = 500, mtry = 3, seed = seed_1
# Modelling 
model_1 <- cforest(APBQM ~ .,
                    data = data_complete,
                    controls = data.control_1) 
model_1_varimp <- varimp(model_1, conditional = TRUE)

# Model 2
#########
# ntree = 2000, mtry = 3, seed = seed_1
# Modelling 
model_2 <- cforest(APBQM ~ .,
                    data = data_complete,
                    controls = data.control_2) 
model_2_varimp <- varimp(model_2, conditional = TRUE)

# Model 3
#########
# Set seed
set.seed(seed_2)
# ntree = 500, mtry = 3, seed = seed_2
# Modelling 
model_3 <- cforest(APBQM ~ .,
                    data = data_complete,
                    controls = data.control_1) 
model_3_varimp <- varimp(model_3, conditional = TRUE)

# Model 4
#########
# ntree = 2000, mtry = 3, seed = seed_2
# Modelling 
model_4 <- cforest(APBQM ~ .,
                    data = data_complete,
                    controls = data.control_2) 
model_4_varimp <- varimp(model_4, conditional = TRUE)

Plots

## Generate plot dataframe
plot_list <- list(model_1_varimp, model_2_varimp, 
                  model_3_varimp, model_4_varimp)
plot_list <- lapply(plot_list, function(x)
    data.frame(Variable = names(x), Importance = x, row.names = NULL))
plot_df <- do.call(cbind, plot_list)
plot_df <- plot_df[ , c(1, 2, 4, 6, 8)]
names(plot_df) <- c('Variable', 'Model_1', 'Model_2', 'Model_3', 'Model_4')
plot_df <- plot_df %>%
    gather(Model, Value, -Variable) %>%
    mutate(Model = factor(Model)) %>%
    group_by(Model) %>%
    arrange(desc(Value)) %>%
    mutate(Important = Value > abs(min(Value)))
## Dataframe of variable importance thresholds
v_importance <- plot_df %>%
    summarise(Threshold = abs(min(Value)))
## Vector to order x axis variables
x_order <- plot_df %>%
    group_by(Variable) %>%
    summarise(mean = mean(Value)) %>%
    arrange(mean) %>%
    mutate(Variable = factor(Variable, Variable, ordered = TRUE)) 
x_order <- x_order$Variable
## Vector to label x variables
x_labs <- c(Education = 'Education',
            Assets = 'Household assets',
            Sex = 'Sex',
            Ancestry = 'Ancestry',
            Employment = 'Employment status')
## Vector of facet labels
f_labels <- c(Model_1 = 'Model 1\n(trees built: 500, seed: 7534)',
              Model_2 = 'Model 2\n(trees built: 2000, seed: 7534)',
              Model_3 = 'Model 3\n(trees built: 500, seed: 597)',
              Model_4 = 'Model 4\n(trees built: 2000, seed: 597)')

## Plot
ggplot(data = plot_df, aes(
                  x = Variable,
                  y = Value,
                  colour = Important,
                  fill = Important)) +
    geom_point(size = 4, 
               shape = 21) +
    geom_hline(data = v_importance, 
               aes(yintercept = Threshold), 
               linetype = 'dashed',
               size = 0.8) +
    facet_wrap(~ Model, 
               labeller = labeller(Model = f_labels)) +
    labs(y = 'Variable importance (arbitrary units)\n') +
    scale_x_discrete(labels = x_labs,
                     limits = x_order) +
    scale_colour_manual(values = palette) +
    scale_fill_manual(values = palette) +
    theme(legend.position = 'none',
          plot.margin = unit(c(1, 3, 1, 3), 'lines'),
          panel.margin.x = unit(2, 'lines'),
          axis.title = element_text(size = 18),
          axis.title.x = element_blank(),
          axis.text = element_text(size = 18),
          axis.text.x = element_text(angle = 30, hjust = 1),
          axis.line = element_line(size = 0.9),
          axis.ticks = element_line(size = 0.9),
          strip.text = element_text(size = 14))

Session information

sessionInfo()

## R version 3.2.4 (2016-03-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.11.6 (El Capitan)
## 
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
## 
## attached base packages:
## [1] stats4    grid      stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] party_1.0-25      strucchange_1.5-1 sandwich_2.3-4   
##  [4] zoo_1.7-13        modeltools_0.2-21 mvtnorm_1.0-5    
##  [7] knitr_1.14        tidyr_0.6.0       dplyr_0.5.0      
## [10] readr_1.0.0       cowplot_0.6.2     scales_0.4.0     
## [13] ggplot2_2.1.0    
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.12.6      formatR_1.4      plyr_1.8.4       tools_3.2.4     
##  [5] digest_0.6.10    evaluate_0.9     tibble_1.2       gtable_0.2.0    
##  [9] lattice_0.20-33  Matrix_1.2-6     DBI_0.5          yaml_2.1.13     
## [13] coin_1.1-2       stringr_1.1.0    R6_2.1.3         survival_2.39-5 
## [17] rmarkdown_1.0    multcomp_1.4-6   TH.data_1.0-7    magrittr_1.5    
## [21] codetools_0.2-14 htmltools_0.3.5  splines_3.2.4    MASS_7.3-45     
## [25] assertthat_0.1   colorspace_1.2-6 labeling_0.3     stringi_1.1.1   
## [29] lazyeval_0.2.0   munsell_0.4.3

pain-sex-ethnicity

Random forest analysis of predictors of pain sensitivity and pain beliefs.

Random forest analysis: Appropriate Pain Behaviours Questionniare (APBQ)-Male

Peter Kamerman

Session setup

Load data

Quick look

Process data

Simple single tree

Random Forest

Plots

Session information