9

UPDATE:

I have tried running the code on https://rdrr.io/snippets/ and it works fine. Therefore, I suspect a problem with my R installation, but it is extremely worrying that this can happen without errors or warnings. What are the best steps to investigate this ? I am running R 3.4.4 on Ubuntu 18.04 and gbm 2.1.4


I am fitting a boosted model to a dataset and have noticed some strange predictions. Here is a minimal working example. Please note that this is just a small sample of the dataset I am working with

mydata <- structure(list(Count = c(1L, 3L, 1L, 4L, 1L, 0L, 1L, 2L, 0L, 0L, 1L, 1L, 2L, 1L, 2L, 2L, 1L, 2L, 0L, 2L, 3L, 1L, 4L, 3L, 0L, 4L, 1L, 2L, 1L, 1L, 0L, 2L, 1L, 4L, 1L, 5L, 3L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 2L, 0L, 0L, 1L, 1L, 1L, 0L, 3L, 1L, 1L, 0L, 3L, 1L, 1L, 1L, 1L, 2L, 3L, 2L, 2L, 0L, 0L, 3L, 5L, 1L, 2L, 1L, 1L, 0L, 0L, 1L, 2L, 1L, 3L, 1L, 1L, 0L, 2L, 2L, 1L, 3L, 3L, 2L, 0L, 0L, 1L, 2L, 1L, 0L, 2L, 0L, 0L, 4L, 4L, 2L), Treat1 = structure(c(10L, 14L, 8L, 2L, 3L, 12L, 1L, 10L, 6L, 2L, 11L, 11L, 15L, 1L, 8L, 3L, 13L, 9L, 9L, 11L, 1L, 8L, 14L, 5L, 10L, 8L, 15L, 11L, 7L, 6L, 13L, 11L, 7L, 1L, 1L, 2L, 7L, 12L, 5L, 1L, 8L, 1L, 9L, 8L,12L, 14L, 12L, 7L, 8L, 14L, 3L, 3L, 5L, 1L, 1L, 11L, 6L, 5L, 5L, 13L, 9L, 3L, 8L, 9L, 13L, 9L, 7L, 9L, 2L, 6L, 10L, 3L, 11L, 4L, 3L, 15L, 12L, 6L, 4L, 3L, 8L, 8L, 11L, 1L, 11L, 2L, 11L, 5L, 12L, 6L, 8L, 14L, 1L, 9L, 9L, 10L, 10L, 5L, 14L, 3L), .Label = c("D", "U", "R", "E", "C", "Y", "L", "O", "G", "T", "N", "J", "V", "X", "A"), class = "factor"), Treat2 = structure(c(15L, 13L, 7L, 8L, 2L, 5L, 15L, 4L, 2L, 7L, 6L, 2L, 3L, 14L, 10L, 7L, 7L, 14L, 11L, 7L, 6L, 1L, 5L, 13L, 11L, 6L, 10L, 5L, 3L, 1L, 7L, 9L, 6L, 10L, 5L, 11L, 15L, 9L, 7L, 11L, 10L, 2L, 3L, 3L, 5L, 11L, 8L, 6L,4L, 5L, 15L, 8L, 8L, 2L, 2L, 10L, 4L, 1L, 10L, 11L, 10L, 8L, 7L, 7L, 8L, 14L, 16L, 11L, 10L, 9L, 3L, 15L, 13L, 1L, 11L, 11L, 9L, 7L, 10L, 9L, 3L, 7L, 5L, 13L, 3L, 14L, 10L, 10L, 15L, 13L, 15L, 12L, 14L, 11L, 5L, 4L, 2L, 3L, 11L, 10L), .Label = c("B", "X", "R", "H", "L", "D", "U", "Q", "K", "C", "T", "V", "J", "E", "F", "A"), class = "factor"), Near = c(0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0), Co1 = c(2, 5, 1, 1, 0, 1, 1, 2, 1, 2, 5, 2, 1, 0, 1, 2, 6, 3, 3, 1, 2, 2, 3, 0, 1, 0, 1, 0, 2, 1, 0, 1, 2, 3, 1, 2, 2, 0, 0, 2, 3, 3, 1, 1, NA, 2, 0, 2, 1, NA, 1, 1, 0, 1, 2, 0, 2, 1, 1, 1, 2, 3, 1, 0, 4, 0, 0, 0, 2, 2, 1, 1,2, 0, 1, 2, 1, 0, 0, 0, 0, 2, 1, 2, 2, 2, 2, 1, 0, 1, 1, 1, 1, 1, 0, 2, 0, 0, 5, 1), Co2 = c(1, 1, 2, 2, 4, 1, 3, 0, 5, 2, 2, 4, 1, 1, 2, 1, 2, 3, 0, 2, 3, 3, 0, 3, 1, 0, 1, 1, 1, 2, 0, 1, 1, 1, 2, 3, 2, 2, 3, 0, 0, 0, 1, 2, NA, 1, 1, 1, 0, 2, 1, 1, 2, 5, 0, 2, 1, 4, 1, 1, 3, 0, 1, 1, 1, 1, NA, 0, 2, 1, 1, 3, 2, 1, 2, 1, 3, 1, 2, 0, 1, 5, 2, 2, 1, 2, 3, 4, 3, 1, 1, 0, 5, 1, 1, 0, 1, 1, 2, 0)), .Names = c("Count", "Treat1", "Treat2", "Near", "Co1", "Co2"), row.names = c(1759L, 959L, 1265L, 1504L, 630L, 1905L, 1885L, 1140L, 1187L, 1792L, 1258L, 1125L, 756L, 778L, 1718L, 1797L, 388L, 715L, 63L, 311L, 1492L, 1128L, 629L, 536L, 503L, 651L, 1684L, 1893L, 721L, 1440L, 1872L, 1444L, 1593L, 143L, 1278L, 1558L, 1851L, 1168L, 1829L, 386L, 365L, 849L, 429L, 155L, 11L, 1644L, 101L, 985L, 72L, 459L, 1716L, 844L, 1313L, 77L, 1870L, 744L, 219L, 513L, 644L, 831L, 338L, 284L, 211L, 1096L,243L, 1717L, 1881L, 1784L, 1017L, 992L, 45L, 707L, 489L, 1267L, 1152L, 1819L, 995L, 510L, 1350L, 1700L, 56L, 1754L, 725L, 1625L, 319L, 1818L, 1287L, 1634L, 953L, 1351L, 1787L, 923L, 917L, 484L, 886L, 390L, 1531L, 679L, 1811L, 1736L), class = "data.frame")

set.seed(12345)
require(gbm)

n.trees <- 10000

m1.gbm <- gbm(Count ~ Treat1 + Treat2 + Near + Co1 + Co2, data = mydata, distribution = "poisson", n.trees = n.trees)

head(predict(m1.gbm, newdata = mydata, n.trees = n.trees, type = "response"))
predict(m1.gbm, newdata = head(mydata), n.trees = n.trees, type = "response")

Perhaps naively I assumed that the last lines would output the same results, but no:

[1] 0.994297776 2.995972275 0.817366593 3.984539334 0.977805068 0.004828331
[1] 10.8603111  1.2439321  1.2515243 93.8925370  1.6301918  0.5146144

To look at a specific example:

mydata$predict.gbm <- predict(m1.gbm, newdata = mydata, n.trees = 10000, type = "response")
tail(mydata)

     Count Treat1 Treat2 Near Co1 Co2 predict.gbm
886      2      G      L    1   0   1 1.996664300
390      0      T      H    1   2   0 0.079447326
1531     0      T      X    0   0   1 0.008874954
679      4      C      R    1   0   1 4.023112604
1811     4      X      T    0   5   2 3.994436833
1736     2      R      C    0   1   0 2.003126597

..and looking at just the last 2 rows:

predict(m1.gbm, data.frame(Count=4, Treat1="X", Treat2="T", Near=0, Co1=5, Co2=2), n.trees = 10000, type = "response")
[1] 6.925626

predict(m1.gbm, data.frame(Count=2, Treat1="R", Treat2="C", Near=0, Co1=1, Co2=0), n.trees = 10000, type = "response")
[1] 5.381878

I must be missing something really obvious here, and would appreciate any help in figuring it out !

Robert Long
  • 2,785
  • 4
  • 18
  • 38
  • 1
    Looks like it has to do with your `factor` variables. You have to make sure you're using the correct (i.e. same) levels. – AntoniosK Oct 08 '18 at 13:08
  • 2
    I get the same values on both lines with your `mydata`. – nicola Oct 08 '18 at 13:09
  • Me too - both lines are the same, I cannot seem to reproduce your issue... – desertnaut Oct 08 '18 at 13:19
  • I do reproduce the problem (for `tail(mydata)` I have same last value but not second to last...) I don't get the values you get though. `gbm` version is 2.1.4, tested on R3.5.1 and R3.4.2 (I get exact same result in both sessions) – Cath Oct 08 '18 at 13:24
  • I can reproduce neither reported problem - R 3.4.1, `gbm` 2.1.3 (windows) – desertnaut Oct 08 '18 at 13:26
  • @AntoniosK I also suspected a problem with factors, but that's why I first did the predict on `head(mydata)` which should preserve the factors, right ? – Robert Long Oct 08 '18 at 13:31
  • @desertnaut I am running R 3.4.4 on Ubuntu 18.04 and gbm 2.1.4 – Robert Long Oct 08 '18 at 13:36
  • @container I get what you mean, but it looks that the factor variables and how the model handles them when it's building the various trees is the problem. Try to remove your factor variables from the formula and check again your whole process (i.e. use the formula `Count ~ Near + Co1 + Co2`) – AntoniosK Oct 08 '18 at 14:25
  • @AntoniosK please see my update in the originall post. the problem seems to be with my installation. I am going to post a new question about it. – Robert Long Oct 08 '18 at 14:37
  • I have the same problem as you, even if the predicted values are different. But, when I remove the factor variables it works as expected. I'm using `R 3.5.1` on WIndows and `gbm 2.1.4` – AntoniosK Oct 08 '18 at 14:47
  • @AntoniosK yes, if I remove the factors then it works OK for me too, however that does not explain how it works 100% OK on https://rdrr.io/snippets/ ? – Robert Long Oct 08 '18 at 14:50
  • They mention "Loaded gbm 2.1.3" when I used your link. Works well though :) – AntoniosK Oct 08 '18 at 14:57
  • @desertnaut the problem only occurs with gbm 2.1.4 ! – Robert Long Oct 08 '18 at 16:32
  • 1
    @AntoniosK thanks for poiting me in the right direction. The problem occurs only with 2.1.4 ! – Robert Long Oct 08 '18 at 16:33

1 Answers1

5

The problem appears to be related to the version of gbm that I am using.

By default it installed v 2.1.4

After I removed the package, and installed v 2.1.3, it worked as expected.

I have now posted a new question related to the inconsistency between package versions

Robert Long
  • 2,785
  • 4
  • 18
  • 38