Another option using Rfast::rowTabulate
v <- c('Hard', 'Match', 'Easy', NA)
DT[, (v) := as.data.table(Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N))),
.SDcols=Task_Alpha:Task_Delta]
output:
userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta Hard Match Easy NA
1: 3108 -8.00 Easy Easy Easy Easy 0 0 4 0
2: 3207 3.00 Hard Easy Match Match 1 2 1 0
3: 3350 5.78 Hard Easy Hard Hard 3 0 1 0
4: 3961 10.00 Easy <NA> Hard Hard 2 0 1 1
5: 4021 10.00 Easy Easy <NA> Hard 1 0 2 1
data from Wimpel:
library(data.table)
DT <- fread("userID Score Task_Alpha Task_Beta Task_Charlie Task_Delta
3108 -8.00 Easy Easy Easy Easy
3207 3.00 Hard Easy Match Match
3350 5.78 Hard Easy Hard Hard
3961 10.00 Easy NA Hard Hard
4021 10.00 Easy Easy NA Hard
")
Would be interesting to know how fast this approach works on actual dataset and if actual dataset is large.
edit: Added timings
library(data.table)
set.seed(0L)
nr <- 1e6
v <- c('Hard', 'Match', 'Easy', NA)
DT <- data.table(userID=1:nr, Task_Alpha=sample(v, nr, TRUE),
Task_Beta=sample(v, nr, TRUE), Task_Charlie=sample(v, nr, TRUE),
Task_Delta=sample(v, nr, TRUE))
df <- as.data.frame(DT)
mtd0 <- function() {
t(apply(df[-1L], 1L, function(x)
table(factor(x, levels = c("Easy", "Hard", "Match")))))
}
mtd1 <- function() {
DT.melt <- melt( DT, id.vars = "userID", measure.vars = patterns( task = "^Task_") )
dcast( DT.melt, userID ~ value, fun.aggregate = length )
}
mtd2 <- function() {
DT[, Rfast::rowTabulate(matrix(match(as.matrix(.SD), v), nrow=.N)),
.SDcols=Task_Alpha:Task_Delta]
}
bench::mark(mtd0(), mtd1(), mtd2(), check=FALSE)
timings:
# A tibble: 3 x 13
expression min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time result memory time gc
<bch:expr> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm> <list> <list> <list> <list>
1 mtd0() 54.7s 54.7s 0.0183 137MB 1.70 1 93 54.7s <int[,3] [1,000,000 x 3]> <df[,3] [107,168 x 3]> <bch:tm> <tibble [1 x 3]>
2 mtd1() 2.4s 2.4s 0.417 398MB 0.833 1 2 2.4s <df[,5] [1,000,000 x 5]> <df[,3] [12,517 x 3]> <bch:tm> <tibble [1 x 3]>
3 mtd2() 252.8ms 264.4ms 3.78 107MB 3.78 2 2 528.7ms <int[,4] [1,000,000 x 4]> <df[,3] [6,509 x 3]> <bch:tm> <tibble [2 x 3]>