2

I am trying to compute observed, minimum and maximum values by group in order to plot with ggplot. I was able to do that in the following way:

library(tidyverse)
df <- cbind.data.frame("V1"=rnorm(100,10,2),
                       "V1_std"=rgamma(100,2),
                       "V2"=rnorm(100,10,2),
                       "V2_std"=rgamma(100,2))
df_obs <- df %>%
  mutate("id"=1:nrow(.)) %>%
  select(-c(V1_std,V2_std)) %>%
  gather(...=1:2)

df_min <- df %>%  mutate("V1_min"= V1 - V1_std) %>%
  mutate("V2_min"= V2 - V2_std) %>% 
  select(V1_min,V2_min) %>%
  gather(...=1:2)

df_max <- df %>%  mutate("V1_max"= V1 + V1_std) %>%
  mutate("V2_max"= V2 + V2_std) %>% 
  select(V1_max,V2_max) %>%
  gather(...=1:2)

df_final <- cbind.data.frame(df_obs,'min'=df_min$value,'max'=df_max$value)

which gives me:

    key     value      min       max
1    V1 11.411261  7.2531585 15.569364
2    V1 10.804986 10.3518400 11.258132
3    V1  9.809049  8.5992110 11.018887
4    V1 11.225030 10.3028566 12.147204
5    V1 10.532991 10.2911703 10.774812
6    V1  9.993123  8.8835294 11.102717
7    V1  6.891480  4.3241776  9.458782
8    V1 12.492458 11.6869964 13.297920
9    V1  8.986359  5.0335530 12.939165
10   V1  8.667382  8.3316045  9.003160

Is there a simpler way to do that, using a single pipe?

Thanks

  • A basic approach can be found [here](https://stackoverflow.com/a/28731116/2461552). Because you don't have suffixes on your values, you'll likely need want to rename the new `` column to something nicer, which can be done via `rename`. You'll also still need your `id` variable step prior to gathering and then you can do your math after spreading. – aosmith Aug 31 '17 at 22:34

3 Answers3

2

Here's my attempt. The real issue was the naming of your columns, which made it difficult to clearly split into separate variables which could be used to create the intermediate long dataset:

df %>% 
  set_names(paste(rep(c("V1","V2"),each=2),c("val","std"),sep="_")) %>% 
  mutate(id = row_number()) %>% 
  gather(key,value,-id) %>%
  separate(key, c("key","group")) %>%
  spread(group, value) %>%
  mutate(min=val - std, max=val + std) %>%
  select(-std)

#     id key       val        min       max
#1     1  V1  7.342068  5.7006317  8.983503
#2     1  V2  7.372698  5.0818045  9.663591
#3     2  V1 13.397766 11.1738412 15.621691
# ...

And for fun, here is the base reshape version:

names(df) <- paste(c("val","std"),rep(c("Vl","V2"),each=2),sep="_")
df_final <- reshape(df, direction="long", sep="_", varying=TRUE, timevar="key")
df_final <- transform(df_final, min = val-std, max=val+std)[c("id","key","val","min","max")]
thelatemail
  • 81,120
  • 12
  • 111
  • 172
2

Here's my version. It's a bit messy, but it should continue to work if you add more columns with the same pattern.

df %>% mutate(ID=1:n()) %>% gather("key", "value", -ID) %>% 
  separate(key, c("var", "SD"), sep=2) %>% 
  mutate(SD=replace(SD, SD == "", "value")) %>% spread("SD", "value") %>% 
  mutate(min = value - `_std`, max=value - `_std`) %>% 
  select(-`_std`, -ID) %>% arrange(var)
Christoph Wolk
  • 1,688
  • 1
  • 5
  • 12
2

And yet another version:

bind_rows(df %>% select(v=V1,std=V1_std) %>% mutate(key="V1"),
          df %>% select(v=V2,std=V2_std) %>% mutate(key="V2")) %>% 
  mutate(max = v + std,
         min = v - std) %>% 
  select(key, value=v, min, max)
    key     value       min       max
1    V1  8.747092  7.910670  9.583515
2    V1 10.367287  9.526856 11.207717
3    V1  8.328743  6.628955 10.028531
4    V1 13.190562 12.385141 13.995983
5    V1 10.659016 10.317527 11.000504
6    V1  8.359063  5.537228 11.180899

Reproducible data:

set.seed(1)
df <- data.frame(V1=rnorm(100,10,2),
                 V1_std=rgamma(100,2),
                 V2=rnorm(100,10,2),
                 V2_std=rgamma(100,2))
eipi10
  • 81,881
  • 20
  • 176
  • 248