0

I have lots of data frames with the same columns. What I want is to apply quantile (15% and 80%) function to the 3rd ("cpm") column for all the data frames in my environment and add the result as a new column to each data frame All the data frames in environment are the same, here is the sample of them:

BD.ios = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L, 1L), .Label = "BD", class = "factor"), os = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "ios", class = "factor"), 
    cpm = c(0.00026978417266187, 0.000276497695852535, 0.00442228161827238, 
    0.00396317260301814, 0.0191772698764066, 0.700811773637797, 
    0.00482934642627173, 0.00201429499675114, 0.00021494623655914, 
    0.0000520855057351408)), row.names = c(12925L, 13011L, 15189L, 
18469L, 19494L, 22385L, 22594L, 29467L, 31907L, 38037L), class = "data.frame")

AE.mac = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "AE", class = "factor"), os = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "mac", class = "factor"), 
    cpm = c(0.000353264424964019, 0.00390138781055901, 0.000893105609526794, 
    0.0099634872417983, 0.00119375573921028, 0.00535134321942833, 
    0.00318471337579618, 0.000983284169124877, 0.116180371352785
    )), row.names = c(2622L, 6483L, 6898L, 9383L, 25280L, 25923L, 
29649L, 37977L, 40411L), class = "data.frame")

AF.android = structure(list(geo = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 
1L, 1L), .Label = "AF", class = "factor"), os = structure(c(1L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "android", class = "factor"), 
    cpm = c(0.193592767295597, 0.153727276424417, 0.30376596601237, 
    0.43615845874945, 0.552450120363948, 0.214786723495654, 0.206123674204523, 
    0.0250727462779332, 0.157723828668625)), row.names = c(955L, 
7975L, 8899L, 9297L, 11223L, 14963L, 17452L, 19883L, 20555L), class = "data.frame")

I believe, that the solution is easy and requires the use of eapply function, but I just can't figure it out

env = .GlobalEnv
eapply(env, quantile, probs = c(.15,.8))

This command results in an error:

    Error in `[.data.frame`(x, order(x, na.last = na.last, decreasing = decreasing)) :
   undefined columns selected

EDIT To make it clear, here is what I did and what I need as a result:

I had Data like this

data = structure(list(geo = structure(c(15L, 1L, 3L, 16L, 1L, 9L, 17L, 
23L, 29L, 52L, 26L, 55L, 34L, 46L, 25L, 52L, 17L, 15L, 27L, 35L, 
45L, 8L, 21L, 24L, 6L, 16L, 52L, 31L, 14L, 38L, 21L, 5L, 41L, 
16L, 34L, 52L, 27L, 16L, 7L, 13L, 10L, 35L, 52L, 44L, 27L, 19L, 
35L, 6L, 42L, 25L, 40L, 31L, 43L, 33L, 13L, 2L, 4L, 12L, 30L, 
44L, 51L, 38L, 35L, 28L, 52L, 32L, 20L, 19L, 34L, 56L, 51L, 53L, 
54L, 22L, 49L, 18L, 4L, 36L, 34L, 4L, 47L, 11L, 25L, 9L, 6L, 
46L, 39L, 25L, 12L, 50L, 27L, 39L, 48L, 27L, 23L, 9L, 19L, 9L, 
44L, 37L), .Label = c("AE", "AR", "AT", "AU", "AZ", "BD", "BG", 
"BO", "CA", "CD", "CH", "CO", "DK", "DZ", "EC", "EG", "ES", "FI", 
"FR", "GA", "GB", "GE", "HK", "HU", "ID", "IE", "IN", "IR", "IT", 
"KE", "KR", "LB", "LY", "MX", "MY", "NL", "PE", "PH", "PK", "PL", 
"PT", "QA", "RO", "RU", "RW", "SE", "SG", "SK", "SY", "TH", "TR", 
"US", "UY", "VN", "YE", "ZA"), class = "factor"), os = structure(c(3L, 
2L, 1L, 1L, 1L, 6L, 4L, 1L, 1L, 4L, 6L, 1L, 1L, 1L, 6L, 7L, 1L, 
4L, 1L, 3L, 1L, 6L, 3L, 1L, 1L, 1L, 1L, 1L, 1L, 4L, 1L, 1L, 1L, 
6L, 1L, 1L, 1L, 1L, 4L, 6L, 1L, 1L, 6L, 6L, 1L, 1L, 1L, 1L, 1L, 
1L, 6L, 1L, 1L, 1L, 4L, 4L, 1L, 3L, 1L, 5L, 1L, 6L, 6L, 1L, 3L, 
1L, 1L, 1L, 1L, 3L, 1L, 1L, 1L, 3L, 1L, 3L, 3L, 1L, 6L, 4L, 2L, 
6L, 1L, 1L, 1L, 1L, 6L, 1L, 1L, 6L, 3L, 3L, 1L, 1L, 1L, 1L, 6L, 
4L, 3L, 1L), .Label = c("android", "blackberry", "ios", "mac", 
"other", "windows", "windows_phone"), class = "factor"), cpm = c(0.259529602595296, 
0.008325, 0.664507018855387, 0.000646161798914448, 0.117647058823529, 
0.630132741077424, 0.00398838150289017, 0.0986788005043583, 0.483832900637243, 
0.631904877252478, 0.00499783423573511, 0.408063887806778, 0.0916731378464372, 
1.3325069724202, 0.0112485708069297, 0.00171537666632221, 0.0129665435458787, 
0.00296443300606869, 0.22941417451864, 0.000426580184572523, 
0.206888580674988, 0.000622490272373541, 0.016084968041569, 0.119169168392267, 
0.0216352172946694, 0.0552526416330796, 0.0150883006745904, 0.324403186817902, 
0.188053932659688, 0.00389006342494715, 0.0625410833224263, 0.00111134385665529, 
0.000198831231813773, 0.00551511140525039, 1.02902374670185, 
0.574300071787509, 0.371022474579782, 0.111970606352996, 0.0000313953488372093, 
0.380035469977198, 0.0159468438538206, 0.0274524158125915, 0.237448482577744, 
0.083452302337827, 0.371352785145889, 0.129754756459319, 0.0261164794985636, 
0.602409638554217, 0.0157611216101295, 0.347620654741816, 0.130193264668441, 
0.34434946165254, 0.0693131695022054, 0.673575129533679, 0.0272002127093858, 
0.0295980803571429, 0.482425913163336, 0.00235336471280429, 0.00508469886782341, 
0.0000840689365279529, 0.236539258503618, 0.0799443865137296, 
0.296296296296296, 0.0236127508854782, 0.0152198636822762, 0.00339285714285714, 
0.150753768844221, 0.0859481582537517, 0.000587920688617856, 
0.00127715231788079, 0.150836862270619, 0.0849810111668886, 0.279757646414598, 
0.00113308871141809, 0.996427153632394, 0.00269808881394042, 
0.374087591240876, 0.228267072474796, 0.0516169572925784, 0.00902986826347305, 
0.000207365145228216, 0.244244977712646, 0.169128424850603, 0.573023255813954, 
0.0152944175375988, 1.11731843575419, 0.426646706586826, 0.0544090571844687, 
0.271433919880195, 0.0271570068233128, 0.00445611403693561, 0.00160892057026477, 
0.671800318640467, 0.0216794334441393, 0.00285318261516391, 0.295866741619575, 
0.0843108504398827, 1.60302577359969, 0.0132230143658259, 0.00246752277351996
)), row.names = c(6L, 22L, 25L, 28L, 31L, 41L, 43L, 45L, 47L, 
59L, 68L, 70L, 71L, 72L, 73L, 80L, 94L, 95L, 96L, 101L, 115L, 
117L, 121L, 123L, 125L, 140L, 144L, 149L, 151L, 165L, 169L, 170L, 
179L, 182L, 186L, 189L, 190L, 206L, 207L, 208L, 221L, 238L, 239L, 
259L, 271L, 275L, 276L, 280L, 281L, 294L, 303L, 308L, 311L, 315L, 
318L, 345L, 354L, 355L, 362L, 374L, 377L, 383L, 384L, 385L, 386L, 
394L, 405L, 407L, 408L, 419L, 422L, 424L, 425L, 427L, 442L, 445L, 
454L, 455L, 465L, 466L, 482L, 484L, 485L, 487L, 496L, 506L, 510L, 
513L, 517L, 518L, 523L, 528L, 544L, 548L, 552L, 557L, 570L, 579L, 
586L, 596L), class = "data.frame")

Used split function to get list of data frames, which separated geo+os combinations from each other and wrote them down in a list of data frames:

X <- split(data, list(data$geo,data$os))

Than I pulled data frames out from that list into the environment and deleted data frames with zero rows

list2env(X, envir = .GlobalEnv)
## create a function that returns a logical value
isEmpty <- function(x) {
  is.data.frame(x) && nrow(x) == 0L
}
## apply it over the environment
empty <- unlist(eapply(.GlobalEnv, isEmpty))
## remove the empties
rm(list = names(empty)[empty])

The desired result is a Data frame, which has 4 columns:

geo, os, quantile_15,quantile_80

Where geo+os are unique and have a certain quantile_15,quantile_80

  • Possible duplicate of [Using lapply to apply a function over list of data frames and saving output to files with different names](https://stackoverflow.com/questions/17018138/using-lapply-to-apply-a-function-over-list-of-data-frames-and-saving-output-to-f) – S Rivero Aug 29 '18 at 14:28
  • why dont you run your code one by one? You have only 3 dataframes – Jimmy Aug 29 '18 at 14:29
  • What's wrong with your code? Why does it not give you what you want? If it threw an error, what was the error? Apply `quantile` to all objects (not just data.frames) in your global environment will likely result in an error since not all objects are numeric vectors. – acylam Aug 29 '18 at 14:33
  • The `probs` should be `probs = c(.15,.8)`. – tmfmnk Aug 29 '18 at 14:36
  • I am tempted to say that it may not be a good idea to loop through all the objects in the environment. What if one of the objects in the environment is not a dataframe? I would suggest explicitly creating a list of all the dataframes that require modifications. Would something like this help? `list(BD.ios, AE.mac, AF.android) %>% map(~mutate(.x, quantile=quantile(cpm, probs = seq(.15,.8))))` (You need `dplyr` and `purrr`) – Vlad C. Aug 29 '18 at 14:44
  • @Salman I have over a thousand data frames like this =) @tmfmnk `probs` isn't the main issue @VladC. yes, this would help, but I have to many data frames to type them all. And I know that all the data frames in environment are exactly the same, I made them myself – Nikita Pronin Aug 29 '18 at 14:46
  • @NikitaPronin Do they perhaps all share some common prefix or suffix in the names? This would make it much easier to retrieve what you want from the environment. – acylam Aug 29 '18 at 14:57
  • @avid_useR they all have the same form - name of the dataframe is geo.os, colnames are = geo, os, cpm. The only value that differs within each data frame is cpm. And I need to find quantile 15 and 80% for that column, which is piece a cake if there weren't so many data frames and function have to apply to a certain column – Nikita Pronin Aug 29 '18 at 15:04

1 Answers1

3

I'd strongly suggest putting your data frames in a list instead of just leaving them in the global environment. The answer I link to should help you understand why lists are better, and also show how you could do lists from the start instead of this "find all data frames and put them in a list" approach.

eapply is difficult because there's nothing built-in to let you apply, say, only to data frames. And eapply returns results as a list, so it doesn't make much sense for adding columns to existing data frames.

df_names = ls()[sapply(mget(ls()), is.data.frame)]
df_list = mget(df_names)
result_list = lapply(df_list, function(d) d$new_col = <code for new column>)

I'm not sure what you want since you don't post your desired output. quantile(x, c(.15, .8)) returns 2 values, and your data frames have more than 2 rows, so I'm not sure what you want added - 2 new columns? 1 new column with recycling? something else?

Alternatively, maybe you just want a 2-number summary for each data frame? In that case sapply does nice simplification and keeps the names:

sapply(df_list, function(d) quantile(d$cpm, c(0.15, 0.8)))
#           AE.mac AF.android       BD.ios
# 15% 0.0009111413  0.1545266 0.0002341395
# 80% 0.0071962008  0.3567230 0.0076989311

EDIT based on your edits, let's work directly with data. We don't need to split, we certainly don't need list2env after the split. Adding columns by group is easy and efficient with dplyr or data.table. For example:

library(dplyr)
data %>%
  group_by(geo, os) %>%
  summarize(quantile_15 = quantile(cpm, .15),
            quantile_80 = quantile(cpm, 0.8))
# # A tibble: 81 x 4
# # Groups:   geo [?]
#    geo   os         quantile_15 quantile_80
#    <fct> <fct>            <dbl>       <dbl>
#  1 AE    android      0.118       0.118    
#  2 AE    blackberry   0.00833     0.00833  
#  3 AR    mac          0.0296      0.0296   
#  4 AT    android      0.665       0.665    
#  5 AU    android      0.482       0.482    
#  6 AU    ios          0.374       0.374    
#  7 AU    mac          0.00903     0.00903  
# ...

Or with data.table:

library(data.table)
setDT(data)
data[, as.list(quantile(cpm, c(0.15, 0.8))), by = .(geo, os)]
 #    geo            os          15%          80%
 # 1:  EC           ios 2.595296e-01 2.595296e-01
 # 2:  AE    blackberry 8.325000e-03 8.325000e-03
 # 3:  AT       android 6.645070e-01 6.645070e-01
 # 4:  EG       android 1.702811e-02 8.928342e-02
 # 5:  AE       android 1.176471e-01 1.176471e-01
 # 6:  CA       windows 6.301327e-01 6.301327e-01
Gregor Thomas
  • 104,719
  • 16
  • 140
  • 257