2

I have a number of dataframes (96) that have columns of 0s and 1s. If there is more than one "1" in any one column of the dataframes, I want to replace the 1s with a equal fractions so that the sum of the values in the column is 1, as shown in the code below.

v1 <- c(0, 1, 0, 1, 1, 0)
v2 <- c(0, 0, 1, 0, 0, 0)
v3 <- c(0, 0, 1, 1, 0, 0)
df1 <- data.frame(v1, v2, v3)
df2 <-data.frame(v3, v3, v1)
df3 <- data.frame(v1, v3, v1)
new.df1 <- t(apply(df1, 2, FUN = function(x) {
  if(sum(x==1, na.rm=TRUE) ==2)  replace(x, x==1, 0.5)
  else if (sum(x==1, na.rm=TRUE)==3) replace( x, x==1, 1/3) 
  else x}))

new.df2 <- t(apply(df2, 2, FUN = function(x) {
  if(sum(x==1, na.rm=TRUE) ==2)  replace(x, x==1, 0.5)
  else if (sum(x==1, na.rm=TRUE)==3) replace( x, x==1, 1/3) 
  else x}))

new.df3 <- t(apply(df3, 2, FUN = function(x) {
  if(sum(x==1, na.rm=TRUE) ==2)  replace(x, x==1, 0.5)
  else if (sum(x==1, na.rm=TRUE)==3) replace( x, x==1, 1/3) 
  else x}))

I am able to create what I want with brute force as in the above example, but there must be a better (more concise) way. I'd greatly appreciate some help.

Dot Dumuid
  • 69
  • 4

3 Answers3

1

Store your code in a function, store your dataframes in a list and then use lapply to loop over each list element with your function.

 recalc <- function(df) {
        t(apply(df, 2, FUN = function(x) {
            if(sum(x==1, na.rm=TRUE) ==2)  replace(x, x==1, 0.5)
            else if (sum(x==1, na.rm=TRUE)==3) replace( x, x==1, 1/3) 
            else x}))
    } 


lapply(dflist, function(df) recalc(df))

[[1]]
   [,1]      [,2] [,3]      [,4]      [,5] [,6]
v1    0 0.3333333  0.0 0.3333333 0.3333333    0
v2    0 0.0000000  1.0 0.0000000 0.0000000    0
v3    0 0.0000000  0.5 0.5000000 0.0000000    0

[[2]]
     [,1]      [,2] [,3]      [,4]      [,5] [,6]
v3      0 0.0000000  0.5 0.5000000 0.0000000    0
v3.1    0 0.0000000  0.5 0.5000000 0.0000000    0
v1      0 0.3333333  0.0 0.3333333 0.3333333    0

[[3]]
     [,1]      [,2] [,3]      [,4]      [,5] [,6]
v1      0 0.3333333  0.0 0.3333333 0.3333333    0
v3      0 0.0000000  0.5 0.5000000 0.0000000    0
v1.1    0 0.3333333  0.0 0.3333333 0.3333333    0
DSGym
  • 2,539
  • 1
  • 4
  • 15
  • 1
    And to get all global environment data frames in a list, use `eapply` or `mget`: `Filter(is.data.frame, eapply(.GlobalEnv, identity))` or `Filter(is.data.frame, mget(x=ls(), envir=.GlobalEnv))`. Or by name such as *df* prefix `mget(ls(pattern="df"))`. – Parfait Jul 02 '19 at 15:26
1

Similar to the other answer, but a little more modular and a improved version of your function:

## Put your data frames in a list
# df_list = list(df1, df2, df3)
df_list = mget(ls(pattern = "df[0-9]"))

## Write a function to modify one column
replace_ones = function(x) {
  sx = sum(x == 1, na.rm = TRUE)
  if(sx > 1) {
    x = replace(x, x == 1, 1 / sx)
  }
  return(x)
}

## Wrap it to modify a data frame:
replace_ones_df = function(df) {
  df[] = lapply(df, replace_ones)
  return(df)
}

## Apply the function to all columns of all data frames:
result_list = lapply(df_list, replace_ones_df)
# $df1
#          v1 v2  v3
# 1 0.0000000  0 0.0
# 2 0.3333333  0 0.0
# 3 0.0000000  1 0.5
# 4 0.3333333  0 0.5
# 5 0.3333333  0 0.0
# 6 0.0000000  0 0.0
# 
# $df2
#    v3 v3.1        v1
# 1 0.0  0.0 0.0000000
# 2 0.0  0.0 0.3333333
# 3 0.5  0.5 0.0000000
# 4 0.5  0.5 0.3333333
# 5 0.0  0.0 0.3333333
# 6 0.0  0.0 0.0000000
# 
# $df3
#          v1  v3      v1.1
# 1 0.0000000 0.0 0.0000000
# 2 0.3333333 0.0 0.3333333
# 3 0.0000000 0.5 0.0000000
# 4 0.3333333 0.5 0.3333333
# 5 0.3333333 0.0 0.3333333
# 6 0.0000000 0.0 0.0000000
Gregor Thomas
  • 104,719
  • 16
  • 140
  • 257
  • Thanks @Gregor. With lapply, how do you specify whether you want the function to be run over rows or columns? How would you modify the code if you wanted to apply the function to rows of the data.frame instead of columns? – Dot Dumuid Jul 03 '19 at 01:50
  • 1
    `lapply` runs over columns. If you want to run over rows, I'd suggest using `matrix` not a data frame, and using `apply` not `lapply`. (You can use `apply` on a data frame, but it just converts it to a matrix internally first) – Gregor Thomas Jul 05 '19 at 21:50
1

Instead of manually counting the number of 1s in the binary column and using if/else, divide the datasets placed in a list with column sums (colSums)

lapply(mget(paste0("df", 1:3)), function(x) x/colSums(x)[col(x)])
#$df1
#         v1 v2  v3
#1 0.0000000  0 0.0
#2 0.3333333  0 0.0
#3 0.0000000  1 0.5
#4 0.3333333  0 0.5
#5 0.3333333  0 0.0
#6 0.0000000  0 0.0

#$df2
#   v3 v3.1        v1
#1 0.0  0.0 0.0000000
#2 0.0  0.0 0.3333333
#3 0.5  0.5 0.0000000
#4 0.5  0.5 0.3333333
#5 0.0  0.0 0.3333333
#6 0.0  0.0 0.0000000

#$df3
#         v1  v3      v1.1
#1 0.0000000 0.0 0.0000000
#2 0.3333333 0.0 0.3333333
#3 0.0000000 0.5 0.0000000
#4 0.3333333 0.5 0.3333333
#5 0.3333333 0.0 0.3333333
#6 0.0000000 0.0 0.0000000
akrun
  • 674,427
  • 24
  • 381
  • 486