0

Let's say I have three tables that overlap.

    A   B   C   D
A   12  16  17  14
B   62  66  9   85
C   37  31  59  75
D   74  76  89  25

    A   B   E   F
A   12  16  11  19
B   62  66  57  28
E   24  21  4   51
F   7   1   68  22

    C   D   E   F
C   59  75  77  80
D   89  25  88  30
E   67  87  4   51
F   39  69  68  22

I would like to combine them by row and by column, without any duplicated rows or columns, and with row and column names intact.

    A   B   C   D   E   F
A   12  16  17  14  11  19
B   62  66  9   85  57  28
C   37  31  59  75  77  80
D   74  76  89  25  88  30
E   24  21  67  87  4   51
F   7   1   39  69  68  22

After three days I managed to cobble this together (with help from here, here, here, and probably others I've forgotten):

#Import tables as dataframes
file.names <- dir(pattern = ".tab")
for(i in 1:length(file.names)){
  nam <- paste("table.", i, sep = "")  #rename the data as table.1 ... table.n
  assign(nam, as.data.frame(as.matrix(read.delim(file.names[i],
         row.names=1, header=TRUE, sep="\t", stringsAsFactors=FALSE))))
}

#Import an empty file (i.e. just column and row names) 
#that you will fill with your smaller data tables
out.file <- as.data.frame(as.matrix(read.delim("Blank_table.csv",
                                               row.names=1, header=TRUE, sep=",")))

#Create a list of the dataframes
file.names = lapply(ls(pattern = "table.[0-9]"), get)

#Add columns that we can use for merging
#because using 'merge' on dataframes destroys row names
out.file$rows <- rownames(out.file)
for(i in 1:length(file.names)){
  rownams <- rownames(file.names[[i]])
  file.names[i] <- lapply(file.names[i], cbind, rows = rownams)
}

#Combine the tables
for(i in 1:length(file.names)){
  file <- file.names[i]
  out.file <- aggregate(. ~ rows, data = merge(out.file, file, all = TRUE),
                        na.action = na.pass, FUN = mean, na.rm = TRUE)
}

This does what I want, but it takes a long time when I'm merging hundreds of tables. I feel like there is probably a simpler way to do it, but I don't want to spend another three days of trial and error to get there.

What I'm imagining, is something along these lines:

  1. Import the empty table n times into a list of dataframes
  2. Import the data tables and merge one each into an empty dataframe in the list
  3. Create a new dataframe that is the mean of the corresponding cells in all the imported dataframes

Any suggestions?

UPDATE: Here are my example tables from dput:

table.1 <- structure(list(A = c(12L, 62L, 37L, 74L), B = c(16L, 66L, 31L, 
76L), C = c(17L, 9L, 59L, 89L), D = c(14L, 85L, 75L, 25L)), .Names = c("A", 
"B", "C", "D"), row.names = c("A", "B", "C", "D"), class = "data.frame")

table.2 <- structure(list(A = c(12L, 62L, 24L, 7L), B = c(16L, 66L, 21L, 
1L), E = c(11L, 57L, 4L, 68L), F = c(19L, 28L, 51L, 22L)), .Names = c("A", 
"B", "E", "F"), row.names = c("A", "B", "E", "F"), class = "data.frame")

table.3 <- structure(list(C = c(59L, 89L, 67L, 39L), D = c(75L, 25L, 87L, 
69L), E = c(77L, 88L, 4L, 68L), F = c(80L, 30L, 51L, 24L)), .Names = c("C", 
"D", "E", "F"), row.names = c("C", "D", "E", "F"), class = "data.frame")

out.file <- structure(list(A = c(NA, NA, NA, NA, NA, NA), B = c(NA, NA, NA, 
NA, NA, NA), C = c(NA, NA, NA, NA, NA, NA), D = c(NA, NA, NA, 
NA, NA, NA), E = c(NA, NA, NA, NA, NA, NA), F = c(NA, NA, NA, 
NA, NA, NA)), .Names = c("A", "B", "C", "D", "E", "F"), row.names = c("A", 
"B", "C", "D", "E", "F"), class = "data.frame")
Community
  • 1
  • 1

2 Answers2

2

A subsetting solution, without additional package (using df1, df2 and df3 as defined by @emehex) :

# List of dataframes to combine
DF<-list(df1, df2, df3)

COL<-unique(unlist(lapply(DF, colnames)))
ROW<-unique(unlist(lapply(DF, rownames)))
# Empty DF with all combinations
TOTAL<-matrix(data=NA, nrow=length(ROW), ncol=length(COL), dimnames=list(ROW, COL))
# Subsetting :
for (df in DF) { 
    TOTAL[rownames(df), colnames(df)] <- as.matrix(df)
}

Subsetting is faster than merging, with numerous dataframes it may be more efficient (see @aichao answer her : For each row extract the value in the column name that match another value in the cell). You just have to adapt the DF list to file.names for your code.

Community
  • 1
  • 1
Jean-Noël
  • 313
  • 1
  • 5
0

Don't know what your .csvs look like so this is the best I can do (with the three example tables above)...

Data Import

df1 <- read.table(header = TRUE, text = 
"A   B   C   D
A   12  16  17  14
B   62  66  9   85
C   37  31  59  75
D   74  76  89  25")

df2 <- read.table(header = TRUE, text = 
"A   B   E   F
A   12  16  11  19
B   62  66  57  28
E   24  21  4   51
F   7   1   68  22")

df3 <- read.table(header = TRUE, text = 
"C   D   E   F
C   59  75  77  80
D   89  25  88  30
E   67  87  4   51
F   39  69  68  22")

Solution with dplyr, tibble and tidyr

library(dplyr)
library(tibble)
library(tidyr)

# intermediate tables for rownames and gathering
df1_c <- df1 %>% 
    rownames_to_column("Name") %>% 
    gather(key, value, -Name)

df2_c <- df2 %>% 
    rownames_to_column("Name") %>% 
    gather(key, value, -Name)

df3_c <- df3 %>% 
    rownames_to_column("Name") %>% 
    gather(key, value, -Name)

# formatted dataframe from spread
df <- bind_rows(df1_c, df2_c, df3_c) %>% 
    group_by(Name, key) %>% 
    distinct(.keep_all = TRUE) %>% 
    spread(key, value)

Output

df
#    Name     A     B     C     D     E     F
# * <chr> <int> <int> <int> <int> <int> <int>
# 1     A    12    16    17    14    11    19
# 2     B    62    66     9    85    57    28
# 3     C    37    31    59    75    77    80
# 4     D    74    76    89    25    88    30
# 5     E    24    21    67    87     4    51
# 6     F     7     1    39    69    68    22
emehex
  • 7,082
  • 8
  • 46
  • 85