1

I have been trying to properly clean and format raw Item price data for (time-series) analysis and I'm curious how any of you professionals would tackle this kind of set up. Every two columns represents a list of dates and a list of prices. These dates are (unfortunately) independent of any other date in the same row (although may have many as the same by happenstance).

My tactic here is to make a new data frame with rows representing days, columns representing prices, and run a loop that matches the Item dates to the correct row and fills in the correct price.

However, I believe I may be doing this inefficiently and my online searches aren't giving me other examples of this procedure.

Below please find example code.

    df <- structure(list(Date1 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item1 = c(650L, 650L, 635L, 640L, 640L, 625L, 620L, 580L, 550L, 520L, 530L), Date2 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item2 = c(590L, 590L, 590L, 580L, 580L, 580L, 580L, 580L, 460L, 460L, 395L), Date3 = c("12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012", "1/23/2012", "1/30/2012", "2/6/2012", "2/13/2012", "2/20/2012"), Item3 = c(775L, 775L, 775L, 750L, 750L, 750L, 750L, 750L, 725L, 725L, 740L), Date4 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item4 = c(660L, 700L, 700L, 700L, 700L, 700L, 650L, 650L, 650L, 650L, 610L), Date5 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item5 = c(705L, 705L, 705L, 650L, 650L, 650L, 650L, 555L, 555L, 555L, 555L), Date6 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item6 = c(612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L, 612L), Date7 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item7 = c(630L, 630L, 625L, 635L, 625L, 615L, 620L, 560L, 550L, 540L, 530L), Date8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Item8 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), Date9 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item9 = c(622L, 622L, 650L, 650L, 650L, 660L, 660L, 660L, 665L, 665L, 665L), Date10 = c("10/31/2011", "11/7/2011", "11/14/2011", "11/21/2011", "11/28/2011", "12/5/2011", "12/12/2011", "12/19/2011", "1/2/2012", "1/9/2012", "1/16/2012"), Item10 = c(1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L, 1040L)), .Names = c("Date1", "Item1", "Date2", "Item2", "Date3", "Item3", "Date4", "Item4", "Date5", "Item5", "Date6", "Item6", "Date7", "Item7", "Date8", "Item8", "Date9", "Item9", "Date10", "Item10"), row.names = 95:105, class = "data.frame")
    df
    class(df)
    # visual inspection for first and last date (10/31/2011, 2/20/2012)

    mdyyyy <- function(x){as.Date(x,"%m/%d/%Y")}

    days <- seq.Date(from = mdyyyy("10/31/2011"), # first date
             to   = mdyyyy("2/20/2012"), # last date
             by   = "day")

    head(days)

    datecolumns <- seq(1,ncol(df),by=2) # (odds) date columns 
    pricecolumns <- seq(2,ncol(df),by=2) # (evens) index columns 

    # Creating a new, cleaned matrix of data where the 
    # rows = days and columns = indices
    newdat    <- matrix(NA, 
                length(days), 
                ncol(df[,pricecolumns])) # indices wide

    # Name rows
    rownames(newdat) <- format(days,"%m/%d/%Y")
    # Each row is a new day
    head(newdat[,1:10]) 

    # Placing prices into the appropriate rows
    for(i in 1:length(datecolumns)){
      pricedates <- 0   # initialize/reset
      pricedates <- mdyyyy(df[,datecolumns[i]]) # column's price dates
      rowlocations <- 0 # initialize/reset
      rowlocations <- match(pricedates, days)   # date's new row number
      for(j in 1:length(rowlocations)){
        # within each cell, place appropriate price
        newdat[rowlocations[j],i] <- df[j,pricecolumns[i]]
      }
    }
    colnames(newdat) <- colnames(df[,pricecolumns])
    head(newdat)

Afterwards I've been looking to the xts package to help me make this into something I can analyze by doing apply.monthly() and rollapply() since the raw data is much more extensive.

Thank you very much for your thoughts and criticisms.

Evan Friedland
  • 2,750
  • 1
  • 9
  • 25

2 Answers2

0

Here's a way, using array indexing which is the most efficient way to populate a matrix with values, AFAIK:

## convert data to long format
long <- within(reshape(df,
                       varying       = list(datecolumns, pricecolumns),
                       v.names       = c('Date', 'Item'),
                       new.row.names = seq(prod(dim(df[datecolumns]))),
                       times         = paste0('Item', seq(datecolumns)),
                       timevar       = 'Id',
                       direction     = 'long')[-4],
               Date <- mdyyyy(Date))

long <- na.omit(long)                   # remove NAs

## create empty matrix
out <- matrix(NA, length(days), length(pricecolumns),
              dimnames=list(as.character(days), names(df)[pricecolumns]))

## fill it with values from long
out[with(long, cbind(as.character(Date), Id))] <- long$Item
Ernest A
  • 6,838
  • 7
  • 30
  • 38
0

Not entirely sure if this is what you're after, but here's a method that uses the dplyr and tidyr packages to convert your data structure into a long format with separate Date and Item (what I assume is price) columns. Whatever you're trying to do, you should find it a bit easier to work with this. Note, df is the data frame provided in the question.

library(tidyr)
library(dplyr)

d <- df %>%
  mutate(row = 1:n()) %>% 
  gather(key, value, -row) %>%
  extract(key, c("var", "ref"), "(Date|Item)([0-9]*)") %>%
  spread(var, value)

head(d)
#>   row ref       Date Item
#> 1   1   1 10/31/2011  650
#> 2   1  10 10/31/2011 1040
#> 3   1   2 10/31/2011  590
#> 4   1   3  12/5/2011  775
#> 5   1   4 10/31/2011  660
#> 6   1   5 10/31/2011  705

Aside, this is based on an answer to a previous post: Gather multiple sets of columns

If you want to spread this out into a table-like structure, here's the same as above with a few extra lines:

d <- df %>%
  mutate(row = 1:n()) %>% 
  gather(key, value, -row) %>%
  extract(key, c("var", "ref"), "(Date|Item)([0-9]*)") %>%
  spread(var, value) %>%
  mutate(ref = paste0("Item", ref)) %>% 
  spread(ref, Item) %>% 
  select(-row)

head(d)
#>         Date Item1 Item10 Item2 Item3 Item4 Item5 Item6 Item7 Item8 Item9
#> 1 10/31/2011   650   1040   590  <NA>   660   705   612   630  <NA>   622
#> 2  12/5/2011  <NA>   <NA>  <NA>   775  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 3       <NA>  <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 4  11/7/2011   650   1040   590  <NA>   700   705   612   630  <NA>   622
#> 5 12/12/2011  <NA>   <NA>  <NA>   775  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
#> 6       <NA>  <NA>   <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>  <NA>
Community
  • 1
  • 1
Simon Jackson
  • 2,816
  • 10
  • 22