3

I'm trying to write a very functional manner. We're using Highland.js for managing the stream processing, however because I'm so new I think I'm getting really confused with how I can deal with this unique situation.

The issue here is that all the data in the file stream is not consistent. The first line in a file is typically the header, which we want to store into memory and zip all rows in the stream afterwards.

Here's my first go at it:

var _      = require('highland');
var fs     = require('fs');
var stream = fs.createReadStream('./data/gigfile.txt');
var output = fs.createWriteStream('output.txt');

var headers = [];

var through = _.pipeline(
    _.split(),
    _.head(),
    _.doto(function(col) {
        headers = col.split(',');
        return headers;
    }),

    ......

    _.splitBy(','),
    _.zip(headers),
    _.wrapCallback(process)
);

_(stream)
    .pipe(through)
    .pipe(output);

The first command in the pipeline is to split the files by lines. The next grabs the header and the doto declares it as a global variable. The problem is the next few lines in the stream don't exist and so the process is blocked...likely because the head() command above it.

I've tried a few other variations but I feel this example give you a sense of where I need to go with it.

Any guidance on this would be helpful -- it also brings up the question of if I have different values in each of my rows how can I splinter the process stream amongst a number of different stream operations of variable length/complexity.

Thanks.

EDIT: I've produced a better result but I'm questioning the efficiency of it -- is there a way I can optimize this so on every run I'm not checking if the headers are recorded? This still feels sloppy.

var through = _.pipeline(
    _.split(),
    _.filter(function(row) {
        // Filter out bogus values
        if (! row || headers) {
            return true;
        }
        headers = row.split(',');
        return false;
    }),
    _.map(function(row) {
        return row.split(',')
    }),
    _.batch(500),
    _.compact(),
    _.map(function(row) {
        return JSON.stringify(row) + "\n";
    })
);

_(stream)
    .pipe(through)
ddibiase
  • 1,192
  • 1
  • 12
  • 31

1 Answers1

4

You can use Stream.observe() or Stream.fork() to split the stream.

var _      = require('highland');
var fs     = require('fs');
var stream = fs.createReadStream('./data/gigfile.txt');
var output = fs.createWriteStream('output.txt');
var through = highland.pipeline(function(s) {
    var headerStream, headers;
    // setup a shared variable to store the headers
    headers = [];
    // setup the csv processing
    s = s
        // split input into lines
        .split()
        // remove empty lines
        .compact()
        // split lines into arrays
        .map(function(row) {
            return row.split(',');
        });
    // create a new stream to grab the header
    headerStream = s.observe();
    // pause the original stream
    s.pause();
    // setup processing of the non-header rows
    s = s
        // drop the header row
        .drop(1)
        // convert the rest of the rows to objects
        .map(function(row) {
            var obj = headers.reduce(function(obj, key, i) {
                obj[key] = row[i];
                return obj;
            }, {});
            return JSON.stringify(obj) + "\n";
        });
    // grab the first row from the header stream
    // save the headers and then resume the normal stream
    headerStream.head().toArray(function(rows) {
        headers = rows[0];
        s.resume();
    });
    return s;
});
_(stream)
    .pipe(through)
    .pipe(output);

That being said, your csv parsing doesn't account for escaping newlines and commas within your values. Typically, that is done in csv files by wrapping values in double quotes. And then double quotes are escaped by putting two next to each other. It's a bit tricky to get this right, so I would recommend using a package that handles it such as fast-csv.

Then your code could look like this:

var _      = require('highland');
var fs     = require('fs');
var csv    = require('fast-csv');
var stream = fs.createReadStream('./data/gigfile.txt');
var output = fs.createWriteStream('output.txt');

_(stream.pipe(csv({headers: true, ignoreEmpty: true})))
    .map(function(row) {
        return JSON.stringify(row) + "\n";
    })
    .pipe(output);
rmarscher
  • 5,438
  • 1
  • 26
  • 30