I have prepared a small benchmark program for measuring different ways of parsing. The problem comes with the huge decrease in performance when using a stream and a custom function for storing a date as a time_t + double.
The weird boost spirit trait for std::string is because seek backtracking fills the variable string with all the common parts of non-matching lines until a line that matches is found.
Sorry for the source code quality (copy/paste, bad variable names, weak indentation...). I am aware that this benchmark code is not going to be included in Clean Code book, so please ignore this fact and let's focus on the subject.
I understand that the fastest way is using a string without backtracking, but time increment for stream is really strange. Can someone explain me what is going on?
#include <boost/fusion/adapted/struct.hpp>
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/phoenix.hpp>
#include <boost/spirit/repository/include/qi_seek.hpp>
#include <boost/chrono/chrono.hpp>
#include <iomanip>
#include <ctime>
typedef std::string::const_iterator It;
namespace structs {
struct Timestamp {
std::time_t date;
double ms;
friend std::istream& operator>> (std::istream& stream, Timestamp& time)
{
struct std::tm tm;
if (stream >> std::get_time(&tm, "%Y-%b-%d %H:%M:%S") >> time.ms)
time.date = std::mktime(&tm);
return stream;
}
};
struct Record1 {
std::string date;
double time;
std::string str;
};
struct Record2 {
Timestamp date;
double time;
std::string str;
};
typedef std::vector<Record1> Records1;
typedef std::vector<Record2> Records2;
}
BOOST_FUSION_ADAPT_STRUCT(structs::Record1,
(std::string, date)
(double, time)
(std::string, str))
BOOST_FUSION_ADAPT_STRUCT(structs::Record2,
(structs::Timestamp, date)
(double, time)
(std::string, str))
namespace boost { namespace spirit { namespace traits {
template <typename It>
struct assign_to_attribute_from_iterators<std::string, It, void> {
static inline void call(It f, It l, std::string& attr) {
attr = std::string(&*f, std::distance(f,l));
}
};
} } }
namespace qi = boost::spirit::qi;
namespace QiParsers {
template <typename It>
struct Parser1 : qi::grammar<It, structs::Record1()>
{
Parser1() : Parser1::base_type(start) {
using namespace qi;
start = '[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph]
>> eol;
}
private:
qi::rule<It, structs::Record1()> start;
};
template <typename It>
struct Parser2 : qi::grammar<It, structs::Record2()>
{
Parser2() : Parser2::base_type(start) {
using namespace qi;
start = '[' >> stream >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph]
>> eol;
}
private:
qi::rule<It, structs::Record2()> start;
};
template <typename It>
struct Parser3 : qi::grammar<It, structs::Records1()>
{
Parser3() : Parser3::base_type(start) {
using namespace qi;
using boost::phoenix::push_back;
line = '[' >> raw[*~char_(']')] >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph];
ignore = *~char_("\r\n");
start = (line[push_back(_val, _1)] | ignore) % eol;
}
private:
qi::rule<It> ignore;
qi::rule<It, structs::Record1()> line;
qi::rule<It, structs::Records1()> start;
};
template <typename It>
struct Parser4 : qi::grammar<It, structs::Records2()>
{
Parser4() : Parser4::base_type(start) {
using namespace qi;
using boost::phoenix::push_back;
line = '[' >> stream >> ']'
>> " - " >> double_ >> " s"
>> " => String: " >> raw[+graph];
ignore = *~char_("\r\n");
start = (line[push_back(_val, _1)] | ignore) % eol;
}
private:
qi::rule<It> ignore;
qi::rule<It, structs::Record2()> line;
qi::rule<It, structs::Records2()> start;
};
}
template<typename Parser, typename Container>
Container parse_seek(It b, It e, const std::string& message)
{
static const Parser parser;
Container records;
boost::chrono::high_resolution_clock::time_point t0 = boost::chrono::high_resolution_clock::now();
parse(b, e, *boost::spirit::repository::qi::seek[parser], records);
boost::chrono::high_resolution_clock::time_point t1 = boost::chrono::high_resolution_clock::now();
auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";
return records;
}
template<typename Parser, typename Container>
Container parse_ignoring(It b, It e, const std::string& message)
{
static const Parser parser;
Container records;
boost::chrono::high_resolution_clock::time_point t0 = boost::chrono::high_resolution_clock::now();
parse(b, e, parser, records);
boost::chrono::high_resolution_clock::time_point t1 = boost::chrono::high_resolution_clock::now();
auto elapsed = boost::chrono::duration_cast<boost::chrono::milliseconds>(t1 - t0);
std::cout << "Elapsed time: " << elapsed.count() << " ms (" << message << ")\n";
return records;
}
static const std::string input1 = "[2018-Mar-01 00:00:00.000000] - 1.000 s => String: Valid_string\n";
static const std::string input2 = "[2018-Mar-02 00:00:00.000000] - 2.000 s => I dont care\n";
static std::string input("");
int main() {
const int N1 = 10;
const int N2 = 100000;
input.reserve(N1 * (input1.size() + N2*input2.size()));
for (int i = N1; i--;)
{
input += input1;
for (int j = N2; j--;)
input += input2;
}
const auto records1 = parse_seek<QiParsers::Parser1<It>, structs::Records1>(input.begin(), input.end(), "std::string + seek");
const auto records2 = parse_seek<QiParsers::Parser2<It>, structs::Records2>(input.begin(), input.end(), "stream + seek");
const auto records3 = parse_ignoring<QiParsers::Parser3<It>, structs::Records1>(input.begin(), input.end(), "std::string + ignoring");
const auto records4 = parse_ignoring<QiParsers::Parser4<It>, structs::Records2>(input.begin(), input.end(), "stream + ignoring");
return 0;
}
The results in the console are:
Elapsed time: 1445 ms (std::string + seek)
Elapsed time: 21519 ms (stream + seek)
Elapsed time: 860 ms (std::string + ignoring)
Elapsed time: 19046 ms (stream + ignoring)