Recently, we had a requirement where there are more that 100,000 xml files and all of them needed modification of particular data in the xml. Simple perl command would do the job but perl was not installed on the machine where the files are located. Hence I wrote a small C# code to do the job.
private static void ModifyXML(string[] args)
{
Stopwatch sw = new Stopwatch();
sw.Start();
string path = @args[0];
string opath = @args[1];
string token = "value_date=\"20121130\"";
string target = "value_date=\"20121019\"";
Parallel.ForEach(Directory.EnumerateFiles(path), (file) =>
{
StringBuilder sb = new StringBuilder(File.ReadAllText(file));
sb.Remove(0, 55);
sb.Replace(token, target);
var filename = file.Split(new char[] { '\\' }).Last();
File.WriteAllText(string.Format("{0}\\{1}", opath, filename), sb.ToString());
});
TimeSpan ts = sw.Elapsed;
Console.WriteLine("Took {0} secs", ts.TotalSeconds);
}
I decided to implement C++ version. It turned out that the C++ version was not significantly faster than C# version. In ran both versions sevaral times.In fact, it's as fast as C# version during some of the runs.
For C# I used .NET 4.0 and for C++ it's VC10.
void FileHandling(std::string src, std::string dest)
{
namespace fs = boost::filesystem;
auto start = boost::chrono::system_clock::now();
string token = "value_date=\"20121130\"";
string target = "value_date=\"20121019\"";
fs::directory_iterator end_iter;
fs::directory_iterator dir_itr(src);
vector<fs::path> files;
files.insert(files.end(), dir_itr, end_iter);
string dest_path = dest + "\\";
parallel_for_each(files.begin(), files.end(), [=](const fs::path& filepath)
{
ifstream inpfile (filepath.generic_string());
string line;
line.insert(line.end(), istreambuf_iterator<char>(inpfile), istreambuf_iterator<char>());
line.erase(0, 55);
auto index = line.find(token, 0);
if (index != string::npos)
{
line.replace(index, token.size(), target);
}
ofstream outfile(dest_path + filepath.filename().generic_string());
outfile << line;
});
boost::chrono::duration<double> finish = boost::chrono::system_clock::now() - start;
std::cout << "Took " << finish.count() << " secs\n";
}