-2

I have a big file, more than 89 million lines. I want to read a file pash it to a hash table and then make some calculations.

The problem is that with istream the reading of the file and passing it to the hash table is too slow.

Is there any possibility of reading a file using more threads? With a thread library?

Or should I have to cut the file into small pieces and then use a thread for each piece?

The hash function does not take much time to be calculated.

For collision I am using lists. The number of the table is 1 million.

       // Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
    //

    #include "pch.h"
    #include <iostream>
    #include <string>
    #include "hash.h"
    #include <iostream>
    #include <fstream>

    using namespace std;
    int main()
    {

        hasho Hashy;
        string f1, f2, f3, f4, f5, f6, f7;
        bool is_first_line = true;
        fstream file_input;
        fstream  file_2_Collums;

        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two column\n which is going to be used for searching based on that file)" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_input.open(infilename.c_str());
            if (file_input)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }



        cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
        while (true)
        {

            string infilename;
            getline(cin, infilename);
            file_2_Collums.open(infilename.c_str());
            if (file_2_Collums)break;
            cout << "Invalid file. Please enter a valid input file name> " << flush;
        }
        //creating output file


        int * table;
        table = new int[2];

        int count_file_lines = 0;
        int line_counter_inventors = 0;

        if (file_input.is_open())
        {

            while (!file_input.eof())
            {
                if (is_first_line == true) {
                    getline(file_input, f1, '\n');
                    is_first_line = false;
                }


                getline(file_input, f1, '\t');// patent id

                getline(file_input, f2, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f3, '\t');// patent id

                getline(file_input, f6, '\t');// patent id
                getline(file_input, f3, '\n');//date


                //cout << "adding these items " << f1 << '\t' << f6 << endl;

                Hashy.AddItem(f2, f6);
            cout << count_file_lines << endl;
                count_file_lines++;
            //  cout << f2 << '\t' << f6 << endl;
            }

        }

        int lines_2 = 0;

            if (file_2_Collums.is_open())
            {
                Hashy.openOutputFile();
                while (!file_2_Collums.eof())
                {

                    getline(file_2_Collums, f1, '\t');//patent_id

                    getline(file_2_Collums, f4, '\n');//assignee_id
                    //cout << f1 << endl;


                    Hashy.FindDateId(f1, f4);

                    lines_2++;
                }

            }





    system("pause");
    return 0;}

Hash.cpp

  #include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"

#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;

hasho::hasho()
{
    for (int i = 0; i < tableSize; i++) {

        //cout << i << endl;

        HashTable[i] = new item;
        HashTable[i]->pattent_id = "empty";

        HashTable[i]->date = "empty";
        HashTable[i]->next = NULL;
    }

}

void hasho::openOutputFile() {

    cout << "Please give the name of the output file: \n(The file should end with the format type (txt,csv etc.)) " << flush;
    while (true)
    {

        string infilename;
        getline(cin, infilename);
        output_file.open(infilename.c_str(), fstream::out);
        break;
    }
}

int hasho::NumberOfItemsInIndex(int index) {

    int count = 0;
    if (HashTable[index]->date == "empty") {

        return count;
    }
    else {

        count++;
        item* ptr = HashTable[index];
        while (ptr->next != NULL) {

            count++;
            ptr = ptr->next;

        }
    }

    return count;

}

void hasho::PrintTable() {

    int number;

    for (int i = 0; i < tableSize; i++) {


        number = NumberOfItemsInIndex(i);
        cout << "---------------------------------------\n";
        cout << "index= " << i << endl;

        cout << HashTable[i]->pattent_id << endl;
        cout << HashTable[i]->date << endl;
        cout << "# of items= " << number << endl;
        cout << "---------------------------------------\n";
    }

}


void hasho::PrintItemsInIndex(int index) {


    item* ptr = HashTable[index];

    if (ptr->date == "empty") {
        cout << "index  = " << index << " is empty." << endl;
    }
    else {
        cout << "index = " << index << " contains the following items\n";
        while (ptr != NULL) {



            cout << "-----------" << endl;
            cout << ptr->date << endl;
            cout << ptr->pattent_id << endl;
            cout << "-----------" << endl;

            ptr = ptr->next;

        }
    }

}



void hasho::AddItem(string pattend_id, string date)
{
    int index = Hash(pattend_id);

    if (HashTable[index]->pattent_id == "empty")
    {
        HashTable[index]->pattent_id = pattend_id;
        HashTable[index]->date = date;
    }
    else {

        item* ptr = HashTable[index];
        item* n = new item;
        n->date = date ;
        n->pattent_id = pattend_id;
        n->next = NULL;

        while (ptr->next != NULL) {
            ptr = ptr->next;
        }
        ptr->next = n;

    }
}

void hasho::FindDateId(string pattend_id, string assignee_id1) {

    int found = 0;
    int nfound = 0;

    int index = Hash(pattend_id);
    bool foundDateId = false;
    string date;
    item* ptr = HashTable[index];
    int count = 1;
    //write to file

    //cout << "WE are looking for the date of " <<pattend_id << " in Index:  " << index <<endl;
    while (ptr != NULL) {
        //cout << "NOw we are looking with : " << pattend_id << endl;
        if (ptr->pattent_id == pattend_id) {

            //cout << "NOw we are looking with : " << pattend_id <<endl;


            foundDateId = true;

            date = ptr->date;
            //write to file 


            output_file << pattend_id << "\t";
            output_file << assignee_id1 << endl;
            output_file << date << "\t";
            //cout << " " << date << endl;
            found = 1;
            count++;
        }
        ptr = ptr->next;
    }
    if (foundDateId == false) {
        nfound++;


    }
    cout << "found " << found << endl;
    cout << "not found " << nfound << endl;
    cout << endl;

}

int hasho::Hash(string key)
{
    int unsigned hash = 0;
    int  unsigned index;

    //cout << key << endl;

    for (int unsigned i = 0; i < key.length(); i++) {

        hash = hash + (int)key[i] *(i+1);

    }

    index =hash % tableSize;
    //cout << index << endl;
    return index;


}

Hash.h

#pragma once

#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;

#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")

#ifndef  HASH_H
#define HASH_H


class hasho {
private:
    static const int tableSize = 300003;

    struct item {
        string pattent_id;
        string date;
        item* next;
    };

    item* HashTable[tableSize];



public:
    hasho();
    int Hash(string key);
    void AddItem(string pattend_id, string date);
    int NumberOfItemsInIndex(int index);
    void PrintTable();
    void PrintItemsInIndex(int index);
    void FindDateId(string pattent_id, string assgnee_id);
    void openOutputFile();
};


#endif // ! HASH_H
Remy Lebeau
  • 454,445
  • 28
  • 366
  • 620
  • 9
    Profile your code to see what is slow. Post your code here. – Maxim Egorushkin Jan 07 '19 at 12:11
  • 2
    With that said, if you have a text file with variable-length lines, it's *very* hard to use threads to simultaneously read different *lines* of the file. You could let each thread read a fixed-sized *chunk* of the file though, and then puzzle together the chunks into a coherent piece of text afterwards. – Some programmer dude Jan 07 '19 at 12:13
  • 3
    Also note that on rotating-platter disks (and not an SSD) using threads could actually *lower* the performance as more threads attempt to use the disk at the same time, needing more moving of the read/write heads. – Some programmer dude Jan 07 '19 at 12:16
  • @MaximEgorushkin I uploaded the code – Γιαννης Καβαζιδης Jan 07 '19 at 12:31
  • 3
    Now you only need to profile your code. In case you're not familiar with that term, look up what profiling means. Your code isn't suitable for posting here, btw, because it should be minimal and self-contained, i.e. a [mcve]. – Ulrich Eckhardt Jan 07 '19 at 12:36
  • You should also *explain* what kind of data that file contains and what do you do with it (and how often). – Basile Starynkevitch Jan 07 '19 at 12:46
  • Please [edit](https://stackoverflow.com/posts/54074180/edit) your question to imporve it and give a lot more context and motivation. What is the size (in gigabytes) of your file? What kind of data is it containing? (Genomics, geographical maps, ???) Are your processing *several times* the same file? Why don't you use standard [containers](https://en.cppreference.com/w/cpp/container)? Have you enough RAM? – Basile Starynkevitch Jan 07 '19 at 12:48
  • On which operating system, which file system, which hardware (RAM, disk)? – Basile Starynkevitch Jan 07 '19 at 12:54
  • 1
    Sounds like it's time to learn how to use a database. – The Quantum Physicist Jan 07 '19 at 13:30
  • @TheQuantumPhysicist you mean that by using only sql or pl sql will be faster than using c++? – Γιαννης Καβαζιδης Jan 07 '19 at 13:59
  • You could use [sqlite](http://sqlite.org/) – Basile Starynkevitch Jan 07 '19 at 14:08
  • @ΓιαννηςΚαβαζιδης I mean that using a flat file for storing data with this amount of data is a bad idea. If you use a database, like postgresql or mysql or conservatively sqlite (since it produces a file and can have thread locking problems, except when you only read), you may be able to do all kinds of parallelism with no problem. – The Quantum Physicist Jan 07 '19 at 14:14
  • No matter how many CPUs your computer has, your hard drive still has only one interface, and a single CPU probably has enough power to run that interface at full speed. – Solomon Slow Jan 07 '19 at 14:55

1 Answers1

1

I have a big file more than 89 million lines

You probably shouldn't, if you think of using several threads to process it. And you should explain what that huge file contains (what kind of data: genomics, time series, ...) and its size (in gigabytes). Are you processing the same file once, or several times? How much time (as measured by time(1)) do you need to process it? How much time does wc(1) needs to count the lines?

A possibility might be to split that file in several smaller files (e.g. using split(1)) made of entire lines, and feed your program with those smaller files. I don't know if that would help you (probably not, unless you do several runs of programs reading these files).

Another possibility could be to make two passes on the file. The first pass would count the lines, and perhaps remember the start offset of some of them (e.g. of every 1024 line). Then, you might process the file in parallel (by reusing the remembered offsets) in the second pass.

BTW, if your huge file is so big that it does not stay in the page cache, your problem is IO bound (and the bottleneck is the physical disk hardware), and you won't gain any speed by trying to parallelize it (even by splitting it in smaller files).

A possibility might be to read and parse once (and slowly) your huge file, and maybe fill some database (perhaps an sqlite one) with its data. Then you might (if you are processing that data several times) take advantage of accessing that database (instead of that file).

Regarding your hash table, consider using instead standard C++ containers (e.g. std::unordered_map).

PS. We don't know what is that huge file and what it contains and how do you process it.

Basile Starynkevitch
  • 1
  • 16
  • 251
  • 479
  • are these small files that you are suggesting will be able to be read by multiple threads and be inserted into a hash table ? – Γιαννης Καβαζιδης Jan 07 '19 at 12:38
  • 1
    What is the point of splitting the file? The hard drive I/O speed (like 0.5 GB/s for SSD) will not change by splitting the file up. – Öö Tiib Jan 07 '19 at 12:39
  • splitting the file and having more threads read for example 4 millions each thread so i suppose that the reading time will be reduces since more threads are processing the file. i am not sure though. – Γιαννης Καβαζιδης Jan 07 '19 at 12:45
  • The file contains 89milions lines and 9 columns, i am taking from that file 3 columns and store them to a hash table. then i am using another file which containt 3 columns which i use the first column to match it with the column of the bigger file and then creating a new file after the comparison. That is why i am using hashtable – Γιαννης Καβαζιδης Jan 07 '19 at 12:50
  • 1
    Don't comment this answer, but do edit your question. Explain how did you get that file? Do you process it only once or several times? And why can't you use `std::unordered_map` or `std::map` ? – Basile Starynkevitch Jan 07 '19 at 12:51