I have a big file, more than 89 million lines. I want to read a file pash it to a hash table and then make some calculations.
The problem is that with istream
the reading of the file and passing it to the hash table is too slow.
Is there any possibility of reading a file using more threads? With a thread library?
Or should I have to cut the file into small pieces and then use a thread for each piece?
The hash function does not take much time to be calculated.
For collision I am using lists. The number of the table is 1 million.
// Adding_date_too_file.cpp : This file contains the 'main' function. Program execution begins and ends there.
//
#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include <iostream>
#include <fstream>
using namespace std;
int main()
{
hasho Hashy;
string f1, f2, f3, f4, f5, f6, f7;
bool is_first_line = true;
fstream file_input;
fstream file_2_Collums;
cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only two column\n which is going to be used for searching based on that file)" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_input.open(infilename.c_str());
if (file_input)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}
cout << "Please give the name of the file that you want to run: \n(The file should end with the format type (txt,csv etc.)) and it has to have the first column sorted !! (file with only one column )" << flush;
while (true)
{
string infilename;
getline(cin, infilename);
file_2_Collums.open(infilename.c_str());
if (file_2_Collums)break;
cout << "Invalid file. Please enter a valid input file name> " << flush;
}
//creating output file
int * table;
table = new int[2];
int count_file_lines = 0;
int line_counter_inventors = 0;
if (file_input.is_open())
{
while (!file_input.eof())
{
if (is_first_line == true) {
getline(file_input, f1, '\n');
is_first_line = false;
}
getline(file_input, f1, '\t');// patent id
getline(file_input, f2, '\t');// patent id
getline(file_input, f3, '\t');// patent id
getline(file_input, f3, '\t');// patent id
getline(file_input, f6, '\t');// patent id
getline(file_input, f3, '\n');//date
//cout << "adding these items " << f1 << '\t' << f6 << endl;
Hashy.AddItem(f2, f6);
cout << count_file_lines << endl;
count_file_lines++;
// cout << f2 << '\t' << f6 << endl;
}
}
int lines_2 = 0;
if (file_2_Collums.is_open())
{
Hashy.openOutputFile();
while (!file_2_Collums.eof())
{
getline(file_2_Collums, f1, '\t');//patent_id
getline(file_2_Collums, f4, '\n');//assignee_id
//cout << f1 << endl;
Hashy.FindDateId(f1, f4);
lines_2++;
}
}
system("pause");
return 0;}
Hash.cpp
#include "pch.h"
#include <iostream>
#include <string>
#include "hash.h"
#include "hash.h"
#include <fstream>
using namespace std;
static ofstream output_file;
hasho::hasho()
{
for (int i = 0; i < tableSize; i++) {
//cout << i << endl;
HashTable[i] = new item;
HashTable[i]->pattent_id = "empty";
HashTable[i]->date = "empty";
HashTable[i]->next = NULL;
}
}
void hasho::openOutputFile() {
cout << "Please give the name of the output file: \n(The file should end with the format type (txt,csv etc.)) " << flush;
while (true)
{
string infilename;
getline(cin, infilename);
output_file.open(infilename.c_str(), fstream::out);
break;
}
}
int hasho::NumberOfItemsInIndex(int index) {
int count = 0;
if (HashTable[index]->date == "empty") {
return count;
}
else {
count++;
item* ptr = HashTable[index];
while (ptr->next != NULL) {
count++;
ptr = ptr->next;
}
}
return count;
}
void hasho::PrintTable() {
int number;
for (int i = 0; i < tableSize; i++) {
number = NumberOfItemsInIndex(i);
cout << "---------------------------------------\n";
cout << "index= " << i << endl;
cout << HashTable[i]->pattent_id << endl;
cout << HashTable[i]->date << endl;
cout << "# of items= " << number << endl;
cout << "---------------------------------------\n";
}
}
void hasho::PrintItemsInIndex(int index) {
item* ptr = HashTable[index];
if (ptr->date == "empty") {
cout << "index = " << index << " is empty." << endl;
}
else {
cout << "index = " << index << " contains the following items\n";
while (ptr != NULL) {
cout << "-----------" << endl;
cout << ptr->date << endl;
cout << ptr->pattent_id << endl;
cout << "-----------" << endl;
ptr = ptr->next;
}
}
}
void hasho::AddItem(string pattend_id, string date)
{
int index = Hash(pattend_id);
if (HashTable[index]->pattent_id == "empty")
{
HashTable[index]->pattent_id = pattend_id;
HashTable[index]->date = date;
}
else {
item* ptr = HashTable[index];
item* n = new item;
n->date = date ;
n->pattent_id = pattend_id;
n->next = NULL;
while (ptr->next != NULL) {
ptr = ptr->next;
}
ptr->next = n;
}
}
void hasho::FindDateId(string pattend_id, string assignee_id1) {
int found = 0;
int nfound = 0;
int index = Hash(pattend_id);
bool foundDateId = false;
string date;
item* ptr = HashTable[index];
int count = 1;
//write to file
//cout << "WE are looking for the date of " <<pattend_id << " in Index: " << index <<endl;
while (ptr != NULL) {
//cout << "NOw we are looking with : " << pattend_id << endl;
if (ptr->pattent_id == pattend_id) {
//cout << "NOw we are looking with : " << pattend_id <<endl;
foundDateId = true;
date = ptr->date;
//write to file
output_file << pattend_id << "\t";
output_file << assignee_id1 << endl;
output_file << date << "\t";
//cout << " " << date << endl;
found = 1;
count++;
}
ptr = ptr->next;
}
if (foundDateId == false) {
nfound++;
}
cout << "found " << found << endl;
cout << "not found " << nfound << endl;
cout << endl;
}
int hasho::Hash(string key)
{
int unsigned hash = 0;
int unsigned index;
//cout << key << endl;
for (int unsigned i = 0; i < key.length(); i++) {
hash = hash + (int)key[i] *(i+1);
}
index =hash % tableSize;
//cout << index << endl;
return index;
}
Hash.h
#pragma once
#include "pch.h"
#include <iostream>
#include <string>
//#include "hash.cpp"
using namespace std;
#pragma comment(linker, "/STACK:7000000")
#pragma comment(linker, "/HEAP:7000000")
#ifndef HASH_H
#define HASH_H
class hasho {
private:
static const int tableSize = 300003;
struct item {
string pattent_id;
string date;
item* next;
};
item* HashTable[tableSize];
public:
hasho();
int Hash(string key);
void AddItem(string pattend_id, string date);
int NumberOfItemsInIndex(int index);
void PrintTable();
void PrintItemsInIndex(int index);
void FindDateId(string pattent_id, string assgnee_id);
void openOutputFile();
};
#endif // ! HASH_H