I originally had the below code using std::wstring
and was using wide strings that were statically typed into the code.
Later I learned that UTF-8 will "fit" into std::string
and that there was no real need for std::wstring
but that I might need some encoding translations later on. So I have a UTF-8 encoded text file that I'm reading in.
#include <iostream>
#include <fstream>
class A
{
public:
A(std::istream& stream)
:
m_stream(stream),
m_lineNumber(1),
m_characterNumber(1)
{
}
bool OutputKnownWords()
{
while(m_stream.good())
{
if(Take("MIDDLE"))
std::cout << "Found middle" << std::endl;
else if(Take("BEGIN"))
std::cout << "Found begin" << std::endl;
else if(Take("END"))
std::cout << "Found end" << std::endl;
else if(Take(" "))
std::cout << "parsed out space" << std::endl;
else
return false;
}
return true;
}
protected:
std::istream::char_type Get()
{
auto c = m_stream.get();
++m_characterNumber;
if(c == '\n')
{
++m_lineNumber;
m_characterNumber = 1;
}
return c;
}
bool Take(const std::string& str)
{
if(!Match(str))
return false;
for(std::string::size_type i = 0; i < str.size(); ++i)
Get();
return true;
}
bool Match(const std::string& str)
{
auto cursorPos = m_stream.tellg();
std::string readStr(str.size(),'\0');
m_stream.read(&readStr[0],str.size());
if(std::size_t(m_stream.gcount()) < str.size() || readStr != str)
{
if(!m_stream.good())
m_stream.clear();
m_stream.seekg(cursorPos);
return false;
}
m_stream.seekg(cursorPos);
return true;
}
std::istream& m_stream;
std::size_t m_lineNumber;
std::size_t m_characterNumber;
};
int main()
{
std::ifstream file("test.txt");
if(!file.is_open())
{
std::cerr << "could not open file" << std::endl;
return 0;
}
A a(file);
if(!a.OutputKnownWords())
{
std::cerr << "something went wrong" << std::endl;
return 0;
}
return 0;
}
text.text
BEGIN MIDDLE
END
So I would expect that this program outputs:
Found begin
parsed out space
Found middle
parsed out space
Found end
However, OutputKnownWords
returns an error. I stepped through with the debugger and I found that the seekg
calls in Match
appear to not be setting the correct position. It's like, each test is out by one character.
When I was doing this with wide strings statically typed I had no problem.
I sort of think this might be related to the difference between UTF-8 encoding vs std::string
's idea of a "character". But I'm not sure how then to handle how many "characters" are in an std::string
.
This isn't related to tellg() function give wrong size of file? because I'm not doing anything with the cursor from tellg
other than using it to reset the position.