#include "stdafx.h" #include "token.h" using namespace std; Tokenizer::Tokenizer() { _config = NULL; Clear(); } Tokenizer::~Tokenizer() { Clear(); } void Tokenizer::Clear() { if (_config) { delete _config; _config = NULL; } _lineNumber = 0; _position = 0; _nextPosition = 0; _noMoreTokens = true; } bool Tokenizer::InitFromIstream(istream* is) { if (!is || is->eof()) { return false; } Clear(); _config = is; if (!getline(*_config, _line)) { Clear(); return false; } else { _noMoreTokens = false; return Next(); } } bool Tokenizer::InitFromString(const char* spec) { // the tokenizer will free this once it is done with it istringstream *iss = new istringstream(spec); return InitFromIstream(iss); } bool Tokenizer::Init(const char* filename) { ASSERT(filename); ifstream* ifs = new ifstream(filename); ASSERT(ifs); return InitFromIstream(ifs); } // copy the line parsed so far into the whitespace and finish it off with a null terminator void Tokenizer::FinishWhitespace(u4* len, u4 off) { if (*len < c_maxTokenLength) { u4 copyLen = (off - _nextPosition); if (copyLen) { if (copyLen + *len > c_maxTokenLength) copyLen = c_maxTokenLength - off; memcpy(_whitespace+*len, &_line[_nextPosition], copyLen); for (u4 i = 0; i < copyLen; ++i) { if (_whitespace[i+*len] == '\r') { _whitespace[i+*len] = '\n'; } } *len += copyLen; } } _whitespace[*len] = 0; _nextPosition = off; } // if this is the end of the line, move to the next nonempty line // if this is the end of the file, return false bool Tokenizer::HandleNewLine(u4* len, u4* off) { if (*off == _line.length()) { FinishWhitespace(len, *off); if ((*len) < c_maxTokenLength) { _whitespace[(*len)++] = '\n'; } if (!getline(*_config, _line)) { // end of file _position = 0; _nextPosition = 0; *off = 0; _noMoreTokens = true; return false; } else { ++_lineNumber; _nextPosition = 0; } *off = 0; } return true; } // record a token that is a single character void Tokenizer::AcceptSingleChar(u4 *off) { _text[0] = _line[(*off)++]; _text[1] = 0; } // record a token that is a quoted string void Tokenizer::AcceptQuotedString(u4 *off) { ASSERT(_line[*off] == '\"'); ++*off; u4 len = 0; for (;;) { if (*off == _line.length()) { // we are probably parsing paragraphs break; } if (len >= c_maxTokenLength) { // we are probably parsing paragraphs break; } char c = _line[(*off)++]; if (c == '\"') { break; } _text[len++] = c; } _text[len] = 0; } // record a token that is a quoted string void Tokenizer::AcceptSingleQuotedString(u4 *off) { ASSERT(_line[*off] == '\''); ++*off; u4 len = 0; for (;;) { if (*off == _line.length()) { // we are probably parsing paragraphs break; } if (len >= c_maxTokenLength) { // we are probably parsing paragraphs break; } char c = _line[(*off)++]; if (c == '\'') { break; } _text[len++] = c; } _text[len] = 0; } // record a token that is a number void Tokenizer::AcceptNumber(u4 *off) { bool periodSeen = false; u4 start = *off; if (_line[*off] == '-') { ++*off; } for (;;) { if (*off == _line.length()) { break; } char c = _line[*off]; if (c == '.') { if (periodSeen) { break; } else { periodSeen = true; } } else if (c >= '0' && c <= '9') { ; } else { if (c == 'e' || c == 'E') { // handle scientific notation ++*off; if (*off == _line.length()) { break; } c = _line[*off]; if (c == '+' || c == '-') { ++*off; if (*off == _line.length()) { break; } c = _line[*off]; } while (*off < _line.length() && _line[*off] >= '0' && _line[*off] <= '9') { ++*off; } break; } else { break; } } ++*off; } u4 len = *off-start; if (len > c_maxTokenLength) { len = c_maxTokenLength; } memcpy(_text, &_line[start], len); _text[*off-start] = 0; } // record a token that is an identifier void Tokenizer::AcceptIdentifier(u4 *off) { u4 len = 0; _text[len++] = _line[(*off)++]; for (;;) { if (len >= c_maxTokenLength) { Throw("identifier is too long"); } if (*off == _line.length()) { break; } char c = _line[*off]; if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_') { _text[len++] = _line[(*off)++]; } else { break; } } _text[len] = 0; } // parse any number of whitespace and then a token, return true if a token was parsed bool Tokenizer::Next() { _position = _nextPosition; _whitespace[0] = 0; _text[0] = 0; bool parsedToken = false; u4 len = 0; // length of whitespace so far u4 off = _nextPosition; // offset of current character in current line while (!parsedToken && HandleNewLine(&len, &off)) { char c = _line[off]; switch(c) { case ' ': case '\t': case '\n': case '\r': // single character whitespace off++; break; case '<': if (off+4 < _line.length() && _line[off+1] == '!' && _line[off+2] == '-' && _line[off+3] == '-') { // xml/html comment off += 4; while (HandleNewLine(&len, &off)) { if (off+3 < _line.length() && _line[off] == '-' && _line[off+1] == '-' && _line[off+2] == '>') { // now we have found the end of the comment off += 3; break; } else { ++off; } } } else { // single character token FinishWhitespace(&len, off); AcceptSingleChar(&off); parsedToken = true; } break; case '\"': // quoted value, find the ending quote FinishWhitespace(&len, off); AcceptQuotedString(&off); parsedToken = true; break; case '\'': // quoted value, find the ending quote FinishWhitespace(&len, off); AcceptSingleQuotedString(&off); parsedToken = true; break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': case '.': case '-': // integer FinishWhitespace(&len, off); AcceptNumber(&off); parsedToken = true; break; default: // identifier FinishWhitespace(&len, off); if (off == _line.length()) { ; } else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) { AcceptIdentifier(&off); parsedToken = true; } else { AcceptSingleChar(&off); parsedToken = true; } break; } } _nextPosition = off; return parsedToken; } bool Tokenizer::Match(const char* text) { return (_stricmp(text, _text) == 0); } bool Tokenizer::Skip(const char* text) { if (Match(text)) { Next(); return true; } else { return false; } } void Tokenizer::Throw(const char* text, ...) { char x[4096]; char y[4096]; va_list args; va_start(args, text); vsprintf_s(x, text, args); va_end(args); sprintf_s( y, "error at line %lu, position %lu, after ###:\n%.*s ### %s\n%s", _lineNumber, _position, (int)_position, _line.c_str(), &_line[_position], x); throw CosmosException(y); } bool Tokenizer::Expect(const char* text) { if (!Match(text)) { Throw("expected %s", text); } return Next(); } int Tokenizer::Color() { // six hexadecimal digits might form one or many tokens ... this was a bad format design ... compensate char color[7]; size_t colorlen = 0; for (;;) { size_t len = strlen(_text); if (colorlen + len > 6) { Throw("too many digits in 6-digit color"); } memcpy(&color[colorlen], _text, len); colorlen += len; if (colorlen == 6) { break; } if (!Next()) { Throw("color had missing digits"); } } color[6] = 0; int rgb; if (1 != sscanf_s(color, "%x", &rgb)) { Throw("could not parse 6-digit color"); } // windows wants the bytes backwards rgb ^= ((rgb & 0xff) << 16); rgb ^= (rgb >> 16); rgb ^= ((rgb & 0xff) << 16); return rgb; } void Tokenizer::UnitTest() { Tokenizer t; ASSERT(t.InitFromString("\r testing .-123 \"hi ho\"\n\r\n\'bob\' 1.4 BOO ")); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), "<") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), "a") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), ">") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "\n ") == 0); ASSERT(strcmp(t.Text(), "testing") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), " ") == 0); ASSERT(strcmp(t.Text(), ".") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), "-123") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), " ") == 0); ASSERT(strcmp(t.Text(), "hi ho") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "\n\n\n") == 0); ASSERT(strcmp(t.Text(), "bob") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), " ") == 0); ASSERT(strcmp(t.Text(), "1.4") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), " ") == 0); ASSERT(strcmp(t.Text(), "BOO") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), " ") == 0); ASSERT(strcmp(t.Text(), "<") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), "/") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), "a") == 0); ASSERT(t.Next()); ASSERT(strcmp(t.Whitespace(), "") == 0); ASSERT(strcmp(t.Text(), ">") == 0); ASSERT(!t.Next()); printf("unit tested Tokenizer\n"); }