#include "stdafx.h"
#include "token.h"

using namespace std;


Tokenizer::Tokenizer()
{
    _config = NULL;
    Clear();
}


Tokenizer::~Tokenizer()
{
    Clear();
}


void Tokenizer::Clear()
{
    if (_config)
    {
        delete _config;
        _config = NULL;
    }
    _lineNumber = 0;
    _position = 0;
    _nextPosition = 0;
    _noMoreTokens = true;
}


bool Tokenizer::InitFromIstream(istream* is)
{
    if (!is || is->eof())
    {
        return false;
    }

    Clear();
    _config = is;

    if (!getline(*_config, _line))
    {
        Clear();
        return false;
    }
    else
    {
        _noMoreTokens = false;
        return Next();
    }
}


bool Tokenizer::InitFromString(const char* spec)
{
    // the tokenizer will free this once it is done with it
    istringstream *iss = new istringstream(spec);
    return InitFromIstream(iss);
}

bool Tokenizer::Init(const char* filename)
{
    ASSERT(filename);
    ifstream* ifs = new ifstream(filename);
    ASSERT(ifs);
    return InitFromIstream(ifs);
}


// copy the line parsed so far into the whitespace and finish it off with a null terminator
void Tokenizer::FinishWhitespace(u4* len, u4 off)
{
    if (*len < c_maxTokenLength)
    {
        u4 copyLen = (off - _nextPosition);
        if (copyLen)
        {
            if (copyLen + *len > c_maxTokenLength)
                copyLen = c_maxTokenLength - off;
            memcpy(_whitespace+*len, &_line[_nextPosition], copyLen);
            for (u4 i = 0; i < copyLen; ++i)
            {
                if (_whitespace[i+*len] == '\r')
                {
                    _whitespace[i+*len] = '\n';
                }
            }
            *len += copyLen;
        }
    }
    _whitespace[*len] = 0;
    _nextPosition = off;
}


// if this is the end of the line, move to the next nonempty line
// if this is the end of the file, return false
bool Tokenizer::HandleNewLine(u4* len, u4* off)
{
    if (*off == _line.length())
    {
        FinishWhitespace(len, *off);
        if ((*len) < c_maxTokenLength)
        {
            _whitespace[(*len)++] = '\n';
        }
        
        if (!getline(*_config, _line))
        {
            // end of file 
			_position = 0;
			_nextPosition = 0;
			*off = 0;
            _noMoreTokens = true;
            return false;
        }
        else
        {
            ++_lineNumber;
            _nextPosition = 0;
        }
        *off = 0;
    }
    return true;
}


// record a token that is a single character
void Tokenizer::AcceptSingleChar(u4 *off)
{
    _text[0] = _line[(*off)++];
    _text[1] = 0;
}


// record a token that is a quoted string
void Tokenizer::AcceptQuotedString(u4 *off)
{
    ASSERT(_line[*off] == '\"');
    ++*off;
    u4 len = 0;
    for (;;)
    {
        if (*off == _line.length())
        {
            // we are probably parsing paragraphs
            break;
        }
        if (len >= c_maxTokenLength)
        {
            // we are probably parsing paragraphs
            break;
        }
        char c = _line[(*off)++];
        if (c == '\"')
        {
            break;
        }
        _text[len++] = c;
    }
    _text[len] = 0;
}


// record a token that is a quoted string
void Tokenizer::AcceptSingleQuotedString(u4 *off)
{
    ASSERT(_line[*off] == '\'');
    ++*off;
    u4 len = 0;
    for (;;)
    {
        if (*off == _line.length())
        {
            // we are probably parsing paragraphs
            break;
        }
        if (len >= c_maxTokenLength)
        {
            // we are probably parsing paragraphs
            break;
        }
        char c = _line[(*off)++];
        if (c == '\'')
        {
            break;
        }
        _text[len++] = c;
    }
    _text[len] = 0;
}


// record a token that is a number
void Tokenizer::AcceptNumber(u4 *off)
{
    bool periodSeen = false;
    u4 start = *off;
    if (_line[*off] == '-')
    {
        ++*off;
    }
    for (;;)
    {
        if (*off == _line.length())
        {
            break;
        }

        char c = _line[*off];
        if (c == '.')
        {
            if (periodSeen)
            {
                break;
            }
            else
            {
                periodSeen = true;
            }
        }
        else if (c >= '0' && c <= '9')
        {
            ;
        }
        else
        {
            if (c == 'e' || c == 'E')
            {
                // handle scientific notation
                ++*off;
                if (*off == _line.length())
                {
                    break;
                }
                c = _line[*off];
                if (c == '+' || c == '-')
                {
                    ++*off;
                    if (*off == _line.length())
                    {
                        break;
                    }
                    c = _line[*off];
                }
                while (*off < _line.length() && _line[*off] >= '0' && _line[*off] <= '9')
                {
                    ++*off;
                }
                break;
            }
            else
            {
                break;
            }
        }
        ++*off;
    }
    u4 len = *off-start;
    if (len > c_maxTokenLength)
    {
        len = c_maxTokenLength;
    }
    memcpy(_text, &_line[start], len);
    _text[*off-start] = 0;
}


// record a token that is an identifier
void Tokenizer::AcceptIdentifier(u4 *off)
{
    u4 len = 0;
    _text[len++] = _line[(*off)++];
    for (;;)
    {
        if (len >= c_maxTokenLength)
        {
            Throw("identifier is too long");
        }
        if (*off == _line.length())
        {
            break;
        }
        char c = _line[*off];
        if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_')
        {
            _text[len++] = _line[(*off)++];
        }
        else
        {
            break;
        }
    }
    _text[len] = 0;
}


// parse any number of whitespace and then a token, return true if a token was parsed
bool Tokenizer::Next()
{
    _position = _nextPosition;
    _whitespace[0] = 0;
    _text[0] = 0;
    bool parsedToken = false;
    u4 len = 0; // length of whitespace so far
    u4 off = _nextPosition; // offset of current character in current line
    while (!parsedToken && HandleNewLine(&len, &off))
    {
        char c = _line[off];
        switch(c)
        {
        case ' ':
        case '\t':
        case '\n':
        case '\r':
            // single character whitespace
            off++;
            break;

        case '<':
            if (off+4 < _line.length() && _line[off+1] == '!' && _line[off+2] == '-' && _line[off+3] == '-')
            {
                // xml/html comment
                off += 4;
                while (HandleNewLine(&len, &off))
                {
                    if (off+3 < _line.length() && _line[off] == '-' && _line[off+1] == '-' && _line[off+2] == '>')
                    {
                        // now we have found the end of the comment
                        off += 3;
                        break;
                    }
                    else
                    {
                        ++off;
                    }
                }
            }
            else
            {
                // single character token
                FinishWhitespace(&len, off);
                AcceptSingleChar(&off);
                parsedToken = true;
            }
            break;

        case '\"':
            // quoted value, find the ending quote
            FinishWhitespace(&len, off);
            AcceptQuotedString(&off);
            parsedToken = true;
            break;

        case '\'':
            // quoted value, find the ending quote
            FinishWhitespace(&len, off);
            AcceptSingleQuotedString(&off);
            parsedToken = true;
            break;

        case '0':
        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
        case '.':
        case '-':
            // integer
            FinishWhitespace(&len, off);
            AcceptNumber(&off);
            parsedToken = true;
            break;

        default:
            // identifier
            FinishWhitespace(&len, off);
			if (off == _line.length())
			{
				;
			}
            else if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
            {
                AcceptIdentifier(&off);
                parsedToken = true;
            }
            else
            {
                AcceptSingleChar(&off);
                parsedToken = true;
            }
            break;
        }
    }

    _nextPosition = off;
    return parsedToken;
}


bool Tokenizer::Match(const char* text)
{
    return (_stricmp(text, _text) == 0);
}

bool Tokenizer::Skip(const char* text)
{
    if (Match(text))
    {
        Next();
        return true;
    }
    else
    {
        return false;
    }
}

void Tokenizer::Throw(const char* text, ...)
{
    char x[4096];
    char y[4096];
    va_list args;
    va_start(args, text);
    vsprintf_s(x, text, args);
    va_end(args);
    sprintf_s(
        y,
        "error at line %lu, position %lu, after ###:\n%.*s ### %s\n%s",
        _lineNumber,
        _position,
        (int)_position,
        _line.c_str(),
        &_line[_position],
        x);
    throw CosmosException(y);
}


bool Tokenizer::Expect(const char* text)
{
    if (!Match(text))
    {
        Throw("expected %s", text); 
    }
    return Next();
}


int Tokenizer::Color()
{
	// six hexadecimal digits might form one or many tokens ... this was a bad format design ... compensate
	char color[7];
	size_t colorlen = 0;
	for (;;)
	{
		size_t len = strlen(_text);
		if (colorlen + len > 6)
		{
			Throw("too many digits in 6-digit color");
		}
		memcpy(&color[colorlen], _text, len);
		colorlen += len;
		if (colorlen == 6)
		{
			break;
		}
		if (!Next())
		{
			Throw("color had missing digits");
		}
	}
	color[6] = 0;
	int rgb;
	if (1 != sscanf_s(color, "%x", &rgb))
	{
		Throw("could not parse 6-digit color");
	}

	// windows wants the bytes backwards
	rgb ^= ((rgb & 0xff) << 16);
	rgb ^= (rgb >> 16);
	rgb ^= ((rgb & 0xff) << 16);

	return rgb;
}


void Tokenizer::UnitTest()
{
    Tokenizer t;
    ASSERT(t.InitFromString("<a>\r testing .-123 \"hi ho\"\n\r\n\'bob\' 1.4 <!-- comment!--\n--> BOO </a>"));
    
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), "<") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), "a") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), ">") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "\n ") == 0);
    ASSERT(strcmp(t.Text(), "testing") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), " ") == 0);
    ASSERT(strcmp(t.Text(), ".") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), "-123") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), " ") == 0);
    ASSERT(strcmp(t.Text(), "hi ho") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "\n\n\n") == 0);
    ASSERT(strcmp(t.Text(), "bob") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), " ") == 0);
    ASSERT(strcmp(t.Text(), "1.4") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), " <!-- comment!--\n--> ") == 0);
    ASSERT(strcmp(t.Text(), "BOO") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), " ") == 0);
    ASSERT(strcmp(t.Text(), "<") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), "/") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), "a") == 0);

    ASSERT(t.Next());
    ASSERT(strcmp(t.Whitespace(), "") == 0);
    ASSERT(strcmp(t.Text(), ">") == 0);

    ASSERT(!t.Next());
    
    printf("unit tested Tokenizer\n");
}