/*
 * A C++ scanner that uses the longest match scanning method. This example
 * differs from other examples of scanning. Each run of the state machine
 * matches one token. This method results in a smaller state machine since the
 * final kleene star is omitted and therefore every state does not need to get
 * all the transitions of the start state.
 *
 * << <= <<= >> >= >>= are left out since angle brackets are used in templates.
 *
 * In this example the the finishing action operator "@" is used to remember
 * that a token was matched every time the machine enters into a final state.
 * After the machine moves into the error state we take the last matching
 * token. If two tokens match the same input, the token referenced later in the
 * machine construction will have it's actions executed second (Ragel ensures
 * this) and will overwrite any previous token. 
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

#define TK_Dlit 256
#define TK_Slit 257
#define TK_Float 258
#define TK_Id 259
#define TK_NameSep 260
#define TK_Arrow 261
#define TK_PlusPlus 262
#define TK_MinusMinus 263
#define TK_ArrowStar 264
#define TK_DotStar 265
#define TK_ShiftLeft 266
#define TK_ShiftRight 267
#define TK_IntegerDecimal 268
#define TK_IntegerOctal 269
#define TK_IntegerHex 270
#define TK_EqualsEquals 271
#define TK_NotEquals 272
#define TK_AndAnd 273
#define TK_OrOr 274
#define TK_MultAssign 275
#define TK_DivAssign 276
#define TK_PercentAssign 277
#define TK_PlusAssign 278
#define TK_MinusAssign 279
#define TK_AmpAssign 280
#define TK_CaretAssign 281
#define TK_BarAssign 282
#define TK_DotDotDot 283
#define TK_Whitespace 284
#define TK_Comment 285

#define BUFSIZE 16384

/* EOF Char used to flush out that last token. This should be a whitespace
 * token. In this implementation of a scanner the last token is always ignored,
 * so enforce that the last token is always whitespace. */

#define LAST_CHAR 0

int tok;
char buf[BUFSIZE], *tokstart, *tokend;
void token( char *data, int len );
bool discard = false;
int line = 1, col = 1;

void token( char *data, int len )
{
	printf( "<%i> ", tok );
	for ( int i = 0; i < len; i++ )
		fputc( data[i], stdout );
	fputc( '\n', stdout );
	
	/* Count newlines and columns. This code is here mainly for having some
	 * code in the token routine when commenting out the above output during
	 * performance testing. */
	for ( int i = 0; i < len; i ++ ) {
		if ( data[i] == '\n' ) {
			line += 1;
			col = 1;
		}
		else {
			col += 1;
		}
	}
}

%% Scanner
	struct {};
	init { 
		tok = 0;
		tokend = 0;
	}

	# Single and double literals.
	slit = ( 'L'? "'" ( [^'\\\n] | /\\./ )* "'" ) @{tok = TK_Slit;};
	dlit = ( 'L'? '"' ( [^"\\\n] | /\\./ )* '"' ) @{tok = TK_Dlit;};

	# Identifiers
	id = ( [a-zA-Z_] [a-zA-Z0-9_]* ) @{tok = TK_Id;};

	# Floating literals.
	fract_const = digit* '.' digit+ | digit+ '.';
	exponent = [eE] [+\-]? digit+;
	float_suffix = [flFL];
	float = 
		( fract_const exponent? float_suffix? |
		digit+ exponent float_suffix? ) @{tok = TK_Float;};
	
	# Integer decimal. Leading part buffered by float.
	integer_decimal = ( ( '0' | [1-9] [0-9]* ) [ulUL]{0,3} ) @{tok = TK_IntegerDecimal;};

	# Integer octal. Leading part buffered by float.
	integer_octal = ( '0' [0-9]+ [ulUL]{0,2} ) @{tok = TK_IntegerOctal;};

	# Integer hex. Leading 0 buffered by float.
	integer_hex = ( '0' ( 'x' [0-9a-fA-F]+ [ulUL]{0,2} ) ) @{tok = TK_IntegerHex;};

	# Only buffer the second item, first buffered by symbol. */
	namesep = '::' @{tok = TK_NameSep;};
	deqs = '==' @{tok = TK_EqualsEquals;};
	neqs = '!=' @{tok = TK_NotEquals;};
	and_and = '&&' @{tok = TK_AndAnd;};
	or_or = '||' @{tok = TK_OrOr;};
	mult_assign = '*=' @{tok = TK_MultAssign;};
	div_assign = '/=' @{tok = TK_DivAssign;};
	percent_assign = '%=' @{tok = TK_PercentAssign;};
	plus_assign = '+=' @{tok = TK_PlusAssign;};
	minus_assign = '-=' @{tok = TK_MinusAssign;};
	amp_assign = '&=' @{tok = TK_AmpAssign;};
	caret_assign = '^=' @{tok = TK_CaretAssign;};
	bar_assign = '|=' @{tok = TK_BarAssign;};
	plus_plus = '++' @{tok = TK_PlusPlus;};
	minus_minus = '--' @{tok = TK_MinusMinus;};
	arrow = '->' @{tok = TK_Arrow;};
	arrow_star = '->*' @{tok = TK_ArrowStar;};
	dot_star = '.*' @{tok = TK_DotStar;};

	# Three char compounds, first item already buffered. */
	dot_dot_dot = '...' @{tok = TK_DotDotDot;};

	# All compunds
	compound = namesep | deqs | neqs | and_and | or_or | mult_assign |
			div_assign | percent_assign | plus_assign | minus_assign |
			amp_assign | caret_assign | bar_assign | plus_plus | minus_minus |
			arrow | arrow_star | dot_star | dot_dot_dot;

	# Single char symbols.
	symbol = ( punct - [_"'] ) @{tok = fc;};

	action discard {
		discard = true;
	}

	# Comments and whitespace.
	commc = '/*' @discard ( any* $0 '*/' @1 ) @{tok = TK_Comment;};
	commcc = '//' @discard ( any* $0 '\n' @1 ) @{tok = TK_Comment;};
	whitespace = ( any - 33..126 )+ >discard @{tok = TK_Whitespace;};

	# All outside code tokens.
	tokens = ( 
		id | slit | dlit | float | integer_decimal | 
		integer_octal | integer_hex | compound | symbol |
		commc | commcc | whitespace );

	action onError {
		if ( tok != 0 ) {
			char *rst_data;
			int rst_len;

			if ( tok == TK_Comment || tok == TK_Whitespace ) {
				/* Reset comment status, don't send. */
				discard = false;

				/* Restart right at the error point if consuming whitespace or
				 * a comment. Consume may have spanned multiple buffers. */
				rst_data = fpc;
				rst_len = (fbuf + fblen) - fpc;
			}
			else {
				/* Send the token. */
				token( tokstart, tokend - tokstart + 1 );

				/* Restart right after the token. */
				rst_data = tokend+1;
				rst_len = (fbuf + fblen) - tokend - 1;
			}

			tokstart = rst_data;
			fexec( rst_data, rst_len );
			fgoto main;
		}
	}

	main := tokens @{tokend=fpc;} $!onError;
%%


int main()
{
	Scanner scanner;
	scanner.init();

	/* Do the first read. */
	int have = 0;
	bool sentLastChar = false;
	tokstart = buf;

	while ( true ) {
		int newd = fread( buf+have, 1, BUFSIZE-have, stdin );
		if ( newd == 0 ) {
			if ( sentLastChar )
				break;
			else {
				/* Push the last character. Note that there is always at least
				 * one free spot. */
				sentLastChar = true;
				buf[have] = LAST_CHAR;
				newd = 1;
			}
		}

		int len = have + newd;
		int rtn = scanner.execute( buf+have, newd );
		if ( rtn < 0 ) {
			/* Machine failed before finding a token. */
			fprintf(stderr, "PARSE ERROR\n" );
			exit(1);
		}
		else if ( discard ) {
			/* No failure yet, end of buf in whitespace or comment. */
			have = 0;
			tokend -= (tokstart-buf);
			tokstart = buf;
		}
		else if ( tokstart == buf && len == BUFSIZE ) {
			/* No failure yet, buffer is full. */
			fprintf(stderr, "TOKEN TOO BIG\n" );
			exit(1);
		}
		else {
			/* No failure yet, room still left in buffer. Shift over data and
			 * read more. */
			have = len - (tokstart-buf);
			memmove( buf, tokstart, have );
			tokend -= (tokstart-buf);
			tokstart = buf;
		}
	}

	return 0;
}
