CS143-Lab/assignments/PA2/cool.flex

/*
 *  The scanner definition for COOL.
 */

/*
 *  Stuff enclosed in %{ %} in the first section is copied verbatim to the
 *  output, so headers and global definitions are placed here to be visible
 * to the code in the file.  Don't remove anything that was here initially
 */
%{
#include <cool-parse.h>
#include <stringtab.h>
#include <utilities.h>

/* The compiler assumes these identifiers. */
#define yylval cool_yylval
#define yylex  cool_yylex

/* Max size of string constants */
#define MAX_STR_CONST 1025
#define YY_NO_UNPUT   /* keep g++ happy */

extern FILE *fin; /* we read from this file */

/* define YY_INPUT so we read from the FILE fin:
 * This change makes it possible to use this scanner in
 * the Cool compiler.
 */
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
	if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
		YY_FATAL_ERROR( "read() in flex scanner failed");

char string_buf[MAX_STR_CONST]; /* to assemble string constants */
char *string_buf_ptr;

extern int curr_lineno;
extern int verbose_flag;

extern YYSTYPE cool_yylval;

/*
 *  Add Your own definitions here
 */

int comment_nest_level;
%}

/*
 * Define names for regular expressions here.
 */

DARROW          =>
LE              <=
ASSIGN          <-

/* Keywords Definition*/
/* Except for the constants true and false, keywords are case insensitive */
/*
 * In flex-old package, you cannot use the case-insensitive function (?i:keyword) ,
 * this is only available after 2.5.34, while our version is 2.5.4
 */
CLASS         [Cc][Ll][Aa][Ss][Ss]
ELSE          [Ee][Ll][Ss][Ee]
FI            [Ff][Ii]
IF            [Ii][Ff]
IN            [Ii][Nn]
INHERITS      [Ii][Nn][Hh][Ee][Rr][Ii][Tt][Ss]
ISVOID        [Ii][Ss][Vv][Oo][Ii][Dd]
LET           [Ll][Ee][Tt]
LOOP          [Ll][Oo][Oo][Pp]
POOL          [Pp][Oo][Oo][Ll]
THEN          [Tt][Hh][Ee][Nn]
WHILE         [Ww][Hh][Ii][Ll][Ee]
CASE          [Cc][Aa][Ss][Ee]
ESAC          [Ee][Ss][Aa][Cc]
NEW           [Nn][Ee][Ww]
OF            [Oo][Ff]
NOT           [Nn][Oo][Tt]
/* the first letter of true/false must be lowercase; the trailing may be upper or lower case. */
TRUE          t[Rr][Uu][Ee]
FALSE         f[Aa][Ll][Ss][Ee]


%x NCOMMENT SCOMMENT STRING STRINGREC

%%

 /*
  *  Nested comments
  * Really strange, not the same as C style comment, nested comments are accepted.
  * For example, `(* haha (* nested comment*) fufu*)` is valid, but `(* ho )* *)` or `(*(* hey*)` are invalid
  * <<EOF>> rules may not be used with other patterns
  */
"(*"              {
                    BEGIN(NCOMMENT);
                    comment_nest_level = 0; /* entering from outside, initialize nest level */
                  }
<NCOMMENT>"(*"    { comment_nest_level += 1; /* new nested comment, increase level */ }
<NCOMMENT>\n      { curr_lineno += 1; }
<NCOMMENT><<EOF>> {
                    cool_yylval.error_msg = "EOF in comment";
                    BEGIN(INITIAL); /* first return to outside, or there will be an EOF loop*/
                    return (ERROR);
                    /*
                      * If a comment remains open when EOF is encountered, report this error with the message ‘‘EOF
                      * in comment’’. Do not tokenize the comment’s contents simply because the terminator is missing.
                      */
                  }
<NCOMMENT>.
<NCOMMENT>"*)"    {
                    if (comment_nest_level > 0) comment_nest_level -= 1; /* decrease nest level unless the outmost*/
                    else BEGIN(INITIAL);
                  }
 /*
  * If you see “*)” outside a comment, report this error as ‘‘Unmatched *)’’,
  * rather than tokenizing it as * and ).
  */
"*)"  {
        cool_yylval.error_msg = "Unmatched *)";
        return (ERROR);
      }

 /*
  * Single line comment
  * Any characters between two dashes “--” and the next newline (or EOF, if there is no next newline) are treated as comments.
  */

"--"            { BEGIN(SCOMMENT); }
<SCOMMENT>.
<SCOMMENT>\n    {
                  curr_lineno += 1;
                  BEGIN(INITIAL);
                  /* there's no special case for single line comment, EOF will auto end */
                }

 /*
  *  The multiple-character operators.
  */
{DARROW}		{ return (DARROW); }
{LE}        { return (LE); }
{ASSIGN}    { return (ASSIGN); }

 /*
  * The tokens for single character symbols (e.g., “;” and “,”)
  * are represented just by the integer (ASCII) value of the character itself.
  */
"+"   { return '+'; }
"/"   { return '/'; }
"-"   { return '-'; }
"*"   { return '*'; }
"="   { return '='; }
"<"   { return '<'; }
"."   { return '.'; }
"~"   { return '~'; }
","   { return ','; }
";"   { return ';'; }
":"   { return ':'; }
"("   { return '('; }
")"   { return ')'; }
"@"   { return '@'; }
"{"   { return '{'; }
"}"   { return '}'; }

 /*
  * Keywords are case-insensitive except for the values true and false,
  * which must begin with a lower-case letter.
  */
{CLASS}     { return (CLASS); }
{ELSE}      { return (ELSE); }
{FI}        { return (FI); }
{IF}        { return (IF); }
{IN}        { return (IN); }
{INHERITS}  { return (INHERITS); }
{ISVOID}    { return (ISVOID); }
{LET}       { return (LET); }
{LOOP}      { return (LOOP); }
{POOL}      { return (POOL); }
{THEN}      { return (THEN); }
{WHILE}     { return (WHILE); }
{CASE}      { return (CASE); }
{ESAC}      { return (ESAC); }
{NEW}       { return (NEW); }
{OF}        { return (OF); }
{NOT}       { return (NOT); }
{TRUE}      { cool_yylval.boolean = 1; return(BOOL_CONST); }
{FALSE}     { cool_yylval.boolean = 0; return(BOOL_CONST); }

 /*
  *  String constants (C syntax)
  *  Escape sequence \c is accepted for all characters c. Except for
  *  \n \t \b \f, the result is c.
  *
  */
<STRINGREC>\\\n   {
                    curr_lineno += 1;
                  }
<STRINGREC>\\\"
  /*
    * In either case(null char | too long), lexing should resume after the end of the string.
    * The end of the string is defined as either:
    *   the beginning of the next line if an unescaped newline occurs after these errors are encountered
    *   after the closing ” otherwise
    */
<STRINGREC>\"|\n  {
                    BEGIN(INITIAL);
                    if (yytext[0] == '\n') curr_lineno += 1;
                  }
<STRINGREC>.

\"                {
                    BEGIN(STRING);
                    string_buf_ptr = string_buf; /* reset string buf ptr*/
                  }
<STRING>[^\"\\\n\0] {
                    /* it is necessary to exclude \0 here */
                    *string_buf_ptr = yytext[0];
                    string_buf_ptr ++;
                    if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
                      BEGIN(STRINGREC);
                      cool_yylval.error_msg = "String constant too long";
                      return (ERROR);
                      /*
                        * When a string is too long,
                        * report the error as ‘‘String constant too long’’
                        * in the error string in the ERROR token.
                        */
                    }
                  }
<STRING>\\(.|\n)  {
                    if (yytext[1] == '\n') {
                      curr_lineno += 1;
                    }
                    /*
                      * Within a string, a sequence ‘\c’ denotes the
                      *   character ‘c’, except \b \t \n \f
                      */
                    switch(yytext[1]) {
                      case 'b': *string_buf_ptr = '\b'; break;
                      case 't': *string_buf_ptr = '\t'; break;
                      case 'n': *string_buf_ptr = '\n'; break;
                      case 'f': *string_buf_ptr = '\f'; break;
                      /*
                      * This rule is not described in PA2's section4, though it could be implied by manual's
                      *   `A string may not contain the null (character \0)`.
                      */
                      case '\0': BEGIN(STRINGREC); cool_yylval.error_msg = "String contains escaped null character."; return (ERROR);
                      default: *string_buf_ptr = yytext[1]; break; /* \\n is included here*/
                    }
                    /*
                      * The two characters \0(0x5c30) is valid, but actually converted to '0'(0x30)
                      * this is handled in the default branch
                      */
                    string_buf_ptr ++;
                    if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
                      BEGIN(STRINGREC);
                      cool_yylval.error_msg = "String constant too long";
                      return (ERROR);
                    }
                  }
  /*
    * A string may not contain EOF.
    * A string may not contain the null (character \0).
    * Any other character may be included in a string.
    */
<STRING><<EOF>>   {
                    cool_yylval.error_msg = "EOF in string constant";
                    BEGIN(INITIAL);
                    return (ERROR);
                    /*
                      * for strings, if an EOF is encountered before the close-quote,
                      * report this error as ‘‘EOF in string constant’’.
                      */
                  }
<STRING>\0        {
                    BEGIN(STRINGREC);
                    cool_yylval.error_msg = "String contains null character.";
                    return (ERROR);
                    /*
                      * If the string contains invalid characters (i.e., the null character),
                      * report this as ‘‘String contains null character’’.
                      */
                  }
<STRING>\n        {
                    /* the escaped case should haved been captured in the escape rule */
                    BEGIN(INITIAL);
                    curr_lineno += 1;
                    cool_yylval.error_msg = "Unterminated string constant";
                    return (ERROR);
                    /*
                      * If a string contains an unescaped newline,
                      * report that error as ‘‘Unterminated string constant’’
                      * and resume lexing at the beginning of the next line
                      */
                  }
<STRING>\"        {
                    BEGIN(INITIAL);
                    cool_yylval.symbol = stringtable.add_string(string_buf, string_buf_ptr - string_buf);
                    return (STR_CONST);
                  }

 /* Integer constants
  * Integers are non-empty strings of digits 0-9
  */
[0-9]+  {
          cool_yylval.symbol = inttable.add_string(yytext);
          return (INT_CONST);
        }

 /*
  * Identifiers
  * Identifiers are strings (other than keywords) consisting of letters, digits, and the underscore character.
  * type identifiers begin with a capital letter
  * object identifiers begin with a lower case letter
  */
[A-Z][A-Za-z0-9_]*  {
                      cool_yylval.symbol = idtable.add_string(yytext);
                      return (TYPEID);
                    }
[a-z][A-Za-z0-9_]*  {
                      cool_yylval.symbol = idtable.add_string(yytext);
                      return (OBJECTID);
                    }

 /*
  * Whitespace consists of any sequence of the characters: blank (ascii 32), \n (newline, ascii 10), \f (form
  * feed, ascii 12), \r (carriage return, ascii 13), \t (tab, ascii 9), \v (vertical tab, ascii 11)
  */
[\x20\f\r\t\v]+
\n              { curr_lineno += 1; }

 /*
  * Invalid Character
  * When an invalid character (one that can’t begin any token) is encountered, a string containing just
  * that character should be returned as the error string.
  * Resume lexing at the following character.
  */
. {
    cool_yylval.error_msg = strdup(yytext);
    return (ERROR);
  }
%%