/* * The scanner definition for COOL. */ /* * Stuff enclosed in %{ %} in the first section is copied verbatim to the * output, so headers and global definitions are placed here to be visible * to the code in the file. Don't remove anything that was here initially */ %{ #include #include #include /* The compiler assumes these identifiers. */ #define yylval cool_yylval #define yylex cool_yylex /* Max size of string constants */ #define MAX_STR_CONST 1025 #define YY_NO_UNPUT /* keep g++ happy */ extern FILE *fin; /* we read from this file */ /* define YY_INPUT so we read from the FILE fin: * This change makes it possible to use this scanner in * the Cool compiler. */ #undef YY_INPUT #define YY_INPUT(buf,result,max_size) \ if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \ YY_FATAL_ERROR( "read() in flex scanner failed"); char string_buf[MAX_STR_CONST]; /* to assemble string constants */ char *string_buf_ptr; extern int curr_lineno; extern int verbose_flag; extern YYSTYPE cool_yylval; /* * Add Your own definitions here */ int comment_nest_level; int string_recover_error; %} /* * Define names for regular expressions here. */ DARROW => LE <= ASSIGN <- /* Keywords Definition*/ /* Except for the constants true and false, keywords are case insensitive */ /* * In flex-old package, you cannot use the case-insensitive function (?i:keyword) , * this is only available after 2.5.34, while our version is 2.5.4 */ CLASS [Cc][Ll][Aa][Ss][Ss] ELSE [Ee][Ll][Ss][Ee] FI [Ff][Ii] IF [Ii][Ff] IN [Ii][Nn] INHERITS [Ii][Nn][Hh][Ee][Rr][Ii][Tt][Ss] ISVOID [Ii][Ss][Vv][Oo][Ii][Dd] LET [Ll][Ee][Tt] LOOP [Ll][Oo][Oo][Pp] POOL [Pp][Oo][Oo][Ll] THEN [Tt][Hh][Ee][Nn] WHILE [Ww][Hh][Ii][Ll][Ee] CASE [Cc][Aa][Ss][Ee] ESAC [Ee][Ss][Aa][Cc] NEW [Nn][Ee][Ww] OF [Oo][Ff] NOT [Nn][Oo][Tt] /* the first letter of true/false must be lowercase; the trailing may be upper or lower case. */ TRUE t[Rr][Uu][Ee] FALSE f[Aa][Ll][Ss][Ee] %x NCOMMENT SCOMMENT STRING %% /* * Nested comments * Really strange, not the same as C style comment, nested comments are accepted. * For example, `(* haha (* nested comment*) fufu*)` is valid, but `(* ho )* *)` or `(*(* hey*)` are invalid * <> rules may not be used with other patterns */ "(*" { BEGIN(NCOMMENT); comment_nest_level = 0; /* entering from outside, initialize nest level */ } "(*" { comment_nest_level += 1; /* new nested comment, increase level */ } \n { curr_lineno += 1; } <> { cool_yylval.error_msg = "EOF in comment"; BEGIN(INITIAL); /* first return to outside, or there will be an EOF loop*/ return (ERROR); /* * If a comment remains open when EOF is encountered, report this error with the message ‘‘EOF * in comment’’. Do not tokenize the comment’s contents simply because the terminator is missing. */ } . "*)" { if (comment_nest_level > 0) comment_nest_level -= 1; /* decrease nest level unless the outmost*/ else BEGIN(INITIAL); } /* * If you see “*)” outside a comment, report this error as ‘‘Unmatched *)’’, * rather than tokenizing it as * and ). */ "*)" { cool_yylval.error_msg = "Unmatched *)"; return (ERROR); } /* * Single line comment * Any characters between two dashes “--” and the next newline (or EOF, if there is no next newline) are treated as comments. */ "--" { BEGIN(SCOMMENT); } . \n { curr_lineno += 1; BEGIN(INITIAL); /* there's no special case for single line comment, EOF will auto end */ } /* * The multiple-character operators. */ {DARROW} { return (DARROW); } {LE} { return (LE); } {ASSIGN} { return (ASSIGN); } /* * The tokens for single character symbols (e.g., “;” and “,”) * are represented just by the integer (ASCII) value of the character itself. */ "+" { return '+'; } "/" { return '/'; } "-" { return '-'; } "*" { return '*'; } "=" { return '='; } "<" { return '<'; } "." { return '.'; } "~" { return '~'; } "," { return ','; } ";" { return ';'; } ":" { return ':'; } "(" { return '('; } ")" { return ')'; } "@" { return '@'; } "{" { return '{'; } "}" { return '}'; } /* * Keywords are case-insensitive except for the values true and false, * which must begin with a lower-case letter. */ {CLASS} { return (CLASS); } {ELSE} { return (ELSE); } {FI} { return (FI); } {IF} { return (IF); } {IN} { return (IN); } {INHERITS} { return (INHERITS); } {ISVOID} { return (ISVOID); } {LET} { return (LET); } {LOOP} { return (LOOP); } {POOL} { return (POOL); } {THEN} { return (THEN); } {WHILE} { return (WHILE); } {CASE} { return (CASE); } {ESAC} { return (ESAC); } {NEW} { return (NEW); } {OF} { return (OF); } {NOT} { return (NOT); } {TRUE} { cool_yylval.boolean = 1; return(BOOL_CONST); } {FALSE} { cool_yylval.boolean = 0; return(BOOL_CONST); } /* * String constants (C syntax) * Escape sequence \c is accepted for all characters c. Except for * \n \t \b \f, the result is c. * */ \" { BEGIN(STRING); string_buf_ptr = string_buf; /* reset string buf ptr*/ string_recover_error = 0; /* reset error flag*/ } [^\"\\\n] { if (!string_recover_error){ *string_buf_ptr = yytext[0]; string_buf_ptr ++; if (string_buf_ptr >= string_buf + MAX_STR_CONST) { string_recover_error = 1; /* string too long */ } } } \\(.|\n) { if (!string_recover_error){ /* * Within a string, a sequence ‘\c’ denotes the * character ‘c’, except \b \t \n \f */ switch(yytext[1]) { case 'b': *string_buf_ptr = '\b'; break; case 't': *string_buf_ptr = '\t'; break; case 'n': *string_buf_ptr = '\n'; break; case 'f': *string_buf_ptr = '\f'; break; case '\n': *string_buf_ptr = '\n'; curr_lineno += 1; break; default: *string_buf_ptr = yytext[1]; break; } string_buf_ptr ++; if (string_buf_ptr >= string_buf + MAX_STR_CONST) { string_recover_error = 1; /* string too long */ } } } /* * A string may not contain EOF. * A string may not contain the null (character \0). * Any other character may be included in a string. */ <> { cool_yylval.error_msg = "EOF in string constant"; BEGIN(INITIAL); return (ERROR); /* * for strings, if an EOF is encountered before the close-quote, * report this error as ‘‘EOF in string constant’’. */ } \0 { string_recover_error = 2; /* null character */ } /* * In either case(null char | too long), lexing should resume after the end of the string. * The end of the string is defined as either: * the beginning of the next line if an unescaped newline occurs after these errors are encountered * after the closing ” otherwise */ \"|\n { BEGIN(INITIAL); if (yytext[0] == '\n') { curr_lineno += 1; } if (!string_recover_error) { if (yytext[0] == '\"') { cool_yylval.symbol = stringtable.add_string(string_buf, string_buf_ptr - string_buf); return (STR_CONST); } else if (yytext[0] == '\n') { /* the escaped case should haved been captured in the escape rule */ cool_yylval.error_msg = "Unterminated string constant"; return (ERROR); /* * If a string contains an unescaped newline, * report that error as ‘‘Unterminated string constant’’ * and resume lexing at the beginning of the next line */ } } else if (string_recover_error == 1) { cool_yylval.error_msg = "EOF in string constant"; return (ERROR); /* * When a string is too long, * report the error as ‘‘String constant too long’’ * in the error string in the ERROR token. */ } else if (string_recover_error == 2){ cool_yylval.error_msg = "String contains null character"; return (ERROR); /* * If the string contains invalid characters (i.e., the null character), * report this as ‘‘String contains null character’’. */ } } /* Integer constants * Integers are non-empty strings of digits 0-9 */ [0-9]+ { cool_yylval.symbol = inttable.add_string(yytext); return (INT_CONST); } /* * Identifiers * Identifiers are strings (other than keywords) consisting of letters, digits, and the underscore character. * type identifiers begin with a capital letter * object identifiers begin with a lower case letter */ [A-Z][A-Za-z0-9_]* { cool_yylval.symbol = idtable.add_string(yytext); return (TYPEID); } [a-z][A-Za-z0-9_]* { cool_yylval.symbol = idtable.add_string(yytext); return (OBJECTID); } /* * Whitespace consists of any sequence of the characters: blank (ascii 32), \n (newline, ascii 10), \f (form * feed, ascii 12), \r (carriage return, ascii 13), \t (tab, ascii 9), \v (vertical tab, ascii 11) */ [\x20\f\r\t\v]+ \n { curr_lineno += 1; } /* * Invalid Character * When an invalid character (one that can’t begin any token) is encountered, a string containing just * that character should be returned as the error string. * Resume lexing at the following character. */ . { cool_yylval.error_msg = strdup(yytext); return (ERROR); } %%