CS143-Lab/assignments/PA2/cool.flex
2023-03-23 04:54:42 +00:00

343 lines
12 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* The scanner definition for COOL.
*/
/*
* Stuff enclosed in %{ %} in the first section is copied verbatim to the
* output, so headers and global definitions are placed here to be visible
* to the code in the file. Don't remove anything that was here initially
*/
%{
#include <cool-parse.h>
#include <stringtab.h>
#include <utilities.h>
/* The compiler assumes these identifiers. */
#define yylval cool_yylval
#define yylex cool_yylex
/* Max size of string constants */
#define MAX_STR_CONST 1025
#define YY_NO_UNPUT /* keep g++ happy */
extern FILE *fin; /* we read from this file */
/* define YY_INPUT so we read from the FILE fin:
* This change makes it possible to use this scanner in
* the Cool compiler.
*/
#undef YY_INPUT
#define YY_INPUT(buf,result,max_size) \
if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
YY_FATAL_ERROR( "read() in flex scanner failed");
char string_buf[MAX_STR_CONST]; /* to assemble string constants */
char *string_buf_ptr;
extern int curr_lineno;
extern int verbose_flag;
extern YYSTYPE cool_yylval;
/*
* Add Your own definitions here
*/
int comment_nest_level;
%}
/*
* Define names for regular expressions here.
*/
DARROW =>
LE <=
ASSIGN <-
/* Keywords Definition*/
/* Except for the constants true and false, keywords are case insensitive */
/*
* In flex-old package, you cannot use the case-insensitive function (?i:keyword) ,
* this is only available after 2.5.34, while our version is 2.5.4
*/
CLASS [Cc][Ll][Aa][Ss][Ss]
ELSE [Ee][Ll][Ss][Ee]
FI [Ff][Ii]
IF [Ii][Ff]
IN [Ii][Nn]
INHERITS [Ii][Nn][Hh][Ee][Rr][Ii][Tt][Ss]
ISVOID [Ii][Ss][Vv][Oo][Ii][Dd]
LET [Ll][Ee][Tt]
LOOP [Ll][Oo][Oo][Pp]
POOL [Pp][Oo][Oo][Ll]
THEN [Tt][Hh][Ee][Nn]
WHILE [Ww][Hh][Ii][Ll][Ee]
CASE [Cc][Aa][Ss][Ee]
ESAC [Ee][Ss][Aa][Cc]
NEW [Nn][Ee][Ww]
OF [Oo][Ff]
NOT [Nn][Oo][Tt]
/* the first letter of true/false must be lowercase; the trailing may be upper or lower case. */
TRUE t[Rr][Uu][Ee]
FALSE f[Aa][Ll][Ss][Ee]
%x NCOMMENT SCOMMENT STRING STRINGREC
%%
/*
* Nested comments
* Really strange, not the same as C style comment, nested comments are accepted.
* For example, `(* haha (* nested comment*) fufu*)` is valid, but `(* ho )* *)` or `(*(* hey*)` are invalid
* <<EOF>> rules may not be used with other patterns
*/
"(*" {
BEGIN(NCOMMENT);
comment_nest_level = 0; /* entering from outside, initialize nest level */
}
<NCOMMENT>"(*" { comment_nest_level += 1; /* new nested comment, increase level */ }
<NCOMMENT>\n { curr_lineno += 1; }
<NCOMMENT><<EOF>> {
cool_yylval.error_msg = "EOF in comment";
BEGIN(INITIAL); /* first return to outside, or there will be an EOF loop*/
return (ERROR);
/*
* If a comment remains open when EOF is encountered, report this error with the message EOF
* in comment. Do not tokenize the comments contents simply because the terminator is missing.
*/
}
<NCOMMENT>.
<NCOMMENT>"*)" {
if (comment_nest_level > 0) comment_nest_level -= 1; /* decrease nest level unless the outmost*/
else BEGIN(INITIAL);
}
/*
* If you see “*)” outside a comment, report this error as Unmatched *),
* rather than tokenizing it as * and ).
*/
"*)" {
cool_yylval.error_msg = "Unmatched *)";
return (ERROR);
}
/*
* Single line comment
* Any characters between two dashes “--” and the next newline (or EOF, if there is no next newline) are treated as comments.
*/
"--" { BEGIN(SCOMMENT); }
<SCOMMENT>.
<SCOMMENT>\n {
curr_lineno += 1;
BEGIN(INITIAL);
/* there's no special case for single line comment, EOF will auto end */
}
/*
* The multiple-character operators.
*/
{DARROW} { return (DARROW); }
{LE} { return (LE); }
{ASSIGN} { return (ASSIGN); }
/*
* The tokens for single character symbols (e.g., “;” and “,”)
* are represented just by the integer (ASCII) value of the character itself.
*/
"+" { return '+'; }
"/" { return '/'; }
"-" { return '-'; }
"*" { return '*'; }
"=" { return '='; }
"<" { return '<'; }
"." { return '.'; }
"~" { return '~'; }
"," { return ','; }
";" { return ';'; }
":" { return ':'; }
"(" { return '('; }
")" { return ')'; }
"@" { return '@'; }
"{" { return '{'; }
"}" { return '}'; }
/*
* Keywords are case-insensitive except for the values true and false,
* which must begin with a lower-case letter.
*/
{CLASS} { return (CLASS); }
{ELSE} { return (ELSE); }
{FI} { return (FI); }
{IF} { return (IF); }
{IN} { return (IN); }
{INHERITS} { return (INHERITS); }
{ISVOID} { return (ISVOID); }
{LET} { return (LET); }
{LOOP} { return (LOOP); }
{POOL} { return (POOL); }
{THEN} { return (THEN); }
{WHILE} { return (WHILE); }
{CASE} { return (CASE); }
{ESAC} { return (ESAC); }
{NEW} { return (NEW); }
{OF} { return (OF); }
{NOT} { return (NOT); }
{TRUE} { cool_yylval.boolean = 1; return(BOOL_CONST); }
{FALSE} { cool_yylval.boolean = 0; return(BOOL_CONST); }
/*
* String constants (C syntax)
* Escape sequence \c is accepted for all characters c. Except for
* \n \t \b \f, the result is c.
*
*/
<STRINGREC>\\\n {
curr_lineno += 1;
}
<STRINGREC>\\\"
/*
* In either case(null char | too long), lexing should resume after the end of the string.
* The end of the string is defined as either:
* the beginning of the next line if an unescaped newline occurs after these errors are encountered
* after the closing ” otherwise
*/
<STRINGREC>\"|\n {
BEGIN(INITIAL);
if (yytext[0] == '\n') curr_lineno += 1;
}
<STRINGREC>.
\" {
BEGIN(STRING);
string_buf_ptr = string_buf; /* reset string buf ptr*/
}
<STRING>[^\"\\\n\0] {
/* it is necessary to exclude \0 here */
*string_buf_ptr = yytext[0];
string_buf_ptr ++;
if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
BEGIN(STRINGREC);
cool_yylval.error_msg = "String constant too long";
return (ERROR);
/*
* When a string is too long,
* report the error as String constant too long
* in the error string in the ERROR token.
*/
}
}
<STRING>\\(.|\n) {
if (yytext[1] == '\n') {
curr_lineno += 1;
}
/*
* Within a string, a sequence \c denotes the
* character c, except \b \t \n \f
*/
switch(yytext[1]) {
case 'b': *string_buf_ptr = '\b'; break;
case 't': *string_buf_ptr = '\t'; break;
case 'n': *string_buf_ptr = '\n'; break;
case 'f': *string_buf_ptr = '\f'; break;
/*
* This rule is not described in PA2's section4, though it could be implied by manual's
* `A string may not contain the null (character \0)`.
*/
case '\0': BEGIN(STRINGREC); cool_yylval.error_msg = "String contains escaped null character."; return (ERROR);
default: *string_buf_ptr = yytext[1]; break; /* \\n is included here*/
}
/*
* The two characters \0(0x5c30) is valid, but actually converted to '0'(0x30)
* this is handled in the default branch
*/
string_buf_ptr ++;
if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
BEGIN(STRINGREC);
cool_yylval.error_msg = "String constant too long";
return (ERROR);
}
}
/*
* A string may not contain EOF.
* A string may not contain the null (character \0).
* Any other character may be included in a string.
*/
<STRING><<EOF>> {
cool_yylval.error_msg = "EOF in string constant";
BEGIN(INITIAL);
return (ERROR);
/*
* for strings, if an EOF is encountered before the close-quote,
* report this error as EOF in string constant.
*/
}
<STRING>\0 {
BEGIN(STRINGREC);
cool_yylval.error_msg = "String contains null character.";
return (ERROR);
/*
* If the string contains invalid characters (i.e., the null character),
* report this as String contains null character.
*/
}
<STRING>\n {
/* the escaped case should haved been captured in the escape rule */
BEGIN(INITIAL);
curr_lineno += 1;
cool_yylval.error_msg = "Unterminated string constant";
return (ERROR);
/*
* If a string contains an unescaped newline,
* report that error as Unterminated string constant
* and resume lexing at the beginning of the next line
*/
}
<STRING>\" {
BEGIN(INITIAL);
cool_yylval.symbol = stringtable.add_string(string_buf, string_buf_ptr - string_buf);
return (STR_CONST);
}
/* Integer constants
* Integers are non-empty strings of digits 0-9
*/
[0-9]+ {
cool_yylval.symbol = inttable.add_string(yytext);
return (INT_CONST);
}
/*
* Identifiers
* Identifiers are strings (other than keywords) consisting of letters, digits, and the underscore character.
* type identifiers begin with a capital letter
* object identifiers begin with a lower case letter
*/
[A-Z][A-Za-z0-9_]* {
cool_yylval.symbol = idtable.add_string(yytext);
return (TYPEID);
}
[a-z][A-Za-z0-9_]* {
cool_yylval.symbol = idtable.add_string(yytext);
return (OBJECTID);
}
/*
* Whitespace consists of any sequence of the characters: blank (ascii 32), \n (newline, ascii 10), \f (form
* feed, ascii 12), \r (carriage return, ascii 13), \t (tab, ascii 9), \v (vertical tab, ascii 11)
*/
[\x20\f\r\t\v]+
\n { curr_lineno += 1; }
/*
* Invalid Character
* When an invalid character (one that cant begin any token) is encountered, a string containing just
* that character should be returned as the error string.
* Resume lexing at the following character.
*/
. {
cool_yylval.error_msg = strdup(yytext);
return (ERROR);
}
%%