334 lines
11 KiB
Plaintext
334 lines
11 KiB
Plaintext
/*
|
||
* The scanner definition for COOL.
|
||
*/
|
||
|
||
/*
|
||
* Stuff enclosed in %{ %} in the first section is copied verbatim to the
|
||
* output, so headers and global definitions are placed here to be visible
|
||
* to the code in the file. Don't remove anything that was here initially
|
||
*/
|
||
%{
|
||
#include <cool-parse.h>
|
||
#include <stringtab.h>
|
||
#include <utilities.h>
|
||
|
||
/* The compiler assumes these identifiers. */
|
||
#define yylval cool_yylval
|
||
#define yylex cool_yylex
|
||
|
||
/* Max size of string constants */
|
||
#define MAX_STR_CONST 1025
|
||
#define YY_NO_UNPUT /* keep g++ happy */
|
||
|
||
extern FILE *fin; /* we read from this file */
|
||
|
||
/* define YY_INPUT so we read from the FILE fin:
|
||
* This change makes it possible to use this scanner in
|
||
* the Cool compiler.
|
||
*/
|
||
#undef YY_INPUT
|
||
#define YY_INPUT(buf,result,max_size) \
|
||
if ( (result = fread( (char*)buf, sizeof(char), max_size, fin)) < 0) \
|
||
YY_FATAL_ERROR( "read() in flex scanner failed");
|
||
|
||
char string_buf[MAX_STR_CONST]; /* to assemble string constants */
|
||
char *string_buf_ptr;
|
||
|
||
extern int curr_lineno;
|
||
extern int verbose_flag;
|
||
|
||
extern YYSTYPE cool_yylval;
|
||
|
||
/*
|
||
* Add Your own definitions here
|
||
*/
|
||
|
||
int comment_nest_level;
|
||
int string_recover_error;
|
||
%}
|
||
|
||
/*
|
||
* Define names for regular expressions here.
|
||
*/
|
||
|
||
DARROW =>
|
||
LE <=
|
||
ASSIGN <-
|
||
|
||
/* Keywords Definition*/
|
||
/* Except for the constants true and false, keywords are case insensitive */
|
||
/*
|
||
* In flex-old package, you cannot use the case-insensitive function (?i:keyword) ,
|
||
* this is only available after 2.5.34, while our version is 2.5.4
|
||
*/
|
||
CLASS [Cc][Ll][Aa][Ss][Ss]
|
||
ELSE [Ee][Ll][Ss][Ee]
|
||
FI [Ff][Ii]
|
||
IF [Ii][Ff]
|
||
IN [Ii][Nn]
|
||
INHERITS [Ii][Nn][Hh][Ee][Rr][Ii][Tt][Ss]
|
||
ISVOID [Ii][Ss][Vv][Oo][Ii][Dd]
|
||
LET [Ll][Ee][Tt]
|
||
LOOP [Ll][Oo][Oo][Pp]
|
||
POOL [Pp][Oo][Oo][Ll]
|
||
THEN [Tt][Hh][Ee][Nn]
|
||
WHILE [Ww][Hh][Ii][Ll][Ee]
|
||
CASE [Cc][Aa][Ss][Ee]
|
||
ESAC [Ee][Ss][Aa][Cc]
|
||
NEW [Nn][Ee][Ww]
|
||
OF [Oo][Ff]
|
||
NOT [Nn][Oo][Tt]
|
||
/* the first letter of true/false must be lowercase; the trailing may be upper or lower case. */
|
||
TRUE t[Rr][Uu][Ee]
|
||
FALSE f[Aa][Ll][Ss][Ee]
|
||
|
||
|
||
%x NCOMMENT SCOMMENT STRING
|
||
|
||
%%
|
||
|
||
/*
|
||
* Nested comments
|
||
* Really strange, not the same as C style comment, nested comments are accepted.
|
||
* For example, `(* haha (* nested comment*) fufu*)` is valid, but `(* ho )* *)` or `(*(* hey*)` are invalid
|
||
* <<EOF>> rules may not be used with other patterns
|
||
*/
|
||
"(*" {
|
||
BEGIN(NCOMMENT);
|
||
comment_nest_level = 0; /* entering from outside, initialize nest level */
|
||
}
|
||
<NCOMMENT>"(*" { comment_nest_level += 1; /* new nested comment, increase level */ }
|
||
<NCOMMENT>\n { curr_lineno += 1; }
|
||
<NCOMMENT><<EOF>> {
|
||
cool_yylval.error_msg = "EOF in comment";
|
||
BEGIN(INITIAL); /* first return to outside, or there will be an EOF loop*/
|
||
return (ERROR);
|
||
/*
|
||
* If a comment remains open when EOF is encountered, report this error with the message ‘‘EOF
|
||
* in comment’’. Do not tokenize the comment’s contents simply because the terminator is missing.
|
||
*/
|
||
}
|
||
<NCOMMENT>.
|
||
<NCOMMENT>"*)" {
|
||
if (comment_nest_level > 0) comment_nest_level -= 1; /* decrease nest level unless the outmost*/
|
||
else BEGIN(INITIAL);
|
||
}
|
||
/*
|
||
* If you see “*)” outside a comment, report this error as ‘‘Unmatched *)’’,
|
||
* rather than tokenizing it as * and ).
|
||
*/
|
||
"*)" {
|
||
cool_yylval.error_msg = "Unmatched *)";
|
||
return (ERROR);
|
||
}
|
||
|
||
/*
|
||
* Single line comment
|
||
* Any characters between two dashes “--” and the next newline (or EOF, if there is no next newline) are treated as comments.
|
||
*/
|
||
|
||
"--" { BEGIN(SCOMMENT); }
|
||
<SCOMMENT>.
|
||
<SCOMMENT>\n {
|
||
curr_lineno += 1;
|
||
BEGIN(INITIAL);
|
||
/* there's no special case for single line comment, EOF will auto end */
|
||
}
|
||
|
||
/*
|
||
* The multiple-character operators.
|
||
*/
|
||
{DARROW} { return (DARROW); }
|
||
{LE} { return (LE); }
|
||
{ASSIGN} { return (ASSIGN); }
|
||
|
||
/*
|
||
* The tokens for single character symbols (e.g., “;” and “,”)
|
||
* are represented just by the integer (ASCII) value of the character itself.
|
||
*/
|
||
"+" { return '+'; }
|
||
"/" { return '/'; }
|
||
"-" { return '-'; }
|
||
"*" { return '*'; }
|
||
"=" { return '='; }
|
||
"<" { return '<'; }
|
||
"." { return '.'; }
|
||
"~" { return '~'; }
|
||
"," { return ','; }
|
||
";" { return ';'; }
|
||
":" { return ':'; }
|
||
"(" { return '('; }
|
||
")" { return ')'; }
|
||
"@" { return '@'; }
|
||
"{" { return '{'; }
|
||
"}" { return '}'; }
|
||
|
||
/*
|
||
* Keywords are case-insensitive except for the values true and false,
|
||
* which must begin with a lower-case letter.
|
||
*/
|
||
{CLASS} { return (CLASS); }
|
||
{ELSE} { return (ELSE); }
|
||
{FI} { return (FI); }
|
||
{IF} { return (IF); }
|
||
{IN} { return (IN); }
|
||
{INHERITS} { return (INHERITS); }
|
||
{ISVOID} { return (ISVOID); }
|
||
{LET} { return (LET); }
|
||
{LOOP} { return (LOOP); }
|
||
{POOL} { return (POOL); }
|
||
{THEN} { return (THEN); }
|
||
{WHILE} { return (WHILE); }
|
||
{CASE} { return (CASE); }
|
||
{ESAC} { return (ESAC); }
|
||
{NEW} { return (NEW); }
|
||
{OF} { return (OF); }
|
||
{NOT} { return (NOT); }
|
||
{TRUE} { cool_yylval.boolean = 1; return(BOOL_CONST); }
|
||
{FALSE} { cool_yylval.boolean = 0; return(BOOL_CONST); }
|
||
|
||
/*
|
||
* String constants (C syntax)
|
||
* Escape sequence \c is accepted for all characters c. Except for
|
||
* \n \t \b \f, the result is c.
|
||
*
|
||
*/
|
||
\" {
|
||
BEGIN(STRING);
|
||
string_buf_ptr = string_buf; /* reset string buf ptr*/
|
||
string_recover_error = 0; /* reset error flag*/
|
||
}
|
||
<STRING>[^\"\\\n] {
|
||
if (!string_recover_error){
|
||
*string_buf_ptr = yytext[0];
|
||
string_buf_ptr ++;
|
||
if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
|
||
string_recover_error = 1; /* string too long */
|
||
}
|
||
}
|
||
}
|
||
<STRING>\\(.|\n) {
|
||
if (!string_recover_error){
|
||
/*
|
||
* Within a string, a sequence ‘\c’ denotes the
|
||
* character ‘c’, except \b \t \n \f
|
||
*/
|
||
switch(yytext[1]) {
|
||
case 'b': *string_buf_ptr = '\b'; break;
|
||
case 't': *string_buf_ptr = '\t'; break;
|
||
case 'n': *string_buf_ptr = '\n'; break;
|
||
case 'f': *string_buf_ptr = '\f'; break;
|
||
case '\n': *string_buf_ptr = '\n'; curr_lineno += 1; break;
|
||
default: *string_buf_ptr = yytext[1]; break;
|
||
}
|
||
string_buf_ptr ++;
|
||
if (string_buf_ptr >= string_buf + MAX_STR_CONST) {
|
||
string_recover_error = 1; /* string too long */
|
||
}
|
||
}
|
||
}
|
||
/*
|
||
* A string may not contain EOF.
|
||
* A string may not contain the null (character \0).
|
||
* Any other character may be included in a string.
|
||
*/
|
||
<STRING><<EOF>> {
|
||
cool_yylval.error_msg = "EOF in string constant";
|
||
BEGIN(INITIAL);
|
||
return (ERROR);
|
||
/*
|
||
* for strings, if an EOF is encountered before the close-quote,
|
||
* report this error as ‘‘EOF in string constant’’.
|
||
*/
|
||
}
|
||
<STRING>\0 {
|
||
string_recover_error = 2; /* null character */
|
||
}
|
||
/*
|
||
* In either case(null char | too long), lexing should resume after the end of the string.
|
||
* The end of the string is defined as either:
|
||
* the beginning of the next line if an unescaped newline occurs after these errors are encountered
|
||
* after the closing ” otherwise
|
||
*/
|
||
<STRING>\"|\n {
|
||
BEGIN(INITIAL);
|
||
if (yytext[0] == '\n') {
|
||
curr_lineno += 1;
|
||
}
|
||
if (!string_recover_error) {
|
||
if (yytext[0] == '\"') {
|
||
cool_yylval.symbol = stringtable.add_string(string_buf, string_buf_ptr - string_buf);
|
||
return (STR_CONST);
|
||
}
|
||
else if (yytext[0] == '\n') {
|
||
/* the escaped case should haved been captured in the escape rule */
|
||
cool_yylval.error_msg = "Unterminated string constant";
|
||
return (ERROR);
|
||
/*
|
||
* If a string contains an unescaped newline,
|
||
* report that error as ‘‘Unterminated string constant’’
|
||
* and resume lexing at the beginning of the next line
|
||
*/
|
||
}
|
||
}
|
||
else if (string_recover_error == 1) {
|
||
cool_yylval.error_msg = "EOF in string constant";
|
||
return (ERROR);
|
||
/*
|
||
* When a string is too long,
|
||
* report the error as ‘‘String constant too long’’
|
||
* in the error string in the ERROR token.
|
||
*/
|
||
}
|
||
else if (string_recover_error == 2){
|
||
cool_yylval.error_msg = "String contains null character";
|
||
return (ERROR);
|
||
/*
|
||
* If the string contains invalid characters (i.e., the null character),
|
||
* report this as ‘‘String contains null character’’.
|
||
*/
|
||
}
|
||
}
|
||
|
||
/* Integer constants
|
||
* Integers are non-empty strings of digits 0-9
|
||
*/
|
||
[0-9]+ {
|
||
cool_yylval.symbol = inttable.add_string(yytext);
|
||
return (INT_CONST);
|
||
}
|
||
|
||
/*
|
||
* Identifiers
|
||
* Identifiers are strings (other than keywords) consisting of letters, digits, and the underscore character.
|
||
* type identifiers begin with a capital letter
|
||
* object identifiers begin with a lower case letter
|
||
*/
|
||
[A-Z][A-Za-z0-9_]* {
|
||
cool_yylval.symbol = idtable.add_string(yytext);
|
||
return (TYPEID);
|
||
}
|
||
[a-z][A-Za-z0-9_]* {
|
||
cool_yylval.symbol = idtable.add_string(yytext);
|
||
return (OBJECTID);
|
||
}
|
||
|
||
/*
|
||
* Whitespace consists of any sequence of the characters: blank (ascii 32), \n (newline, ascii 10), \f (form
|
||
* feed, ascii 12), \r (carriage return, ascii 13), \t (tab, ascii 9), \v (vertical tab, ascii 11)
|
||
*/
|
||
[\x20\f\r\t\v]+
|
||
\n { curr_lineno += 1; }
|
||
|
||
/*
|
||
* Invalid Character
|
||
* When an invalid character (one that can’t begin any token) is encountered, a string containing just
|
||
* that character should be returned as the error string.
|
||
* Resume lexing at the following character.
|
||
*/
|
||
. {
|
||
cool_yylval.error_msg = strdup(yytext);
|
||
return (ERROR);
|
||
}
|
||
%%
|