Java 1.5 Parser - Scanner and Lexer - Part 4

<- back

Next Chapter Grammar Rules - part 5


Literals: Boolean, Char, String and Operators

Boolean Literal

Chapter 3.10.3 Boolean Literals defines boolean literals:

Java Syntax Rule
BooleanLiteral: one of
true false

These rules are converted the following JFlex grammar

JFlex grammar Rule
%%

%%
<YYINITIAL> {
true { return new Token(Parser._True, yycolumn + 1, yyline + 1, yychar,
yytext()); }
false { return new Token(Parser._False, yycolumn + 1, yyline + 1, yychar,
yytext()); }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
True
False

Unittests

Verify that true and false is tokenized correctly.

    @Test
    public void testScan_token_true() throws UnsupportedEncodingException {
        System.out.println("testScan_token_true");
        // Initialize
        String sContent = "true";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._True, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }
 
    @Test
    public void testScan_token_false() throws UnsupportedEncodingException {
        System.out.println("testScan_token_false");
        // Initialize
        String sContent = "false";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._False, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }

Character Literals

Chapter 3.10.4 Character Literals defines character literals:

Java Syntax Rule
CharacterLiteral:
' SingleCharacter '
' EscapeSequence '

SingleCharacter:
InputCharacter but not ' or \

Chapter 3.10.6 Escape Sequences for Character and String Literals defines escape sequences used in character literals and string literals. Note though, that the unicode escape sequence \uxxxx is not defined below.

Java Syntax Rule
EscapeSequence:
\ b /* \u0008: backspace BS */
\ t /* \u0009: horizontal tab HT */
\ n /* \u000a: linefeed LF */
\ f /* \u000c: form feed FF */
\ r /* \u000d: carriage return CR */
\ " /* \u0022: double quote " */
\ ' /* \u0027: single quote ' */
\ \ /* \u005c: backslash \ */
OctalEscape /* \u0000 to \u00ff: from octal value */

OctalEscape:
\ OctalDigit
\ OctalDigit OctalDigit
\ ZeroToThree OctalDigit OctalDigit

OctalDigit: one of
0 1 2 3 4 5 6 7

ZeroToThree: one of
0 1 2 3

These rules are converted the following JFlex grammar

JFlex grammar Rule
%%

%{
StringBuffer textcontent = new StringBuffer();
int nColumn, nLine, nChar, nCharCount;
%}

CharacterLiteral = [^\n\r'\\] | \\b | \\f | \\t | \\n | \\r | \\' | \\\" | \\\\ |
\\u[0-9a-fA-F]{4} | \\[0-7]{1,2} | \\[0-3][0-7]{2}

%state CHAR

%%
<YYINITIAL> {
' { textcontent.setLength(0); nColumn = yycolumn + 1; nLine = yyline + 1;
nChar = yychar; nCharCount = 0; yybegin(CHAR); }
}

<CHAR> {
' { yybegin(YYINITIAL);
if ( nCharCount == 1 ) {
return new Token(Parser._CharacterLiteral, nColumn, nLine, nChar,
textcontent.toString());
}
else {
return new Token(Parser._Illegaltoken, nColumn, nLine, nChar,
textcontent.toString());
} }

{CharacterLiteral} { textcontent.append( yytext() ); ++nCharCount; }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
CharacterLiteral

Unittest

Character literals have a lot of options. Unittest should contain ordinary ASCII characters literals, unicode character literals and escape sequences. The empty character literal '' should generate IllegalToken. A few examples is shown below

    private char generateRandom_ASCII_character() {
        char c = (char) (randomGenerator.nextInt(94) + 32);
        switch( c ) {
            case 34:    // "
                ++c;
                break;
            case 39:    // '
                ++c;
                break;
            case 92:    // \
                ++c;
                break;
        }
        return c;
    }
 
    @Test
    public void testScan_token_CharacterLiteral_ASCII_literal() throws UnsupportedEncodingException {
        System.out.println("testScan_token_CharacterLiteral_ASCII_literal");
        // Initialize
        char cChar = generateRandom_ASCII_character();
        String sContent = "\'" + Character.toString(cChar) + "\'";
        System.out.println("  Random character literal: " + sContent );
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._CharacterLiteral, 0, 0, 0, Character.toString(cChar) );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
        assertNotNull( result.val );
        assertEquals( expected.val, result.val );
    }
 
    private String generateRandomEscapeCharacters() {
        String sEscape = null;
        int nEscape = randomGenerator.nextInt(8);
        switch( nEscape ) {
            case 0:
                sEscape = "\\b";
                break;
            case 1:
                sEscape = "\\t";
                break;
            case 2:
                sEscape = "\\n";
                break;
            case 3:
                sEscape = "\\f";
                break;
            case 4:
                sEscape = "\\r";
                break;
            case 5:
                sEscape = "\\\"";
                break;
            case 6:
                sEscape = "\\\'";
                break;
            default:
                sEscape = "\\\\";
        }
        return sEscape;
    }
 
    @Test
    public void testScan_token_CharacterLiteral_escape_literal() throws UnsupportedEncodingException {
        System.out.println("testScan_token_CharacterLiteral_escape_literal");
        // Initialize
        String sChar = generateRandomEscapeCharacters();
        String sContent = "\'" + sChar + "\'";
        System.out.println("  Random character literal: " + sContent );
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._CharacterLiteral, 0, 0, 0, sChar );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
        assertNotNull( result.val );
        assertEquals( expected.val, result.val );
    }
 
    @Test
    public void testScan_token_CharacterLiteral_illegal_empty_char() throws UnsupportedEncodingException {
        System.out.println("testScan_token_CharacterLiteral_illegal_empty_char");
        // Initialize
        String sContent = "\'\'";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._Illegaltoken, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }

String Literals

Chapter 3.10.5 String Literals defines string literals:

Java Syntax Rule
StringLiteral:
" StringCharactersopt "

StringCharacters:
StringCharacter
StringCharacters StringCharacter

StringCharacter:
{{InputCharacter but not " or \

EscapeSequence

These rules are converted the following JFlex grammar

JFlex grammar Rule
%%

StringLiteral = [^\n\r\"] | \\\"

%state CHAR, STRING

%%
<YYINITIAL> {
\" { textcontent.setLength(0); nColumn = yycolumn + 1; nLine = yyline + 1;
nChar = yychar; yybegin(STRING); }
}

<STRING> {
\" { yybegin(YYINITIAL); return new Token(Parser._StringLiteral, nColumn, nLine,
nChar, textcontent.toString()); }
{StringLiteral} { textcontent.append( yytext() ); }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
StringLiteral

Unittest

The empty string and a few legal string, ASCII and unicode, unittests should be created.

@Test
    public void testScan_token_StringLiteral_empty_string() throws UnsupportedEncodingException {
        System.out.println("testScan_token_StringLiteral_empty_string");
        // Initialize
        String sString = "";
        String sContent = "\"" + sString + "\"";
        System.out.println("  String literal: " + sContent );
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._StringLiteral, 0, 0, 0, sString );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
        assertNotNull( result.val );
        assertEquals( expected.val, result.val );
    }
 
    private String randomASCIIString() {
        StringBuilder sb = new StringBuilder();
        int nNrOfCharacters = randomGenerator.nextInt(20);
        for ( int i=0; i<nNrOfCharacters; ++i ) {
            sb.append( generateRandom_ASCII_character() );
        }
        return sb.toString();
    }
 
    @Test
    public void testScan_token_StringLiteral_ASCII_string() throws UnsupportedEncodingException {
        System.out.println("testScan_token_StringLiteral_ASCII_string");
        // Initialize
        String sString = randomASCIIString();
        String sContent = "\"" + sString + "\"";
        System.out.println("  String literal: " + sContent );
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._StringLiteral, 0, 0, 0, sString );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
        assertNotNull( result.val );
        assertEquals( expected.val, result.val );
    }
 
    private char generateUnicodeLiteral() {
        char c = (char) (randomGenerator.nextInt(0x02B0 - 0x0100) + 0x0100);
        return c;
    }
 
    private String randomUnicodeString() {
        StringBuilder sb = new StringBuilder();
        int nNrOfCharacters = randomGenerator.nextInt(20);
        for ( int i=0; i<nNrOfCharacters; ++i ) {
            sb.append( generateUnicodeLiteral() );
        }
        return sb.toString();
    }
 
    @Test
    public void testScan_token_StringLiteral_Unicode_string() throws UnsupportedEncodingException {
        System.out.println("testScan_token_StringLiteral_Unicode_string");
        // Initialize
        String sString = randomUnicodeString();
        String sContent = "\"" + sString + "\"";
        System.out.println("  String literal: " + sContent );
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._StringLiteral, 0, 0, 0, sString );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
        assertNotNull( result.val );
        assertEquals( expected.val, result.val );
    }

null Literal

Chapter 3.10.7 The Null Literal defines the null literal:

Java Syntax Rule
NullLiteral:
null

These rules are converted the following JFlex grammar

JFlex grammar Rule
%%

%%
<YYINITIAL> {
 null { return new Token(Parser._Null, yycolumn + 1, yyline + 1, yychar,
yytext()); }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
 Null

Unittest

Check that null generates the correct token

    @Test
    public void testScan_token_null() throws UnsupportedEncodingException {
        System.out.println("testScan_token_null");
        // Initialize
        String sContent = "null";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._Null, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }

Separators

Chapter 3.11 Separators defines java separator characters:

Java Syntax Rule
Separator: one of
( ) { } [ ] ; , .

These rules are converted the following JFlex grammar, though only a few are shown because they are very similar.

JFlex grammar Rule
%%

%%
<YYINITIAL> {
\( { return new Token(Parser._LeftParenthesis, yycolumn + 1, yyline + 1, yychar,
yytext()); }
\) { return new Token(Parser._RightParenthesis, yycolumn + 1, yyline + 1, yychar,
yytext()); }

; { return new Token(Parser._SemiColon, yycolumn + 1, yyline + 1, yychar,
yytext()); }
\, { return new Token(Parser._Comma, yycolumn + 1, yyline + 1, yychar,
yytext()); }
\. { return new Token(Parser._Period, yycolumn + 1, yyline + 1, yychar,
yytext()); }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
LeftParenthesis
RightParenthesis
LeftCurlyBrace
RightCurlyBrace
LeftSquareBracket
RightSquareBracket
SemiColon
Comma
Period

Unittests

Verify that the separators are tokenized correctly. Just one is shown:

    @Test
    public void testScan_token_leftParenthesis() throws UnsupportedEncodingException {
        System.out.println("testScan_token_leftParenthesis");
        // Initialize
        String sContent = "(";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._LeftParenthesis, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }

perators

Chapter 3.12 Operators defines java operators:

Java Syntax Rule
Operator: one of
= < > ! ~ ? : == <= >=
!= && || ++ + - * / &
| ^ & << >> >>> += -= *= /=
&= |= ^= %= <<= >>= >>>=

These rules are converted the following JFlex grammar

JFlex grammar Rule
%%

%%
<YYINITIAL> {
= { return new Token(Parser._Assignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
< { return new Token(Parser._LesserThan, yycolumn + 1, yyline + 1, yychar,
yytext()); }
> { return new Token(Parser._GreaterThan, yycolumn + 1, yyline + 1, yychar,
yytext()); }
! { return new Token(Parser._ExclamationMark, yycolumn + 1, yyline + 1, yychar,
yytext()); }
~ { return new Token(Parser._Tilde, yycolumn + 1, yyline + 1, yychar,
yytext()); }
? { return new Token(Parser._QuestonMark, yycolumn + 1, yyline + 1, yychar,
yytext()); }
: { return new Token(Parser._Colon, yycolumn + 1, yyline + 1, yychar,
yytext()); }
== { return new Token(Parser._Equal, yycolumn + 1, yyline + 1, yychar,
yytext()); }
<= { return new Token(Parser._LessanThanOrEqual, yycolumn + 1, yyline + 1, yychar,
yytext()); }
>= { return new Token(Parser._GreaterThanOrEqual, yycolumn + 1, yyline + 1, yychar,
yytext()); }
!= { return new Token(Parser._NotEqual, yycolumn + 1, yyline + 1, yychar,
yytext()); }
&& { return new Token(Parser._LogicAnd, yycolumn + 1, yyline + 1, yychar,
yytext()); }
|| { return new Token(Parser._LogicOr, yycolumn + 1, yyline + 1, yychar,
yytext()); }
++ { return new Token(Parser._Increment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
— { return new Token(Parser._Decrement, yycolumn + 1, yyline + 1, yychar,
yytext()); }
+ { return new Token(Parser._Plus, yycolumn + 1, yyline + 1, yychar,
yytext()); }
- { return new Token(Parser._Minus, yycolumn + 1, yyline + 1, yychar,
yytext()); }

* { return new Token(Parser._Asterix, yycolumn + 1, yyline + 1, yychar,
yytext()); }
/ { return new Token(Parser._Slash, yycolumn + 1, yyline + 1, yychar,
yytext()); }
& { return new Token(Parser._BitwiseAnd, yycolumn + 1, yyline + 1, yychar,
yytext()); }
| { return new Token(Parser._BitwiseOr yycolumn + 1, yyline + 1, yychar,
yytext()); }
^ { return new Token(Parser._BitwiseXor, yycolumn + 1, yyline + 1, yychar,
yytext()); }
% { return new Token(Parser._Modulo, yycolumn + 1, yyline + 1, yychar,
yytext()); }
<< { return new Token(Parser._BitwiseShiftLeft, yycolumn + 1, yyline + 1, yychar,
yytext()); }
>> { return new Token(Parser._BitwiseShiftRight, yycolumn + 1, yyline + 1, yychar,
yytext()); }
>>> { return new Token(Parser._BitwiseUnsignedShiftRight, yycolumn + 1, yyline + 1, yychar,
yytext()); }
+= { return new Token(Parser._PlusAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
-= { return new Token(Parser._MinusAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
*= { return new Token(Parser._AsterixAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
/= { return new Token(Parser._SlashAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
&= { return new Token(Parser._BitwiseAndAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
|= { return new Token(Parser._BitwiseOrAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
^= { return new Token(Parser._BitwiseXorAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }

%= { return new Token(Parser._ModuloAssignment yycolumn + 1, yyline + 1, yychar,
yytext()); }
<<= { return new Token(Parser._BitwiseShiftLeftAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
>>= { return new Token(Parser._BitwiseShiftRightAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
>>>= { return new Token(Parser._BitwiseUnsignedShiftRightAssignment, yycolumn + 1, yyline + 1, yychar,
yytext()); }
}

The Coco/r parser must also be updated

Coco/R EBFN Rule
TOKENS
Assignment
LesserThan
GreaterThan
ExclamationMark
Tilde
QuestonMark
Colon
Equal
LessanThanOrEqual
GreaterThanOrEqual
NotEqual
LogicAnd
LogicOr
Increment
Decrement
Plus
Minus
Asterix
Slash
BitwiseAnd
BitwiseOr
BitwiseXor
Modulo
BitwiseShiftLeft
BitwiseShiftRight
BitwiseUnsignedShiftRight
PlusAssignment
MinusAssignment
AsterixAssignment
SlashAssignment
BitwiseAndAssignment
BitwiseOrAssignment
BitwiseXorAssignment
ModuloAssignment
BitwiseShiftLeftAssignment
BitwiseShiftRightAssignment
BitwiseUnsignedShiftRightAssignment

Unittest

Quite a few unittests are needed to test all operators. Here are some

    @Test
    public void testScan_token_greaterThan() throws UnsupportedEncodingException {
        System.out.println("testScan_token_greaterThan");
        // Initialize
        String sContent = ">";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._GreaterThan, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }
 
    @Test
    public void testScan_token_BitwiseShiftRight() throws UnsupportedEncodingException {
        System.out.println("testScan_token_BitwiseShiftRight");
        // Initialize
        String sContent = ">>";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._BitwiseShiftRight, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }
 
    @Test
    public void testScan_token_BitwiseUnsignedShiftRight() throws UnsupportedEncodingException {
        System.out.println("testScan_token_BitwiseUnsignedShiftRight");
        // Initialize
        String sContent = ">>>";
        InputStream is = new ByteArrayInputStream(sContent.getBytes("UTF-8"));
        Scanner instance = new Scanner(is);
        Token expected = new Token( Parser._BitwiseUnsignedShiftRight, 0, 0, 0 );
        // Test
        Token result = instance.Scan();
        // Validate
        assertNotNull( result );
        assertEquals( expected.kind, result.kind );
    }

So far…

Now, all tokens are defined. It is time to define grammar rules.

Next Chapter Grammar Rules - part 5


<- back

Unless otherwise stated, the content of this page is licensed under Creative Commons Attribution-ShareAlike 3.0 License