参加ThoughtWorks University的一个来月没啥事情,闲了写写compiler玩。发现Lexer部分比较基础也比较常用,有很多相似的东西,每次都要写一遍也太麻烦了,下面是我按着JSL写的一个common java-like lexer,对于大多数接近java语法的语言估计是够用了。BTW:这个Lexer定义是TDD出来,以通过测试为要务,可能可读性不太强。

1.WhiteSpace

 1 WhiteSpace
 2     : (' '    // ASCII SP
 3     |  '\t'   // ASCII HT
 4     |  '\f'   // ASCII FF
 5     |  LineTerminator {newline();}
 6     )+{$setType(Token.SKIP);}
 7     ;
 8 protected LineTerminator
 9     options {generateAmbigWarnings=false;}
10     : '\n'   // ASCII LF
11     | '\r'   // ASCII CR
12     | "\r\n" // ASCII CR followed ASCII LF
13     ;

2.Comments

 1 Comment
 2     : (SingleLineComment | MultiLineComment)
 3     {$setType(Token.SKIP);}
 4     ;
 5 protected SingleLineComment
 6     : "//" (~('\n'|'\r'))* (LineTerminator{newline();})?
 7     ;
 8 protected MultiLineComment
 9     : "/*"
10       (~('\n'|'\r'|'*'| LineTerminator{newline();})* 
11       "*/"
12     ;

3.Escape Sequences

 1 protected EscapeSequence
 2     :'\\'!
 3         ('n' {$setText("\n");}
 4         |'r' {$setText("\r");}
 5         |'t' {$setText("\t");}
 6         |'b' {$setText("\b");}
 7         |'f' {$setText("\f");}
 8         |'"' 
 9         |'\''
10         |'\\'
11         // octal escape
12         |'0'..'3'
13             ( options { warnWhenFollowAmbig = false; }: '0'..'7'
14             ( options { warnWhenFollowAmbig = false; }: '0'..'7')?)?
15         {char c = (char)Integer.parseInt($getText,8); $setText(c);}
16         |'4'..'7'
17             ( options { warnWhenFollowAmbig = false; }: '0'..'7' )?
18         {char c = (char)Integer.parseInt($getText,8); $setText(c);}
19         )
20     | ("\\u"=> UnicodeEscape
21     ;
22 protected UnicodeEscape
23     : '\\'! ('u')+{$setText("");} HexDigit HexDigit HexDigit HexDigit
24     {char c = (char)Integer.parseInt($getText,16); $setText(c);}
25     ;
26 protected HexDigit: '0'..'9' | 'a'..'f' | 'A'..'F';
27 

这个东西比较麻烦,种类很多,有像\t \n \r这样的escape,也有\uuu1234这样的unicode escape,还有octal escape,说实话,这个东西还是这次写compiler的时候新发现的,以前还真不知道有这么个东西,也从来没用过...汗啊...octal escape是对于小于255的数,可以用\012这样的八进制数表示,这个东西没想明白有什么用。反正JSL上写了,就按这个来吧。

4. String & Character Literal

1 StringLiteral
2     : '"'! (EscapeSequence|~'"')* '"'!
3     ;
4 CharacterLiteral
5     : '\''! (EscapeSequence|~'"')? '\''!
6     ;

5. NumericLiteral

 1 NumericLiteral
 2     options{testLiterals = true;}
 3      {int type = 0;}
 4      : ((".end"=> type = EndOfDirective
 5         |(".max"=> type = MaxDirective
 6        |('.' 'a'..'z'=> type = Directives         
 7        | ('+'! | '-')? (type = IntegerLiteral | type = HexIntegerLiteral | type = DoubleLiteral)
 8       )
 9     {$setType(type);}
10     ;
11
26 protected IntegerLiteral
27     returns [int type = 0]
28     {$setType(DecimalIntegerLiteral);}
29     : ('0' 
30     | '0'! ( '0'..'7' {$setType(OctalIntegerLiteral);})+
31     | '1'..'9' ('0'..'9')*
32      ((LongTypeSuffix! {
33             if (_ttype == OctalIntegerLiteral) 
34               $setType(OctalLongLiteral);
35             else 
36               $setType(DecimalLongLiteral);
37        }) ? 
38       | {_ttype == DecimalIntegerLiteral}? 
39         (FloatingPointPart | ExponentPart) {$setType(DoubleLiteral);}
40         (DoubleTypeSuffix! | FloatTypeSuffix!{$setType(FloatLiteral);})?
41       ){type = _ttype;}
42     ;
43 protected HexIntegerLiteral
44     returns [int type = 0]
45     : ('0'! ('x'! | 'X'!) (HexDigit)+ 
46       (LongTypeSuffix! {$setType(HexLongLiteral);}) ?)
47       {type = _ttype;}
48     ;
49 protected DoubleLiteral
50     returns [int type = 0]
51     : (FloatingPointPart (DoubleTypeSuffix! | FloatTypeSuffix!{$setType(FloatLiteral);})?)
52       {type = _ttype;}
53     ;
54 protected FloatingPointPart
55     : '.' ('0'..'9')+ (ExponentPart)?
56     ;
57 protected ExponentPart 
58     : ('E'|'e') ('+'|'-')? ('0'..'9')+
59     ;
60 protected LongTypeSuffix : 'l' | 'L';
61 protected DoubleTypeSuffix : 'd' | 'D';
62 protected FloatTypeSuffix : 'f' | 'F';

这个是最复杂的一部分...

Unit Test比较长,节选吧

  1 public void testShouldIgnoreWhiteSpaces() throws Exception {
  2   assertRecognized(OctaneTokenTypes.EOF, " ");
  3   assertRecognized(OctaneTokenTypes.EOF, "\t");
  4   assertRecognized(OctaneTokenTypes.EOF, "\f");
  5 }
  6 
  7 public void testShouldIgnoreLineTerminators() throws Exception {
  8   assertRecognized(OctaneTokenTypes.EOF, "\r");
  9   assertRecognized(OctaneTokenTypes.EOF, "\n");
 10   assertRecognized(OctaneTokenTypes.EOF, "\r\n");
 11 }
 12 
 13 public void testShouldIgnoreSingleLineComment() throws Exception {
 14   assertRecognized(OctaneTokenTypes.EOF, "// comments 1234 &*^$\n");
 15 }
 16 
 17 public void testShouldIgnoreMultiLineComment() throws Exception {
 18   assertRecognized(OctaneLexer.EOF, "/* comment line 1\ncomment line 2\n*/");
 19 }
 20 
 21 public void testShouldIncreaseLineNumberIfLineTerminatorsGiven() throws Exception {
 22   assertEquals(2, createLexer("\r").nextToken().getLine());
 23   assertEquals(2, createLexer("\n").nextToken().getLine());
 24   assertEquals(2, createLexer("\r\n").nextToken().getLine());
 25 }
 26 
 27 public void testShouldRecognizeBasicEscapeInCharacterLiteral() throws Exception {
 28   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\n""'\\n'");
 29   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\r""'\\r'");
 30   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\t""'\\t'");
 31   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\b""'\\b'");
 32   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\f""'\\f'");
 33   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\"""'\\\"'");
 34   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\\""'\\\\'");
 35   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\'""'\\\''");
 36 }
 37 
 38 public void testShouldRecognizeBasicEscapeInStringLiteral() throws Exception {
 39   assertRecognized(OctaneTokenTypes.StringLiteral, "\n""\"\\n\"");
 40   assertRecognized(OctaneTokenTypes.StringLiteral, "\r""\"\\r\"");
 41   assertRecognized(OctaneTokenTypes.StringLiteral, "\t""\"\\t\"");
 42   assertRecognized(OctaneTokenTypes.StringLiteral, "\b""\"\\b\"");
 43   assertRecognized(OctaneTokenTypes.StringLiteral, "\f""\"\\f\"");
 44   assertRecognized(OctaneTokenTypes.StringLiteral, "\"""\"\\\"\"");
 45   assertRecognized(OctaneTokenTypes.StringLiteral, "\\""\"\\\\\"");
 46   assertRecognized(OctaneTokenTypes.StringLiteral, "\'""\"\\\'\"");
 47 }
 48 
 49 public void testShouldRecognizeOctalEscapeInCharacterLiteral() throws Exception {
 50   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\077""'\\077'");
 51   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\77""'\\77'");
 52   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\37""'\\37'");
 53   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\7""'\\7'");
 54 }
 55 
 56 public void testShouldRecognizeOctalEscapeInStringLiteral() throws Exception {
 57   assertRecognized(OctaneTokenTypes.StringLiteral, "\077""\"\\077\"");
 58   assertRecognized(OctaneTokenTypes.StringLiteral, "\77""\"\\77\"");
 59   assertRecognized(OctaneTokenTypes.StringLiteral, "\37""\"\\37\"");
 60   assertRecognized(OctaneTokenTypes.StringLiteral, "\7""\"\\7\"");
 61 }
 62 
 63 public void testShouldRecognizeUnicodeEscapeInCharacterLiteral() throws Exception {
 64   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\u1234""'\\u1234'");
 65   assertRecognized(OctaneTokenTypes.CharacterLiteral, "\uu1234","'\\uu1234\'");
 66 }
 67 
 68 public void testShouldRecognizeUnicodeEscapeInStringLiteral() throws Exception {
 69   assertRecognized(OctaneTokenTypes.StringLiteral, "\u1234""\"\\u1234\"");
 70   assertRecognized(OctaneTokenTypes.StringLiteral, "\uu1234""\"\\uu1234\"");
 71 }
 72 
 73 public void testShouldRecognizeUnicodeInStringLiteral() throws Exception {
 74   assertRecognized(OctaneTokenTypes.StringLiteral, "\"这是一行中文\"");
 75 }
 76 
 77 public void testShouldRecognizeDecimalIntegerLiteral() throws Exception {
 78   assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral, "0""0");
 79   assertRecognized(OctaneTokenTypes.DecimalIntegerLiteral, "-123""-123");
 80 }
 81 
 82 public void testShouldRecognizeDecimalLongLiteral() throws Exception {
 83   assertRecognized(OctaneTokenTypes.DecimalLongLiteral, "0""0l");
 84   assertRecognized(OctaneTokenTypes.DecimalLongLiteral, "-123""-123L");
 85 }
 86 
 87 public void testShouldRecognizeHexIntegerLiteral() throws Exception {
 88   assertRecognized(OctaneTokenTypes.HexIntegerLiteral, "1A3B""+0x1A3B");
 89   assertRecognized(OctaneTokenTypes.HexIntegerLiteral, "-1A3B""-0x1A3B");
 90 }
 91 
 92 public void testShouldRecognizeHexLongLiteral() throws Exception {
 93   assertRecognized(OctaneTokenTypes.HexLongLiteral, "1A3B""+0x1A3BL");
 94   assertRecognized(OctaneTokenTypes.HexLongLiteral, "-1A3F""-0x1A3Fl");
 95 }
 96 
 97 public void testShouldRecognizeOctalIntegerLiteral() throws Exception {
 98   assertRecognized(OctaneTokenTypes.OctalIntegerLiteral, "123""+0123");
 99   assertRecognized(OctaneTokenTypes.OctalIntegerLiteral, "-123""-0123");
100 }
101 
102 public void testShouldRecognizeOctalLongLiteral() throws Exception {
103   assertRecognized(OctaneTokenTypes.OctalLongLiteral, "1237""+01237L");
104   assertRecognized(OctaneTokenTypes.OctalLongLiteral, "-1237""-01237l");
105 }
106 
107 public void testShouldRecognizeDoubleLiteral() throws Exception {
108   assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5""+0.5");
109   assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5""-.5");
110   assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5""+0.5D");
111   assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5""-.5d");
112 }
113 
114 public void testShouldRecognizeDoubleLiteralInExponentialForm() throws Exception {
115   assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5e+10""+0.5e+10");
116   assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5E-10""-.5E-10");
117   assertRecognized(OctaneTokenTypes.DoubleLiteral, "0.5E+5""+0.5E+5D");
118   assertRecognized(OctaneTokenTypes.DoubleLiteral, "-.5E-5""-.5E-5d");
119   assertRecognized(OctaneTokenTypes.DoubleLiteral, "10E+5""+10E+5d");
120   assertRecognized(OctaneTokenTypes.DoubleLiteral, "-10e-5""-10e-5D");
121 }
122 
123 public void testShouldRecognizeFloatLiteral() throws Exception {
124   assertRecognized(OctaneTokenTypes.FloatLiteral, "0.5""+0.5F");
125   assertRecognized(OctaneTokenTypes.FloatLiteral, "-.5""-.5f");
126   assertRecognized(OctaneTokenTypes.FloatLiteral, "10E+5""+10E+5f");
127   assertRecognized(OctaneTokenTypes.FloatLiteral, "-10e-5""-10e-5F");
128 }
129 
130 public void testShouldRecognizeFloatLiteralInExponentialForm() throws Exception {
131   assertRecognized(OctaneTokenTypes.FloatLiteral, "0.5E+5""+0.5E+5F");
132   assertRecognized(OctaneTokenTypes.FloatLiteral, "-.5e-5""-.5e-5f");
133 }
134 
135 protected void assertRecognized(int tokenType, String sourceString) throws Exception {
136   assertRecognized(tokenType, null, sourceString);
137 
138 }
139 
140 protected void assertRecognized(int tokenType, String exceptedText, String sourceString) throws Exception {
141   assertRecognized(new int[] { tokenType }, exceptedText == null ? null : new String[] { exceptedText }, sourceString);
142 }
143 
144 protected void assertRecognized(int[] tokenTypes, String[] exceptedText, String sourceString) throws TokenStreamException {
145   TokenStream lexer = createLexer(sourceString);
146   for (int i = 0; i < tokenTypes.length; i++) {
147     Token token = lexer.nextToken();
148     assertEquals(tokenTypes[i], token.getType());
149     if (exceptedText != null) assertEquals(exceptedText[i], token.getText());
150   }
151   assertEquals(OctaneTokenTypes.EOF, lexer.nextToken().getType());
152 }