A general call came out on a compiler message group I’m a member of for example implementations of a simple lexical scanner implementing this specification http://rosettacode.org/wiki/Compiler/lexical_analyzer. As a C# example had not yet been supplied, I implemented a straightforward and heavily documented example in this language.
using System; using System.IO; using System.Linq; using System.Collections.Generic; namespace Rosetta { public enum TokenType { End_of_input, Op_multiply, Op_divide, Op_mod, Op_add, Op_subtract, Op_negate, Op_not, Op_less, Op_lessequal, Op_greater, Op_greaterequal, Op_equal, Op_notequal, Op_assign, Op_and, Op_or, Keyword_if, Keyword_else, Keyword_while, Keyword_print, Keyword_putc, LeftParen, RightParen, LeftBrace, RightBrace, Semicolon, Comma, Identifier, Integer, String, None } /// <summary> /// Storage class for tokens /// </summary> public class Token { public TokenType Type { get; set; } public int Line { get; set; } public int Position { get; set; } public string Value { get; set; } public override string ToString() { if (Type == TokenType.Integer || Type == TokenType.Identifier) { return String.Format("{0,-5} {1,-5} {2,-14} {3}", Line, Position, Type.ToString(), Value); } else if (Type == TokenType.String) { return String.Format("{0,-5} {1,-5} {2,-14} \"{3}\"", Line, Position, Type.ToString(), Value.Replace("\n", "\\n")); } return String.Format("{0,-5} {1,-5} {2,-14}", Line, Position, Type.ToString()); } } /// <summary> /// C# Example of Lexical scanner for Rosetta Compiler /// </summary> public class LexicalScanner { // character classes private const string _letters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"; private const string _numbers = "0123456789"; private const string _identifier = _letters + _numbers + "_"; private const string _whitespace = " \t\n\r"; // mappings from string keywords to token type private Dictionary<string, TokenType> _keywordTokenTypeMap = new Dictionary<string, TokenType>() { { "if", TokenType.Keyword_if }, { "else", TokenType.Keyword_else }, { "while", TokenType.Keyword_while }, { "print", TokenType.Keyword_print }, { "putc", TokenType.Keyword_putc } }; // mappings from simple operators to token type private Dictionary<string, TokenType> _operatorTokenTypeMap = new Dictionary<string, TokenType>() { { "+", TokenType.Op_add }, { "-", TokenType.Op_subtract }, { "*", TokenType.Op_multiply }, { "/", TokenType.Op_divide }, { "%", TokenType.Op_mod }, { "=", TokenType.Op_assign }, { "<", TokenType.Op_less }, { ">", TokenType.Op_greater }, { "!", TokenType.Op_not }, }; private List<string> _keywords; private string _operators = "+-*/%=<>!%"; private string _code; private List<Token> tokens = new List<Token>(); private int _line = 1; private int _position = 1; public string CurrentCharacter { get { try { return _code.Substring(0, 1); } catch (ArgumentOutOfRangeException) { return ""; } } } /// <summary> /// Lexical scanner initialiser /// </summary> /// <param name="code">Code to be tokenised</param> public LexicalScanner (string code) { _code = code; _keywords = _keywordTokenTypeMap.Keys.ToList(); } /// <summary> /// Advance the cursor forward given number of characters /// </summary> /// <param name="characters">Number of characters to advance</param> private void advance(int characters=1) { try { // reset position when there is a newline if (CurrentCharacter == "\n") { _position = 0; _line++; } _code = _code.Substring(characters, _code.Length - characters); _position += characters; } catch (ArgumentOutOfRangeException) { _code = ""; } } /// <summary> /// Outputs error message to the console and exits /// </summary> /// <param name="message">Error message to display to user</param> /// <param name="line">Line error occurred on</param> /// <param name="position">Line column that the error occurred at</param> public void error(string message, int line, int position) { // output error to the console and exit Console.WriteLine(String.Format("{0} @ {1}:{2}", message, line, position)); Environment.Exit(1); } /// <summary> /// Pattern matching using first & follow matching /// </summary> /// <param name="recogniseClass">String of characters that identifies the token type /// or the exact match the be made if exact:true</param> /// <param name="matchClass">String of characters to match against remaining target characters</param> /// <param name="tokenType">Type of token the match represents.</param> /// <param name="notNextClass">Optional class of characters that cannot follow the match</param> /// <param name="maxLen">Optional maximum length of token value</param> /// <param name="exact">Denotes whether recogniseClass represents an exact match or class match. /// Default: false</param> /// <param name="discard">Denotes whether the token is kept or discarded. Default: false</param> /// <param name="offset">Optiona line position offset to account for discarded tokens</param> /// <returns>Boolean indicating if a match was made </returns> public bool match(string recogniseClass, string matchClass, TokenType tokenType, string notNextClass=null, int maxLen=Int32.MaxValue, bool exact=false, bool discard=false, int offset=0) { // if we've hit the end of the file, there's no more matching to be done if (CurrentCharacter == "") return false; // store _current_ line and position so that our vectors point at the start // of each token int line = _line; int position = _position; // special case exact tokens to avoid needing to worry about backtracking if (exact) { if (_code.StartsWith(recogniseClass)) { if (!discard) tokens.Add(new Token() { Type = tokenType, Value = recogniseClass, Line = line, Position = position - offset}); advance(recogniseClass.Length); return true; } return false; } // first match - denotes the token type usually if (!recogniseClass.Contains(CurrentCharacter)) return false; string tokenValue = CurrentCharacter; advance(); // follow match while we haven't exceeded maxLen and there are still characters // in the code stream while ((matchClass ?? "").Contains(CurrentCharacter) && tokenValue.Length <= maxLen && CurrentCharacter != "") { tokenValue += CurrentCharacter; advance(); } // ensure that any incompatible characters are not next to the token // eg 42fred is invalid, and neither recognized as a number nor an identifier. // _letters would be the notNextClass if (notNextClass != null && notNextClass.Contains(CurrentCharacter)) error("Unrecognised character: " + CurrentCharacter, _line, _position); // only add tokens to the stack that aren't marked as discard - dont want // things like open and close quotes/comments if (!discard) { Token token = new Token() { Type = tokenType, Value = tokenValue, Line = line, Position = position - offset }; tokens.Add(token); } return true; } /// <summary> /// Tokenise the input code /// </summary> /// <returns>List of Tokens</returns> public List<Token> scan() { while (CurrentCharacter != "") { // match whitespace match(_whitespace, _whitespace, TokenType.None, discard: true); // match integers match(_numbers, _numbers, TokenType.Integer, notNextClass:_letters); // match identifiers and keywords if (match(_letters, _identifier, TokenType.Identifier)) { Token match = tokens.Last(); if (_keywords.Contains(match.Value)) match.Type = _keywordTokenTypeMap[match.Value]; } // match string similarly to comments without allowing newlines // this token doesn't get discarded though if (match("\"", null, TokenType.String, discard:true)) { string value = ""; int position = _position; while (!match("\"", null, TokenType.String, discard:true)) { // not allowed newlines in strings if (CurrentCharacter == "\n") error("End-of-line while scanning string literal. Closing string character not found before end-of-line", _line, _position); // end of file reached before finding end of string if (CurrentCharacter == "") error("End-of-file while scanning string literal. Closing string character not found", _line, _position); value += CurrentCharacter; // deal with escape sequences - we only accept newline (\n) if (value.Length >= 2) { string lastCharacters = value.Substring(value.Length - 2, 2); if (lastCharacters[0] == '\\') { if (lastCharacters[1] != 'n') { error("Unknown escape sequence. ", _line, position); } value = value.Substring(0, value.Length - 2).ToString() + "\n"; } } advance(); } tokens.Add(new Token() { Type = TokenType.String, Value = value, Line = _line, Position = position - 1}); } // match string literals if (match("'", null, TokenType.Integer, discard:true)) { int value; int position = _position; value = CurrentCharacter.ToCharArray()[0]; advance(); // deal with empty literals '' if (value == '\'') error("Empty character literal", _line, _position); // deal with escaped characters, only need to worry about \n and \\ // throw werror on any other if (value == '\\') { if (CurrentCharacter == "n") { value = '\n'; } else if (CurrentCharacter == "\\") { value = '\\'; } else { error("Unknown escape sequence. ", _line, _position - 1); } advance(); } // if we haven't hit a closing ' here, there are two many characters // in the literal if (!match("'", null, TokenType.Integer, discard: true)) error("Multi-character constant", _line, _position); tokens.Add(new Rosetta.Token() { Type = TokenType.Integer, Value = value.ToString(), Line = _line, Position = position - 1 }); } // match comments by checking for starting token, then advancing // until closing token is matched if (match("/*", null, TokenType.None, exact: true, discard: true)) { while (!match("*/", null, TokenType.None, exact: true, discard: true)) { // reached the end of the file without closing comment! if (CurrentCharacter == "") error("End-of-file in comment. Closing comment characters not found.", _line, _position); advance(); } continue; } // match complex operators match("<=", null, TokenType.Op_lessequal, exact: true); match(">=", null, TokenType.Op_greaterequal, exact: true); match("==", null, TokenType.Op_equal, exact: true); match("!=", null, TokenType.Op_notequal, exact: true); match("&&", null, TokenType.Op_and, exact: true); match("||", null, TokenType.Op_or, exact: true); // match simple operators if (match(_operators, null, TokenType.None, maxLen:1)) { Token match = tokens.Last(); match.Type = _operatorTokenTypeMap[match.Value]; } // brackets, braces and separators match("(", null, TokenType.LeftParen, exact: true); match(")", null, TokenType.RightParen, exact: true); match("{", null, TokenType.LeftBrace, exact: true); match("}", null, TokenType.RightBrace, exact: true); match(";", null, TokenType.Semicolon, exact: true); match(",", null, TokenType.Comma, exact: true); } // end of file token tokens.Add(new Rosetta.Token() { Type = TokenType.End_of_input, Line = _line, Position = _position }); return tokens; } static void Main (string[] args) { StreamReader inputFile; // if we passed in a filename, read code from that, else // read code from stdin if (args.Length > 0) { string path = args[0]; try { inputFile = new StreamReader(path); } catch (IOException) { inputFile = new StreamReader(Console.OpenStandardInput(8192)); } } else { inputFile = new StreamReader(Console.OpenStandardInput(8192)); } string code = inputFile.ReadToEnd(); // strip windows line endings out code = code.Replace("\r", ""); LexicalScanner scanner = new LexicalScanner(code); List<Token> tokens = scanner.scan(); foreach(Token token in tokens) { Console.WriteLine(token.ToString()); } } } }
/* All lexical tokens - not syntactically correct, but that will have to wait until syntax analysis */ /* Print */ print /* Sub */ - /* Putc */ putc /* Lss */ < /* If */ if /* Gtr */ > /* Else */ else /* Leq */ <= /* While */ while /* Geq */ >= /* Lbrace */ { /* Eq */ == /* Rbrace */ } /* Neq */ != /* Lparen */ ( /* And */ && /* Rparen */ ) /* Or */ || /* Uminus */ - /* Semi */ ; /* Not */ ! /* Comma */ , /* Mul */ * /* Assign */ = /* Div */ / /* Integer */ 42 /* Mod */ % /* String */ "String literal" /* Add */ + /* Ident */ variable_name /* character literal */ '\n' /* character literal */ '\\' /* character literal */ ' '
F:\Rosetta\Rosetta\obj\Debug>Rosetta.exe f:\test.txt 5 16 Keyword_print 5 40 Op_subtract 6 16 Keyword_putc 6 40 Op_less 7 16 Keyword_if 7 40 Op_greater 8 16 Keyword_else 8 40 Op_lessequal 9 16 Keyword_while 9 40 Op_greaterequal 10 16 LeftBrace 10 40 Op_equal 11 16 RightBrace 11 40 Op_notequal 12 16 LeftParen 12 40 Op_and 13 16 RightParen 13 40 Op_or 14 16 Op_subtract 14 40 Semicolon 15 16 Op_not 15 40 Comma 16 16 Op_multiply 16 40 Op_assign 17 16 Op_divide 17 40 Integer 42 18 16 Op_mod 18 40 String "String literal" 19 16 Op_add 19 40 Identifier variable_name 20 26 Integer 10 21 26 Integer 92 22 26 Integer 32 23 1 End_of_input