49. Making my own Teeny Tiny Compiler - Lexer.
I am following this article: Let's make a Teeny Tiny compiler, part 1 by the very amazing Austin Henley. This blog is placeholder for my notes that I am making while I go follow Austin's article.
A compiler basically has the following flow: source code --> lexer (gives tokens) --> parser (gives program tree) --> emitter (gives compiled code)
Let's start with the lexer! Functionality: Given a string of Teeny Tiny code, it will iterate character by character to do two things: decide where each token starts/stops and what type of token it is. If the lexer is unable to do this, then it will report an error for an invalid token.
Steps on building the lexer:
- First we gathered the functions that we need: nextChar, peek, skipWhitespace, skipComment, abort and getToken
- We defined class Token which has two attributes: tokenChar and tokenKind. tokenKind is an enum.
- First we wrote the lexing logic for single operators like +,=, whitespace etc. Then extended this to two char operators like '==' using peek function.
- Then we extended the logic to identify Strings, Comments, Numbers, Identifiers and Keywords
import enum
import sys
class Lexer:
def __init__(self, source):
pass
#Process the next character
def nextChar(self):
self.curPos += 1
if self.curPos >= len(self.source):
self.curChar = '\0' # EOF (End of file)
else:
self.curChar = self.source[self.curPos]
#Return the lookahead character
def peek(self):
if(self.curPos + 1 >= len(self.source)):
return '\0'
else:
return self.source[self.curPos+1]
#Invalid token found, print error message and exit
def abort(self, message):
sys.exit("Lexing error. " + message)
#Skip whitespace except newline which indicates end of line
def skipWhitespace(self):
while self.curChar == ' ' or self.curChar == '\t' or self.curChar == '\r':
self.nextChar()
#Ignore comments
def skipComment(self):
if self.curChar == '#':
while self.curChar != '\n':
self.nextChar()
#Retrun the next token
def getToken(self):
token = None
# Check the first character of this token to see if we can decide what it is.
# If it is a multiple character operator (e.g., !=), number, identifier, or keyword then we will process the rest.
self.skipWhitespace()
self.skipComment()
if self.curChar == '+':
token = Token(self.curChar, TokenType.PLUS)
elif self.curChar == '-':
token = Token(self.curChar, TokenType.MINUS)
elif self.curChar == '*':
token = Token(self.curChar, TokenType.ASTERISK)
elif self.curChar == '/':
token = Token(self.curChar, TokenType.SLASH)
elif self.curChar == '=':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.EQEQ)
else:
token = Token(self.curChar, TokenType.EQ)
elif self.curChar == '>':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.GTEQ)
else:
token = Token(self.curChar, TokenType.GT)
elif self.curChar == '<':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.LTEQ)
else:
token = Token(self.curChar, TokenType.LT)
elif self.curChar == '!':
if self.peek() == '=':
lastChar = self.curChar
self.nextChar()
token = Token(lastChar + self.curChar, TokenType.NOTEQ)
else:
self.abort(f"Expected '!=' got !{self.peek()}")
elif self.curChar == '\n':
token = Token(self.curChar, TokenType.NEWLINE)
elif self.curChar == '\0':
token = Token('', TokenType.EOF)
elif self.curChar == '\"':
self.nextChar()
startPos = self.curPos
while self.curChar != '\"':
if self.curChar == '\r' or self.curChar == '\n' or self.curChar == '\t' or self.curChar == '\\' or self.curChar == '%':
self.abort("Illegal character in string.")
self.nextChar()
tokText = self.source[startPos:self.curPos]
token = Token(tokText, TokenType.STRING)
elif self.curChar.isdigit():
# Leading character is a digit, so this must be a number.
# Get all consecutive digits and decimal if there is one.
startPos = self.curPos
while self.peek().isdigit():
self.nextChar()
if self.peek() == '.': # Decimal!
self.nextChar()
# Must have at least one digit after decimal.
if not self.peek().isdigit():
# Error!
self.abort("Illegal character in number.")
while self.peek().isdigit():
self.nextChar()
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
token = Token(tokText, TokenType.NUMBER)
elif self.curChar.isalpha():
# Leading character is a letter, so this must be an identifier or a keyword.
# Get all consecutive alpha numeric characters.
startPos = self.curPos
while self.peek().isalnum():
self.nextChar()
# Check if the token is in the list of keywords.
tokText = self.source[startPos : self.curPos + 1] # Get the substring.
keyword = Token.checkIfKeyword(tokText)
if keyword == None: # Identifier
token = Token(tokText, TokenType.IDENT)
else: # Keyword
token = Token(tokText, keyword)
else:
# Unknown token!
self.abort("Unkown token: " + self.curChar)
self.nextChar()
return token
#Constructor
def __init__(self, source):
self.source = source + '\n' # Source is basically the Source code to lex as a string. Append a newline to simplify lexing/parsing the last token/statement.
self.curChar = '' # Current character in the string.
self.curPos = -1 # Current position in the string.
self.nextChar()
class Token:
def __init__(self, tokenText, tokenKind):
self.text = tokenText # The token's actual text. Used for identifiers, strings, and numbers.
self.kind = tokenKind # The TokenType that this token is classified as.
@staticmethod
def checkIfKeyword(tokenText):
for kind in TokenType:
# Relies on all keyword enum values being 1XX.
if kind.name == tokenText and kind.value >= 100 and kind.value < 200:
return kind
return None
# TokenType is our enum for all the types of tokens.
class TokenType(enum.Enum):
EOF = -1
NEWLINE = 0
NUMBER = 1
IDENT = 2
STRING = 3
# Keywords.
LABEL = 101
GOTO = 102
PRINT = 103
INPUT = 104
LET = 105
IF = 106
THEN = 107
ENDIF = 108
WHILE = 109
REPEAT = 110
ENDWHILE = 111
# Operators.
EQ = 201
PLUS = 202
MINUS = 203
ASTERISK = 204
SLASH = 205
EQEQ = 206
NOTEQ = 207
LT = 208
LTEQ = 209
GT = 210
GTEQ = 211
With this the lexer is completed!!