teaching machines

CS 330: Lecture 8 – Parsing

February 14, 2018 by . Filed under cs330, lectures, spring 2018.

Dear students,

We started writing our own lexer last time. Today, we’re going to get a parser that tries to make sense of those tokens. But first, we should talk about what kind of programming language we’re writing.

Let’s write a language for drawing things with shapes. Really simple shapes. We’ll need a name for this language, because names come before love, and we must have love to see this thing through to the end.

Maybe our language looks like this:

rectangle 0 0 100 200
circle 50 100 18

It’s pretty light on punctuation. Eventually we’ll work in math, variables, loops, conditionals, and functions. But this is good start.

Let’s make rectangle and circle keywords in our language. Right now the lexer will recognize them as identifiers. We’ll just do a quick check on the identifier that gets slurped up. If it’s a keyword, we emit a special token:

function identifier() {
  while (has(/[A-Za-z]/)) {
    devour();
  }

  if (tokenSoFar == 'rectangle') {
    emit(RECTANGLE);
  } else if (tokenSoFar == 'circle') {
    emit(CIRCLE);
  } else {
    emit(IDENTIFIER);
  }
};

Now it is time to assemble these tokens into a model of our program. That’s the role of the parser. To make a model of our program, we need to think about what a program is. We can describe a program’s anatomy by defining its grammar:

program
  : statement* EOF

statement
  : RECTANGLE expr expr expr expr
  : CIRCLE expr expr expr

expr
  : INTEGER       

The parser’s job is to build up a representation of our program in a language that the computer does know how to execute. It doesn’t know our shaper language, but it does know Javascript! So, we create abstractions for all our program parts:

function Block(statements) {
}

function StatementRectangle(left, top, width, height) {
}

function ExpressionInteger(literal) {
}

We can even write a program directly using these classes:

var program = new Block([
  new StatementRectangle(
    new ExpressionInteger(10),
    new ExpressionInteger(20),
    new ExpressionInteger(100),
    new ExpressionInteger(200)
  )
]);

But this would be like us writing in assembly. We have a higher-level language, which is much less verbose:

rectangle 10 20 100 200

We need to model how these program parts behave when our program is executed. Let’s add an evaluate method to each:

function Block(statements) {
  this.evaluate = function() {
    statements.forEach(statement => statement.evaluate());
  }
}

function StatementRectangle(left, top, width, height) {
  this.evaluate = function() {
    var l = left.evaluate();
    var t = top.evaluate();
    var w = width.evaluate();
    var h = height.evaluate();
    // draw something
  }
}

function ExpressionInteger(literal) {
  this.evaluate = function() {
    return literal;
  }
}

We also need to add a place for our output to show up. We add a canvas element to the HTML:

<canvas id="canvas" width="400" height="300" style="background-color: lightgray;"></canvas>

And grab a reference to it and the drawing context in Javascript:

var canvas = document.getElementById('canvas');
var context = canvas.getContext('2d');

Then we can draw in StatementRectangle with this:

context.fillRect(l, t, w, h);

We are ready know to build the bridge—the translator—between our language and Javascript. Let’s write parse in the same style as our lexer:

function parse(tokens) {
  var i = 0;
  
  function has(tokenType) {
    return i < tokens.length && tokens[i].type == tokenType;
  }

  function devour() {
    var token = tokens[i];
    ++i;
    return token;
  }

  ...
}

We have functions asserting that certain tokens are on the horizon and for advancing past them. Now, the particular elegance of a state machine emerges in the rest of the code. It turns out that we just need to write a function for each of the so-called non-terminals in our grammar:

function program() {
}

function statement() {
}

function expression() {
}

Function program tries to parse a sequence of statements up until EOF:

function program() {
  var statements = [];
  while (!has(EOF)) {
    statements.push(statement());
  }
  return new Block(statements);
}

Function statement tries to parse a rectangle command:

function statement() {
  if (has(RECTANGLE)) {
    consume();
    var l = expression();
    var t = expression();
    var w = expression();
    var h = expression();
    return new StatementRectangle(l, t, w, h);
  } else {
    throw "Unexpected token: " + tokens[i].source;
  }
}

And function expression tries to parse an integer literal:

function expression() {
  if (has(INTEGER)) {
    var e = new ExpressionInteger(parseInt(tokens[i].source));
    devour();
    return e;
  } else {
    throw "Unexpected token: " + tokens[i].source;
  }
}

That’s really the core our parsing. We just need to get the ball rolling at the end of parse:

return program();

Whoever gets the results can choose to evaluate our tree when they are ready. We will do so immediately in our onchange callback:

var lexer = new Lexer(editor.value);
var tokens = lexer.lex();
var parser = new Parser(tokens);
var ast = parser.parse(); // ast -> abstract syntax tree
ast.evaluate();

We should see our program come to life!

If we make an edit and run it again, the canvas still has the old pixels on it. We can clear it with this code:

context.clearRect(0, 0, canvas.width, canvas.height);

Let’s add support for circles. We use this class for our AST:

function StatementCircle(originXExpression, originYExpression, radiusExpression) {
  this.evaluate = function(env) {
    var originX = originXExpression.evaluate(env);
    var originY = originYExpression.evaluate(env);
    var radius = radiusExpression.evaluate(env);
    context.beginPath();
    context.arc(originX, originY, radius, 0, 2 * Math.PI);
    context.fill();
  }
}

And add this logic to our parser’s statement rule:

else if (has(CIRCLE)) {
  devour();
  var ox = expression();
  var oy = expression();
  var r = expression();
  return new StatementCircle(ox, oy, r);
}

If we have time, let’s add variables. We add this model for our AST:

function ExpressionVariableReference(id) {
  this.evaluate = function() {
    ...
  }
}

How do we evaluate a variable? We’ve got to look in up in some sort of table or environment that maps (binds) names to values. We could use some global table, but that would mean all our variables have global scope. Instead, we’re going to send in the table as a parameter to evaluate, which will make sending in various scopes much easier:

function ExpressionVariableReference(id) {
  this.evaluate = function() {
    var value = rhs.evaluate(env);
    env[id] = value;
    return value;
  }
}

We have to change all of our earlier evaluates to accept and pass on this environment recursively.

In our parser’s expression rule, we add:

else if (has(IDENTIFIER)) {
  var id = devour();
  var e = new ExpressionVariableReference(id.source);
  return e;
}

Back in our main, we create our initial environment. Let’s prime it with a variable manually since we don’t have assignment statements yet:

var env = {radius: 50};

Now we can write programs that reference variables.

Here’s your TODO list: new ArrayList().

See you next time!

Sincerely,

P.S. Here’s a haiku just for you:

Programming is math?
Feels more like language to me
Maybe it’s their kid

P.P.S. Here’s the code we wrote together:

lexer.js

var IDENTIFIER = 'IDENTIFIER';
var PLUS = 'PLUS';
var EOF = 'EOF';
var LEFT_PARENTHESIS = 'LEFT_PARENTHESIS';
var RECTANGLE = 'RECTANGLE';
var CIRCLE = 'CIRCLE';
var RIGHT_PARENTHESIS = 'RIGHT_PARENTHESIS';
var DIVIDE = 'DIVIDE';
var MINUS = 'MINUS';
var ASTERISK = 'ASTERISK';
var COMMA = 'COMMA';
var MOD = 'MOD';
var INTEGER = 'INTEGER';

function lex(source) {
  var tokens = [];
  var tokenSoFar = '';
  var i = 0;

  function has(regex) {
    return source.charAt(i).match(regex);
  }

  function devour() {
    tokenSoFar += source.charAt(i);
    i += 1;
  }

  function emit(type) {
    tokens.push({type: type, source: tokenSoFar});
    tokenSoFar = '';
  }

  while (i < source.length) {
    if (has(/\+/)) {
      devour();
      emit(PLUS);
    } else if (has(/%/)) {
      devour();
      emit(MOD);
    } else if (has(/-/)) {
      devour();
      emit(MINUS);
    } else if (has(/\(/)) {
      devour();
      emit(LEFT_PARENTHESIS);
    } else if (has(/\)/)) {
      devour();
      emit(RIGHT_PARENTHESIS);
    } else if (has(/\*/)) {
      devour();
      emit(ASTERISK);
    } else if (has(/\//)) {
      devour();
      if (has(/\//)) {
        while (has(/[^\n]/)) {
          devour();
        }
        tokenSoFar = '';
      } else {
        emit(DIVIDE);
      }
    } else if (has(/\d/)) {
      while (has(/\d/)) {
        devour();
      }
      emit(INTEGER);
    } else if (has(/[a-zA-Z]/)) {
      while (has(/[a-zA-Z0-9]/)) {
        devour();
      }

      if (tokenSoFar == 'rectangle') {
        emit(RECTANGLE);
      } else if (tokenSoFar == 'circle') {
        emit(CIRCLE);
      } else {
        emit(IDENTIFIER);
      }
    } else if (has(/\s/)) {
      devour();
      tokenSoFar = '';
    } else {
      throw 'ack! [' + tokenSoFar + ']';
    }
  }

  emit(EOF);

  return tokens;
}

ast.js

function Block(statements) {
  this.evaluate = function(env) {
    statements.forEach(statement => statement.evaluate(env));
  }
}

function StatementCircle(originXExpression, originYExpression, radiusExpression) {
  this.evaluate = function(env) {
    var originX = originXExpression.evaluate(env); 
    var originY = originYExpression.evaluate(env); 
    var radius = radiusExpression.evaluate(env); 
    context.beginPath();
    context.arc(originX, originY, radius, 0, 2 * Math.PI);
    context.fill();
  }
}

function StatementRectangle(leftExpression, topExpression, widthExpression, heightExpression) {
  this.evaluate = function(env) {
    var l = leftExpression.evaluate(env); 
    var t = topExpression.evaluate(env); 
    var w = widthExpression.evaluate(env); 
    var h = heightExpression.evaluate(env); 
    context.fillRect(l, t, w, h);
  }
}

function ExpressionIntegerLiteral(literal) {
  this.evaluate = function(env) {
    return literal;
  }
}

function ExpressionVariableReference(id) {
  this.evaluate = function(env) {
    return env[id];
  }
}

parser.js

function parse(tokens) {
  var i = 0;

  function has(tokenType) {
    return i < tokens.length && tokens[i].type == tokenType;
  }

  function devour() {
    var token = tokens[i];
    i++;
    return token;
  }

  function program() {
    var statements = [];
    while (!has(EOF)) {
      statements.push(statement());
    }
    return new Block(statements);
  }

  function statement() {
    if (has(RECTANGLE)) {
      devour();
      var leftExpression = expression();
      var topExpression = expression();
      var widthExpression = expression();
      var heightExpression = expression();
      return new StatementRectangle(leftExpression, topExpression, widthExpression, heightExpression);
    } else if (has(CIRCLE)) {
      devour();
      var originXExpression = expression();
      var originYExpression = expression();
      var radiusExpression = expression();
      return new StatementCircle(originXExpression, originYExpression, radiusExpression);
    } else {
      throw 'You messed up big time, foobag. I don\'t know ' + tokens[i].source;
    }
  }

  function expression() {
    if (has(INTEGER)) {
      var token = devour();
      return new ExpressionIntegerLiteral(parseInt(token.source));
    } else if (has(IDENTIFIER)) {
      var token = devour();
      return new ExpressionVariableReference(token.source);
    } else {
      throw 'I expected an expression. That\'s NOT what I found. ' + tokens[i].source;
    }
  }

  return program();
}

index.html

<!DOCTYPE html>
<html>
<head>
  <title>...</title>
  <script src="lexer.js"></script>
  <script src="ast.js"></script>
  <script src="parser.js"></script>
</head>
<body>
  <textarea id="ide" rows="8"></textarea>
  <input type="button" id="run" value="Run">
  <canvas id="canvas" width="400" height="300" style="background-color: lightgray;"></canvas>
  
  <script>
var ide = document.getElementById('ide');
var run = document.getElementById('run');
var canvas = document.getElementById('canvas');
var context = canvas.getContext('2d');

run.onclick = function() {
  context.clearRect(0, 0, canvas.width, canvas.height);

  var source = ide.value;
  var tokens = lex(source);
  var ast = parse(tokens);

  //var program = new Block([
  //  new StatementRectangle(
  //    new ExpressionIntegerLiteral(12),
  //    new ExpressionIntegerLiteral(10),
  //    new ExpressionIntegerLiteral(70),
  //    new ExpressionIntegerLiteral(140)
  //  )
  //]);

  var env = {r: 150};
  ast.evaluate(env);
};

  </script>
</body>
</html>