Implementing a “brainf” compiler

In this example we use libgccjit to construct a compiler for an esoteric programming language that we shall refer to as “brainf”.

The compiler can run the generated code in-process (JIT compilation), or write the generated code as a machine code executable (classic ahead-of-time compilation).

The “brainf” language

brainf scripts operate on an array of bytes, with a notional data pointer within the array.

brainf is hard for humans to read, but it’s trivial to write a parser for it, as there is no lexing; just a stream of bytes. The operations are:

Character

Meaning

>

idx += 1

<

idx -= 1

+

data[idx] += 1

-

data[idx] -= 1

.

output (data[idx])

,

data[idx] = input ()

[

loop until data[idx] == 0

]

end of loop

Anything else

ignored

Unlike the previous example, we’ll implement an ahead-of-time compiler, which reads .bf scripts and outputs executables (though it would be trivial to have it run them JIT-compiled in-process).

Here’s what a simple .bf script looks like:

[
  Emit the uppercase alphabet
]

cell 0 = 26
++++++++++++++++++++++++++

cell 1 = 65
>+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<

while cell#0 != 0
[
 >
 .      emit cell#1
 +      increment cell@1
 <-     decrement cell@0
]

Note

This example makes use of whitespace and comments for legibility, but could have been written as:

++++++++++++++++++++++++++
>+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
[>.+<-]

It’s not a particularly useful language, except for providing compiler-writers with a test case that’s easy to parse.

Converting a brainf script to libgccjit IR

We write simple code to populate a gccjit.Context.


class Paren:
    def __init__(self, b_test, b_body, b_after):
        self.b_test = b_test
        self.b_body = b_body
        self.b_after = b_after

class CompileError(Exception):
    def __init__(self, compiler, msg):
        self.filename = compiler.filename
        self.line = compiler.line
        self.column = compiler.column
        self.msg = msg

    def __str__(self):
        return ("%s:%i:%i: %s"
                % (self.filename, self.line, self.column, self.msg))

class Compiler:
    def __init__(self): #, filename):
        self.ctxt = gccjit.Context()
        if 1:
            self.ctxt.set_int_option(gccjit.IntOption.OPTIMIZATION_LEVEL,
                                     3);
            self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_INITIAL_GIMPLE,
                                      0);
            self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_GENERATED_CODE,
                                      0);
            self.ctxt.set_bool_option(gccjit.BoolOption.DEBUGINFO,
                                      1);
            self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_EVERYTHING,
                                      0);
            self.ctxt.set_bool_option(gccjit.BoolOption.KEEP_INTERMEDIATES,
                                      0);
        self.void_type = self.ctxt.get_type(gccjit.TypeKind.VOID)
        self.int_type = self.ctxt.get_type(gccjit.TypeKind.INT)
        self.byte_type = self.ctxt.get_type(gccjit.TypeKind.UNSIGNED_CHAR)
        self.array_type = self.ctxt.new_array_type(self.byte_type,
                                                   30000)
        self.func_getchar = (
            self.ctxt.new_function(gccjit.FunctionKind.IMPORTED,
                                   self.int_type,
                                   b"getchar", []))
        self.func_putchar = (
            self.ctxt.new_function(gccjit.FunctionKind.IMPORTED,
                                   self.void_type,
                                   b"putchar",
                                   [self.ctxt.new_param(self.int_type,
                                                        b"c")]))
        self.func = self.ctxt.new_function(gccjit.FunctionKind.EXPORTED,
                                           self.void_type, b'func', [])
        self.curblock = self.func.new_block(b"initial")
        self.int_zero = self.ctxt.zero(self.int_type)
        self.int_one = self.ctxt.one(self.int_type)
        self.byte_zero = self.ctxt.zero(self.byte_type)
        self.byte_one = self.ctxt.one(self.byte_type)
        self.data_cells = self.ctxt.new_global(gccjit.GlobalKind.INTERNAL,
                                               self.array_type,
                                               b"data_cells")
        self.idx = self.func.new_local(self.int_type,
                                       b"idx")

        self.open_parens = []

        self.curblock.add_comment(b"idx = 0;")
        self.curblock.add_assignment(self.idx,
                                     self.int_zero)

    def get_current_data(self, loc):
        """Get 'data_cells[idx]' as an lvalue. """
        return self.ctxt.new_array_access(self.data_cells,
                                          self.idx,
                                          loc)


    def current_data_is_zero(self, loc):
        """Get 'data_cells[idx] == 0' as a boolean rvalue."""
        return self.ctxt.new_comparison(gccjit.Comparison.EQ,
                                        self.get_current_data(loc),
                                        self.byte_zero,
                                        loc)

    def compile_char(self, ch):
        """Compile one bf character."""
        loc = self.ctxt.new_location(self.filename,
                                     self.line,
                                     self.column)

        # Turn this on to trace execution, by injecting putchar()
        # of each source char.
        if 0:
            arg = self.ctxt.new_rvalue_from_int (self.int_type,
                                                 ch)
            call = self.ctxt.new_call (self.func_putchar,
                                       [arg],
                                       loc)
            self.curblock.add_eval (call, loc)

        if ch == '>':
            self.curblock.add_comment(b"'>': idx += 1;", loc)
            self.curblock.add_assignment_op(self.idx,
                                            gccjit.BinaryOp.PLUS,
                                            self.int_one,
                                            loc)
        elif ch == '<':
            self.curblock.add_comment(b"'<': idx -= 1;", loc)
            self.curblock.add_assignment_op(self.idx,
                                            gccjit.BinaryOp.MINUS,
                                            self.int_one,
                                            loc)
        elif ch == '+':
            self.curblock.add_comment(b"'+': data[idx] += 1;", loc)
            self.curblock.add_assignment_op(self.get_current_data (loc),
                                            gccjit.BinaryOp.PLUS,
                                            self.byte_one,
                                            loc)
        elif ch == '-':
            self.curblock.add_comment(b"'-': data[idx] -= 1;", loc)
            self.curblock.add_assignment_op(self.get_current_data(loc),
                                            gccjit.BinaryOp.MINUS,
                                            self.byte_one,
                                            loc)
        elif ch == '.':
            arg = self.ctxt.new_cast(self.get_current_data(loc),
                                     self.int_type,
                                     loc)
            call = self.ctxt.new_call(self.func_putchar,
                                      [arg],
                                      loc)
            self.curblock.add_comment(b"'.': putchar ((int)data[idx]);",
                                      loc)
            self.curblock.add_eval(call, loc)
        elif ch == ',':
            call = self.ctxt.new_call(self.func_getchar, [], loc)
            self.curblock.add_comment(b"',': data[idx] = (unsigned char)getchar ();",
                                      loc)
            self.curblock.add_assignment(self.get_current_data(loc),
                                         self.ctxt.new_cast(call,
                                                            self.byte_type,
                                                            loc),
                                         loc)
        elif ch == '[':
            loop_test = self.func.new_block()
            on_zero = self.func.new_block()
            on_non_zero = self.func.new_block()

            self.curblock.end_with_jump(loop_test, loc)

            loop_test.add_comment(b"'['", loc)
            loop_test.end_with_conditional(self.current_data_is_zero(loc),
                                           on_zero,
                                           on_non_zero,
                                           loc)
            self.open_parens.append(Paren(loop_test, on_non_zero, on_zero))
            self.curblock = on_non_zero;
        elif ch == ']':
            self.curblock.add_comment(b"']'", loc)
            if not self.open_parens:
                raise CompileError(self, "mismatching parens")
            paren = self.open_parens.pop()
            self.curblock.end_with_jump(paren.b_test)
            self.curblock = paren.b_after
        elif ch == '\n':
            self.line +=1;
            self.column = 0;

        if ch != '\n':
            self.column += 1


    def parse_into_ctxt(self, filename):
        """
        Parse the given .bf file into the gccjit.Context, containing a
        single "main" function suitable for compiling into an executable.
        """
        self.filename = filename;
        self.line = 1
        self.column = 0
        with open(filename) as f_in:
            for ch in f_in.read():
                self.compile_char(ch)
        self.curblock.end_with_void_return()

    # Compiling to an executable

Compiling a context to a file

In previous examples, we compiled and ran the generated machine code in-process. We can do that:


    def run(self):
        import ctypes
        result = self.ctxt.compile()
        py_func_type = ctypes.CFUNCTYPE(None)
        py_func = py_func_type(result.get_code(b'func'))
        py_func()

but this time we’ll also provide a way to compile the context directly to an executable, using gccjit.Context.compile_to_file().

To do so, we need to export a main function. A helper function for doing so is provided by the JIT API:


def make_main(ctxt):
    """
    Make "main" function:
      int
      main (int argc, char **argv)
      {
      ...
      }
    Return (func, param_argc, param_argv)
    """
    int_type = ctxt.get_type(TypeKind.INT)
    param_argc = ctxt.new_param(int_type, b"argc")
    char_ptr_ptr_type = (
        ctxt.get_type(TypeKind.CHAR).get_pointer().get_pointer())
    param_argv = ctxt.new_param(char_ptr_ptr_type, b"argv")
    func_main = ctxt.new_function(FunctionKind.EXPORTED,
                                  int_type,
                                  b"main",
                                  [param_argc, param_argv])
    return (func_main, param_argc, param_argv)

which we can use (as gccjit.make_main) to compile the function to an executable:


    def compile_to_file(self, output_path):
        # Wrap "func" up in a "main" function
        mainfunc, argv, argv = gccjit.make_main(self.ctxt)
        block = mainfunc.new_block()
        block.add_eval(self.ctxt.new_call(self.func, []))
        block.end_with_return(self.int_zero)
        self.ctxt.compile_to_file(gccjit.OutputKind.EXECUTABLE,
                                  output_path)

Finally, here’s the top-level of the program:


def main(argv):
    from optparse import OptionParser
    parser = OptionParser()
    parser.add_option("-o", "--output", dest="outputfile",
                      help="compile to FILE", metavar="FILE")
    (options, args) = parser.parse_args()
    if len(args) != 1:
        raise ValueError('No input file')
    inputfile = args[0]
    c = Compiler()
    c.parse_into_ctxt(inputfile)
    if options.outputfile:
        c.compile_to_file(options.outputfile)
    else:
        c.run()

if __name__ == '__main__':
    try:
        main(sys.argv)
    except Exception as exc:
        print(exc)
        sys.exit(1)

The overall script examples/bf.py is thus a bf-to-machine-code compiler, which we can use to compile .bf files, either to run in-process,

$ PYTHONPATH=. python examples/bf.py \
     emit-alphabet.bf
ABCDEFGHIJKLMNOPQRSTUVWXYZ

or to compile into machine code executables:

$ PYTHONPATH=. python examples/bf.py \
     emit-alphabet.bf \
     -o a.out

which we can run independently:

$ ./a.out
ABCDEFGHIJKLMNOPQRSTUVWXYZ

Success!

We can also inspect the generated executable using standard tools:

$ objdump -d a.out |less

which shows that libgccjit has managed to optimize the function somewhat (for example, the runs of 26 and 65 increment operations have become integer constants 0x1a and 0x41):

0000000000400620 <main>:
  400620:     80 3d 39 0a 20 00 00    cmpb   $0x0,0x200a39(%rip)        # 601060 <data
  400627:     74 07                   je     400630 <main
  400629:     eb fe                   jmp    400629 <main+0x9>
  40062b:     0f 1f 44 00 00          nopl   0x0(%rax,%rax,1)
  400630:     48 83 ec 08             sub    $0x8,%rsp
  400634:     0f b6 05 26 0a 20 00    movzbl 0x200a26(%rip),%eax        # 601061 <data_cells+0x1>
  40063b:     c6 05 1e 0a 20 00 1a    movb   $0x1a,0x200a1e(%rip)       # 601060 <data_cells>
  400642:     8d 78 41                lea    0x41(%rax),%edi
  400645:     40 88 3d 15 0a 20 00    mov    %dil,0x200a15(%rip)        # 601061 <data_cells+0x1>
  40064c:     0f 1f 40 00             nopl   0x0(%rax)
  400650:     40 0f b6 ff             movzbl %dil,%edi
  400654:     e8 87 fe ff ff          callq  4004e0 <putchar@plt>
  400659:     0f b6 05 01 0a 20 00    movzbl 0x200a01(%rip),%eax        # 601061 <data_cells+0x1>
  400660:     80 2d f9 09 20 00 01    subb   $0x1,0x2009f9(%rip)        # 601060 <data_cells>
  400667:     8d 78 01                lea    0x1(%rax),%edi
  40066a:     40 88 3d f0 09 20 00    mov    %dil,0x2009f0(%rip)        # 601061 <data_cells+0x1>
  400671:     75 dd                   jne    400650 <main+0x30>
  400673:     31 c0                   xor    %eax,%eax
  400675:     48 83 c4 08             add    $0x8,%rsp
  400679:     c3                      retq
  40067a:     66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

We also set up debugging information (via gccjit.Context.new_location() and gccjit.BoolOption.DEBUGINFO), so it’s possible to use gdb to singlestep through the generated binary and inspect the internal state idx and data_cells:

(gdb) break main
Breakpoint 1 at 0x400790
(gdb) run
Starting program: a.out

Breakpoint 1, 0x0000000000400790 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
0x0000000000400797 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
0x00000000004007a0 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
9     >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
(gdb) list
4
5     cell 0 = 26
6     ++++++++++++++++++++++++++
7
8     cell 1 = 65
9     >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
10
11    while cell#0 != 0
12    [
13     >
(gdb) n
6     ++++++++++++++++++++++++++
(gdb) n
9     >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
(gdb) p idx
$1 = 1
(gdb) p data_cells
$2 = "\032", '\000' <repeats 29998 times>
(gdb) p data_cells[0]
$3 = 26 '\032'
(gdb) p data_cells[1]
$4 = 0 '\000'
(gdb) list
4
5     cell 0 = 26
6     ++++++++++++++++++++++++++
7
8     cell 1 = 65
9     >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
10
11    while cell#0 != 0
12    [
13     >

Other forms of ahead-of-time-compilation

The above demonstrates compiling a gccjit.Context directly to an executable. It’s also possible to compile it to an object file, and to a dynamic library. See the documentation of gccjit.Context.compile_to_file() for more information.