Implementing a “brainf” compiler¶
In this example we use libgccjit to construct a compiler for an esoteric programming language that we shall refer to as “brainf”.
The compiler can run the generated code in-process (JIT compilation), or write the generated code as a machine code executable (classic ahead-of-time compilation).
The “brainf” language¶
brainf scripts operate on an array of bytes, with a notional data pointer within the array.
brainf is hard for humans to read, but it’s trivial to write a parser for it, as there is no lexing; just a stream of bytes. The operations are:
Character |
Meaning |
---|---|
|
|
|
|
|
|
|
|
|
|
|
|
|
loop until |
|
end of loop |
Anything else |
ignored |
Unlike the previous example, we’ll implement an ahead-of-time compiler,
which reads .bf
scripts and outputs executables (though it would
be trivial to have it run them JIT-compiled in-process).
Here’s what a simple .bf
script looks like:
[ Emit the uppercase alphabet ] cell 0 = 26 ++++++++++++++++++++++++++ cell 1 = 65 >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++< while cell#0 != 0 [ > . emit cell#1 + increment cell@1 <- decrement cell@0 ]
Note
This example makes use of whitespace and comments for legibility, but could have been written as:
++++++++++++++++++++++++++
>+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
[>.+<-]
It’s not a particularly useful language, except for providing compiler-writers with a test case that’s easy to parse.
Converting a brainf script to libgccjit IR¶
We write simple code to populate a gccjit.Context
.
class Paren: def __init__(self, b_test, b_body, b_after): self.b_test = b_test self.b_body = b_body self.b_after = b_after class CompileError(Exception): def __init__(self, compiler, msg): self.filename = compiler.filename self.line = compiler.line self.column = compiler.column self.msg = msg def __str__(self): return ("%s:%i:%i: %s" % (self.filename, self.line, self.column, self.msg)) class Compiler: def __init__(self): #, filename): self.ctxt = gccjit.Context() if 1: self.ctxt.set_int_option(gccjit.IntOption.OPTIMIZATION_LEVEL, 3); self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_INITIAL_GIMPLE, 0); self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_GENERATED_CODE, 0); self.ctxt.set_bool_option(gccjit.BoolOption.DEBUGINFO, 1); self.ctxt.set_bool_option(gccjit.BoolOption.DUMP_EVERYTHING, 0); self.ctxt.set_bool_option(gccjit.BoolOption.KEEP_INTERMEDIATES, 0); self.void_type = self.ctxt.get_type(gccjit.TypeKind.VOID) self.int_type = self.ctxt.get_type(gccjit.TypeKind.INT) self.byte_type = self.ctxt.get_type(gccjit.TypeKind.UNSIGNED_CHAR) self.array_type = self.ctxt.new_array_type(self.byte_type, 30000) self.func_getchar = ( self.ctxt.new_function(gccjit.FunctionKind.IMPORTED, self.int_type, b"getchar", [])) self.func_putchar = ( self.ctxt.new_function(gccjit.FunctionKind.IMPORTED, self.void_type, b"putchar", [self.ctxt.new_param(self.int_type, b"c")])) self.func = self.ctxt.new_function(gccjit.FunctionKind.EXPORTED, self.void_type, b'func', []) self.curblock = self.func.new_block(b"initial") self.int_zero = self.ctxt.zero(self.int_type) self.int_one = self.ctxt.one(self.int_type) self.byte_zero = self.ctxt.zero(self.byte_type) self.byte_one = self.ctxt.one(self.byte_type) self.data_cells = self.ctxt.new_global(gccjit.GlobalKind.INTERNAL, self.array_type, b"data_cells") self.idx = self.func.new_local(self.int_type, b"idx") self.open_parens = [] self.curblock.add_comment(b"idx = 0;") self.curblock.add_assignment(self.idx, self.int_zero) def get_current_data(self, loc): """Get 'data_cells[idx]' as an lvalue. """ return self.ctxt.new_array_access(self.data_cells, self.idx, loc) def current_data_is_zero(self, loc): """Get 'data_cells[idx] == 0' as a boolean rvalue.""" return self.ctxt.new_comparison(gccjit.Comparison.EQ, self.get_current_data(loc), self.byte_zero, loc) def compile_char(self, ch): """Compile one bf character.""" loc = self.ctxt.new_location(self.filename, self.line, self.column) # Turn this on to trace execution, by injecting putchar() # of each source char. if 0: arg = self.ctxt.new_rvalue_from_int (self.int_type, ch) call = self.ctxt.new_call (self.func_putchar, [arg], loc) self.curblock.add_eval (call, loc) if ch == '>': self.curblock.add_comment(b"'>': idx += 1;", loc) self.curblock.add_assignment_op(self.idx, gccjit.BinaryOp.PLUS, self.int_one, loc) elif ch == '<': self.curblock.add_comment(b"'<': idx -= 1;", loc) self.curblock.add_assignment_op(self.idx, gccjit.BinaryOp.MINUS, self.int_one, loc) elif ch == '+': self.curblock.add_comment(b"'+': data[idx] += 1;", loc) self.curblock.add_assignment_op(self.get_current_data (loc), gccjit.BinaryOp.PLUS, self.byte_one, loc) elif ch == '-': self.curblock.add_comment(b"'-': data[idx] -= 1;", loc) self.curblock.add_assignment_op(self.get_current_data(loc), gccjit.BinaryOp.MINUS, self.byte_one, loc) elif ch == '.': arg = self.ctxt.new_cast(self.get_current_data(loc), self.int_type, loc) call = self.ctxt.new_call(self.func_putchar, [arg], loc) self.curblock.add_comment(b"'.': putchar ((int)data[idx]);", loc) self.curblock.add_eval(call, loc) elif ch == ',': call = self.ctxt.new_call(self.func_getchar, [], loc) self.curblock.add_comment(b"',': data[idx] = (unsigned char)getchar ();", loc) self.curblock.add_assignment(self.get_current_data(loc), self.ctxt.new_cast(call, self.byte_type, loc), loc) elif ch == '[': loop_test = self.func.new_block() on_zero = self.func.new_block() on_non_zero = self.func.new_block() self.curblock.end_with_jump(loop_test, loc) loop_test.add_comment(b"'['", loc) loop_test.end_with_conditional(self.current_data_is_zero(loc), on_zero, on_non_zero, loc) self.open_parens.append(Paren(loop_test, on_non_zero, on_zero)) self.curblock = on_non_zero; elif ch == ']': self.curblock.add_comment(b"']'", loc) if not self.open_parens: raise CompileError(self, "mismatching parens") paren = self.open_parens.pop() self.curblock.end_with_jump(paren.b_test) self.curblock = paren.b_after elif ch == '\n': self.line +=1; self.column = 0; if ch != '\n': self.column += 1 def parse_into_ctxt(self, filename): """ Parse the given .bf file into the gccjit.Context, containing a single "main" function suitable for compiling into an executable. """ self.filename = filename; self.line = 1 self.column = 0 with open(filename) as f_in: for ch in f_in.read(): self.compile_char(ch) self.curblock.end_with_void_return() # Compiling to an executable
Compiling a context to a file¶
In previous examples, we compiled and ran the generated machine code in-process. We can do that:
def run(self): import ctypes result = self.ctxt.compile() py_func_type = ctypes.CFUNCTYPE(None) py_func = py_func_type(result.get_code(b'func')) py_func()
but this time we’ll also provide a way to compile the context directly
to an executable, using gccjit.Context.compile_to_file()
.
To do so, we need to export a main
function. A helper
function for doing so is provided by the JIT API:
def make_main(ctxt): """ Make "main" function: int main (int argc, char **argv) { ... } Return (func, param_argc, param_argv) """ int_type = ctxt.get_type(TypeKind.INT) param_argc = ctxt.new_param(int_type, b"argc") char_ptr_ptr_type = ( ctxt.get_type(TypeKind.CHAR).get_pointer().get_pointer()) param_argv = ctxt.new_param(char_ptr_ptr_type, b"argv") func_main = ctxt.new_function(FunctionKind.EXPORTED, int_type, b"main", [param_argc, param_argv]) return (func_main, param_argc, param_argv)
which we can use (as gccjit.make_main
) to compile the function
to an executable:
def compile_to_file(self, output_path): # Wrap "func" up in a "main" function mainfunc, argv, argv = gccjit.make_main(self.ctxt) block = mainfunc.new_block() block.add_eval(self.ctxt.new_call(self.func, [])) block.end_with_return(self.int_zero) self.ctxt.compile_to_file(gccjit.OutputKind.EXECUTABLE, output_path)
Finally, here’s the top-level of the program:
def main(argv): from optparse import OptionParser parser = OptionParser() parser.add_option("-o", "--output", dest="outputfile", help="compile to FILE", metavar="FILE") (options, args) = parser.parse_args() if len(args) != 1: raise ValueError('No input file') inputfile = args[0] c = Compiler() c.parse_into_ctxt(inputfile) if options.outputfile: c.compile_to_file(options.outputfile) else: c.run() if __name__ == '__main__': try: main(sys.argv) except Exception as exc: print(exc) sys.exit(1)
The overall script examples/bf.py is thus a bf-to-machine-code compiler, which we can use to compile .bf files, either to run in-process,
$ PYTHONPATH=. python examples/bf.py \
emit-alphabet.bf
ABCDEFGHIJKLMNOPQRSTUVWXYZ
or to compile into machine code executables:
$ PYTHONPATH=. python examples/bf.py \
emit-alphabet.bf \
-o a.out
which we can run independently:
$ ./a.out
ABCDEFGHIJKLMNOPQRSTUVWXYZ
Success!
We can also inspect the generated executable using standard tools:
$ objdump -d a.out |less
which shows that libgccjit has managed to optimize the function somewhat (for example, the runs of 26 and 65 increment operations have become integer constants 0x1a and 0x41):
0000000000400620 <main>:
400620: 80 3d 39 0a 20 00 00 cmpb $0x0,0x200a39(%rip) # 601060 <data
400627: 74 07 je 400630 <main
400629: eb fe jmp 400629 <main+0x9>
40062b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
400630: 48 83 ec 08 sub $0x8,%rsp
400634: 0f b6 05 26 0a 20 00 movzbl 0x200a26(%rip),%eax # 601061 <data_cells+0x1>
40063b: c6 05 1e 0a 20 00 1a movb $0x1a,0x200a1e(%rip) # 601060 <data_cells>
400642: 8d 78 41 lea 0x41(%rax),%edi
400645: 40 88 3d 15 0a 20 00 mov %dil,0x200a15(%rip) # 601061 <data_cells+0x1>
40064c: 0f 1f 40 00 nopl 0x0(%rax)
400650: 40 0f b6 ff movzbl %dil,%edi
400654: e8 87 fe ff ff callq 4004e0 <putchar@plt>
400659: 0f b6 05 01 0a 20 00 movzbl 0x200a01(%rip),%eax # 601061 <data_cells+0x1>
400660: 80 2d f9 09 20 00 01 subb $0x1,0x2009f9(%rip) # 601060 <data_cells>
400667: 8d 78 01 lea 0x1(%rax),%edi
40066a: 40 88 3d f0 09 20 00 mov %dil,0x2009f0(%rip) # 601061 <data_cells+0x1>
400671: 75 dd jne 400650 <main+0x30>
400673: 31 c0 xor %eax,%eax
400675: 48 83 c4 08 add $0x8,%rsp
400679: c3 retq
40067a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1)
We also set up debugging information (via
gccjit.Context.new_location()
and
gccjit.BoolOption.DEBUGINFO
), so it’s possible to use gdb
to singlestep through the generated binary and inspect the internal
state idx
and data_cells
:
(gdb) break main
Breakpoint 1 at 0x400790
(gdb) run
Starting program: a.out
Breakpoint 1, 0x0000000000400790 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
0x0000000000400797 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
0x00000000004007a0 in main (argc=1, argv=0x7fffffffe448)
(gdb) stepi
9 >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
(gdb) list
4
5 cell 0 = 26
6 ++++++++++++++++++++++++++
7
8 cell 1 = 65
9 >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
10
11 while cell#0 != 0
12 [
13 >
(gdb) n
6 ++++++++++++++++++++++++++
(gdb) n
9 >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
(gdb) p idx
$1 = 1
(gdb) p data_cells
$2 = "\032", '\000' <repeats 29998 times>
(gdb) p data_cells[0]
$3 = 26 '\032'
(gdb) p data_cells[1]
$4 = 0 '\000'
(gdb) list
4
5 cell 0 = 26
6 ++++++++++++++++++++++++++
7
8 cell 1 = 65
9 >+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++<
10
11 while cell#0 != 0
12 [
13 >
Other forms of ahead-of-time-compilation¶
The above demonstrates compiling a gccjit.Context
directly
to an executable. It’s also possible to compile it to an object file,
and to a dynamic library. See the documentation of
gccjit.Context.compile_to_file()
for more information.