From 22fadc3ed91cb380f7303e8a83ff5806d4576cb5 Mon Sep 17 00:00:00 2001 From: "Richard W.M. Jones" Date: Tue, 29 May 2012 20:50:42 +0100 Subject: [PATCH 7/8] New ARM backend, written by Benedikt Meurer (PR#5433). Backported from upstream sources to 3.12.1 by RWMJ. Includes svn rev 12548 to fix invalid generation of Thumb-2 branch instruction TBH (upstream PR#5623, RHBZ#821153). --- asmcomp/amd64/selection.ml | 14 +- asmcomp/arm/arch.ml | 152 +++++++- asmcomp/arm/emit.mlp | 857 ++++++++++++++++++++++++++++-------------- asmcomp/arm/proc.ml | 185 ++++++--- asmcomp/arm/reload.ml | 4 +- asmcomp/arm/scheduling.ml | 80 ++-- asmcomp/arm/selection.ml | 343 ++++++++++------- asmcomp/i386/selection.ml | 14 +- asmcomp/power/selection.ml | 2 +- asmcomp/power64/selection.ml | 2 +- asmcomp/selectgen.ml | 13 +- asmcomp/selectgen.mli | 2 +- asmcomp/sparc/selection.ml | 2 +- asmrun/arm.S | 544 ++++++++++++++++----------- asmrun/signals_osdep.h | 2 +- configure | 11 +- 16 files changed, 1485 insertions(+), 742 deletions(-) diff --git a/asmcomp/amd64/selection.ml b/asmcomp/amd64/selection.ml index f0546cf..5d9f6fa 100644 --- a/asmcomp/amd64/selection.ml +++ b/asmcomp/amd64/selection.ml @@ -121,7 +121,7 @@ method is_immediate n = n <= 0x7FFFFFFF && n >= -0x80000000 method is_immediate_natint n = n <= 0x7FFFFFFFn && n >= -0x80000000n -method select_addressing exp = +method select_addressing chunk exp = let (a, d) = select_addr exp in (* PR#4625: displacement must be a signed 32-bit immediate *) if d < -0x8000_0000 || d > 0x7FFF_FFFF @@ -157,7 +157,7 @@ method! select_operation op args = match op with (* Recognize the LEA instruction *) Caddi | Cadda | Csubi | Csuba -> - begin match self#select_addressing (Cop(op, args)) with + begin match self#select_addressing Word (Cop(op, args)) with (Iindexed d, _) -> super#select_operation op args | (Iindexed2 0, _) -> super#select_operation op args | (addr, arg) -> (Ispecific(Ilea addr), [arg]) @@ -191,7 +191,7 @@ method! select_operation op args = begin match args with [loc; Cop(Caddi, [Cop(Cload _, [loc']); Cconst_int n])] when loc = loc' && self#is_immediate n -> - let (addr, arg) = self#select_addressing loc in + let (addr, arg) = self#select_addressing Word loc in (Ispecific(Ioffset_loc(n, addr)), [arg]) | _ -> super#select_operation op args @@ -202,12 +202,12 @@ method! select_operation op args = method select_floatarith commutative regular_op mem_op args = match args with - [arg1; Cop(Cload (Double|Double_u), [loc2])] -> - let (addr, arg2) = self#select_addressing loc2 in + [arg1; Cop(Cload (Double|Double_u as chunk), [loc2])] -> + let (addr, arg2) = self#select_addressing chunk loc2 in (Ispecific(Ifloatarithmem(mem_op, addr)), [arg1; arg2]) - | [Cop(Cload (Double|Double_u), [loc1]); arg2] when commutative -> - let (addr, arg1) = self#select_addressing loc1 in + | [Cop(Cload (Double|Double_u as chunk), [loc1]); arg2] when commutative -> + let (addr, arg1) = self#select_addressing chunk loc1 in (Ispecific(Ifloatarithmem(mem_op, addr)), [arg2; arg1]) | [arg1; arg2] -> diff --git a/asmcomp/arm/arch.ml b/asmcomp/arm/arch.ml index 998fa4b..c4aca8d 100644 --- a/asmcomp/arm/arch.ml +++ b/asmcomp/arm/arch.ml @@ -1,25 +1,98 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) -(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Benedikt Meurer, University of Siegen *) (* *) -(* Copyright 1998 Institut National de Recherche en Informatique et *) -(* en Automatique. All rights reserved. This file is distributed *) -(* under the terms of the Q Public License version 1.0. *) +(* Copyright 1998 Institut National de Recherche en Informatique *) +(* et en Automatique. Copyright 2012 Benedikt Meurer. All rights *) +(* reserved. This file is distributed under the terms of the Q *) +(* Public License version 1.0. *) (* *) (***********************************************************************) -(* $Id: arch.ml 5304 2002-11-29 15:03:37Z xleroy $ *) +(* $Id$ *) (* Specific operations for the ARM processor *) open Misc open Format +type abi = EABI | EABI_VFP +type arch = ARMv4 | ARMv5 | ARMv5TE | ARMv6 | ARMv6T2 | ARMv7 +type fpu = Soft | VFPv3_D16 | VFPv3 + +let abi = + match Config.system with + "linux_eabi" -> EABI + | "linux_eabihf" -> EABI_VFP + | _ -> assert false + +let string_of_arch = function + ARMv4 -> "armv4" + | ARMv5 -> "armv5" + | ARMv5TE -> "armv5te" + | ARMv6 -> "armv6" + | ARMv6T2 -> "armv6t2" + | ARMv7 -> "armv7" + +let string_of_fpu = function + Soft -> "soft" + | VFPv3_D16 -> "vfpv3-d16" + | VFPv3 -> "vfpv3" + (* Machine-specific command-line options *) -let command_line_options = [] +let (arch, fpu, thumb) = + let (def_arch, def_fpu, def_thumb) = + begin match abi, Config.model with + (* Defaults for architecture, FPU and Thumb *) + EABI, "armv5" -> ARMv5, Soft, false + | EABI, "armv5te" -> ARMv5TE, Soft, false + | EABI, "armv6" -> ARMv6, Soft, false + | EABI, "armv6t2" -> ARMv6T2, Soft, false + | EABI, "armv7" -> ARMv7, Soft, false + | EABI, _ -> ARMv4, Soft, false + | EABI_VFP, _ -> ARMv7, VFPv3_D16, true + end in + (ref def_arch, ref def_fpu, ref def_thumb) + +let pic_code = ref false + +let farch spec = + arch := (match spec with + "armv4" when abi <> EABI_VFP -> ARMv4 + | "armv5" when abi <> EABI_VFP -> ARMv5 + | "armv5te" when abi <> EABI_VFP -> ARMv5TE + | "armv6" when abi <> EABI_VFP -> ARMv6 + | "armv6t2" when abi <> EABI_VFP -> ARMv6T2 + | "armv7" -> ARMv7 + | spec -> raise (Arg.Bad spec)) + +let ffpu spec = + fpu := (match spec with + "soft" when abi <> EABI_VFP -> Soft + | "vfpv3-d16" when abi = EABI_VFP -> VFPv3_D16 + | "vfpv3" when abi = EABI_VFP -> VFPv3 + | spec -> raise (Arg.Bad spec)) + +let command_line_options = + [ "-farch", Arg.String farch, + " Select the ARM target architecture" + ^ " (default: " ^ (string_of_arch !arch) ^ ")"; + "-ffpu", Arg.String ffpu, + " Select the floating-point hardware" + ^ " (default: " ^ (string_of_fpu !fpu) ^ ")"; + "-fPIC", Arg.Set pic_code, + " Generate position-independent machine code"; + "-fno-PIC", Arg.Clear pic_code, + " Generate position-dependent machine code"; + "-fthumb", Arg.Set thumb, + " Enable Thumb/Thumb-2 code generation" + ^ (if !thumb then " (default)" else ""); + "-fno-thumb", Arg.Clear thumb, + " Disable Thumb/Thumb-2 code generation" + ^ (if not !thumb then " (default" else "")] (* Addressing modes *) @@ -37,6 +110,14 @@ type specific_operation = Ishiftarith of arith_operation * int | Ishiftcheckbound of int | Irevsubimm of int + | Imuladd (* multiply and add *) + | Imulsub (* multiply and subtract *) + | Inegmulf (* floating-point negate and multiply *) + | Imuladdf (* floating-point multiply and add *) + | Inegmuladdf (* floating-point negate, multiply and add *) + | Imulsubf (* floating-point multiply and subtract *) + | Inegmulsubf (* floating-point negate, multiply and subtract *) + | Isqrtf (* floating-point square root *) and arith_operation = Ishiftadd @@ -51,6 +132,10 @@ let size_addr = 4 let size_int = 4 let size_float = 8 +(* Behavior of division *) + +let division_crashes_on_overflow = false + (* Operations on addressing modes *) let identity_addressing = Iindexed 0 @@ -84,3 +169,56 @@ let print_specific_operation printreg op ppf arg = fprintf ppf "check %a >> %i > %a" printreg arg.(0) n printreg arg.(1) | Irevsubimm n -> fprintf ppf "%i %s %a" n "-" printreg arg.(0) + | Imuladd -> + fprintf ppf "(%a * %a) + %a" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Imulsub -> + fprintf ppf "-(%a * %a) + %a" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Inegmulf -> + fprintf ppf "-f (%a *f %a)" + printreg arg.(0) + printreg arg.(1) + | Imuladdf -> + fprintf ppf "%a +f (%a *f %a)" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Inegmuladdf -> + fprintf ppf "%a -f (%a *f %a)" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Imulsubf -> + fprintf ppf "(-f %a) +f (%a *f %a)" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Inegmulsubf -> + fprintf ppf "(-f %a) -f (%a *f %a)" + printreg arg.(0) + printreg arg.(1) + printreg arg.(2) + | Isqrtf -> + fprintf ppf "sqrtf %a" + printreg arg.(0) + +(* Recognize immediate operands *) + +(* Immediate operands are 8-bit immediate values, zero-extended, + and rotated right by 0 ... 30 bits. + In Thumb/Thumb-2 mode we utilize 26 ... 30. *) + +let is_immediate n = + let n = ref n in + let s = ref 0 in + let m = if !thumb then 24 else 30 in + while (!s <= m && Int32.logand !n 0xffl <> !n) do + n := Int32.logor (Int32.shift_right_logical !n 2) (Int32.shift_left !n 30); + s := !s + 2 + done; + !s <= m diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp index a4b2241..f8db396 100644 --- a/asmcomp/arm/emit.mlp +++ b/asmcomp/arm/emit.mlp @@ -1,16 +1,17 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) -(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Benedikt Meurer, University of Siegen *) (* *) -(* Copyright 1998 Institut National de Recherche en Informatique et *) -(* en Automatique. All rights reserved. This file is distributed *) -(* under the terms of the Q Public License version 1.0. *) +(* Copyright 1998 Institut National de Recherche en Informatique *) +(* et en Automatique. Copyright 2012 Benedikt Meurer. All rights *) +(* reserved. This file is distributed under the terms of the Q *) +(* Public License version 1.0. *) (* *) (***********************************************************************) -(* $Id: emit.mlp 10293 2010-04-22 09:33:18Z xleroy $ *) +(* $Id$ *) (* Emission of ARM assembly code *) @@ -33,16 +34,28 @@ let fastcode_flag = ref true let emit_label lbl = emit_string ".L"; emit_int lbl -(* Output a symbol *) +let emit_data_label lbl = + emit_string ".Ld"; emit_int lbl + +(* Symbols *) let emit_symbol s = Emitaux.emit_symbol '$' s +let emit_call s = + if !Clflags.dlcode || !pic_code + then `bl {emit_symbol s}(PLT)` + else `bl {emit_symbol s}` + +let emit_jump s = + if !Clflags.dlcode || !pic_code + then `b {emit_symbol s}(PLT)` + else `b {emit_symbol s}` + (* Output a pseudo-register *) -let emit_reg r = - match r.loc with - | Reg r -> emit_string (register_name r) +let emit_reg = function + {loc = Reg r} -> emit_string (register_name r) | _ -> fatal_error "Emit_arm.emit_reg" (* Layout of the stack frame *) @@ -53,14 +66,23 @@ let frame_size () = let sz = !stack_offset + 4 * num_stack_slots.(0) + + 8 * num_stack_slots.(1) + + 8 * num_stack_slots.(2) + (if !contains_calls then 4 else 0) in Misc.align sz 8 let slot_offset loc cl = match loc with - Incoming n -> frame_size() + n - | Local n -> !stack_offset + n * 4 - | Outgoing n -> n + Incoming n -> + assert (n >= 0); + frame_size() + n + | Local n -> + if cl = 0 + then !stack_offset + n * 4 + else !stack_offset + num_stack_slots.(0) * 4 + n * 8 + | Outgoing n -> + assert (n >= 0); + n (* Output a stack reference *) @@ -79,20 +101,13 @@ let emit_addressing addr r n = (* Record live pointers at call points *) -type frame_descr = - { fd_lbl: int; (* Return address *) - fd_frame_size: int; (* Size of stack frame *) - fd_live_offset: int list } (* Offsets/regs of live addresses *) - -let frame_descriptors = ref([] : frame_descr list) - -let record_frame live = +let record_frame_label live dbg = let lbl = new_label() in let live_offset = ref [] in Reg.Set.iter (function {typ = Addr; loc = Reg r} -> - live_offset := (r lsl 1) + 1 :: !live_offset + live_offset := ((r lsl 1) + 1) :: !live_offset | {typ = Addr; loc = Stack s} as reg -> live_offset := slot_offset s (register_class reg) :: !live_offset | _ -> ()) @@ -100,18 +115,57 @@ let record_frame live = frame_descriptors := { fd_lbl = lbl; fd_frame_size = frame_size(); - fd_live_offset = !live_offset } :: !frame_descriptors; - `{emit_label lbl}:` - -let emit_frame fd = - ` .word {emit_label fd.fd_lbl} + 4\n`; - ` .short {emit_int fd.fd_frame_size}\n`; - ` .short {emit_int (List.length fd.fd_live_offset)}\n`; - List.iter - (fun n -> - ` .short {emit_int n}\n`) - fd.fd_live_offset; - ` .align 2\n` + fd_live_offset = !live_offset; + fd_debuginfo = dbg } :: !frame_descriptors; + lbl + +let record_frame live dbg = + let lbl = record_frame_label live dbg in `{emit_label lbl}:` + +(* Record calls to the GC -- we've moved them out of the way *) + +type gc_call = + { gc_lbl: label; (* Entry label *) + gc_return_lbl: label; (* Where to branch after GC *) + gc_frame_lbl: label } (* Label of frame descriptor *) + +let call_gc_sites = ref ([] : gc_call list) + +let emit_call_gc gc = + `{emit_label gc.gc_lbl}: {emit_call "caml_call_gc"}\n`; + `{emit_label gc.gc_frame_lbl}: b {emit_label gc.gc_return_lbl}\n` + +(* Record calls to caml_ml_array_bound_error. + In debug mode, we maintain one call to caml_ml_array_bound_error + per bound check site. Otherwise, we can share a single call. *) + +type bound_error_call = + { bd_lbl: label; (* Entry label *) + bd_frame_lbl: label } (* Label of frame descriptor *) + +let bound_error_sites = ref ([] : bound_error_call list) + +let bound_error_label dbg = + if !Clflags.debug || !bound_error_sites = [] then begin + let lbl_bound_error = new_label() in + let lbl_frame = record_frame_label Reg.Set.empty dbg in + bound_error_sites := + { bd_lbl = lbl_bound_error; + bd_frame_lbl = lbl_frame } :: !bound_error_sites; + lbl_bound_error + end else begin + let bd = List.hd !bound_error_sites in bd.bd_lbl + end + +let emit_call_bound_error bd = + `{emit_label bd.bd_lbl}: {emit_call "caml_ml_array_bound_error"}\n`; + `{emit_label bd.bd_frame_lbl}:\n` + +(* Negate a comparison *) + +let negate_integer_comparison = function + Isigned cmp -> Isigned(negate_comparison cmp) + | Iunsigned cmp -> Iunsigned(negate_comparison cmp) (* Names of various instructions *) @@ -121,22 +175,13 @@ let name_for_comparison = function | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "ls" | Iunsigned Cge -> "cs" | Iunsigned Clt -> "cc" | Iunsigned Cgt -> "hi" -let name_for_float_comparison cmp neg = - match cmp with - Ceq -> if neg then "ne" else "eq" - | Cne -> if neg then "eq" else "ne" - | Cle -> if neg then "hi" else "ls" - | Cge -> if neg then "lt" else "ge" - | Clt -> if neg then "pl" else "mi" - | Cgt -> if neg then "le" else "gt" - let name_for_int_operation = function Iadd -> "add" | Isub -> "sub" | Imul -> "mul" - | Iand -> "and" - | Ior -> "orr" - | Ixor -> "eor" + | Iand -> "and" + | Ior -> "orr" + | Ixor -> "eor" | _ -> assert false let name_for_shift_operation = function @@ -145,60 +190,54 @@ let name_for_shift_operation = function | Iasr -> "asr" | _ -> assert false -let name_for_shift_int_operation = function - Ishiftadd -> "add" - | Ishiftsub -> "sub" - | Ishiftsubrev -> "rsb" - -(* Recognize immediate operands *) - -(* Immediate operands are 8-bit immediate values, zero-extended, and rotated - right by 0, 2, 4, ... 30 bits. - We check only with 8-bit values shifted left 0 to 24 bits. *) - -let rec is_immed n shift = - shift <= 24 && - (Nativeint.logand n (Nativeint.shift_left (Nativeint.of_int 0xFF) shift) = n - || is_immed n (shift + 2)) - -let is_immediate n = is_immed n 0 - (* General functional to decompose a non-immediate integer constant - into 8-bit chunks shifted left 0 ... 24 bits *) + into 8-bit chunks shifted left 0 ... 30 bits. *) let decompose_intconst n fn = let i = ref n in let shift = ref 0 in let ninstr = ref 0 in - while !i <> 0n do - if Nativeint.to_int (Nativeint.shift_right !i !shift) land 3 = 0 then + while !i <> 0l do + if Int32.logand (Int32.shift_right !i !shift) 3l = 0l then shift := !shift + 2 else begin - let mask = Nativeint.shift_left 0xFFn !shift in - let bits = Nativeint.logand !i mask in - fn bits; + let bits = Int32.logand !i (Int32.shift_left 0xffl !shift) in + i := Int32.sub !i bits; shift := !shift + 8; - i := Nativeint.sub !i bits; - incr ninstr + incr ninstr; + fn bits end done; !ninstr (* Load an integer constant into a register *) -let emit_intconst r n = - let nr = Nativeint.lognot n in +let emit_intconst dst n = + let nr = Int32.lognot n in if is_immediate n then begin - ` mov {emit_reg r}, #{emit_nativeint n}\n`; 1 + (* Use movs here to enable 16-bit T1 encoding *) + ` movs {emit_reg dst}, #{emit_int32 n}\n`; 1 end else if is_immediate nr then begin - ` mvn {emit_reg r}, #{emit_nativeint nr}\n`; 1 + ` mvn {emit_reg dst}, #{emit_int32 nr}\n`; 1 + end else if !arch > ARMv6 then begin + let nl = Int32.logand 0xffffl n in + let nh = Int32.logand 0xffffl (Int32.shift_right_logical n 16) in + if nh = 0l then begin + ` movw {emit_reg dst}, #{emit_int32 nl}\n`; 1 + end else if Int32.logand nl 0xffl = nl then begin + ` movs {emit_reg dst}, #{emit_int32 nl}\n`; + ` movt {emit_reg dst}, #{emit_int32 nh}\n`; 2 + end else begin + ` movw {emit_reg dst}, #{emit_int32 nl}\n`; + ` movt {emit_reg dst}, #{emit_int32 nh}\n`; 2 + end end else begin let first = ref true in decompose_intconst n (fun bits -> if !first - then ` mov {emit_reg r}, #{emit_nativeint bits} @ {emit_nativeint n}\n` - else ` add {emit_reg r}, {emit_reg r}, #{emit_nativeint bits}\n`; + then ` mov {emit_reg dst}, #{emit_int32 bits} @ {emit_int32 n}\n` + else ` add {emit_reg dst}, {emit_reg dst}, #{emit_int32 bits}\n`; first := false) end @@ -206,46 +245,105 @@ let emit_intconst r n = let emit_stack_adjustment instr n = if n <= 0 then 0 else - decompose_intconst (Nativeint.of_int n) + decompose_intconst (Int32.of_int n) (fun bits -> - ` {emit_string instr} sp, sp, #{emit_nativeint bits}\n`) + ` {emit_string instr} sp, sp, #{emit_int32 bits}\n`) (* Name of current function *) let function_name = ref "" (* Entry point for tail recursive calls *) let tailrec_entry_point = ref 0 -(* Table of symbols referenced *) -let symbol_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t) -(* Table of floating-point literals *) -let float_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t) -(* Total space (in word) occupied by pending literals *) +(* Pending floating-point literals *) +let float_literals = ref ([] : (string * label) list) +(* Pending relative references to the global offset table *) +let gotrel_literals = ref ([] : (label * label) list) +(* Pending symbol literals *) +let symbol_literals = ref ([] : (string * label) list) +(* Total space (in words) occupied by pending literals *) let num_literals = ref 0 -(* Label a symbol or float constant *) -let label_constant tbl s size = +(* Label a floating-point literal *) +let float_literal f = try - Hashtbl.find tbl s + List.assoc f !float_literals with Not_found -> let lbl = new_label() in - Hashtbl.add tbl s lbl; - num_literals := !num_literals + size; + num_literals := !num_literals + 2; + float_literals := (f, lbl) :: !float_literals; lbl -(* Emit all pending constants *) - -let emit_constants () = - Hashtbl.iter - (fun s lbl -> - `{emit_label lbl}: .word {emit_symbol s}\n`) - symbol_constants; - Hashtbl.iter - (fun s lbl -> - `{emit_label lbl}: .double {emit_string s}\n`) - float_constants; - Hashtbl.clear symbol_constants; - Hashtbl.clear float_constants; +(* Label a GOTREL literal *) +let gotrel_literal l = + let lbl = new_label() in + num_literals := !num_literals + 1; + gotrel_literals := (l, lbl) :: !gotrel_literals; + lbl + +(* Label a symbol literal *) +let symbol_literal s = + try + List.assoc s !symbol_literals + with Not_found -> + let lbl = new_label() in + num_literals := !num_literals + 1; + symbol_literals := (s, lbl) :: !symbol_literals; + lbl + +(* Emit all pending literals *) +let emit_literals() = + if !float_literals <> [] then begin + ` .align 3\n`; + List.iter + (fun (f, lbl) -> + `{emit_label lbl}: .double {emit_string f}\n`) + !float_literals; + float_literals := [] + end; + if !symbol_literals <> [] then begin + let offset = if !thumb then 4 else 8 in + let suffix = if !pic_code then "(GOT)" else "" in + ` .align 2\n`; + List.iter + (fun (l, lbl) -> + `{emit_label lbl}: .word _GLOBAL_OFFSET_TABLE_-({emit_label l}+{emit_int offset})\n`) + !gotrel_literals; + List.iter + (fun (s, lbl) -> + `{emit_label lbl}: .word {emit_symbol s}{emit_string suffix}\n`) + !symbol_literals; + gotrel_literals := []; + symbol_literals := [] + end; num_literals := 0 +(* Emit code to load the address of a symbol *) + +let emit_load_symbol_addr dst s = + if !pic_code then begin + let lbl_pic = new_label() in + let lbl_got = gotrel_literal lbl_pic in + let lbl_sym = symbol_literal s in + (* Both r3 and r12 are marked as clobbered in PIC mode (cf. proc.ml), + so use r12 as temporary scratch register unless the destination is + r12, then we use r3 instead. *) + let tmp = if dst.loc = Reg 8 (*r12*) + then phys_reg 3 (*r3*) + else phys_reg 8 (*r12*) in + ` ldr {emit_reg tmp}, {emit_label lbl_got}\n`; + ` ldr {emit_reg dst}, {emit_label lbl_sym}\n`; + `{emit_label lbl_pic}: add {emit_reg tmp}, pc, {emit_reg tmp}\n`; + ` ldr {emit_reg dst}, [{emit_reg tmp}, {emit_reg dst}] @ {emit_symbol s}\n`; + 4 + end else if !arch > ARMv6 && not !Clflags.dlcode && !fastcode_flag then begin + ` movw {emit_reg dst}, #:lower16:{emit_symbol s}\n`; + ` movt {emit_reg dst}, #:upper16:{emit_symbol s}\n`; + 2 + end else begin + let lbl = symbol_literal s in + ` ldr {emit_reg dst}, {emit_label lbl} @ {emit_symbol s}\n`; + 1 + end + (* Output the assembly code for an instruction *) let emit_instr i = @@ -254,40 +352,76 @@ let emit_instr i = | Lop(Imove | Ispill | Ireload) -> let src = i.arg.(0) and dst = i.res.(0) in if src.loc = dst.loc then 0 else begin - match (src, dst) with - {loc = Reg rs; typ = Int|Addr}, {loc = Reg rd; typ = Int|Addr} -> - ` mov {emit_reg dst}, {emit_reg src}\n`; 1 - | {loc = Reg rs; typ = Int|Addr}, {loc = Stack sd} -> - ` str {emit_reg src}, {emit_stack dst}\n`; 1 - | {loc = Stack ss; typ = Int|Addr}, {loc = Reg rd} -> - ` ldr {emit_reg dst}, {emit_stack src}\n`; 1 + begin match (src, dst) with + {loc = Reg _; typ = Float}, {loc = Reg _} -> + ` fcpyd {emit_reg dst}, {emit_reg src}\n` + | {loc = Reg _}, {loc = Reg _} -> + ` mov {emit_reg dst}, {emit_reg src}\n` + | {loc = Reg _; typ = Float}, _ -> + ` fstd {emit_reg src}, {emit_stack dst}\n` + | {loc = Reg _}, _ -> + ` str {emit_reg src}, {emit_stack dst}\n` + | {typ = Float}, _ -> + ` fldd {emit_reg dst}, {emit_stack src}\n` | _ -> - assert false + ` ldr {emit_reg dst}, {emit_stack src}\n` + end; 1 end | Lop(Iconst_int n) -> - emit_intconst i.res.(0) n - | Lop(Iconst_float s) -> - let bits = Int64.bits_of_float (float_of_string s) in - let high_bits = Int64.to_nativeint (Int64.shift_right_logical bits 32) - and low_bits = Int64.to_nativeint bits in - if is_immediate low_bits && is_immediate high_bits then begin - ` mov {emit_reg i.res.(0)}, #{emit_nativeint low_bits} @ {emit_string s}\n`; - ` mov {emit_reg i.res.(1)}, #{emit_nativeint high_bits}\n`; - 2 + emit_intconst i.res.(0) (Nativeint.to_int32 n) + | Lop(Iconst_float f) when !fpu = Soft -> + ` @ {emit_string f}\n`; + let bits = Int64.bits_of_float (float_of_string f) in + let high_bits = Int64.to_int32 (Int64.shift_right_logical bits 32) + and low_bits = Int64.to_int32 bits in + if is_immediate low_bits || is_immediate high_bits then begin + let ninstr_low = emit_intconst i.res.(0) low_bits + and ninstr_high = emit_intconst i.res.(1) high_bits in + ninstr_low + ninstr_high end else begin - let lbl = label_constant float_constants s 2 in - ` ldr {emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string s}\n`; + let lbl = float_literal f in + ` ldr {emit_reg i.res.(0)}, {emit_label lbl}\n`; ` ldr {emit_reg i.res.(1)}, {emit_label lbl} + 4\n`; 2 end + | Lop(Iconst_float f) -> + let encode imm = + let sg = Int64.to_int (Int64.shift_right_logical imm 63) in + let ex = Int64.to_int (Int64.shift_right_logical imm 52) in + let ex = (ex land 0x7ff) - 1023 in + let mn = Int64.logand imm 0xfffffffffffffL in + if Int64.logand mn 0xffffffffffffL <> 0L || ex < -3 || ex > 4 + then + None + else begin + let mn = Int64.to_int (Int64.shift_right_logical mn 48) in + if mn land 0x0f <> mn then + None + else + let ex = ((ex + 3) land 0x07) lxor 0x04 in + Some((sg lsl 7) lor (ex lsl 4) lor mn) + end in + begin match encode (Int64.bits_of_float (float_of_string f)) with + None -> + let lbl = float_literal f in + ` fldd {emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string f}\n` + | Some imm8 -> + ` fconstd {emit_reg i.res.(0)}, #{emit_int imm8} @ {emit_string f}\n` + end; 1 | Lop(Iconst_symbol s) -> - let lbl = label_constant symbol_constants s 1 in - ` ldr {emit_reg i.res.(0)}, {emit_label lbl} @ {emit_symbol s}\n`; 1 + emit_load_symbol_addr i.res.(0) s | Lop(Icall_ind) -> - ` mov lr, pc\n`; - `{record_frame i.live} bx {emit_reg i.arg.(0)}\n`; 2 + if !arch >= ARMv5 then begin + ` blx {emit_reg i.arg.(0)}\n`; + `{record_frame i.live i.dbg}\n`; 1 + end else begin + ` mov lr, pc\n`; + ` bx {emit_reg i.arg.(0)}\n`; + `{record_frame i.live i.dbg}\n`; 2 + end | Lop(Icall_imm s) -> - `{record_frame i.live} bl {emit_symbol s}\n`; 1 + ` {emit_call s}\n`; + `{record_frame i.live i.dbg}\n`; 1 | Lop(Itailcall_ind) -> let n = frame_size() in if !contains_calls then @@ -303,17 +437,16 @@ let emit_instr i = if !contains_calls then ` ldr lr, [sp, #{emit_int (n-4)}]\n`; let ninstr = emit_stack_adjustment "add" n in - ` b {emit_symbol s}\n`; + ` {emit_jump s}\n`; 2 + ninstr end - | Lop(Iextcall(s, alloc)) -> - if alloc then begin - let lbl = label_constant symbol_constants s 1 in - ` ldr r12, {emit_label lbl} @ {emit_symbol s}\n`; - `{record_frame i.live} bl caml_c_call\n`; 2 - end else begin - ` bl {emit_symbol s}\n`; 1 - end + | Lop(Iextcall(s, false)) -> + ` {emit_call s}\n`; 1 + | Lop(Iextcall(s, true)) -> + let ninstr = emit_load_symbol_addr (phys_reg 7 (* r7 *)) s in + ` {emit_call "caml_c_call"}\n`; + `{record_frame i.live i.dbg}\n`; + 1 + ninstr | Lop(Istackoffset n) -> assert (n mod 8 = 0); let ninstr = @@ -322,16 +455,28 @@ let emit_instr i = else emit_stack_adjustment "add" (-n) in stack_offset := !stack_offset + n; ninstr - | Lop(Iload((Double | Double_u), addr)) -> - let addr' = offset_addressing addr 4 in - if i.res.(0).loc <> i.arg.(0).loc then begin - ` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`; - ` ldr {emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n` - end else begin - ` ldr {emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`; - ` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n` - end; - 2 + | Lop(Iload(Single, addr)) when !fpu >= VFPv3_D16 -> + ` flds s14, {emit_addressing addr i.arg 0}\n`; + ` fcvtds {emit_reg i.res.(0)}, s14\n`; 2 + | Lop(Iload((Double | Double_u), addr)) when !fpu = Soft -> + (* Use LDM or LDRD if possible *) + begin match i.res.(0), i.res.(1), addr with + {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0 + when rt < rt2 -> + ` ldm {emit_reg i.arg.(0)}, \{{emit_reg i.res.(0)}, {emit_reg i.res.(1)}}\n`; 1 + | {loc = Reg rt}, {loc = Reg rt2}, addr + when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 -> + ` ldrd {emit_reg i.res.(0)}, {emit_reg i.res.(1)}, {emit_addressing addr i.arg 0}\n`; 1 + | _ -> + let addr' = offset_addressing addr 4 in + if i.res.(0).loc <> i.arg.(0).loc then begin + ` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`; + ` ldr {emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n` + end else begin + ` ldr {emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`; + ` ldr {emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n` + end; 2 + end | Lop(Iload(size, addr)) -> let r = i.res.(0) in let instr = @@ -340,65 +485,114 @@ let emit_instr i = | Byte_signed -> "ldrsb" | Sixteen_unsigned -> "ldrh" | Sixteen_signed -> "ldrsh" + | Double + | Double_u -> "fldd" | _ (* 32-bit quantities *) -> "ldr" in - ` {emit_string instr} {emit_reg r}, {emit_addressing addr i.arg 0}\n`; - 1 - | Lop(Istore((Double | Double_u), addr)) -> - let addr' = offset_addressing addr 4 in - ` str {emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`; - ` str {emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`; - 2 + ` {emit_string instr} {emit_reg r}, {emit_addressing addr i.arg 0}\n`; 1 + | Lop(Istore(Single, addr)) when !fpu >= VFPv3_D16 -> + ` fcvtsd s14, {emit_reg i.arg.(0)}\n`; + ` fsts s14, {emit_addressing addr i.arg 1}\n`; 2 + | Lop(Istore((Double | Double_u), addr)) when !fpu = Soft -> + (* Use STM or STRD if possible *) + begin match i.arg.(0), i.arg.(1), addr with + {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0 + when rt < rt2 -> + ` stm {emit_reg i.arg.(2)}, \{{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}}\n`; 1 + | {loc = Reg rt}, {loc = Reg rt2}, addr + when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 -> + ` strd {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_addressing addr i.arg 2}\n`; 1 + | _ -> + let addr' = offset_addressing addr 4 in + ` str {emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`; + ` str {emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`; 2 + end | Lop(Istore(size, addr)) -> let r = i.arg.(0) in let instr = match size with - Byte_unsigned | Byte_signed -> "strb" - | Sixteen_unsigned | Sixteen_signed -> "strh" + Byte_unsigned + | Byte_signed -> "strb" + | Sixteen_unsigned + | Sixteen_signed -> "strh" + | Double + | Double_u -> "fstd" | _ (* 32-bit quantities *) -> "str" in - ` {emit_string instr} {emit_reg r}, {emit_addressing addr i.arg 1}\n`; - 1 + ` {emit_string instr} {emit_reg r}, {emit_addressing addr i.arg 1}\n`; 1 | Lop(Ialloc n) -> + let lbl_frame = record_frame_label i.live i.dbg in if !fastcode_flag then begin - let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in - ` sub alloc_ptr, alloc_ptr, r12\n`; + let lbl_redo = new_label() in + `{emit_label lbl_redo}:`; + let ninstr = decompose_intconst + (Int32.of_int n) + (fun i -> + ` sub alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in ` cmp alloc_ptr, alloc_limit\n`; - `{record_frame i.live} blcc caml_call_gc\n`; ` add {emit_reg i.res.(0)}, alloc_ptr, #4\n`; - 4 + ni - end else if n = 8 || n = 12 || n = 16 then begin - `{record_frame i.live} bl caml_alloc{emit_int ((n-4)/4)}\n`; - ` add {emit_reg i.res.(0)}, alloc_ptr, #4\n`; 2 + let lbl_call_gc = new_label() in + ` bcc {emit_label lbl_call_gc}\n`; + call_gc_sites := + { gc_lbl = lbl_call_gc; + gc_return_lbl = lbl_redo; + gc_frame_lbl = lbl_frame } :: !call_gc_sites; + 3 + ninstr end else begin - let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in - `{record_frame i.live} bl caml_allocN\n`; - ` add {emit_reg i.res.(0)}, alloc_ptr, #4\n`; - 2 + ni + let ninstr = + begin match n with + 8 -> ` {emit_call "caml_alloc1"}\n`; 1 + | 12 -> ` {emit_call "caml_alloc2"}\n`; 1 + | 16 -> ` {emit_call "caml_alloc3"}\n`; 1 + | _ -> let ninstr = emit_intconst (phys_reg 7) (Int32.of_int n) in + ` {emit_call "caml_allocN"}\n`; 1 + ninstr + end in + `{emit_label lbl_frame}: add {emit_reg i.res.(0)}, alloc_ptr, #4\n`; + 1 + ninstr end | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) -> let shift = name_for_shift_operation op in ` mov {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} {emit_reg i.arg.(1)}\n`; 1 | Lop(Iintop(Icomp cmp)) -> - let comp = name_for_comparison cmp in + let compthen = name_for_comparison cmp in + let compelse = name_for_comparison (negate_integer_comparison cmp) in ` cmp {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; - ` mov {emit_reg i.res.(0)}, #0\n`; - ` mov{emit_string comp} {emit_reg i.res.(0)}, #1\n`; 3 - | Lop(Iintop(Icheckbound)) -> + ` ite {emit_string compthen}\n`; + ` mov{emit_string compthen} {emit_reg i.res.(0)}, #1\n`; + ` mov{emit_string compelse} {emit_reg i.res.(0)}, #0\n`; 4 + | Lop(Iintop_imm(Icomp cmp, n)) -> + let compthen = name_for_comparison cmp in + let compelse = name_for_comparison (negate_integer_comparison cmp) in + ` cmp {emit_reg i.arg.(0)}, #{emit_int n}\n`; + ` ite {emit_string compthen}\n`; + ` mov{emit_string compthen} {emit_reg i.res.(0)}, #1\n`; + ` mov{emit_string compelse} {emit_reg i.res.(0)}, #0\n`; 4 + | Lop(Iintop Icheckbound) -> + let lbl = bound_error_label i.dbg in ` cmp {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; - ` blls caml_ml_array_bound_error\n`; 2 + ` bls {emit_label lbl}\n`; 2 + | Lop(Iintop_imm(Icheckbound, n)) -> + let lbl = bound_error_label i.dbg in + ` cmp {emit_reg i.arg.(0)}, #{emit_int n}\n`; + ` bls {emit_label lbl}\n`; 2 + | Lop(Ispecific(Ishiftcheckbound shift)) -> + let lbl = bound_error_label i.dbg in + ` cmp {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`; + ` bcs {emit_label lbl}\n`; 2 | Lop(Iintop op) -> let instr = name_for_int_operation op in - ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1 + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1 | Lop(Iintop_imm(Idiv, n)) -> (* n is a power of 2 *) let l = Misc.log2 n in let r = i.res.(0) in ` movs {emit_reg r}, {emit_reg i.arg.(0)}\n`; - if n <= 256 then + if n <= 256 then begin + ` it lt\n`; ` addlt {emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n` - else begin + end else begin + ` itt lt\n`; ` addlt {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; ` sublt {emit_reg r}, {emit_reg r}, #1\n` end; - ` mov {emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 4 + ` mov {emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5 | Lop(Iintop_imm(Imod, n)) -> (* n is a power of 2 *) let l = Misc.log2 n in let a = i.arg.(0) in @@ -409,40 +603,71 @@ let emit_instr i = ` mov {emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`; ` bpl {emit_label lbl}\n`; ` cmp {emit_reg r}, #0\n`; + ` it ne\n`; ` subne {emit_reg r}, {emit_reg r}, #{emit_int n}\n`; - `{emit_label lbl}:\n`; 6 + `{emit_label lbl}:\n`; 7 | Lop(Iintop_imm((Ilsl | Ilsr | Iasr as op), n)) -> let shift = name_for_shift_operation op in ` mov {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} #{emit_int n}\n`; 1 - | Lop(Iintop_imm(Icomp cmp, n)) -> - let comp = name_for_comparison cmp in - ` cmp {emit_reg i.arg.(0)}, #{emit_int n}\n`; - ` mov {emit_reg i.res.(0)}, #0\n`; - ` mov{emit_string comp} {emit_reg i.res.(0)}, #1\n`; 3 - | Lop(Iintop_imm(Icheckbound, n)) -> - ` cmp {emit_reg i.arg.(0)}, #{emit_int n}\n`; - ` blls caml_ml_array_bound_error\n`; 2 | Lop(Iintop_imm(op, n)) -> let instr = name_for_int_operation op in - ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1 - | Lop(Inegf) -> (* argument and result in (r0, r1) *) - ` eor r1, r1, #0x80000000\n`; 1 - | Lop(Iabsf) -> (* argument and result in (r0, r1) *) - ` bic r1, r1, #0x80000000\n`; 1 - | Lop(Ifloatofint | Iintoffloat | Iaddf | Isubf | Imulf | Idivf) -> - assert false + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1 + | Lop(Iabsf | Inegf as op) when !fpu = Soft -> + let instr = (match op with + Iabsf -> "bic" + | Inegf -> "eor" + | _ -> assert false) in + ` {emit_string instr} {emit_reg i.res.(1)}, {emit_reg i.arg.(1)}, #0x80000000\n`; 1 + | Lop(Iabsf | Inegf | Ispecific Isqrtf as op) -> + let instr = (match op with + Iabsf -> "fabsd" + | Inegf -> "fnegd" + | Ispecific Isqrtf -> "fsqrtd" + | _ -> assert false) in + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1 + | Lop(Ifloatofint) -> + ` fmsr s14, {emit_reg i.arg.(0)}\n`; + ` fsitod {emit_reg i.res.(0)}, s14\n`; 2 + | Lop(Iintoffloat) -> + ` ftosizd s14, {emit_reg i.arg.(0)}\n`; + ` fmrs {emit_reg i.res.(0)}, s14\n`; 2 + | Lop(Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf as op) -> + let instr = (match op with + Iaddf -> "faddd" + | Isubf -> "fsubd" + | Imulf -> "fmuld" + | Idivf -> "fdivd" + | Ispecific Inegmulf -> "fnmuld" + | _ -> assert false) in + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; + 1 + | Lop(Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf as op)) -> + let instr = (match op with + Imuladdf -> "fmacd" + | Inegmuladdf -> "fnmacd" + | Imulsubf -> "fmscd" + | Inegmulsubf -> "fnmscd" + | _ -> assert false) in + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; + 1 | Lop(Ispecific(Ishiftarith(op, shift))) -> - let instr = name_for_shift_int_operation op in - ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`; + let instr = (match op with + Ishiftadd -> "add" + | Ishiftsub -> "sub" + | Ishiftsubrev -> "rsb") in + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`; if shift >= 0 then `, lsl #{emit_int shift}\n` else `, asr #{emit_int (-shift)}\n`; 1 - | Lop(Ispecific(Ishiftcheckbound shift)) -> - ` cmp {emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`; - ` blcs caml_ml_array_bound_error\n`; 2 | Lop(Ispecific(Irevsubimm n)) -> ` rsb {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1 + | Lop(Ispecific(Imuladd | Imulsub as op)) -> + let instr = (match op with + Imuladd -> "mla" + | Imulsub -> "mls" + | _ -> assert false) in + ` {emit_string instr} {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1 | Lreloadretaddr -> let n = frame_size() in ` ldr lr, [sp, #{emit_int(n-4)}]\n`; 1 @@ -458,29 +683,41 @@ let emit_instr i = begin match tst with Itruetest -> ` cmp {emit_reg i.arg.(0)}, #0\n`; - ` bne {emit_label lbl}\n` + ` bne {emit_label lbl}\n`; 2 | Ifalsetest -> ` cmp {emit_reg i.arg.(0)}, #0\n`; - ` beq {emit_label lbl}\n` + ` beq {emit_label lbl}\n`; 2 | Iinttest cmp -> ` cmp {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; let comp = name_for_comparison cmp in - ` b{emit_string comp} {emit_label lbl}\n` + ` b{emit_string comp} {emit_label lbl}\n`; 2 | Iinttest_imm(cmp, n) -> ` cmp {emit_reg i.arg.(0)}, #{emit_int n}\n`; let comp = name_for_comparison cmp in - ` b{emit_string comp} {emit_label lbl}\n` + ` b{emit_string comp} {emit_label lbl}\n`; 2 | Ifloattest(cmp, neg) -> - assert false + let comp = (match (cmp, neg) with + (Ceq, false) | (Cne, true) -> "eq" + | (Cne, false) | (Ceq, true) -> "ne" + | (Clt, false) -> "cc" + | (Clt, true) -> "cs" + | (Cle, false) -> "ls" + | (Cle, true) -> "hi" + | (Cgt, false) -> "gt" + | (Cgt, true) -> "le" + | (Cge, false) -> "ge" + | (Cge, true) -> "lt") in + ` fcmpd {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; + ` fmstat\n`; + ` b{emit_string comp} {emit_label lbl}\n`; 3 | Ioddtest -> ` tst {emit_reg i.arg.(0)}, #1\n`; - ` bne {emit_label lbl}\n` + ` bne {emit_label lbl}\n`; 2 | Ieventest -> ` tst {emit_reg i.arg.(0)}, #1\n`; - ` beq {emit_label lbl}\n` - end; - 2 - | Lcondbranch3(lbl0, lbl1, lbl2) -> + ` beq {emit_label lbl}\n`; 2 + end + | Lcondbranch3(lbl0, lbl1, lbl2) -> ` cmp {emit_reg i.arg.(0)}, #1\n`; begin match lbl0 with None -> () @@ -495,108 +732,144 @@ let emit_instr i = | Some lbl -> ` bgt {emit_label lbl}\n` end; 4 - | Lswitch jumptbl -> - ` ldr pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`; - ` mov r0, r0\n`; (* nop *) - for i = 0 to Array.length jumptbl - 1 do - ` .word {emit_label jumptbl.(i)}\n` - done; + | Lswitch jumptbl -> + if !arch > ARMv6 && !thumb then begin + (* The Thumb-2 TBH instruction supports only forward branches, + so we need to generate appropriate trampolines for all labels + that appear before this switch instruction (PR#5623) *) + let tramtbl = Array.copy jumptbl in + ` tbh [pc, {emit_reg i.arg.(0)}, lsl #1]\n`; + for j = 0 to Array.length tramtbl - 1 do + let rec label i = + match i.desc with + Lend -> new_label() + | Llabel lbl when lbl = tramtbl.(j) -> lbl + | _ -> label i.next in + tramtbl.(j) <- label i.next; + ` .short ({emit_label tramtbl.(j)}-.)/2+{emit_int j}\n` + done; + (* Generate the necessary trampolines *) + for j = 0 to Array.length tramtbl - 1 do + if tramtbl.(j) <> jumptbl.(j) then + `{emit_label tramtbl.(j)}: b {emit_label jumptbl.(j)}\n` + done + end else if not !pic_code then begin + ` ldr pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`; + ` nop\n`; + for j = 0 to Array.length jumptbl - 1 do + ` .word {emit_label jumptbl.(j)}\n` + done + end else begin + (* Slightly slower, but position-independent *) + ` add pc, pc, {emit_reg i.arg.(0)}, lsl #2\n`; + ` nop\n`; + for j = 0 to Array.length jumptbl - 1 do + ` b {emit_label jumptbl.(j)}\n` + done + end; 2 + Array.length jumptbl | Lsetuptrap lbl -> ` bl {emit_label lbl}\n`; 1 | Lpushtrap -> stack_offset := !stack_offset + 8; - ` stmfd sp!, \{trap_ptr, lr}\n`; + ` push \{trap_ptr, lr}\n`; ` mov trap_ptr, sp\n`; 2 | Lpoptrap -> - ` ldmfd sp!, \{trap_ptr, lr}\n`; + ` pop \{trap_ptr, lr}\n`; stack_offset := !stack_offset - 8; 1 | Lraise -> - ` mov sp, trap_ptr\n`; - ` ldmfd sp!, \{trap_ptr, pc}\n`; 2 + if !Clflags.debug then begin + ` {emit_call "caml_raise_exn"}\n`; + `{record_frame Reg.Set.empty i.dbg}\n`; 1 + end else begin + ` mov sp, trap_ptr\n`; + ` pop \{trap_ptr, pc}\n`; 2 + end (* Emission of an instruction sequence *) -let no_fallthrough = function - Lop(Itailcall_ind | Itailcall_imm _) -> true - | Lreturn -> true - | Lbranch _ -> true - | Lswitch _ -> true - | Lraise -> true - | _ -> false - let rec emit_all ninstr i = if i.desc = Lend then () else begin let n = emit_instr i in let ninstr' = ninstr + n in - let limit = 511 - !num_literals in - if ninstr' >= limit - 64 && no_fallthrough i.desc then begin - emit_constants(); + (* fldd can address up to +/-1KB, ldr can address up to +/-4KB *) + let limit = (if !fpu >= VFPv3_D16 && !float_literals <> [] + then 127 + else 511) in + let limit = limit - !num_literals in + if ninstr' >= limit - 64 && not(has_fallthrough i.desc) then begin + emit_literals(); emit_all 0 i.next - end else - if ninstr' >= limit then begin + end else if !num_literals != 0 && ninstr' >= limit then begin let lbl = new_label() in ` b {emit_label lbl}\n`; - emit_constants(); + emit_literals(); `{emit_label lbl}:\n`; emit_all 0 i.next end else emit_all ninstr' i.next end +(* Emission of the profiling prelude *) + +let emit_profile() = + match Config.system with + "linux_eabi" | "linux_eabihf" -> + ` push \{lr}\n`; + ` {emit_call "__gnu_mcount_nc"}\n` + | _ -> () + (* Emission of a function declaration *) let fundecl fundecl = function_name := fundecl.fun_name; fastcode_flag := fundecl.fun_fast; tailrec_entry_point := new_label(); + float_literals := []; + gotrel_literals := []; + symbol_literals := []; stack_offset := 0; - Hashtbl.clear symbol_constants; - Hashtbl.clear float_constants; + call_gc_sites := []; + bound_error_sites := []; ` .text\n`; ` .align 2\n`; - ` .global {emit_symbol fundecl.fun_name}\n`; + ` .globl {emit_symbol fundecl.fun_name}\n`; + if !arch > ARMv6 && !thumb then + ` .thumb\n` + else + ` .arm\n`; ` .type {emit_symbol fundecl.fun_name}, %function\n`; `{emit_symbol fundecl.fun_name}:\n`; + if !Clflags.gprofile then emit_profile(); let n = frame_size() in ignore(emit_stack_adjustment "sub" n); if !contains_calls then ` str lr, [sp, #{emit_int(n - 4)}]\n`; `{emit_label !tailrec_entry_point}:\n`; emit_all 0 fundecl.fun_body; - emit_constants() + emit_literals(); + List.iter emit_call_gc !call_gc_sites; + List.iter emit_call_bound_error !bound_error_sites; + ` .type {emit_symbol fundecl.fun_name}, %function\n`; + ` .size {emit_symbol fundecl.fun_name}, .-{emit_symbol fundecl.fun_name}\n` (* Emission of data *) let emit_item = function - Cglobal_symbol s -> - ` .global {emit_symbol s}\n`; - | Cdefine_symbol s -> - `{emit_symbol s}:\n` - | Cdefine_label lbl -> - `{emit_label (100000 + lbl)}:\n` - | Cint8 n -> - ` .byte {emit_int n}\n` - | Cint16 n -> - ` .short {emit_int n}\n` - | Cint32 n -> - ` .word {emit_nativeint n}\n` - | Cint n -> - ` .word {emit_nativeint n}\n` - | Csingle f -> - emit_float32_directive ".long" f - | Cdouble f -> - emit_float64_split_directive ".long" f - | Csymbol_address s -> - ` .word {emit_symbol s}\n` - | Clabel_address lbl -> - ` .word {emit_label (100000 + lbl)}\n` - | Cstring s -> - emit_string_directive " .ascii " s - | Cskip n -> - if n > 0 then ` .space {emit_int n}\n` - | Calign n -> - ` .align {emit_int(Misc.log2 n)}\n` + Cglobal_symbol s -> ` .globl {emit_symbol s}\n`; + | Cdefine_symbol s -> `{emit_symbol s}:\n` + | Cdefine_label lbl -> `{emit_data_label lbl}:\n` + | Cint8 n -> ` .byte {emit_int n}\n` + | Cint16 n -> ` .short {emit_int n}\n` + | Cint32 n -> ` .long {emit_int32 (Nativeint.to_int32 n)}\n` + | Cint n -> ` .long {emit_int32 (Nativeint.to_int32 n)}\n` + | Csingle f -> ` .single {emit_string f}\n` + | Cdouble f -> ` .double {emit_string f}\n` + | Csymbol_address s -> ` .word {emit_symbol s}\n` + | Clabel_address lbl -> ` .word {emit_data_label lbl}\n` + | Cstring s -> emit_string_directive " .ascii " s + | Cskip n -> if n > 0 then ` .space {emit_int n}\n` + | Calign n -> ` .align {emit_int(Misc.log2 n)}\n` let data l = ` .data\n`; @@ -605,32 +878,62 @@ let data l = (* Beginning / end of an assembly file *) let begin_assembly() = - `trap_ptr .req r11\n`; - `alloc_ptr .req r8\n`; - `alloc_limit .req r10\n`; + ` .syntax unified\n`; + begin match !arch with + | ARMv4 -> ` .arch armv4t\n` + | ARMv5 -> ` .arch armv5t\n` + | ARMv5TE -> ` .arch armv5te\n` + | ARMv6 -> ` .arch armv6\n` + | ARMv6T2 -> ` .arch armv6t2\n` + | ARMv7 -> ` .arch armv7-a\n` + end; + begin match !fpu with + Soft -> ` .fpu softvfp\n` + | VFPv3_D16 -> ` .fpu vfpv3-d16\n` + | VFPv3 -> ` .fpu vfpv3\n` + end; + `trap_ptr .req r8\n`; + `alloc_ptr .req r10\n`; + `alloc_limit .req r11\n`; let lbl_begin = Compilenv.make_symbol (Some "data_begin") in ` .data\n`; - ` .global {emit_symbol lbl_begin}\n`; + ` .globl {emit_symbol lbl_begin}\n`; `{emit_symbol lbl_begin}:\n`; let lbl_begin = Compilenv.make_symbol (Some "code_begin") in ` .text\n`; - ` .global {emit_symbol lbl_begin}\n`; + ` .globl {emit_symbol lbl_begin}\n`; `{emit_symbol lbl_begin}:\n` let end_assembly () = let lbl_end = Compilenv.make_symbol (Some "code_end") in ` .text\n`; - ` .global {emit_symbol lbl_end}\n`; + ` .globl {emit_symbol lbl_end}\n`; `{emit_symbol lbl_end}:\n`; let lbl_end = Compilenv.make_symbol (Some "data_end") in ` .data\n`; - ` .global {emit_symbol lbl_end}\n`; + ` .globl {emit_symbol lbl_end}\n`; `{emit_symbol lbl_end}:\n`; - ` .word 0\n`; + ` .long 0\n`; let lbl = Compilenv.make_symbol (Some "frametable") in - ` .data\n`; - ` .global {emit_symbol lbl}\n`; + ` .globl {emit_symbol lbl}\n`; `{emit_symbol lbl}:\n`; - ` .word {emit_int (List.length !frame_descriptors)}\n`; - List.iter emit_frame !frame_descriptors; - frame_descriptors := [] + emit_frames + { efa_label = (fun lbl -> + ` .type {emit_label lbl}, %function\n`; + ` .word {emit_label lbl}\n`); + efa_16 = (fun n -> ` .short {emit_int n}\n`); + efa_32 = (fun n -> ` .long {emit_int32 n}\n`); + efa_word = (fun n -> ` .word {emit_int n}\n`); + efa_align = (fun n -> ` .align {emit_int(Misc.log2 n)}\n`); + efa_label_rel = (fun lbl ofs -> + ` .word {emit_label lbl} - . + {emit_int32 ofs}\n`); + efa_def_label = (fun lbl -> `{emit_label lbl}:\n`); + efa_string = (fun s -> emit_string_directive " .asciz " s) }; + ` .type {emit_symbol lbl}, %object\n`; + ` .size {emit_symbol lbl}, .-{emit_symbol lbl}\n`; + begin match Config.system with + "linux_eabihf" | "linux_eabi" -> + (* Mark stack as non-executable *) + ` .section .note.GNU-stack,\"\",%progbits\n` + | _ -> () + end diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml index e56ac6e..aed2b01 100644 --- a/asmcomp/arm/proc.ml +++ b/asmcomp/arm/proc.ml @@ -1,16 +1,17 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) -(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Benedikt Meurer, University of Siegen *) (* *) -(* Copyright 1998 Institut National de Recherche en Informatique et *) -(* en Automatique. All rights reserved. This file is distributed *) -(* under the terms of the Q Public License version 1.0. *) +(* Copyright 1998 Institut National de Recherche en Informatique *) +(* et en Automatique. Copyright 2012 Benedikt Meurer. All rights *) +(* reserved. This file is distributed under the terms of the Q *) +(* Public License version 1.0. *) (* *) (***********************************************************************) -(* $Id: proc.ml 9252 2009-05-04 13:46:46Z xleroy $ *) +(* $Id$ *) (* Description of the ARM processor *) @@ -26,32 +27,56 @@ let word_addressed = false (* Registers available for register allocation *) -(* Register map: - r0 - r3 general purpose (not preserved by C) - r4 - r7 general purpose (preserved) - r8 allocation pointer (preserved) - r9 platform register, usually reserved - r10 allocation limit (preserved) - r11 trap pointer (preserved) - r12 general purpose (not preserved by C) - r13 stack pointer - r14 return address - r15 program counter +(* Integer register map: + r0 - r3 general purpose (not preserved) + r4 - r7 general purpose (preserved) + r8 trap pointer (preserved) + r9 platform register, usually reserved + r10 allocation pointer (preserved) + r11 allocation limit (preserved) + r12 intra-procedural scratch register (not preserved) + r13 stack pointer + r14 return address + r15 program counter + Floatinng-point register map (VFPv3): + d0 - d7 general purpose (not preserved) + d8 - d15 general purpose (preserved) + d16 - d31 generat purpose (not preserved), VFPv3 only *) -let int_reg_name = [| - "r0"; "r1"; "r2"; "r3"; "r4"; "r5"; "r6"; "r7"; "r12" -|] +let int_reg_name = + [| "r0"; "r1"; "r2"; "r3"; "r4"; "r5"; "r6"; "r7"; "r12" |] + +let float_reg_name = + [| "d0"; "d1"; "d2"; "d3"; "d4"; "d5"; "d6"; "d7"; + "d8"; "d9"; "d10"; "d11"; "d12"; "d13"; "d14"; "d15"; + "d16"; "d17"; "d18"; "d19"; "d20"; "d21"; "d22"; "d23"; + "d24"; "d25"; "d26"; "d27"; "d28"; "d29"; "d30"; "d31" |] + +(* We have three register classes: + 0 for integer registers + 1 for VFPv3-D16 + 2 for VFPv3 + This way we can choose between VFPv3-D16 and VFPv3 + at (ocamlopt) runtime using command line switches. +*) -let num_register_classes = 1 +let num_register_classes = 3 -let register_class r = assert (r.typ <> Float); 0 +let register_class r = + match (r.typ, !fpu) with + (Int | Addr), _ -> 0 + | Float, VFPv3_D16 -> 1 + | Float, _ -> 2 -let num_available_registers = [| 9 |] +let num_available_registers = + [| 9; 16; 32 |] -let first_available_register = [| 0 |] +let first_available_register = + [| 0; 100; 100 |] -let register_name r = int_reg_name.(r) +let register_name r = + if r < 100 then int_reg_name.(r) else float_reg_name.(r - 100) let rotate_registers = true @@ -59,25 +84,34 @@ let rotate_registers = true let hard_int_reg = let v = Array.create 9 Reg.dummy in - for i = 0 to 8 do v.(i) <- Reg.at_location Int (Reg i) done; + for i = 0 to 8 do + v.(i) <- Reg.at_location Int (Reg i) + done; + v + +let hard_float_reg = + let v = Array.create 32 Reg.dummy in + for i = 0 to 31 do + v.(i) <- Reg.at_location Float (Reg(100 + i)) + done; v -let all_phys_regs = hard_int_reg +let all_phys_regs = + Array.append hard_int_reg hard_float_reg -let phys_reg n = all_phys_regs.(n) +let phys_reg n = + if n < 100 then hard_int_reg.(n) else hard_float_reg.(n - 100) let stack_slot slot ty = - assert (ty <> Float); Reg.at_location ty (Stack slot) (* Calling conventions *) -(* XXX float types have already been expanded into pairs of integers. - So we cannot align these floats. See if that causes a problem. *) - -let calling_conventions first_int last_int make_stack arg = +let calling_conventions + first_int last_int first_float last_float make_stack arg = let loc = Array.create (Array.length arg) Reg.dummy in let int = ref first_int in + let float = ref first_float in let ofs = ref 0 in for i = 0 to Array.length arg - 1 do match arg.(i).typ with @@ -90,37 +124,86 @@ let calling_conventions first_int last_int make_stack arg = ofs := !ofs + size_int end | Float -> - assert false + assert (abi = EABI_VFP); + assert (!fpu >= VFPv3_D16); + if !float <= last_float then begin + loc.(i) <- phys_reg !float; + incr float + end else begin + ofs := Misc.align !ofs size_float; + loc.(i) <- stack_slot (make_stack !ofs) Float; + ofs := !ofs + size_float + end done; - (loc, Misc.align !ofs 8) + (loc, Misc.align !ofs 8) (* keep stack 8-aligned *) let incoming ofs = Incoming ofs let outgoing ofs = Outgoing ofs let not_supported ofs = fatal_error "Proc.loc_results: cannot call" +(* OCaml calling convention: + first integer args in r0...r7 + first float args in d0...d15 (EABI+VFP) + remaining args on stack. + Return values in r0...r7 or d0...d15. *) + let loc_arguments arg = - calling_conventions 0 7 outgoing arg + calling_conventions 0 7 100 115 outgoing arg let loc_parameters arg = - let (loc, ofs) = calling_conventions 0 7 incoming arg in loc + let (loc, _) = calling_conventions 0 7 100 115 incoming arg in loc let loc_results res = - let (loc, ofs) = calling_conventions 0 7 not_supported res in loc + let (loc, _) = calling_conventions 0 7 100 115 not_supported res in loc + +(* C calling convention: + first integer args in r0...r3 + first float args in d0...d7 (EABI+VFP) + remaining args on stack. + Return values in r0...r1 or d0. *) let loc_external_arguments arg = - calling_conventions 0 3 outgoing arg + calling_conventions 0 3 100 107 outgoing arg let loc_external_results res = - let (loc, ofs) = calling_conventions 0 1 not_supported res in loc + let (loc, _) = calling_conventions 0 1 100 100 not_supported res in loc let loc_exn_bucket = phys_reg 0 (* Registers destroyed by operations *) -let destroyed_at_c_call = (* r4-r7 preserved *) - Array.of_list(List.map phys_reg [0;1;2;3;8]) +let destroyed_at_alloc = (* r0-r6, d0-d15 preserved *) + Array.of_list (List.map + phys_reg + [7;8; + 116;116;118;119;120;121;122;123; + 124;125;126;127;128;129;130;131]) + +let destroyed_at_c_call = + Array.of_list (List.map + phys_reg + (match abi with + EABI -> (* r4-r7 preserved *) + [0;1;2;3;8; + 100;101;102;103;104;105;106;107; + 108;109;110;111;112;113;114;115; + 116;116;118;119;120;121;122;123; + 124;125;126;127;128;129;130;131] + | EABI_VFP -> (* r4-r7, d8-d15 preserved *) + [0;1;2;3;8; + 100;101;102;103;104;105;106;107; + 116;116;118;119;120;121;122;123; + 124;125;126;127;128;129;130;131])) let destroyed_at_oper = function - Iop(Icall_ind | Icall_imm _ | Iextcall(_, true)) -> all_phys_regs - | Iop(Iextcall(_, false)) -> destroyed_at_c_call - | Iop(Ialloc(_)) -> [|phys_reg 8|] (* r12 destroyed *) + Iop(Icall_ind | Icall_imm _ ) + | Iop(Iextcall(_, true)) -> + all_phys_regs + | Iop(Iextcall(_, false)) -> + destroyed_at_c_call + | Iop(Ialloc n) -> + destroyed_at_alloc + | Iop(Iconst_symbol _) when !pic_code -> + [|phys_reg 3; phys_reg 8|] (* r3 and r12 destroyed *) + | Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _)) -> + [|phys_reg 107|] (* d7 (s14-s15) destroyed *) | _ -> [||] let destroyed_at_raise = all_phys_regs @@ -128,15 +211,16 @@ let destroyed_at_raise = all_phys_regs (* Maximal register pressure *) let safe_register_pressure = function - Iextcall(_, _) -> 4 + Iextcall(_, _) -> 5 | _ -> 9 + let max_register_pressure = function - Iextcall(_, _) -> [| 4 |] - | _ -> [| 9 |] + Iextcall(_, _) -> [| 5; 9; 9 |] + | _ -> [| 9; 16; 32 |] (* Layout of the stack *) -let num_stack_slots = [| 0 |] +let num_stack_slots = [| 0; 0; 0 |] let contains_calls = ref false (* Calling the assembler *) @@ -144,6 +228,3 @@ let contains_calls = ref false let assemble_file infile outfile = Ccomp.command (Config.asm ^ " -o " ^ Filename.quote outfile ^ " " ^ Filename.quote infile) - -open Clflags;; -open Config;; diff --git a/asmcomp/arm/reload.ml b/asmcomp/arm/reload.ml index 0917438..c5b137a 100644 --- a/asmcomp/arm/reload.ml +++ b/asmcomp/arm/reload.ml @@ -1,6 +1,6 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) (* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) (* *) @@ -10,7 +10,7 @@ (* *) (***********************************************************************) -(* $Id: reload.ml 2553 1999-11-17 18:59:06Z xleroy $ *) +(* $Id$ *) (* Reloading for the ARM *) diff --git a/asmcomp/arm/scheduling.ml b/asmcomp/arm/scheduling.ml index 930e1bc..4b47733 100644 --- a/asmcomp/arm/scheduling.ml +++ b/asmcomp/arm/scheduling.ml @@ -1,51 +1,79 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) -(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Benedikt Meurer, University of Siegen *) (* *) -(* Copyright 1996 Institut National de Recherche en Informatique et *) -(* en Automatique. All rights reserved. This file is distributed *) -(* under the terms of the Q Public License version 1.0. *) +(* Copyright 1998 Institut National de Recherche en Informatique *) +(* et en Automatique. Copyright 2012 Benedikt Meurer. All rights *) +(* reserved. This file is distributed under the terms of the Q *) +(* Public License version 1.0. *) (* *) (***********************************************************************) -(* $Id: scheduling.ml 2553 1999-11-17 18:59:06Z xleroy $ *) +(* $Id$ *) +open Arch open Mach -(* Instruction scheduling for the Sparc *) +(* Instruction scheduling for the ARM *) -class scheduler = object +class scheduler = object(self) -inherit Schedgen.scheduler_generic +inherit Schedgen.scheduler_generic as super -(* Scheduling -- based roughly on the Strong ARM *) +(* Scheduling -- based roughly on the ARM11 (ARMv6) *) method oper_latency = function - Ireload -> 2 - | Iload(_, _) -> 2 - | Iconst_symbol _ -> 2 (* turned into a load *) - | Iconst_float _ -> 2 (* turned into a load *) - | Iintop(Imul) -> 3 - | Iintop_imm(Imul, _) -> 3 - (* No data available for floatops, let's make educated guesses *) - | Iaddf -> 3 - | Isubf -> 3 - | Imulf -> 5 - | Idivf -> 15 + (* Loads have a latency of two cycles in general *) + Iconst_symbol _ + | Iconst_float _ + | Iload(_, _) + | Ireload + | Ifloatofint (* mcr/mrc count as memory access *) + | Iintoffloat -> 2 + (* Multiplys have a latency of two cycles *) + | Iintop Imul + | Ispecific(Imuladd | Imulsub) -> 2 + (* VFP instructions *) + | Iaddf + | Isubf + | Idivf + | Imulf | Ispecific Inegmulf + | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf) + | Ispecific Isqrtf + | Inegf | Iabsf when !fpu >= VFPv3_D16 -> 2 + (* Everything else *) | _ -> 1 -(* Issue cycles. Rough approximations *) +method! is_checkbound = function + Ispecific(Ishiftcheckbound _) -> true + | op -> super#is_checkbound op + +(* Issue cycles. Rough approximations *) method oper_issue_cycles = function Ialloc _ -> 4 - | Iintop(Icomp _) -> 3 - | Iintop(Icheckbound) -> 2 - | Iintop_imm(Idiv, _) -> 4 - | Iintop_imm(Imod, _) -> 6 + | Iintop(Ilsl | Ilsr | Iasr) -> 2 + | Iintop(Icomp _) | Iintop_imm(Icomp _, _) -> 3 + | Iintop(Icheckbound) | Iintop_imm(Icheckbound, _) -> 2 + | Ispecific(Ishiftcheckbound _) -> 3 + | Iintop_imm(Idiv, _) -> 4 + | Iintop_imm(Imod, _) -> 6 + | Iintop Imul + | Ispecific(Imuladd | Imulsub) -> 2 + (* VFP instructions *) + | Iaddf + | Isubf -> 7 + | Imulf + | Ispecific Inegmulf -> 9 + | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf) -> 17 + | Idivf + | Ispecific Isqrtf -> 27 + | Inegf | Iabsf | Iconst_float _ when !fpu >= VFPv3_D16 -> 4 + (* Everything else *) | _ -> 1 end diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml index f09d146..94d0367 100644 --- a/asmcomp/arm/selection.ml +++ b/asmcomp/arm/selection.ml @@ -1,54 +1,77 @@ (***********************************************************************) (* *) -(* Objective Caml *) +(* OCaml *) (* *) -(* Xavier Leroy, projet Cristal, INRIA Rocquencourt *) +(* Benedikt Meurer, University of Siegen *) (* *) -(* Copyright 1998 Institut National de Recherche en Informatique et *) -(* en Automatique. All rights reserved. This file is distributed *) -(* under the terms of the Q Public License version 1.0. *) +(* Copyright 1998 Institut National de Recherche en Informatique *) +(* et en Automatique. Copyright 2012 Benedikt Meurer. All rights *) +(* reserved. This file is distributed under the terms of the Q *) +(* Public License version 1.0. *) (* *) (***********************************************************************) -(* $Id: selection.ml 10295 2010-04-22 12:39:40Z xleroy $ *) +(* $Id$ *) (* Instruction selection for the ARM processor *) -open Misc -open Cmm -open Reg open Arch -open Proc +open Cmm open Mach +open Misc +open Proc +open Reg -(* Immediate operands are 8-bit immediate values, zero-extended, and rotated - right by 0, 2, 4, ... 30 bits. - To avoid problems with Caml's 31-bit arithmetic, - we check only with 8-bit values shifted left 0 to 22 bits. *) - -let rec is_immed n shift = - if shift > 22 then false - else if n land (0xFF lsl shift) = n then true - else is_immed n (shift + 2) +let is_offset chunk n = + match chunk with + (* VFPv3 load/store have -1020 to 1020 *) + Single | Double | Double_u + when !fpu >= VFPv3_D16 -> + n >= -1020 && n <= 1020 + (* ARM load/store byte/word have -4095 to 4095 *) + | Byte_unsigned | Byte_signed + | Thirtytwo_unsigned | Thirtytwo_signed + | Word | Single + when not !thumb -> + n >= -4095 && n <= 4095 + (* Thumb-2 load/store have -255 to 4095 *) + | _ when !arch > ARMv6 && !thumb -> + n >= -255 && n <= 4095 + (* Everything else has -255 to 255 *) + | _ -> + n >= -255 && n <= 255 -(* We have 12-bit + sign byte offsets for word accesses, - 8-bit + sign word offsets for float accesses, - and 8-bit + sign byte offsets for bytes and shorts. - Use lowest common denominator. *) +let is_intconst = function + Cconst_int _ -> true + | _ -> false -let is_offset n = n < 256 && n > -256 +(* Special constraints on operand and result registers *) -let is_intconst = function Cconst_int n -> true | _ -> false +exception Use_default -(* Soft emulation of float comparisons *) +let r1 = phys_reg 1 -let float_comparison_function = function - | Ceq -> "__eqdf2" - | Cne -> "__nedf2" - | Clt -> "__ltdf2" - | Cle -> "__ledf2" - | Cgt -> "__gtdf2" - | Cge -> "__gedf2" +let pseudoregs_for_operation op arg res = + match op with + (* For mul rd,rm,rs and mla rd,rm,rs,ra (pre-ARMv6) the registers rm + and rd must be different. We deal with this by pretending that rm + is also a result of the mul / mla operation. *) + Iintop Imul | Ispecific Imuladd when !arch < ARMv6 -> + (arg, [| res.(0); arg.(0) |]) + (* Soft-float Iabsf and Inegf: arg.(0) and res.(0) must be the same *) + | Iabsf | Inegf when !fpu = Soft -> + ([|res.(0); arg.(1)|], res) + (* VFPv3 Imuladdf...Inegmulsubf: arg.(0) and res.(0) must be the same *) + | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf) -> + let arg' = Array.copy arg in + arg'.(0) <- res.(0); + (arg', res) + (* We use __aeabi_idivmod for Cmodi only, and hence we care only + for the remainder in r1, so fix up the destination register. *) + | Iextcall("__aeabi_idivmod", false) -> + (arg, [|r1|]) + (* Other instructions are regular *) + | _ -> raise Use_default (* Instruction selection *) class selector = object(self) @@ -56,23 +79,32 @@ class selector = object(self) inherit Selectgen.selector_generic as super method! regs_for tyv = - (* Expand floats into pairs of integer registers *) - let nty = Array.length tyv in - let rec expand i = - if i >= nty then [] else begin - match tyv.(i) with - | Float -> Int :: Int :: expand (i+1) - | ty -> ty :: expand (i+1) - end in - Reg.createv (Array.of_list (expand 0)) + Reg.createv (if !fpu = Soft then begin + (* Expand floats into pairs of integer registers *) + let rec expand = function + [] -> [] + | Float :: tyl -> Int :: Int :: expand tyl + | ty :: tyl -> ty :: expand tyl in + Array.of_list (expand (Array.to_list tyv)) + end else begin + tyv + end) method is_immediate n = - n land 0xFF = n || is_immed n 2 + is_immediate (Int32.of_int n) + +method! is_simple_expr = function + (* inlined floating-point ops are simple if their arguments are *) + | Cop(Cextcall("sqrt", _, _, _), args) when !fpu >= VFPv3_D16 -> + List.for_all self#is_simple_expr args + | e -> super#is_simple_expr e -method select_addressing = function - Cop(Cadda, [arg; Cconst_int n]) when is_offset n -> +method select_addressing chunk = function + | Cop(Cadda, [arg; Cconst_int n]) + when is_offset chunk n -> (Iindexed n, arg) - | Cop(Cadda, [arg1; Cop(Caddi, [arg2; Cconst_int n])]) when is_offset n -> + | Cop(Cadda, [arg1; Cop(Caddi, [arg2; Cconst_int n])]) + when is_offset chunk n -> (Iindexed n, Cop(Cadda, [arg1; arg2])) | arg -> (Iindexed 0, arg) @@ -91,109 +123,146 @@ method select_shift_arith op shiftop shiftrevop args = | [Cop(Casr, [arg1; Cconst_int n]); arg2] when n > 0 && n < 32 && not(is_intconst arg1) -> (Ispecific(Ishiftarith(shiftrevop, -n)), [arg2; arg1]) - | _ -> - super#select_operation op args + | args -> + begin match super#select_operation op args with + (* Recognize multiply and add *) + (Iintop Iadd, [Cop(Cmuli, args); arg3]) + | (Iintop Iadd, [arg3; Cop(Cmuli, args)]) as op_args -> + begin match self#select_operation Cmuli args with + (Iintop Imul, [arg1; arg2]) -> + (Ispecific Imuladd, [arg1; arg2; arg3]) + | _ -> op_args + end + (* Recognize multiply and subtract *) + | (Iintop Isub, [arg3; Cop(Cmuli, args)]) as op_args + when !arch > ARMv6 -> + begin match self#select_operation Cmuli args with + (Iintop Imul, [arg1; arg2]) -> + (Ispecific Imulsub, [arg1; arg2; arg3]) + | _ -> op_args + end + | op_args -> op_args + end method! select_operation op args = - match op with - Cadda | Caddi -> - begin match args with - [arg1; Cconst_int n] when n < 0 && self#is_immediate (-n) -> - (Iintop_imm(Isub, -n), [arg1]) - | _ -> - self#select_shift_arith op Ishiftadd Ishiftadd args - end - | Csuba | Csubi -> - begin match args with - [arg1; Cconst_int n] when n < 0 && self#is_immediate (-n) -> - (Iintop_imm(Iadd, -n), [arg1]) - | [Cconst_int n; arg2] when self#is_immediate n -> - (Ispecific(Irevsubimm n), [arg2]) - | _ -> - self#select_shift_arith op Ishiftsub Ishiftsubrev args - end - | Cmuli -> (* no multiply immediate *) + match (op, args) with + (* Recognize special shift arithmetic *) + ((Cadda | Caddi), [arg; Cconst_int n]) + when n < 0 && self#is_immediate (-n) -> + (Iintop_imm(Isub, -n), [arg]) + | ((Cadda | Caddi as op), args) -> + self#select_shift_arith op Ishiftadd Ishiftadd args + | ((Csuba | Csubi), [arg; Cconst_int n]) + when n < 0 && self#is_immediate (-n) -> + (Iintop_imm(Iadd, -n), [arg]) + | ((Csuba | Csubi), [Cconst_int n; arg]) + when self#is_immediate n -> + (Ispecific(Irevsubimm n), [arg]) + | ((Csuba | Csubi as op), args) -> + self#select_shift_arith op Ishiftsub Ishiftsubrev args + | (Ccheckbound _, [Cop(Clsr, [arg1; Cconst_int n]); arg2]) + when n > 0 && n < 32 && not(is_intconst arg2) -> + (Ispecific(Ishiftcheckbound n), [arg1; arg2]) + (* ARM does not support immediate operands for multiplication *) + | (Cmuli, args) -> (Iintop Imul, args) - | Cdivi -> - begin match args with - [arg1; Cconst_int n] when n = 1 lsl (Misc.log2 n) -> - (Iintop_imm(Idiv, n), [arg1]) - | _ -> - (Iextcall("__divsi3", false), args) - end - | Cmodi -> - begin match args with - [arg1; Cconst_int n] when n = 1 lsl (Misc.log2 n) -> - (Iintop_imm(Imod, n), [arg1]) - | _ -> - (Iextcall("__modsi3", false), args) - end - | Ccheckbound _ -> - begin match args with - [Cop(Clsr, [arg1; Cconst_int n]); arg2] - when n > 0 && n < 32 && not(is_intconst arg2) -> - (Ispecific(Ishiftcheckbound n), [arg1; arg2]) - | _ -> - super#select_operation op args - end - (* Turn floating-point operations into library function calls *) - | Caddf -> (Iextcall("__adddf3", false), args) - | Csubf -> (Iextcall("__subdf3", false), args) - | Cmulf -> (Iextcall("__muldf3", false), args) - | Cdivf -> (Iextcall("__divdf3", false), args) - | Cfloatofint -> (Iextcall("__floatsidf", false), args) - | Cintoffloat -> (Iextcall("__fixdfsi", false), args) - | Ccmpf comp -> - (Iintop_imm(Icomp(Isigned comp), 0), - [Cop(Cextcall(float_comparison_function comp, - typ_int, false, Debuginfo.none), - args)]) + (* Turn integer division/modulus into runtime ABI calls *) + | (Cdivi, [arg; Cconst_int n]) + when n = 1 lsl Misc.log2 n -> + (Iintop_imm(Idiv, n), [arg]) + | (Cdivi, args) -> + (Iextcall("__aeabi_idiv", false), args) + | (Cmodi, [arg; Cconst_int n]) + when n = 1 lsl Misc.log2 n -> + (Iintop_imm(Imod, n), [arg]) + | (Cmodi, args) -> + (* See above for fix up of return register *) + (Iextcall("__aeabi_idivmod", false), args) + (* Turn floating-point operations into runtime ABI calls for softfp *) + | (op, args) when !fpu = Soft -> self#select_operation_softfp op args + (* Select operations for VFPv3 *) + | (op, args) -> self#select_operation_vfpv3 op args + +method private select_operation_softfp op args = + match (op, args) with + (* Turn floating-point operations into runtime ABI calls *) + | (Caddf, args) -> (Iextcall("__aeabi_dadd", false), args) + | (Csubf, args) -> (Iextcall("__aeabi_dsub", false), args) + | (Cmulf, args) -> (Iextcall("__aeabi_dmul", false), args) + | (Cdivf, args) -> (Iextcall("__aeabi_ddiv", false), args) + | (Cfloatofint, args) -> (Iextcall("__aeabi_i2d", false), args) + | (Cintoffloat, args) -> (Iextcall("__aeabi_d2iz", false), args) + | (Ccmpf comp, args) -> + let func = (match comp with + Cne (* there's no __aeabi_dcmpne *) + | Ceq -> "__aeabi_dcmpeq" + | Clt -> "__aeabi_dcmplt" + | Cle -> "__aeabi_dcmple" + | Cgt -> "__aeabi_dcmpgt" + | Cge -> "__aeabi_dcmpge") in + let comp = (match comp with + Cne -> Ceq (* eq 0 => false *) + | _ -> Cne (* ne 0 => true *)) in + (Iintop_imm(Icomp(Iunsigned comp), 0), + [Cop(Cextcall(func, typ_int, false, Debuginfo.none), args)]) (* Add coercions around loads and stores of 32-bit floats *) - | Cload Single -> - (Iextcall("__extendsfdf2", false), [Cop(Cload Word, args)]) - | Cstore Single -> - begin match args with - | [arg1; arg2] -> - let arg2' = - Cop(Cextcall("__truncdfsf2", typ_int, false, Debuginfo.none), - [arg2]) in - self#select_operation (Cstore Word) [arg1; arg2'] - | _ -> assert false - end + | (Cload Single, args) -> + (Iextcall("__aeabi_f2d", false), [Cop(Cload Word, args)]) + | (Cstore Single, [arg1; arg2]) -> + let arg2' = + Cop(Cextcall("__aeabi_d2f", typ_int, false, Debuginfo.none), + [arg2]) in + self#select_operation (Cstore Word) [arg1; arg2'] (* Other operations are regular *) - | _ -> super#select_operation op args + | (op, args) -> super#select_operation op args + +method private select_operation_vfpv3 op args = + match (op, args) with + (* Recognize floating-point negate and multiply *) + (Cnegf, [Cop(Cmulf, args)]) -> + (Ispecific Inegmulf, args) + (* Recognize floating-point multiply and add *) + | (Caddf, [arg; Cop(Cmulf, args)]) + | (Caddf, [Cop(Cmulf, args); arg]) -> + (Ispecific Imuladdf, arg :: args) + (* Recognize floating-point negate, multiply and subtract *) + | (Csubf, [Cop(Cnegf, [arg]); Cop(Cmulf, args)]) + | (Csubf, [Cop(Cnegf, [Cop(Cmulf, args)]); arg]) -> + (Ispecific Inegmulsubf, arg :: args) + (* Recognize floating-point negate, multiply and add *) + | (Csubf, [arg; Cop(Cmulf, args)]) -> + (Ispecific Inegmuladdf, arg :: args) + (* Recognize multiply and subtract *) + | (Csubf, [Cop(Cmulf, args); arg]) -> + (Ispecific Imulsubf, arg :: args) + (* Recognize floating-point square root *) + | (Cextcall("sqrt", _, false, _), args) -> + (Ispecific Isqrtf, args) + (* Other operations are regular *) + | (op, args) -> super#select_operation op args method! select_condition = function - | Cop(Ccmpf cmp, args) -> - (Iinttest_imm(Isigned cmp, 0), - Cop(Cextcall(float_comparison_function cmp, - typ_int, false, Debuginfo.none), - args)) + (* Turn floating-point comparisons into runtime ABI calls *) + Cop(Ccmpf _ as op, args) when !fpu = Soft -> + begin match self#select_operation_softfp op args with + (Iintop_imm(Icomp(Iunsigned Ceq), 0), [arg]) -> (Ifalsetest, arg) + | (Iintop_imm(Icomp(Iunsigned Cne), 0), [arg]) -> (Itruetest, arg) + | _ -> assert false + end | expr -> super#select_condition expr -(* Deal with some register irregularities: - -1- In mul rd, rm, rs, the registers rm and rd must be different. - We deal with this by pretending that rm is also a result of the mul - operation. - -2- For Inegf and Iabsf, force arguments and results in (r0, r1); - this simplifies code generation later. -*) +(* Deal with some register constraints *) method! insert_op_debug op dbg rs rd = - match op with - | Iintop(Imul) -> - self#insert_debug (Iop op) dbg rs [| rd.(0); rs.(0) |]; rd - | Iabsf | Inegf -> - let r = [| phys_reg 0; phys_reg 1 |] in - self#insert_moves rs r; - self#insert_debug (Iop op) dbg r r; - self#insert_moves r rd; - rd - | _ -> - super#insert_op_debug op dbg rs rd + try + let (rsrc, rdst) = pseudoregs_for_operation op rs rd in + self#insert_moves rs rsrc; + self#insert_debug (Iop op) dbg rsrc rdst; + self#insert_moves rdst rd; + rd + with Use_default -> + super#insert_op_debug op dbg rs rd end diff --git a/asmcomp/i386/selection.ml b/asmcomp/i386/selection.ml index 1700bf3..827a63d 100644 --- a/asmcomp/i386/selection.ml +++ b/asmcomp/i386/selection.ml @@ -168,7 +168,7 @@ method! is_simple_expr e = | _ -> super#is_simple_expr e -method select_addressing exp = +method select_addressing chunk exp = match select_addr exp with (Asymbol s, d) -> (Ibased(s, d), Ctuple []) @@ -200,7 +200,7 @@ method! select_operation op args = match op with (* Recognize the LEA instruction *) Caddi | Cadda | Csubi | Csuba -> - begin match self#select_addressing (Cop(op, args)) with + begin match self#select_addressing Word (Cop(op, args)) with (Iindexed d, _) -> super#select_operation op args | (Iindexed2 0, _) -> super#select_operation op args | (addr, arg) -> (Ispecific(Ilea addr), [arg]) @@ -233,7 +233,7 @@ method! select_operation op args = begin match args with [loc; Cop(Caddi, [Cop(Cload _, [loc']); Cconst_int n])] when loc = loc' -> - let (addr, arg) = self#select_addressing loc in + let (addr, arg) = self#select_addressing Word loc in (Ispecific(Ioffset_loc(n, addr)), [arg]) | _ -> super#select_operation op args @@ -250,11 +250,11 @@ method! select_operation op args = method select_floatarith regular_op reversed_op mem_op mem_rev_op args = match args with [arg1; Cop(Cload chunk, [loc2])] -> - let (addr, arg2) = self#select_addressing loc2 in + let (addr, arg2) = self#select_addressing chunk loc2 in (Ispecific(Ifloatarithmem(chunk_double chunk, mem_op, addr)), [arg1; arg2]) | [Cop(Cload chunk, [loc1]); arg2] -> - let (addr, arg1) = self#select_addressing loc1 in + let (addr, arg1) = self#select_addressing chunk loc1 in (Ispecific(Ifloatarithmem(chunk_double chunk, mem_rev_op, addr)), [arg2; arg1]) | [arg1; arg2] -> @@ -295,10 +295,10 @@ method select_push exp = | Cconst_natpointer n -> (Ispecific(Ipush_int n), Ctuple []) | Cconst_symbol s -> (Ispecific(Ipush_symbol s), Ctuple []) | Cop(Cload Word, [loc]) -> - let (addr, arg) = self#select_addressing loc in + let (addr, arg) = self#select_addressing Word loc in (Ispecific(Ipush_load addr), arg) | Cop(Cload Double_u, [loc]) -> - let (addr, arg) = self#select_addressing loc in + let (addr, arg) = self#select_addressing Double_u loc in (Ispecific(Ipush_load_float addr), arg) | _ -> (Ispecific(Ipush), exp) diff --git a/asmcomp/power/selection.ml b/asmcomp/power/selection.ml index ed15efb..0532d6b 100644 --- a/asmcomp/power/selection.ml +++ b/asmcomp/power/selection.ml @@ -52,7 +52,7 @@ inherit Selectgen.selector_generic as super method is_immediate n = (n <= 32767) && (n >= -32768) -method select_addressing exp = +method select_addressing chunk exp = match select_addr exp with (Asymbol s, d) -> (Ibased(s, d), Ctuple []) diff --git a/asmcomp/power64/selection.ml b/asmcomp/power64/selection.ml index 7b8e2a4..d2325e1 100644 --- a/asmcomp/power64/selection.ml +++ b/asmcomp/power64/selection.ml @@ -52,7 +52,7 @@ inherit Selectgen.selector_generic as super method is_immediate n = (n <= 32767) && (n >= -32768) -method select_addressing exp = +method select_addressing chunk exp = match select_addr exp with (Asymbol s, d) -> (Ibased(s, d), Ctuple []) diff --git a/asmcomp/selectgen.ml b/asmcomp/selectgen.ml index 2fc40f7..0bc9efb 100644 --- a/asmcomp/selectgen.ml +++ b/asmcomp/selectgen.ml @@ -204,7 +204,7 @@ method virtual is_immediate : int -> bool (* Selection of addressing modes *) method virtual select_addressing : - Cmm.expression -> Arch.addressing_mode * Cmm.expression + Cmm.memory_chunk -> Cmm.expression -> Arch.addressing_mode * Cmm.expression (* Default instruction selection for stores (of words) *) @@ -219,10 +219,10 @@ method select_operation op args = | (Capply(ty, dbg), _) -> (Icall_ind, args) | (Cextcall(s, ty, alloc, dbg), _) -> (Iextcall(s, alloc), args) | (Cload chunk, [arg]) -> - let (addr, eloc) = self#select_addressing arg in + let (addr, eloc) = self#select_addressing chunk arg in (Iload(chunk, addr), [eloc]) | (Cstore chunk, [arg1; arg2]) -> - let (addr, eloc) = self#select_addressing arg1 in + let (addr, eloc) = self#select_addressing chunk arg1 in if chunk = Word then begin let (op, newarg2) = self#select_store addr arg2 in (op, [newarg2; eloc]) @@ -366,7 +366,7 @@ method insert_move src dst = self#insert (Iop Imove) [|src|] [|dst|] method insert_moves src dst = - for i = 0 to Array.length src - 1 do + for i = 0 to min (Array.length src) (Array.length dst) - 1 do self#insert_move src.(i) dst.(i) done @@ -490,9 +490,8 @@ method emit_expr env exp = let (loc_arg, stack_ofs) = self#emit_extcall_args env new_args in let rd = self#regs_for ty in - let loc_res = Proc.loc_external_results rd in - self#insert_debug (Iop(Iextcall(lbl, alloc))) dbg - loc_arg loc_res; + let loc_res = self#insert_op_debug (Iextcall(lbl, alloc)) dbg + loc_arg (Proc.loc_external_results rd) in self#insert_move_results loc_res rd stack_ofs; Some rd | Ialloc _ -> diff --git a/asmcomp/selectgen.mli b/asmcomp/selectgen.mli index ae53cda..69dae6d 100644 --- a/asmcomp/selectgen.mli +++ b/asmcomp/selectgen.mli @@ -26,7 +26,7 @@ class virtual selector_generic : object (* Must be defined to indicate whether a constant is a suitable immediate operand to arithmetic instructions *) method virtual select_addressing : - Cmm.expression -> Arch.addressing_mode * Cmm.expression + Cmm.memory_chunk -> Cmm.expression -> Arch.addressing_mode * Cmm.expression (* Must be defined to select addressing modes *) method is_simple_expr: Cmm.expression -> bool (* Can be overridden to reflect special extcalls known to be pure *) diff --git a/asmcomp/sparc/selection.ml b/asmcomp/sparc/selection.ml index 82758dc..c1f30fd 100644 --- a/asmcomp/sparc/selection.ml +++ b/asmcomp/sparc/selection.ml @@ -26,7 +26,7 @@ inherit Selectgen.selector_generic as super method is_immediate n = (n <= 4095) && (n >= -4096) -method select_addressing = function +method select_addressing chunk = function Cconst_symbol s -> (Ibased(s, 0), Ctuple []) | Cop(Cadda, [Cconst_symbol s; Cconst_int n]) -> diff --git a/asmrun/arm.S b/asmrun/arm.S index 1313e9c..6482956 100644 --- a/asmrun/arm.S +++ b/asmrun/arm.S @@ -1,286 +1,411 @@ /***********************************************************************/ /* */ -/* Objective Caml */ +/* OCaml */ /* */ -/* Xavier Leroy, projet Cristal, INRIA Rocquencourt */ +/* Benedikt Meurer, University of Siegen */ /* */ -/* Copyright 1998 Institut National de Recherche en Informatique et */ -/* en Automatique. All rights reserved. This file is distributed */ -/* under the terms of the GNU Library General Public License, with */ -/* the special exception on linking described in file ../LICENSE. */ +/* Copyright 1998 Institut National de Recherche en Informatique */ +/* et en Automatique. Copyright 2012 Benedikt Meurer. All rights */ +/* reserved. This file is distributed under the terms of the GNU */ +/* Library General Public License, with the special exception on */ +/* linking described in file ../LICENSE. */ /* */ /***********************************************************************/ -/* $Id: arm.S 9252 2009-05-04 13:46:46Z xleroy $ */ +/* $Id$ */ /* Asm part of the runtime system, ARM processor */ +/* Must be preprocessed by cpp */ -trap_ptr .req r11 -alloc_ptr .req r8 -alloc_limit .req r10 - + .syntax unified .text +#if defined(SYS_linux_eabihf) + .arch armv7-a + .fpu vfpv3-d16 + .thumb +#elif defined(SYS_linux_eabi) + .arch armv4t + .arm + + /* Compatibility macros */ + .macro blx reg + mov lr, pc + bx \reg + .endm + .macro cbz reg, lbl + cmp \reg, #0 + beq \lbl + .endm + .macro vpop regs + .endm + .macro vpush regs + .endm +#endif + +trap_ptr .req r8 +alloc_ptr .req r10 +alloc_limit .req r11 + +/* Support for profiling with gprof */ + +#if defined(PROFILING) && (defined(SYS_linux_eabihf) || defined(SYS_linux_eabi)) +#define PROFILE \ + push {lr}; \ + bl __gnu_mcount_nc +#else +#define PROFILE +#endif /* Allocation functions and GC interface */ - .globl caml_call_gc + .globl caml_system__code_begin +caml_system__code_begin: + + .align 2 + .globl caml_call_gc .type caml_call_gc, %function caml_call_gc: - /* Record return address and desired size */ - /* Can use alloc_limit as a temporary since it will be reloaded by - invoke_gc */ - ldr alloc_limit, .Lcaml_last_return_address - str lr, [alloc_limit, #0] - ldr alloc_limit, .Lcaml_requested_size - str r12, [alloc_limit, #0] - /* Branch to shared GC code */ - bl .Linvoke_gc - /* Finish allocation */ - ldr r12, .Lcaml_requested_size - ldr r12, [r12, #0] - sub alloc_ptr, alloc_ptr, r12 + PROFILE + /* Record return address */ + ldr r12, =caml_last_return_address + str lr, [r12] +.Lcaml_call_gc: + /* Record lowest stack address */ + ldr r12, =caml_bottom_of_stack + str sp, [r12] + /* Save caller floating-point registers on the stack */ + vpush {d0-d7} + /* Save integer registers and return address on the stack */ + push {r0-r7,r12,lr} + /* Store pointer to saved integer registers in caml_gc_regs */ + ldr r12, =caml_gc_regs + str sp, [r12] + /* Save current allocation pointer for debugging purposes */ + ldr alloc_limit, =caml_young_ptr + str alloc_ptr, [alloc_limit] + /* Save trap pointer in case an exception is raised during GC */ + ldr r12, =caml_exception_pointer + str trap_ptr, [r12] + /* Call the garbage collector */ + bl caml_garbage_collection + /* Restore integer registers and return address from the stack */ + pop {r0-r7,r12,lr} + /* Restore floating-point registers from the stack */ + vpop {d0-d7} + /* Reload new allocation pointer and limit */ + /* alloc_limit still points to caml_young_ptr */ + ldr r12, =caml_young_limit + ldr alloc_ptr, [alloc_limit] + ldr alloc_limit, [r12] + /* Return to caller */ bx lr + .type caml_call_gc, %function + .size caml_call_gc, .-caml_call_gc - .globl caml_alloc1 + .align 2 + .globl caml_alloc1 .type caml_alloc1, %function caml_alloc1: - sub alloc_ptr, alloc_ptr, #8 + PROFILE +.Lcaml_alloc1: + sub alloc_ptr, alloc_ptr, 8 cmp alloc_ptr, alloc_limit - movcs pc, lr /* Return if alloc_ptr >= alloc_limit */ - /* Record return address */ - ldr r12, .Lcaml_last_return_address - str lr, [r12, #0] - /* Invoke GC */ - bl .Linvoke_gc + bcc 1f + bx lr +1: /* Record return address */ + ldr r7, =caml_last_return_address + str lr, [r7] + /* Call GC (preserves r7) */ + bl .Lcaml_call_gc + /* Restore return address */ + ldr lr, [r7] /* Try again */ - b caml_alloc1 + b .Lcaml_alloc1 + .type caml_alloc1, %function + .size caml_alloc1, .-caml_alloc1 - .globl caml_alloc2 + .align 2 + .globl caml_alloc2 .type caml_alloc2, %function caml_alloc2: - sub alloc_ptr, alloc_ptr, #12 + PROFILE +.Lcaml_alloc2: + sub alloc_ptr, alloc_ptr, 12 cmp alloc_ptr, alloc_limit - movcs pc, lr /* Return if alloc_ptr >= alloc_limit */ - /* Record return address */ - ldr r12, .Lcaml_last_return_address - str lr, [r12, #0] - /* Invoke GC */ - bl .Linvoke_gc + bcc 1f + bx lr +1: /* Record return address */ + ldr r7, =caml_last_return_address + str lr, [r7] + /* Call GC (preserves r7) */ + bl .Lcaml_call_gc + /* Restore return address */ + ldr lr, [r7] /* Try again */ - b caml_alloc2 + b .Lcaml_alloc2 + .type caml_alloc2, %function + .size caml_alloc2, .-caml_alloc2 - .globl caml_alloc3 + .align 2 + .globl caml_alloc3 .type caml_alloc3, %function caml_alloc3: - sub alloc_ptr, alloc_ptr, #16 + PROFILE +.Lcaml_alloc3: + sub alloc_ptr, alloc_ptr, 16 cmp alloc_ptr, alloc_limit - movcs pc, lr /* Return if alloc_ptr >= alloc_limit */ - /* Record return address */ - ldr r12, .Lcaml_last_return_address - str lr, [r12, #0] - /* Invoke GC */ - bl .Linvoke_gc + bcc 1f + bx lr +1: /* Record return address */ + ldr r7, =caml_last_return_address + str lr, [r7] + /* Call GC (preserves r7) */ + bl .Lcaml_call_gc + /* Restore return address */ + ldr lr, [r7] /* Try again */ - b caml_alloc3 + b .Lcaml_alloc3 + .type caml_alloc3, %function + .size caml_alloc3, .-caml_alloc3 - .globl caml_allocN + .align 2 + .globl caml_allocN .type caml_allocN, %function caml_allocN: - sub alloc_ptr, alloc_ptr, r12 + PROFILE +.Lcaml_allocN: + sub alloc_ptr, alloc_ptr, r7 cmp alloc_ptr, alloc_limit - movcs pc, lr /* Return if alloc_ptr >= alloc_limit */ - /* Record return address and desired size */ - /* Can use alloc_limit as a temporary since it will be reloaded by - invoke_gc */ - ldr alloc_limit, .Lcaml_last_return_address - str lr, [alloc_limit, #0] - ldr alloc_limit, .Lcaml_requested_size - str r12, [alloc_limit, #0] - /* Invoke GC */ - bl .Linvoke_gc + bcc 1f + bx lr +1: /* Record return address */ + ldr r12, =caml_last_return_address + str lr, [r12] + /* Call GC (preserves r7) */ + bl .Lcaml_call_gc + /* Restore return address */ + ldr r12, =caml_last_return_address + ldr lr, [r12] /* Try again */ - ldr r12, .Lcaml_requested_size - ldr r12, [r12, #0] - b caml_allocN - -/* Shared code to invoke the GC */ -.Linvoke_gc: - /* Record lowest stack address */ - ldr r12, .Lcaml_bottom_of_stack - str sp, [r12, #0] - /* Save integer registers and return address on stack */ - stmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r12,lr} - /* Store pointer to saved integer registers in caml_gc_regs */ - ldr r12, .Lcaml_gc_regs - str sp, [r12, #0] - /* Save current allocation pointer for debugging purposes */ - ldr r12, .Lcaml_young_ptr - str alloc_ptr, [r12, #0] - /* Save trap pointer in case an exception is raised during GC */ - ldr r12, .Lcaml_exception_pointer - str trap_ptr, [r12, #0] - /* Call the garbage collector */ - bl caml_garbage_collection - /* Restore the registers from the stack */ - ldmfd sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r12} - /* Reload return address */ - ldr r12, .Lcaml_last_return_address - ldr lr, [r12, #0] - /* Reload new allocation pointer and allocation limit */ - ldr r12, .Lcaml_young_ptr - ldr alloc_ptr, [r12, #0] - ldr r12, .Lcaml_young_limit - ldr alloc_limit, [r12, #0] - /* Return to caller */ - ldr r12, [sp], #4 - bx r12 + b .Lcaml_allocN + .type caml_allocN, %function + .size caml_allocN, .-caml_allocN -/* Call a C function from Caml */ -/* Function to call is in r12 */ +/* Call a C function from OCaml */ +/* Function to call is in r7 */ - .globl caml_c_call + .align 2 + .globl caml_c_call .type caml_c_call, %function caml_c_call: + PROFILE + /* Record lowest stack address and return address */ + ldr r5, =caml_last_return_address + ldr r6, =caml_bottom_of_stack + str lr, [r5] + str sp, [r6] /* Preserve return address in callee-save register r4 */ mov r4, lr - /* Record lowest stack address and return address */ - ldr r5, .Lcaml_last_return_address - ldr r6, .Lcaml_bottom_of_stack - str lr, [r5, #0] - str sp, [r6, #0] - /* Make the exception handler and alloc ptr available to the C code */ - ldr r6, .Lcaml_young_ptr - ldr r7, .Lcaml_exception_pointer - str alloc_ptr, [r6, #0] - str trap_ptr, [r7, #0] + /* Make the exception handler alloc ptr available to the C code */ + ldr r5, =caml_young_ptr + ldr r6, =caml_exception_pointer + str alloc_ptr, [r5] + str trap_ptr, [r6] /* Call the function */ - mov lr, pc - bx r12 + blx r7 /* Reload alloc ptr and alloc limit */ - ldr r5, .Lcaml_young_limit - ldr alloc_ptr, [r6, #0] /* r6 still points to caml_young_ptr */ - ldr alloc_limit, [r5, #0] + ldr r6, =caml_young_limit + ldr alloc_ptr, [r5] /* r5 still points to caml_young_ptr */ + ldr alloc_limit, [r6] /* Return */ bx r4 + .type caml_c_call, %function + .size caml_c_call, .-caml_c_call -/* Start the Caml program */ +/* Start the OCaml program */ - .globl caml_start_program + .align 2 + .globl caml_start_program .type caml_start_program, %function caml_start_program: - ldr r12, .Lcaml_program + PROFILE + ldr r12, =caml_program /* Code shared with caml_callback* */ -/* Address of Caml code to call is in r12 */ -/* Arguments to the Caml code are in r0...r3 */ +/* Address of OCaml code to call is in r12 */ +/* Arguments to the OCaml code are in r0...r3 */ .Ljump_to_caml: /* Save return address and callee-save registers */ - stmfd sp!, {r4,r5,r6,r7,r8,r10,r11,lr} /* 8-alignment */ + vpush {d8-d15} + push {r4-r8,r10,r11,lr} /* 8-byte alignment */ /* Setup a callback link on the stack */ - sub sp, sp, #4*4 /* 8-alignment */ - ldr r4, .Lcaml_bottom_of_stack - ldr r4, [r4, #0] - str r4, [sp, #0] - ldr r4, .Lcaml_last_return_address - ldr r4, [r4, #0] - str r4, [sp, #4] - ldr r4, .Lcaml_gc_regs - ldr r4, [r4, #0] - str r4, [sp, #8] - /* Setup a trap frame to catch exceptions escaping the Caml code */ - sub sp, sp, #4*2 - ldr r4, .Lcaml_exception_pointer - ldr r4, [r4, #0] - str r4, [sp, #0] - ldr r4, .LLtrap_handler - str r4, [sp, #4] + sub sp, sp, 4*4 /* 8-byte alignment */ + ldr r4, =caml_bottom_of_stack + ldr r5, =caml_last_return_address + ldr r6, =caml_gc_regs + ldr r4, [r4] + ldr r5, [r5] + ldr r6, [r6] + str r4, [sp, 0] + str r5, [sp, 4] + str r6, [sp, 8] + /* Setup a trap frame to catch exceptions escaping the OCaml code */ + sub sp, sp, 2*4 + ldr r6, =caml_exception_pointer + ldr r5, =.Ltrap_handler + ldr r4, [r6] + str r4, [sp, 0] + str r5, [sp, 4] mov trap_ptr, sp /* Reload allocation pointers */ - ldr r4, .Lcaml_young_ptr - ldr alloc_ptr, [r4, #0] - ldr r4, .Lcaml_young_limit - ldr alloc_limit, [r4, #0] - /* Call the Caml code */ - mov lr, pc - bx r12 + ldr r4, =caml_young_ptr + ldr alloc_ptr, [r4] + ldr r4, =caml_young_limit + ldr alloc_limit, [r4] + /* Call the OCaml code */ + blx r12 .Lcaml_retaddr: /* Pop the trap frame, restoring caml_exception_pointer */ - ldr r4, .Lcaml_exception_pointer - ldr r5, [sp, #0] - str r5, [r4, #0] - add sp, sp, #2 * 4 + ldr r4, =caml_exception_pointer + ldr r5, [sp, 0] + str r5, [r4] + add sp, sp, 2*4 /* Pop the callback link, restoring the global variables */ .Lreturn_result: - ldr r4, .Lcaml_bottom_of_stack - ldr r5, [sp, #0] - str r5, [r4, #0] - ldr r4, .Lcaml_last_return_address - ldr r5, [sp, #4] - str r5, [r4, #0] - ldr r4, .Lcaml_gc_regs - ldr r5, [sp, #8] - str r5, [r4, #0] - add sp, sp, #4*4 + ldr r4, =caml_bottom_of_stack + ldr r5, [sp, 0] + str r5, [r4] + ldr r4, =caml_last_return_address + ldr r5, [sp, 4] + str r5, [r4] + ldr r4, =caml_gc_regs + ldr r5, [sp, 8] + str r5, [r4] + add sp, sp, 4*4 /* Update allocation pointer */ - ldr r4, .Lcaml_young_ptr - str alloc_ptr, [r4, #0] + ldr r4, =caml_young_ptr + str alloc_ptr, [r4] /* Reload callee-save registers and return */ - ldmfd sp!, {r4,r5,r6,r7,r8,r10,r11,lr} - bx lr + pop {r4-r8,r10,r11,lr} + vpop {d8-d15} + bx lr + .type .Lcaml_retaddr, %function + .size .Lcaml_retaddr, .-.Lcaml_retaddr + .type caml_start_program, %function + .size caml_start_program, .-caml_start_program + +/* The trap handler */ - /* The trap handler */ + .align 2 .Ltrap_handler: /* Save exception pointer */ - ldr r4, .Lcaml_exception_pointer - str trap_ptr, [r4, #0] + ldr r12, =caml_exception_pointer + str trap_ptr, [r12] /* Encode exception bucket as an exception result */ - orr r0, r0, #2 + orr r0, r0, 2 /* Return it */ b .Lreturn_result + .type .Ltrap_handler, %function + .size .Ltrap_handler, .-.Ltrap_handler + +/* Raise an exception from OCaml */ + + .align 2 + .globl caml_raise_exn +caml_raise_exn: + PROFILE + /* Test if backtrace is active */ + ldr r1, =caml_backtrace_active + ldr r1, [r1] + cbz r1, 1f + /* Preserve exception bucket in callee-save register r4 */ + mov r4, r0 + /* Stash the backtrace */ + mov r1, lr /* arg2: pc of raise */ + mov r2, sp /* arg3: sp of raise */ + mov r3, trap_ptr /* arg4: sp of handler */ + bl caml_stash_backtrace + /* Restore exception bucket */ + mov r0, r4 +1: /* Cut stack at current trap handler */ + mov sp, trap_ptr + /* Pop previous handler and addr of trap, and jump to it */ + pop {trap_ptr, pc} + .type caml_raise_exn, %function + .size caml_raise_exn, .-caml_raise_exn /* Raise an exception from C */ - .globl caml_raise_exception + .align 2 + .globl caml_raise_exception .type caml_raise_exception, %function caml_raise_exception: - /* Reload Caml allocation pointers */ - ldr r12, .Lcaml_young_ptr - ldr alloc_ptr, [r12, #0] - ldr r12, .Lcaml_young_limit - ldr alloc_limit, [r12, #0] - /* Cut stack at current trap handler */ - ldr r12, .Lcaml_exception_pointer - ldr sp, [r12, #0] + PROFILE + /* Reload trap ptr, alloc ptr and alloc limit */ + ldr trap_ptr, =caml_exception_pointer + ldr alloc_ptr, =caml_young_ptr + ldr alloc_limit, =caml_young_limit + ldr trap_ptr, [trap_ptr] + ldr alloc_ptr, [alloc_ptr] + ldr alloc_limit, [alloc_limit] + /* Test if backtrace is active */ + ldr r1, =caml_backtrace_active + ldr r1, [r1] + cbz r1, 1f + /* Preserve exception bucket in callee-save register r4 */ + mov r4, r0 + ldr r1, =caml_last_return_address /* arg2: pc of raise */ + ldr r1, [r1] + ldr r2, =caml_bottom_of_stack /* arg3: sp of raise */ + ldr r2, [r2] + mov r3, trap_ptr /* arg4: sp of handler */ + bl caml_stash_backtrace + /* Restore exception bucket */ + mov r0, r4 +1: /* Cut stack at current trap handler */ + mov sp, trap_ptr /* Pop previous handler and addr of trap, and jump to it */ - ldmfd sp!, {trap_ptr, pc} + pop {trap_ptr, pc} + .type caml_raise_exception, %function + .size caml_raise_exception, .-caml_raise_exception -/* Callback from C to Caml */ +/* Callback from C to OCaml */ - .globl caml_callback_exn + .align 2 + .globl caml_callback_exn .type caml_callback_exn, %function caml_callback_exn: + PROFILE /* Initial shuffling of arguments (r0 = closure, r1 = first arg) */ mov r12, r0 - mov r0, r1 /* r0 = first arg */ - mov r1, r12 /* r1 = closure environment */ - ldr r12, [r12, #0] /* code pointer */ + mov r0, r1 /* r0 = first arg */ + mov r1, r12 /* r1 = closure environment */ + ldr r12, [r12] /* code pointer */ b .Ljump_to_caml + .type caml_callback_exn, %function + .size caml_callback_exn, .-caml_callback_exn - .globl caml_callback2_exn + .align 2 + .globl caml_callback2_exn .type caml_callback2_exn, %function caml_callback2_exn: + PROFILE /* Initial shuffling of arguments (r0 = closure, r1 = arg1, r2 = arg2) */ mov r12, r0 - mov r0, r1 /* r0 = first arg */ - mov r1, r2 /* r1 = second arg */ - mov r2, r12 /* r2 = closure environment */ - ldr r12, .Lcaml_apply2 + mov r0, r1 /* r0 = first arg */ + mov r1, r2 /* r1 = second arg */ + mov r2, r12 /* r2 = closure environment */ + ldr r12, =caml_apply2 b .Ljump_to_caml + .type caml_callback2_exn, %function + .size caml_callback2_exn, .-caml_callback2_exn - .globl caml_callback3_exn + .align 2 + .globl caml_callback3_exn .type caml_callback3_exn, %function caml_callback3_exn: + PROFILE /* Initial shuffling of arguments */ /* (r0 = closure, r1 = arg1, r2 = arg2, r3 = arg3) */ mov r12, r0 @@ -288,43 +413,36 @@ caml_callback3_exn: mov r1, r2 /* r1 = second arg */ mov r2, r3 /* r2 = third arg */ mov r3, r12 /* r3 = closure environment */ - ldr r12, .Lcaml_apply3 + ldr r12, =caml_apply3 b .Ljump_to_caml + .type caml_callback3_exn, %function + .size caml_callback3_exn, .-caml_callback3_exn - .globl caml_ml_array_bound_error + .align 2 + .globl caml_ml_array_bound_error .type caml_ml_array_bound_error, %function caml_ml_array_bound_error: - /* Load address of [caml_array_bound_error] in r12 */ - ldr r12, .Lcaml_array_bound_error + PROFILE + /* Load address of [caml_array_bound_error] in r7 */ + ldr r7, =caml_array_bound_error /* Call that function */ b caml_c_call + .type caml_ml_array_bound_error, %function + .size caml_ml_array_bound_error, .-caml_ml_array_bound_error -/* Global references */ - -.Lcaml_last_return_address: .word caml_last_return_address -.Lcaml_bottom_of_stack: .word caml_bottom_of_stack -.Lcaml_gc_regs: .word caml_gc_regs -.Lcaml_young_ptr: .word caml_young_ptr -.Lcaml_young_limit: .word caml_young_limit -.Lcaml_exception_pointer: .word caml_exception_pointer -.Lcaml_program: .word caml_program -.LLtrap_handler: .word .Ltrap_handler -.Lcaml_apply2: .word caml_apply2 -.Lcaml_apply3: .word caml_apply3 -.Lcaml_array_bound_error: .word caml_array_bound_error -.Lcaml_requested_size: .word caml_requested_size - - .data -caml_requested_size: - .word 0 + .globl caml_system__code_end +caml_system__code_end: /* GC roots for callback */ .data - .globl caml_system__frametable + .align 2 + .globl caml_system__frametable caml_system__frametable: .word 1 /* one descriptor */ .word .Lcaml_retaddr /* return address into callback */ .short -1 /* negative frame size => use callback link */ .short 0 /* no roots */ .align 2 + .type caml_system__frametable, %object + .size caml_system__frametable, .-caml_system__frametable diff --git a/asmrun/signals_osdep.h b/asmrun/signals_osdep.h index 1e91327..732f3a0 100644 --- a/asmrun/signals_osdep.h +++ b/asmrun/signals_osdep.h @@ -78,7 +78,7 @@ /****************** ARM, Linux */ -#elif defined(TARGET_arm) && defined (SYS_linux) +#elif defined(TARGET_arm) && (defined (SYS_linux_eabi) || defined(SYS_linux_eabihf)) #include diff --git a/configure b/configure index 6ed0a9c..4e07c92 100755 --- a/configure +++ b/configure @@ -636,6 +636,7 @@ if test $withsharedlibs = "yes"; then i[345]86-*-netbsd*) natdynlink=true;; x86_64-*-netbsd*) natdynlink=true;; i386-*-gnu0.3) natdynlink=true;; + arm*-*-linux*) natdynlink=true;; esac fi @@ -691,8 +692,13 @@ case "$host" in powerpc-*-rhapsody*) arch=power; model=ppc; system=rhapsody;; powerpc-*-darwin*) arch=power; system=rhapsody if $arch64; then model=ppc64; else model=ppc; fi;; - arm*-*-linux*) arch=arm; system=linux;; - arm*-*-gnu*) arch=arm; system=gnu;; + arm*-*-linux-gnueabihf) arch=arm; system=linux_eabihf;; + armv7*-*-linux-gnueabi) arch=arm; model=armv7; system=linux_eabi;; + armv6t2*-*-linux-gnueabi) arch=arm; model=armv6t2; system=linux_eabi;; + armv6*-*-linux-gnueabi) arch=arm; model=armv6; system=linux_eabi;; + armv5te*-*-linux-gnueabi) arch=arm; model=armv5te; system=linux_eabi;; + armv5*-*-linux-gnueabi) arch=arm; model=armv5; system=linux_eabi;; + arm*-*-linux-gnueabi) arch=arm; system=linux_eabi;; ia64-*-linux*) arch=ia64; system=linux;; ia64-*-gnu*) arch=ia64; system=gnu;; ia64-*-freebsd*) arch=ia64; system=freebsd;; @@ -804,6 +810,7 @@ case "$arch,$model,$system" in case "$nativecc" in gcc*) ;; *) cc_profile='-xpg';; esac;; amd64,*,linux) profiling='prof';; amd64,*,gnu) profiling='prof';; + arm,*,linux*) profiling='prof';; *) profiling='noprof';; esac -- 1.7.10.1