From 22fadc3ed91cb380f7303e8a83ff5806d4576cb5 Mon Sep 17 00:00:00 2001
From: "Richard W.M. Jones" <rjones@redhat.com>
Date: Tue, 29 May 2012 20:50:42 +0100
Subject: [PATCH 7/8] New ARM backend, written by Benedikt Meurer (PR#5433).

Backported from upstream sources to 3.12.1 by RWMJ.

Includes svn rev 12548 to fix invalid generation of Thumb-2 branch
instruction TBH (upstream PR#5623, RHBZ#821153).
---
 asmcomp/amd64/selection.ml   |   14 +-
 asmcomp/arm/arch.ml          |  152 +++++++-
 asmcomp/arm/emit.mlp         |  857 ++++++++++++++++++++++++++++--------------
 asmcomp/arm/proc.ml          |  185 ++++++---
 asmcomp/arm/reload.ml        |    4 +-
 asmcomp/arm/scheduling.ml    |   80 ++--
 asmcomp/arm/selection.ml     |  343 ++++++++++-------
 asmcomp/i386/selection.ml    |   14 +-
 asmcomp/power/selection.ml   |    2 +-
 asmcomp/power64/selection.ml |    2 +-
 asmcomp/selectgen.ml         |   13 +-
 asmcomp/selectgen.mli        |    2 +-
 asmcomp/sparc/selection.ml   |    2 +-
 asmrun/arm.S                 |  544 ++++++++++++++++-----------
 asmrun/signals_osdep.h       |    2 +-
 configure                    |   11 +-
 16 files changed, 1485 insertions(+), 742 deletions(-)

diff --git a/asmcomp/amd64/selection.ml b/asmcomp/amd64/selection.ml
index f0546cf..5d9f6fa 100644
--- a/asmcomp/amd64/selection.ml
+++ b/asmcomp/amd64/selection.ml
@@ -121,7 +121,7 @@ method is_immediate n = n <= 0x7FFFFFFF && n >= -0x80000000
 
 method is_immediate_natint n = n <= 0x7FFFFFFFn && n >= -0x80000000n
 
-method select_addressing exp =
+method select_addressing chunk exp =
   let (a, d) = select_addr exp in
   (* PR#4625: displacement must be a signed 32-bit immediate *)
   if d < -0x8000_0000 || d > 0x7FFF_FFFF
@@ -157,7 +157,7 @@ method! select_operation op args =
   match op with
   (* Recognize the LEA instruction *)
     Caddi | Cadda | Csubi | Csuba ->
-      begin match self#select_addressing (Cop(op, args)) with
+      begin match self#select_addressing Word (Cop(op, args)) with
         (Iindexed d, _) -> super#select_operation op args
       | (Iindexed2 0, _) -> super#select_operation op args
       | (addr, arg) -> (Ispecific(Ilea addr), [arg])
@@ -191,7 +191,7 @@ method! select_operation op args =
       begin match args with
         [loc; Cop(Caddi, [Cop(Cload _, [loc']); Cconst_int n])]
         when loc = loc' && self#is_immediate n ->
-          let (addr, arg) = self#select_addressing loc in
+          let (addr, arg) = self#select_addressing Word loc in
           (Ispecific(Ioffset_loc(n, addr)), [arg])
       | _ ->
           super#select_operation op args
@@ -202,12 +202,12 @@ method! select_operation op args =
 
 method select_floatarith commutative regular_op mem_op args =
   match args with
-    [arg1; Cop(Cload (Double|Double_u), [loc2])] ->
-      let (addr, arg2) = self#select_addressing loc2 in
+    [arg1; Cop(Cload (Double|Double_u as chunk), [loc2])] ->
+      let (addr, arg2) = self#select_addressing chunk loc2 in
       (Ispecific(Ifloatarithmem(mem_op, addr)),
                  [arg1; arg2])
-  | [Cop(Cload (Double|Double_u), [loc1]); arg2] when commutative ->
-      let (addr, arg1) = self#select_addressing loc1 in
+  | [Cop(Cload (Double|Double_u as chunk), [loc1]); arg2] when commutative ->
+      let (addr, arg1) = self#select_addressing chunk loc1 in
       (Ispecific(Ifloatarithmem(mem_op, addr)),
                  [arg2; arg1])
   | [arg1; arg2] ->
diff --git a/asmcomp/arm/arch.ml b/asmcomp/arm/arch.ml
index 998fa4b..c4aca8d 100644
--- a/asmcomp/arm/arch.ml
+++ b/asmcomp/arm/arch.ml
@@ -1,25 +1,98 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: arch.ml 5304 2002-11-29 15:03:37Z xleroy $ *)
+(* $Id$ *)
 
 (* Specific operations for the ARM processor *)
 
 open Misc
 open Format
 
+type abi = EABI | EABI_VFP
+type arch = ARMv4 | ARMv5 | ARMv5TE | ARMv6 | ARMv6T2 | ARMv7
+type fpu = Soft | VFPv3_D16 | VFPv3
+
+let abi =
+  match Config.system with
+    "linux_eabi"   -> EABI
+  | "linux_eabihf" -> EABI_VFP
+  | _ -> assert false
+
+let string_of_arch = function
+    ARMv4   -> "armv4"
+  | ARMv5   -> "armv5"
+  | ARMv5TE -> "armv5te"
+  | ARMv6   -> "armv6"
+  | ARMv6T2 -> "armv6t2"
+  | ARMv7   -> "armv7"
+
+let string_of_fpu = function
+    Soft      -> "soft"
+  | VFPv3_D16 -> "vfpv3-d16"
+  | VFPv3     -> "vfpv3"
+
 (* Machine-specific command-line options *)
 
-let command_line_options = []
+let (arch, fpu, thumb) =
+  let (def_arch, def_fpu, def_thumb) =
+    begin match abi, Config.model with
+    (* Defaults for architecture, FPU and Thumb *)
+      EABI, "armv5"   -> ARMv5,   Soft,      false
+    | EABI, "armv5te" -> ARMv5TE, Soft,      false
+    | EABI, "armv6"   -> ARMv6,   Soft,      false
+    | EABI, "armv6t2" -> ARMv6T2, Soft,      false
+    | EABI, "armv7"   -> ARMv7,   Soft,      false
+    | EABI, _         -> ARMv4,   Soft,      false
+    | EABI_VFP, _     -> ARMv7,   VFPv3_D16, true
+    end in
+  (ref def_arch, ref def_fpu, ref def_thumb)
+
+let pic_code = ref false
+
+let farch spec =
+  arch := (match spec with
+             "armv4" when abi <> EABI_VFP   -> ARMv4
+           | "armv5" when abi <> EABI_VFP   -> ARMv5
+           | "armv5te" when abi <> EABI_VFP -> ARMv5TE
+           | "armv6" when abi <> EABI_VFP   -> ARMv6
+           | "armv6t2" when abi <> EABI_VFP -> ARMv6T2
+           | "armv7"                        -> ARMv7
+           | spec -> raise (Arg.Bad spec))
+
+let ffpu spec =
+  fpu := (match spec with
+            "soft" when abi <> EABI_VFP     -> Soft
+          | "vfpv3-d16" when abi = EABI_VFP -> VFPv3_D16
+          | "vfpv3" when abi = EABI_VFP     -> VFPv3
+          | spec -> raise (Arg.Bad spec))
+
+let command_line_options =
+  [ "-farch", Arg.String farch,
+      "<arch>  Select the ARM target architecture"
+      ^ " (default: " ^ (string_of_arch !arch) ^ ")";
+    "-ffpu", Arg.String ffpu,
+      "<fpu>  Select the floating-point hardware"
+      ^ " (default: " ^ (string_of_fpu !fpu) ^ ")";
+    "-fPIC", Arg.Set pic_code,
+      " Generate position-independent machine code";
+    "-fno-PIC", Arg.Clear pic_code,
+      " Generate position-dependent machine code";
+    "-fthumb", Arg.Set thumb,
+      " Enable Thumb/Thumb-2 code generation"
+      ^ (if !thumb then " (default)" else "");
+    "-fno-thumb", Arg.Clear thumb,
+      " Disable Thumb/Thumb-2 code generation"
+      ^ (if not !thumb then " (default" else "")]
 
 (* Addressing modes *)
 
@@ -37,6 +110,14 @@ type specific_operation =
     Ishiftarith of arith_operation * int
   | Ishiftcheckbound of int
   | Irevsubimm of int
+  | Imuladd     (* multiply and add *)
+  | Imulsub     (* multiply and subtract *)
+  | Inegmulf    (* floating-point negate and multiply *)
+  | Imuladdf    (* floating-point multiply and add *)
+  | Inegmuladdf (* floating-point negate, multiply and add *)
+  | Imulsubf    (* floating-point multiply and subtract *)
+  | Inegmulsubf (* floating-point negate, multiply and subtract *)
+  | Isqrtf      (* floating-point square root *)
 
 and arith_operation =
     Ishiftadd
@@ -51,6 +132,10 @@ let size_addr = 4
 let size_int = 4
 let size_float = 8
 
+(* Behavior of division *)
+
+let division_crashes_on_overflow = false
+
 (* Operations on addressing modes *)
 
 let identity_addressing = Iindexed 0
@@ -84,3 +169,56 @@ let print_specific_operation printreg op ppf arg =
       fprintf ppf "check %a >> %i > %a" printreg arg.(0) n printreg arg.(1)
   | Irevsubimm n ->
       fprintf ppf "%i %s %a" n "-" printreg arg.(0)
+  | Imuladd ->
+      fprintf ppf "(%a * %a) + %a"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Imulsub ->
+      fprintf ppf "-(%a * %a) + %a"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Inegmulf ->
+      fprintf ppf "-f (%a *f %a)"
+        printreg arg.(0)
+        printreg arg.(1)
+  | Imuladdf ->
+      fprintf ppf "%a +f (%a *f %a)"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Inegmuladdf ->
+      fprintf ppf "%a -f (%a *f %a)"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Imulsubf ->
+      fprintf ppf "(-f %a) +f (%a *f %a)"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Inegmulsubf ->
+      fprintf ppf "(-f %a) -f (%a *f %a)"
+        printreg arg.(0)
+        printreg arg.(1)
+        printreg arg.(2)
+  | Isqrtf ->
+      fprintf ppf "sqrtf %a"
+        printreg arg.(0)
+
+(* Recognize immediate operands *)
+
+(* Immediate operands are 8-bit immediate values, zero-extended,
+   and rotated right by 0 ... 30 bits.
+   In Thumb/Thumb-2 mode we utilize 26 ... 30. *)
+
+let is_immediate n =
+  let n = ref n in
+  let s = ref 0 in
+  let m = if !thumb then 24 else 30 in
+  while (!s <= m && Int32.logand !n 0xffl <> !n) do
+    n := Int32.logor (Int32.shift_right_logical !n 2) (Int32.shift_left !n 30);
+    s := !s + 2
+  done;
+  !s <= m
diff --git a/asmcomp/arm/emit.mlp b/asmcomp/arm/emit.mlp
index a4b2241..f8db396 100644
--- a/asmcomp/arm/emit.mlp
+++ b/asmcomp/arm/emit.mlp
@@ -1,16 +1,17 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: emit.mlp 10293 2010-04-22 09:33:18Z xleroy $ *)
+(* $Id$ *)
 
 (* Emission of ARM assembly code *)
 
@@ -33,16 +34,28 @@ let fastcode_flag = ref true
 let emit_label lbl =
   emit_string ".L"; emit_int lbl
 
-(* Output a symbol *)
+let emit_data_label lbl =
+  emit_string ".Ld"; emit_int lbl
+
+(* Symbols *)
 
 let emit_symbol s =
   Emitaux.emit_symbol '$' s
 
+let emit_call s =
+  if !Clflags.dlcode || !pic_code
+  then `bl	{emit_symbol s}(PLT)`
+  else `bl	{emit_symbol s}`
+
+let emit_jump s =
+  if !Clflags.dlcode || !pic_code
+  then `b	{emit_symbol s}(PLT)`
+  else `b	{emit_symbol s}`
+
 (* Output a pseudo-register *)
 
-let emit_reg r =
-  match r.loc with
-  | Reg r -> emit_string (register_name r)
+let emit_reg = function
+    {loc = Reg r} -> emit_string (register_name r)
   | _ -> fatal_error "Emit_arm.emit_reg"
 
 (* Layout of the stack frame *)
@@ -53,14 +66,23 @@ let frame_size () =
   let sz =
     !stack_offset +
     4 * num_stack_slots.(0) +
+    8 * num_stack_slots.(1) +
+    8 * num_stack_slots.(2) +
     (if !contains_calls then 4 else 0)
   in Misc.align sz 8
 
 let slot_offset loc cl =
   match loc with
-    Incoming n -> frame_size() + n
-  | Local n -> !stack_offset + n * 4
-  | Outgoing n -> n
+    Incoming n ->
+      assert (n >= 0);
+      frame_size() + n
+  | Local n ->
+      if cl = 0
+      then !stack_offset + n * 4
+      else !stack_offset + num_stack_slots.(0) * 4 + n * 8
+  | Outgoing n ->
+      assert (n >= 0);
+      n
 
 (* Output a stack reference *)
 
@@ -79,20 +101,13 @@ let emit_addressing addr r n =
 
 (* Record live pointers at call points *)
 
-type frame_descr =
-  { fd_lbl: int;                        (* Return address *)
-    fd_frame_size: int;                 (* Size of stack frame *)
-    fd_live_offset: int list }          (* Offsets/regs of live addresses *)
-
-let frame_descriptors = ref([] : frame_descr list)
-
-let record_frame live =
+let record_frame_label live dbg =
   let lbl = new_label() in
   let live_offset = ref [] in
   Reg.Set.iter
     (function
         {typ = Addr; loc = Reg r} ->
-          live_offset := (r lsl 1) + 1 :: !live_offset
+          live_offset := ((r lsl 1) + 1) :: !live_offset
       | {typ = Addr; loc = Stack s} as reg ->
           live_offset := slot_offset s (register_class reg) :: !live_offset
       | _ -> ())
@@ -100,18 +115,57 @@ let record_frame live =
   frame_descriptors :=
     { fd_lbl = lbl;
       fd_frame_size = frame_size();
-      fd_live_offset = !live_offset } :: !frame_descriptors;
-  `{emit_label lbl}:`
-
-let emit_frame fd =
-  `	.word	{emit_label fd.fd_lbl} + 4\n`;
-  `	.short	{emit_int fd.fd_frame_size}\n`;
-  `	.short	{emit_int (List.length fd.fd_live_offset)}\n`;
-  List.iter
-    (fun n ->
-      `	.short	{emit_int n}\n`)
-    fd.fd_live_offset;
-  `	.align	2\n`
+      fd_live_offset = !live_offset;
+      fd_debuginfo = dbg } :: !frame_descriptors;
+  lbl
+
+let record_frame live dbg =
+  let lbl = record_frame_label live dbg in `{emit_label lbl}:`
+
+(* Record calls to the GC -- we've moved them out of the way *)
+
+type gc_call =
+  { gc_lbl: label;                      (* Entry label *)
+    gc_return_lbl: label;               (* Where to branch after GC *)
+    gc_frame_lbl: label }               (* Label of frame descriptor *)
+
+let call_gc_sites = ref ([] : gc_call list)
+
+let emit_call_gc gc =
+  `{emit_label gc.gc_lbl}:	{emit_call "caml_call_gc"}\n`;
+  `{emit_label gc.gc_frame_lbl}:	b	{emit_label gc.gc_return_lbl}\n`
+
+(* Record calls to caml_ml_array_bound_error.
+   In debug mode, we maintain one call to caml_ml_array_bound_error
+   per bound check site. Otherwise, we can share a single call. *)
+
+type bound_error_call =
+  { bd_lbl: label;                    (* Entry label *)
+    bd_frame_lbl: label }             (* Label of frame descriptor *)
+
+let bound_error_sites = ref ([] : bound_error_call list)
+
+let bound_error_label dbg =
+  if !Clflags.debug || !bound_error_sites = [] then begin
+    let lbl_bound_error = new_label() in
+    let lbl_frame = record_frame_label Reg.Set.empty dbg in
+    bound_error_sites :=
+      { bd_lbl = lbl_bound_error;
+        bd_frame_lbl = lbl_frame } :: !bound_error_sites;
+    lbl_bound_error
+  end else begin
+    let bd = List.hd !bound_error_sites in bd.bd_lbl
+  end
+
+let emit_call_bound_error bd =
+  `{emit_label bd.bd_lbl}:	{emit_call "caml_ml_array_bound_error"}\n`;
+  `{emit_label bd.bd_frame_lbl}:\n`
+
+(* Negate a comparison *)
+
+let negate_integer_comparison = function
+    Isigned cmp   -> Isigned(negate_comparison cmp)
+  | Iunsigned cmp -> Iunsigned(negate_comparison cmp)
 
 (* Names of various instructions *)
 
@@ -121,22 +175,13 @@ let name_for_comparison = function
   | Iunsigned Ceq -> "eq" | Iunsigned Cne -> "ne" | Iunsigned Cle -> "ls"
   | Iunsigned Cge -> "cs" | Iunsigned Clt -> "cc" | Iunsigned Cgt -> "hi"
 
-let name_for_float_comparison cmp neg =
-  match cmp with
-    Ceq -> if neg then "ne" else "eq"
-  | Cne -> if neg then "eq" else "ne"
-  | Cle -> if neg then "hi" else "ls"
-  | Cge -> if neg then "lt" else "ge"
-  | Clt -> if neg then "pl" else "mi"
-  | Cgt -> if neg then "le" else "gt"
-
 let name_for_int_operation = function
     Iadd -> "add"
   | Isub -> "sub"
   | Imul -> "mul"
-  | Iand  -> "and"
-  | Ior   -> "orr"
-  | Ixor  -> "eor"
+  | Iand -> "and"
+  | Ior  -> "orr"
+  | Ixor -> "eor"
   | _ -> assert false
 
 let name_for_shift_operation = function
@@ -145,60 +190,54 @@ let name_for_shift_operation = function
   | Iasr -> "asr"
   | _ -> assert false
 
-let name_for_shift_int_operation = function
-    Ishiftadd -> "add"
-  | Ishiftsub -> "sub"
-  | Ishiftsubrev -> "rsb"
-
-(* Recognize immediate operands *)
-
-(* Immediate operands are 8-bit immediate values, zero-extended, and rotated
-   right by 0, 2, 4, ... 30 bits.
-   We check only with 8-bit values shifted left 0 to 24 bits. *)
-
-let rec is_immed n shift =
-  shift <= 24 &&
-  (Nativeint.logand n (Nativeint.shift_left (Nativeint.of_int 0xFF) shift) = n
-   || is_immed n (shift + 2))
-
-let is_immediate n = is_immed n 0
-
 (* General functional to decompose a non-immediate integer constant
-   into 8-bit chunks shifted left 0 ... 24 bits *)
+   into 8-bit chunks shifted left 0 ... 30 bits. *)
 
 let decompose_intconst n fn =
   let i = ref n in
   let shift = ref 0 in
   let ninstr = ref 0 in
-  while !i <> 0n do
-    if Nativeint.to_int (Nativeint.shift_right !i !shift) land 3 = 0 then
+  while !i <> 0l do
+    if Int32.logand (Int32.shift_right !i !shift) 3l = 0l then
       shift := !shift + 2
     else begin
-      let mask = Nativeint.shift_left 0xFFn !shift in
-      let bits = Nativeint.logand !i mask in
-      fn bits;
+      let bits = Int32.logand !i (Int32.shift_left 0xffl !shift) in
+      i := Int32.sub !i bits;
       shift := !shift + 8;
-      i := Nativeint.sub !i bits;
-      incr ninstr
+      incr ninstr;
+      fn bits
     end
   done;
   !ninstr
 
 (* Load an integer constant into a register *)
 
-let emit_intconst r n =
-  let nr = Nativeint.lognot n in
+let emit_intconst dst n =
+  let nr = Int32.lognot n in
   if is_immediate n then begin
-    `	mov	{emit_reg r}, #{emit_nativeint n}\n`; 1
+    (* Use movs here to enable 16-bit T1 encoding *)
+    `	movs	{emit_reg dst}, #{emit_int32 n}\n`; 1
   end else if is_immediate nr then begin
-    `	mvn	{emit_reg r}, #{emit_nativeint nr}\n`; 1
+    `	mvn	{emit_reg dst}, #{emit_int32 nr}\n`; 1
+  end else if !arch > ARMv6 then begin
+    let nl = Int32.logand 0xffffl n in
+    let nh = Int32.logand 0xffffl (Int32.shift_right_logical n 16) in
+    if nh = 0l then begin
+      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`; 1
+    end else if Int32.logand nl 0xffl = nl then begin
+      `	movs	{emit_reg dst}, #{emit_int32 nl}\n`;
+      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
+    end else begin
+      `	movw	{emit_reg dst}, #{emit_int32 nl}\n`;
+      `	movt	{emit_reg dst}, #{emit_int32 nh}\n`; 2
+    end
   end else begin
     let first = ref true in
     decompose_intconst n
       (fun bits ->
         if !first
-        then `	mov	{emit_reg r}, #{emit_nativeint bits} @ {emit_nativeint n}\n`
-        else `	add	{emit_reg r}, {emit_reg r}, #{emit_nativeint bits}\n`;
+        then `	mov	{emit_reg dst}, #{emit_int32 bits} @ {emit_int32 n}\n`
+        else `	add	{emit_reg dst}, {emit_reg dst}, #{emit_int32 bits}\n`;
         first := false)
   end
 
@@ -206,46 +245,105 @@ let emit_intconst r n =
 
 let emit_stack_adjustment instr n =
   if n <= 0 then 0 else
-    decompose_intconst (Nativeint.of_int n)
+    decompose_intconst (Int32.of_int n)
       (fun bits ->
-        `	{emit_string instr}	sp, sp, #{emit_nativeint bits}\n`)
+        `	{emit_string instr}	sp, sp, #{emit_int32 bits}\n`)
 
 (* Name of current function *)
 let function_name = ref ""
 (* Entry point for tail recursive calls *)
 let tailrec_entry_point = ref 0
-(* Table of symbols referenced *)
-let symbol_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
-(* Table of floating-point literals *)
-let float_constants = (Hashtbl.create 11 : (string, int) Hashtbl.t)
-(* Total space (in word) occupied by pending literals *)
+(* Pending floating-point literals *)
+let float_literals = ref ([] : (string * label) list)
+(* Pending relative references to the global offset table *)
+let gotrel_literals = ref ([] : (label * label) list)
+(* Pending symbol literals *)
+let symbol_literals = ref ([] : (string * label) list)
+(* Total space (in words) occupied by pending literals *)
 let num_literals = ref 0
 
-(* Label a symbol or float constant *)
-let label_constant tbl s size =
+(* Label a floating-point literal *)
+let float_literal f =
   try
-    Hashtbl.find tbl s
+    List.assoc f !float_literals
   with Not_found ->
     let lbl = new_label() in
-    Hashtbl.add tbl s lbl;
-    num_literals := !num_literals + size;
+    num_literals := !num_literals + 2;
+    float_literals := (f, lbl) :: !float_literals;
     lbl
 
-(* Emit all pending constants *)
-
-let emit_constants () =
-  Hashtbl.iter
-    (fun s lbl ->
-      `{emit_label lbl}:        .word   {emit_symbol s}\n`)
-    symbol_constants;
-  Hashtbl.iter
-    (fun s lbl ->
-      `{emit_label lbl}:        .double {emit_string s}\n`)
-    float_constants;
-  Hashtbl.clear symbol_constants;
-  Hashtbl.clear float_constants;
+(* Label a GOTREL literal *)
+let gotrel_literal l =
+  let lbl = new_label() in
+  num_literals := !num_literals + 1;
+  gotrel_literals := (l, lbl) :: !gotrel_literals;
+  lbl
+
+(* Label a symbol literal *)
+let symbol_literal s =
+  try
+    List.assoc s !symbol_literals
+  with Not_found ->
+    let lbl = new_label() in
+    num_literals := !num_literals + 1;
+    symbol_literals := (s, lbl) :: !symbol_literals;
+    lbl
+
+(* Emit all pending literals *)
+let emit_literals() =
+  if !float_literals <> [] then begin
+    `	.align	3\n`;
+    List.iter
+      (fun (f, lbl) ->
+        `{emit_label lbl}:	.double	{emit_string f}\n`)
+      !float_literals;
+    float_literals := []
+  end;
+  if !symbol_literals <> [] then begin
+    let offset = if !thumb then 4 else 8 in
+    let suffix = if !pic_code then "(GOT)" else "" in
+    `	.align	2\n`;
+    List.iter
+      (fun (l, lbl) ->
+        `{emit_label lbl}:	.word	_GLOBAL_OFFSET_TABLE_-({emit_label l}+{emit_int offset})\n`)
+      !gotrel_literals;
+    List.iter
+      (fun (s, lbl) ->
+        `{emit_label lbl}:	.word	{emit_symbol s}{emit_string suffix}\n`)
+      !symbol_literals;
+    gotrel_literals := [];
+    symbol_literals := []
+  end;
   num_literals := 0
 
+(* Emit code to load the address of a symbol *)
+
+let emit_load_symbol_addr dst s =
+  if !pic_code then begin
+    let lbl_pic = new_label() in
+    let lbl_got = gotrel_literal lbl_pic in
+    let lbl_sym = symbol_literal s in
+    (* Both r3 and r12 are marked as clobbered in PIC mode (cf. proc.ml),
+       so use r12 as temporary scratch register unless the destination is
+       r12, then we use r3 instead. *)
+    let tmp = if dst.loc = Reg 8 (*r12*)
+              then phys_reg 3 (*r3*)
+              else phys_reg 8 (*r12*) in
+    `	ldr	{emit_reg tmp}, {emit_label lbl_got}\n`;
+    `	ldr	{emit_reg dst}, {emit_label lbl_sym}\n`;
+    `{emit_label lbl_pic}:	add	{emit_reg tmp}, pc, {emit_reg tmp}\n`;
+    `	ldr	{emit_reg dst}, [{emit_reg tmp}, {emit_reg dst}] @ {emit_symbol s}\n`;
+    4
+  end else if !arch > ARMv6 && not !Clflags.dlcode && !fastcode_flag then begin
+    `	movw	{emit_reg dst}, #:lower16:{emit_symbol s}\n`;
+    `	movt	{emit_reg dst}, #:upper16:{emit_symbol s}\n`;
+    2
+  end else begin
+    let lbl = symbol_literal s in
+    `	ldr	{emit_reg dst}, {emit_label lbl} @ {emit_symbol s}\n`;
+    1
+  end
+
 (* Output the assembly code for an instruction *)
 
 let emit_instr i =
@@ -254,40 +352,76 @@ let emit_instr i =
     | Lop(Imove | Ispill | Ireload) ->
         let src = i.arg.(0) and dst = i.res.(0) in
         if src.loc = dst.loc then 0 else begin
-          match (src, dst) with
-            {loc = Reg rs; typ = Int|Addr}, {loc = Reg rd; typ = Int|Addr} ->
-              `	mov	{emit_reg dst}, {emit_reg src}\n`; 1
-          | {loc = Reg rs; typ = Int|Addr}, {loc = Stack sd} ->
-              `	str	{emit_reg src}, {emit_stack dst}\n`; 1
-          | {loc = Stack ss; typ = Int|Addr}, {loc = Reg rd} ->
-              `	ldr	{emit_reg dst}, {emit_stack src}\n`; 1
+          begin match (src, dst) with
+            {loc = Reg _; typ = Float}, {loc = Reg _} ->
+              `	fcpyd	{emit_reg dst}, {emit_reg src}\n`
+          | {loc = Reg _}, {loc = Reg _} ->
+              `	mov	{emit_reg dst}, {emit_reg src}\n`
+          | {loc = Reg _; typ = Float}, _ ->
+              `	fstd	{emit_reg src}, {emit_stack dst}\n`
+          | {loc = Reg _}, _ ->
+              `	str	{emit_reg src}, {emit_stack dst}\n`
+          | {typ = Float}, _ ->
+              `	fldd	{emit_reg dst}, {emit_stack src}\n`
           | _ ->
-              assert false
+              `	ldr	{emit_reg dst}, {emit_stack src}\n`
+          end; 1
         end
     | Lop(Iconst_int n) ->
-        emit_intconst i.res.(0) n
-    | Lop(Iconst_float s) ->
-        let bits = Int64.bits_of_float (float_of_string s) in
-        let high_bits = Int64.to_nativeint (Int64.shift_right_logical bits 32)
-        and low_bits = Int64.to_nativeint bits in
-        if is_immediate low_bits && is_immediate high_bits then begin
-          `	mov	{emit_reg i.res.(0)}, #{emit_nativeint low_bits} @ {emit_string s}\n`;
-          `	mov	{emit_reg i.res.(1)}, #{emit_nativeint high_bits}\n`;
-          2
+        emit_intconst i.res.(0) (Nativeint.to_int32 n)
+    | Lop(Iconst_float f) when !fpu = Soft ->
+        `	@ {emit_string f}\n`;
+        let bits = Int64.bits_of_float (float_of_string f) in
+        let high_bits = Int64.to_int32 (Int64.shift_right_logical bits 32)
+        and low_bits = Int64.to_int32 bits in
+        if is_immediate low_bits || is_immediate high_bits then begin
+          let ninstr_low = emit_intconst i.res.(0) low_bits
+          and ninstr_high = emit_intconst i.res.(1) high_bits in
+          ninstr_low + ninstr_high
         end else begin
-          let lbl = label_constant float_constants s 2 in
-          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string s}\n`;
+          let lbl = float_literal f in
+          `	ldr	{emit_reg i.res.(0)}, {emit_label lbl}\n`;
           `	ldr	{emit_reg i.res.(1)}, {emit_label lbl} + 4\n`;
           2
         end
+    | Lop(Iconst_float f) ->
+        let encode imm =
+          let sg = Int64.to_int (Int64.shift_right_logical imm 63) in
+          let ex = Int64.to_int (Int64.shift_right_logical imm 52) in
+          let ex = (ex land 0x7ff) - 1023 in
+          let mn = Int64.logand imm 0xfffffffffffffL in
+          if Int64.logand mn 0xffffffffffffL <> 0L || ex < -3 || ex > 4
+          then
+            None
+          else begin
+            let mn = Int64.to_int (Int64.shift_right_logical mn 48) in
+            if mn land 0x0f <> mn then
+              None
+            else
+              let ex = ((ex + 3) land 0x07) lxor 0x04 in
+              Some((sg lsl 7) lor (ex lsl 4) lor mn)
+          end in
+        begin match encode (Int64.bits_of_float (float_of_string f)) with
+          None ->
+            let lbl = float_literal f in
+            `	fldd	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_string f}\n`
+        | Some imm8 ->
+            `	fconstd	{emit_reg i.res.(0)}, #{emit_int imm8} @ {emit_string f}\n`
+        end; 1
     | Lop(Iconst_symbol s) ->
-        let lbl = label_constant symbol_constants s 1 in
-        `	ldr	{emit_reg i.res.(0)}, {emit_label lbl} @ {emit_symbol s}\n`; 1
+        emit_load_symbol_addr i.res.(0) s
     | Lop(Icall_ind) ->
-        `	mov	lr, pc\n`;
-        `{record_frame i.live}  bx	{emit_reg i.arg.(0)}\n`; 2
+        if !arch >= ARMv5 then begin
+          `	blx	{emit_reg i.arg.(0)}\n`;
+          `{record_frame i.live i.dbg}\n`; 1
+        end else begin
+          `	mov	lr, pc\n`;
+          `	bx	{emit_reg i.arg.(0)}\n`;
+          `{record_frame i.live i.dbg}\n`; 2
+        end
     | Lop(Icall_imm s) ->
-        `{record_frame i.live}  bl      {emit_symbol s}\n`; 1
+        `	{emit_call s}\n`;
+        `{record_frame i.live i.dbg}\n`; 1
     | Lop(Itailcall_ind) ->
         let n = frame_size() in
         if !contains_calls then
@@ -303,17 +437,16 @@ let emit_instr i =
           if !contains_calls then
             `	ldr	lr, [sp, #{emit_int (n-4)}]\n`;
           let ninstr = emit_stack_adjustment "add" n in
-          `	b	{emit_symbol s}\n`;
+          `	{emit_jump s}\n`;
           2 + ninstr
         end
-    | Lop(Iextcall(s, alloc)) ->
-        if alloc then begin
-          let lbl = label_constant symbol_constants s 1 in
-          `	ldr	r12, {emit_label lbl} @ {emit_symbol s}\n`;
-          `{record_frame i.live}	bl	caml_c_call\n`; 2
-        end else begin
-          `	bl	{emit_symbol s}\n`; 1
-        end
+    | Lop(Iextcall(s, false)) ->
+        `	{emit_call s}\n`; 1
+    | Lop(Iextcall(s, true)) ->
+        let ninstr = emit_load_symbol_addr (phys_reg 7 (* r7 *)) s in
+        `	{emit_call "caml_c_call"}\n`;
+        `{record_frame i.live i.dbg}\n`;
+        1 + ninstr
     | Lop(Istackoffset n) ->
         assert (n mod 8 = 0);
         let ninstr =
@@ -322,16 +455,28 @@ let emit_instr i =
           else emit_stack_adjustment "add" (-n) in
         stack_offset := !stack_offset + n;
         ninstr
-    | Lop(Iload((Double | Double_u), addr)) ->
-        let addr' = offset_addressing addr 4 in
-        if i.res.(0).loc <> i.arg.(0).loc then begin
-          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
-          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
-        end else begin
-          `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
-          `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
-        end;
-        2
+    | Lop(Iload(Single, addr)) when !fpu >= VFPv3_D16 ->
+        `	flds	s14, {emit_addressing addr i.arg 0}\n`;
+        `	fcvtds	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iload((Double | Double_u), addr)) when !fpu = Soft ->
+        (* Use LDM or LDRD if possible *)
+        begin match i.res.(0), i.res.(1), addr with
+          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
+          when rt < rt2 ->
+            `	ldm	{emit_reg i.arg.(0)}, \{{emit_reg i.res.(0)}, {emit_reg i.res.(1)}}\n`; 1
+        | {loc = Reg rt}, {loc = Reg rt2}, addr
+          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
+            `	ldrd	{emit_reg i.res.(0)}, {emit_reg i.res.(1)}, {emit_addressing addr i.arg 0}\n`; 1
+        | _ ->
+            let addr' = offset_addressing addr 4 in
+            if i.res.(0).loc <> i.arg.(0).loc then begin
+              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`;
+              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`
+            end else begin
+              `	ldr	{emit_reg i.res.(1)}, {emit_addressing addr' i.arg 0}\n`;
+              `	ldr	{emit_reg i.res.(0)}, {emit_addressing addr i.arg 0}\n`
+            end; 2
+        end
     | Lop(Iload(size, addr)) ->
         let r = i.res.(0) in
         let instr =
@@ -340,65 +485,114 @@ let emit_instr i =
           | Byte_signed -> "ldrsb"
           | Sixteen_unsigned -> "ldrh"
           | Sixteen_signed -> "ldrsh"
+          | Double
+          | Double_u -> "fldd"
           | _ (* 32-bit quantities *) -> "ldr" in
-        `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 0}\n`;
-        1
-    | Lop(Istore((Double | Double_u), addr)) ->
-        let addr' = offset_addressing addr 4 in
-        `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
-        `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`;
-        2
+        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 0}\n`; 1
+    | Lop(Istore(Single, addr)) when !fpu >= VFPv3_D16 ->
+        `	fcvtsd	s14, {emit_reg i.arg.(0)}\n`;
+        `	fsts	s14, {emit_addressing addr i.arg 1}\n`; 2
+    | Lop(Istore((Double | Double_u), addr)) when !fpu = Soft ->
+        (* Use STM or STRD if possible *)
+        begin match i.arg.(0), i.arg.(1), addr with
+          {loc = Reg rt}, {loc = Reg rt2}, Iindexed 0
+          when rt < rt2 ->
+            `	stm	{emit_reg i.arg.(2)}, \{{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}}\n`; 1
+        | {loc = Reg rt}, {loc = Reg rt2}, addr
+          when !arch >= ARMv5TE && rt mod 2 == 0 && rt2 = rt + 1 ->
+            `	strd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_addressing addr i.arg 2}\n`; 1
+        | _ ->
+            let addr' = offset_addressing addr 4 in
+            `	str	{emit_reg i.arg.(0)}, {emit_addressing addr i.arg 2}\n`;
+            `	str	{emit_reg i.arg.(1)}, {emit_addressing addr' i.arg 2}\n`; 2
+        end
     | Lop(Istore(size, addr)) ->
         let r = i.arg.(0) in
         let instr =
           match size with
-            Byte_unsigned | Byte_signed -> "strb"
-          | Sixteen_unsigned | Sixteen_signed -> "strh"
+            Byte_unsigned
+          | Byte_signed -> "strb"
+          | Sixteen_unsigned
+          | Sixteen_signed -> "strh"
+          | Double
+          | Double_u -> "fstd"
           | _ (* 32-bit quantities *) -> "str" in
-        `	{emit_string	instr}     {emit_reg r}, {emit_addressing addr i.arg 1}\n`;
-        1
+        `	{emit_string instr}	{emit_reg r}, {emit_addressing addr i.arg 1}\n`; 1
     | Lop(Ialloc n) ->
+        let lbl_frame = record_frame_label i.live i.dbg in
         if !fastcode_flag then begin
-          let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in
-          `	sub	alloc_ptr, alloc_ptr, r12\n`;
+          let lbl_redo = new_label() in
+          `{emit_label lbl_redo}:`;
+          let ninstr = decompose_intconst
+                         (Int32.of_int n)
+                         (fun i ->
+                           `	sub	alloc_ptr, alloc_ptr, #{emit_int32 i}\n`) in
           `	cmp	alloc_ptr, alloc_limit\n`;
-          `{record_frame i.live}	blcc    caml_call_gc\n`;
           `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
-          4 + ni
-        end else if n = 8 || n = 12 || n = 16 then begin
-          `{record_frame i.live}	bl	caml_alloc{emit_int ((n-4)/4)}\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`; 2
+          let lbl_call_gc = new_label() in
+          `	bcc	{emit_label lbl_call_gc}\n`;
+          call_gc_sites :=
+            { gc_lbl = lbl_call_gc;
+              gc_return_lbl = lbl_redo;
+              gc_frame_lbl = lbl_frame } :: !call_gc_sites;
+          3 + ninstr
         end else begin
-          let ni = emit_intconst (phys_reg 8 (*r12*)) (Nativeint.of_int n) in
-          `{record_frame i.live}	bl	caml_allocN\n`;
-          `	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
-          2 + ni
+          let ninstr =
+            begin match n with
+               8 -> `	{emit_call "caml_alloc1"}\n`; 1
+            | 12 -> `	{emit_call "caml_alloc2"}\n`; 1
+            | 16 -> `	{emit_call "caml_alloc3"}\n`; 1
+            |  _ -> let ninstr = emit_intconst (phys_reg 7) (Int32.of_int n) in
+                    `	{emit_call "caml_allocN"}\n`; 1 + ninstr
+            end in
+          `{emit_label lbl_frame}:	add	{emit_reg i.res.(0)}, alloc_ptr, #4\n`;
+          1 + ninstr
         end
     | Lop(Iintop(Ilsl | Ilsr | Iasr as op)) ->
         let shift = name_for_shift_operation op in
         `	mov	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} {emit_reg i.arg.(1)}\n`; 1
     | Lop(Iintop(Icomp cmp)) ->
-        let comp = name_for_comparison cmp in
+        let compthen = name_for_comparison cmp in
+        let compelse = name_for_comparison (negate_integer_comparison cmp) in
         `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
-        `	mov	{emit_reg i.res.(0)}, #0\n`;
-        `	mov{emit_string	comp}   {emit_reg i.res.(0)}, #1\n`; 3
-    | Lop(Iintop(Icheckbound)) ->
+        `	ite	{emit_string compthen}\n`;
+        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
+        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
+    | Lop(Iintop_imm(Icomp cmp, n)) ->
+        let compthen = name_for_comparison cmp in
+        let compelse = name_for_comparison (negate_integer_comparison cmp) in
+        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
+        `	ite	{emit_string compthen}\n`;
+        `	mov{emit_string	compthen}	{emit_reg i.res.(0)}, #1\n`;
+        `	mov{emit_string compelse}	{emit_reg i.res.(0)}, #0\n`; 4
+    | Lop(Iintop Icheckbound) ->
+        let lbl = bound_error_label i.dbg in
         `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
-        `	blls	caml_ml_array_bound_error\n`; 2
+        `	bls	{emit_label lbl}\n`; 2
+    | Lop(Iintop_imm(Icheckbound, n)) ->
+        let lbl = bound_error_label i.dbg in
+        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
+        `	bls	{emit_label lbl}\n`; 2
+    | Lop(Ispecific(Ishiftcheckbound shift)) ->
+        let lbl = bound_error_label i.dbg in
+        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
+        `	bcs	{emit_label lbl}\n`; 2
     | Lop(Iintop op) ->
         let instr = name_for_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
+        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`; 1
     | Lop(Iintop_imm(Idiv, n)) -> (* n is a power of 2 *)
         let l = Misc.log2 n in
         let r = i.res.(0) in
         `	movs	{emit_reg r}, {emit_reg i.arg.(0)}\n`;
-        if n <= 256 then
+        if n <= 256 then begin
+          `	it	lt\n`;
           `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int (n-1)}\n`
-        else begin
+        end else begin
+          `	itt	lt\n`;
           `	addlt	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
           `	sublt	{emit_reg r}, {emit_reg r}, #1\n`
         end;
-        `	mov	{emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 4
+        `	mov	{emit_reg r}, {emit_reg r}, asr #{emit_int l}\n`; 5
     | Lop(Iintop_imm(Imod, n)) -> (* n is a power of 2 *)
         let l = Misc.log2 n in
         let a = i.arg.(0) in
@@ -409,40 +603,71 @@ let emit_instr i =
         `	mov	{emit_reg r}, {emit_reg r}, lsr #{emit_int (32-l)}\n`;
         `	bpl	{emit_label lbl}\n`;
         `	cmp	{emit_reg r}, #0\n`;
+        `	it	ne\n`;
         `	subne	{emit_reg r}, {emit_reg r}, #{emit_int n}\n`;
-        `{emit_label lbl}:\n`; 6
+        `{emit_label lbl}:\n`; 7
     | Lop(Iintop_imm((Ilsl | Ilsr | Iasr as op), n)) ->
         let shift = name_for_shift_operation op in
         `	mov	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_string shift} #{emit_int n}\n`; 1
-    | Lop(Iintop_imm(Icomp cmp, n)) ->
-        let comp = name_for_comparison cmp in
-        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
-        `	mov	{emit_reg i.res.(0)}, #0\n`;
-        `	mov{emit_string	comp}   {emit_reg i.res.(0)}, #1\n`; 3
-    | Lop(Iintop_imm(Icheckbound, n)) ->
-        `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
-        `	blls	caml_ml_array_bound_error\n`; 2
     | Lop(Iintop_imm(op, n)) ->
         let instr = name_for_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
-    | Lop(Inegf) -> (* argument and result in (r0, r1) *)
-        `	eor     r1, r1, #0x80000000\n`; 1
-    | Lop(Iabsf) -> (* argument and result in (r0, r1) *)
-        `	bic     r1, r1, #0x80000000\n`; 1
-    | Lop(Ifloatofint | Iintoffloat | Iaddf | Isubf | Imulf | Idivf) ->
-        assert false
+        `	{emit_string instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
+    | Lop(Iabsf | Inegf as op) when !fpu = Soft ->
+        let instr = (match op with
+                       Iabsf -> "bic"
+                     | Inegf -> "eor"
+                     | _     -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(1)}, {emit_reg i.arg.(1)}, #0x80000000\n`; 1
+    | Lop(Iabsf | Inegf | Ispecific Isqrtf as op) ->
+        let instr = (match op with
+                       Iabsf            -> "fabsd"
+                     | Inegf            -> "fnegd"
+                     | Ispecific Isqrtf -> "fsqrtd"
+                     | _                -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}\n`; 1
+    | Lop(Ifloatofint) ->
+        `	fmsr	s14, {emit_reg i.arg.(0)}\n`;
+        `	fsitod	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iintoffloat) ->
+        `	ftosizd	s14, {emit_reg i.arg.(0)}\n`;
+        `	fmrs	{emit_reg i.res.(0)}, s14\n`; 2
+    | Lop(Iaddf | Isubf | Imulf | Idivf | Ispecific Inegmulf as op) ->
+        let instr = (match op with
+                       Iaddf              -> "faddd"
+                     | Isubf              -> "fsubd"
+                     | Imulf              -> "fmuld"
+                     | Idivf              -> "fdivd"
+                     | Ispecific Inegmulf -> "fnmuld"
+                     | _                  -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
+        1
+    | Lop(Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf as op)) ->
+        let instr = (match op with
+                       Imuladdf    -> "fmacd"
+                     | Inegmuladdf -> "fnmacd"
+                     | Imulsubf    -> "fmscd"
+                     | Inegmulsubf -> "fnmscd"
+                     | _ -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`;
+        1
     | Lop(Ispecific(Ishiftarith(op, shift))) ->
-        let instr = name_for_shift_int_operation op in
-        `	{emit_string	instr}     {emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
+        let instr = (match op with
+                       Ishiftadd    -> "add"
+                     | Ishiftsub    -> "sub"
+                     | Ishiftsubrev -> "rsb") in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}`;
         if shift >= 0
         then `, lsl #{emit_int shift}\n`
         else `, asr #{emit_int (-shift)}\n`;
         1
-    | Lop(Ispecific(Ishiftcheckbound shift)) ->
-        `	cmp	{emit_reg i.arg.(1)}, {emit_reg i.arg.(0)}, lsr #{emit_int shift}\n`;
-        `	blcs	caml_ml_array_bound_error\n`; 2
     | Lop(Ispecific(Irevsubimm n)) ->
         `	rsb	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, #{emit_int n}\n`; 1
+    | Lop(Ispecific(Imuladd | Imulsub as op)) ->
+        let instr = (match op with
+                       Imuladd -> "mla"
+                     | Imulsub -> "mls"
+                     | _ -> assert false) in
+        `	{emit_string instr}	{emit_reg i.res.(0)}, {emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}, {emit_reg i.arg.(2)}\n`; 1
     | Lreloadretaddr ->
         let n = frame_size() in
         `	ldr	lr, [sp, #{emit_int(n-4)}]\n`; 1
@@ -458,29 +683,41 @@ let emit_instr i =
         begin match tst with
           Itruetest ->
             `	cmp	{emit_reg i.arg.(0)}, #0\n`;
-            `	bne	{emit_label lbl}\n`
+            `	bne	{emit_label lbl}\n`; 2
         | Ifalsetest ->
             `	cmp	{emit_reg i.arg.(0)}, #0\n`;
-            `	beq	{emit_label lbl}\n`
+            `	beq	{emit_label lbl}\n`; 2
         | Iinttest cmp ->
             `	cmp	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string comp}	{emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`; 2
         | Iinttest_imm(cmp, n) ->
             `	cmp	{emit_reg i.arg.(0)}, #{emit_int n}\n`;
             let comp = name_for_comparison cmp in
-            `	b{emit_string comp}	{emit_label lbl}\n`
+            `	b{emit_string comp}	{emit_label lbl}\n`; 2
         | Ifloattest(cmp, neg) ->
-            assert false
+            let comp = (match (cmp, neg) with
+                          (Ceq, false) | (Cne, true) -> "eq"
+                        | (Cne, false) | (Ceq, true) -> "ne"
+                        | (Clt, false) -> "cc"
+                        | (Clt, true)  -> "cs"
+                        | (Cle, false) -> "ls"
+                        | (Cle, true)  -> "hi"
+                        | (Cgt, false) -> "gt"
+                        | (Cgt, true)  -> "le"
+                        | (Cge, false) -> "ge"
+                        | (Cge, true)  -> "lt") in
+            `	fcmpd	{emit_reg i.arg.(0)}, {emit_reg i.arg.(1)}\n`;
+            `	fmstat\n`;
+            `	b{emit_string comp}	{emit_label lbl}\n`; 3
         | Ioddtest ->
             `	tst	{emit_reg i.arg.(0)}, #1\n`;
-            `	bne	{emit_label lbl}\n`
+            `	bne	{emit_label lbl}\n`; 2
         | Ieventest ->
             `	tst	{emit_reg i.arg.(0)}, #1\n`;
-            `	beq	{emit_label lbl}\n`
-        end;
-        2
-  | Lcondbranch3(lbl0, lbl1, lbl2) ->
+            `	beq	{emit_label lbl}\n`; 2
+        end
+    | Lcondbranch3(lbl0, lbl1, lbl2) ->
         `	cmp	{emit_reg i.arg.(0)}, #1\n`;
         begin match lbl0 with
           None -> ()
@@ -495,108 +732,144 @@ let emit_instr i =
         | Some lbl -> `	bgt	{emit_label lbl}\n`
         end;
         4
-  | Lswitch jumptbl ->
-        `	ldr	pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`;
-        `	mov	r0, r0\n`;      (* nop *)
-        for i = 0 to Array.length jumptbl - 1 do
-          `	.word	{emit_label jumptbl.(i)}\n`
-        done;
+    | Lswitch jumptbl ->
+        if !arch > ARMv6 && !thumb then begin
+          (* The Thumb-2 TBH instruction supports only forward branches,
+             so we need to generate appropriate trampolines for all labels
+             that appear before this switch instruction (PR#5623) *)
+          let tramtbl = Array.copy jumptbl in
+          `	tbh	[pc, {emit_reg i.arg.(0)}, lsl #1]\n`;
+          for j = 0 to Array.length tramtbl - 1 do
+            let rec label i =
+              match i.desc with
+                Lend -> new_label()
+              | Llabel lbl when lbl = tramtbl.(j) -> lbl
+              | _ -> label i.next in
+            tramtbl.(j) <- label i.next;
+            `	.short	({emit_label tramtbl.(j)}-.)/2+{emit_int j}\n`
+          done;
+          (* Generate the necessary trampolines *)
+          for j = 0 to Array.length tramtbl - 1 do
+            if tramtbl.(j) <> jumptbl.(j) then
+              `{emit_label tramtbl.(j)}:	b	{emit_label jumptbl.(j)}\n`
+          done
+        end else if not !pic_code then begin
+          `	ldr	pc, [pc, {emit_reg i.arg.(0)}, lsl #2]\n`;
+          `	nop\n`;
+          for j = 0 to Array.length jumptbl - 1 do
+            `	.word	{emit_label jumptbl.(j)}\n`
+          done
+        end else begin
+          (* Slightly slower, but position-independent *)
+          `	add	pc, pc, {emit_reg i.arg.(0)}, lsl #2\n`;
+          `	nop\n`;
+          for j = 0 to Array.length jumptbl - 1 do
+            `	b	{emit_label jumptbl.(j)}\n`
+          done
+        end;
         2 + Array.length jumptbl
     | Lsetuptrap lbl ->
         `	bl	{emit_label lbl}\n`; 1
     | Lpushtrap ->
         stack_offset := !stack_offset + 8;
-        `	stmfd	sp!, \{trap_ptr, lr}\n`;
+        `	push	\{trap_ptr, lr}\n`;
         `	mov	trap_ptr, sp\n`; 2
     | Lpoptrap ->
-        `	ldmfd	sp!, \{trap_ptr, lr}\n`;
+        `	pop	\{trap_ptr, lr}\n`;
         stack_offset := !stack_offset - 8; 1
     | Lraise ->
-        `	mov	sp, trap_ptr\n`;
-        `	ldmfd	sp!, \{trap_ptr, pc}\n`; 2
+        if !Clflags.debug then begin
+          `	{emit_call "caml_raise_exn"}\n`;
+          `{record_frame Reg.Set.empty i.dbg}\n`; 1
+        end else begin
+          `	mov	sp, trap_ptr\n`;
+          `	pop	\{trap_ptr, pc}\n`; 2
+        end
 
 (* Emission of an instruction sequence *)
 
-let no_fallthrough = function
-    Lop(Itailcall_ind | Itailcall_imm _) -> true
-  | Lreturn -> true
-  | Lbranch _ -> true
-  | Lswitch _ -> true
-  | Lraise -> true
-  | _ -> false
-
 let rec emit_all ninstr i =
   if i.desc = Lend then () else begin
     let n = emit_instr i in
     let ninstr' = ninstr + n in
-    let limit = 511 - !num_literals in
-    if ninstr' >= limit - 64 && no_fallthrough i.desc then begin
-      emit_constants();
+    (* fldd can address up to +/-1KB, ldr can address up to +/-4KB *)
+    let limit = (if !fpu >= VFPv3_D16 && !float_literals <> []
+                 then 127
+                 else 511) in
+    let limit = limit - !num_literals in
+    if ninstr' >= limit - 64 && not(has_fallthrough i.desc) then begin
+      emit_literals();
       emit_all 0 i.next
-    end else
-    if ninstr' >= limit then begin
+    end else if !num_literals != 0 && ninstr' >= limit then begin
       let lbl = new_label() in
       `	b	{emit_label lbl}\n`;
-      emit_constants();
+      emit_literals();
       `{emit_label lbl}:\n`;
       emit_all 0 i.next
     end else
       emit_all ninstr' i.next
   end
 
+(* Emission of the profiling prelude *)
+
+let emit_profile() =
+  match Config.system with
+    "linux_eabi" | "linux_eabihf" ->
+      `	push	\{lr}\n`;
+      `	{emit_call "__gnu_mcount_nc"}\n`
+  | _ -> ()
+
 (* Emission of a function declaration *)
 
 let fundecl fundecl =
   function_name := fundecl.fun_name;
   fastcode_flag := fundecl.fun_fast;
   tailrec_entry_point := new_label();
+  float_literals := [];
+  gotrel_literals := [];
+  symbol_literals := [];
   stack_offset := 0;
-  Hashtbl.clear symbol_constants;
-  Hashtbl.clear float_constants;
+  call_gc_sites := [];
+  bound_error_sites := [];
   `	.text\n`;
   `	.align	2\n`;
-  `	.global	{emit_symbol fundecl.fun_name}\n`;
+  `	.globl	{emit_symbol fundecl.fun_name}\n`;
+  if !arch > ARMv6 && !thumb then
+    `	.thumb\n`
+  else
+    `	.arm\n`;
   `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
   `{emit_symbol fundecl.fun_name}:\n`;
+  if !Clflags.gprofile then emit_profile();
   let n = frame_size() in
   ignore(emit_stack_adjustment "sub" n);
   if !contains_calls then
     `	str	lr, [sp, #{emit_int(n - 4)}]\n`;
   `{emit_label !tailrec_entry_point}:\n`;
   emit_all 0 fundecl.fun_body;
-  emit_constants()
+  emit_literals();
+  List.iter emit_call_gc !call_gc_sites;
+  List.iter emit_call_bound_error !bound_error_sites;
+  `	.type	{emit_symbol fundecl.fun_name}, %function\n`;
+  `	.size	{emit_symbol fundecl.fun_name}, .-{emit_symbol fundecl.fun_name}\n`
 
 (* Emission of data *)
 
 let emit_item = function
-    Cglobal_symbol s ->
-      `	.global	{emit_symbol s}\n`;
-  | Cdefine_symbol s ->
-      `{emit_symbol s}:\n`
-  | Cdefine_label lbl ->
-      `{emit_label (100000 + lbl)}:\n`
-  | Cint8 n ->
-      `	.byte	{emit_int n}\n`
-  | Cint16 n ->
-      `	.short	{emit_int n}\n`
-  | Cint32 n ->
-      `	.word	{emit_nativeint	n}\n`
-  | Cint n ->
-      `	.word	{emit_nativeint n}\n`
-  | Csingle f ->
-      emit_float32_directive ".long" f
-  | Cdouble f ->
-      emit_float64_split_directive ".long" f
-  | Csymbol_address s ->
-      `	.word	{emit_symbol s}\n`
-  | Clabel_address lbl ->
-      `	.word	{emit_label (100000 + lbl)}\n`
-  | Cstring s ->
-      emit_string_directive "	.ascii  " s
-  | Cskip n ->
-      if n > 0 then `	.space	{emit_int n}\n`
-  | Calign n ->
-      `	.align	{emit_int(Misc.log2 n)}\n`
+    Cglobal_symbol s -> `	.globl	{emit_symbol s}\n`;
+  | Cdefine_symbol s -> `{emit_symbol s}:\n`
+  | Cdefine_label lbl -> `{emit_data_label lbl}:\n`
+  | Cint8 n -> `	.byte	{emit_int n}\n`
+  | Cint16 n -> `	.short	{emit_int n}\n`
+  | Cint32 n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
+  | Cint n -> `	.long	{emit_int32 (Nativeint.to_int32 n)}\n`
+  | Csingle f -> `	.single	{emit_string f}\n`
+  | Cdouble f -> `	.double	{emit_string f}\n`
+  | Csymbol_address s -> `	.word	{emit_symbol s}\n`
+  | Clabel_address lbl -> `	.word	{emit_data_label lbl}\n`
+  | Cstring s -> emit_string_directive "	.ascii  " s
+  | Cskip n -> if n > 0 then `	.space	{emit_int n}\n`
+  | Calign n -> `	.align	{emit_int(Misc.log2 n)}\n`
 
 let data l =
   `	.data\n`;
@@ -605,32 +878,62 @@ let data l =
 (* Beginning / end of an assembly file *)
 
 let begin_assembly() =
-  `trap_ptr     .req    r11\n`;
-  `alloc_ptr    .req    r8\n`;
-  `alloc_limit  .req    r10\n`;
+  `	.syntax	unified\n`;
+  begin match !arch with
+  | ARMv4   -> `	.arch	armv4t\n`
+  | ARMv5   -> `	.arch	armv5t\n`
+  | ARMv5TE -> `	.arch	armv5te\n`
+  | ARMv6   -> `	.arch	armv6\n`
+  | ARMv6T2 -> `	.arch	armv6t2\n`
+  | ARMv7   -> `	.arch	armv7-a\n`
+  end;
+  begin match !fpu with
+    Soft      -> `	.fpu	softvfp\n`
+  | VFPv3_D16 -> `	.fpu	vfpv3-d16\n`
+  | VFPv3     -> `	.fpu	vfpv3\n`
+  end;
+  `trap_ptr	.req	r8\n`;
+  `alloc_ptr	.req	r10\n`;
+  `alloc_limit	.req	r11\n`;
   let lbl_begin = Compilenv.make_symbol (Some "data_begin") in
   `	.data\n`;
-  `	.global	{emit_symbol lbl_begin}\n`;
+  `	.globl	{emit_symbol lbl_begin}\n`;
   `{emit_symbol lbl_begin}:\n`;
   let lbl_begin = Compilenv.make_symbol (Some "code_begin") in
   `	.text\n`;
-  `	.global	{emit_symbol lbl_begin}\n`;
+  `	.globl	{emit_symbol lbl_begin}\n`;
   `{emit_symbol lbl_begin}:\n`
 
 let end_assembly () =
   let lbl_end = Compilenv.make_symbol (Some "code_end") in
   `	.text\n`;
-  `	.global	{emit_symbol lbl_end}\n`;
+  `	.globl	{emit_symbol lbl_end}\n`;
   `{emit_symbol lbl_end}:\n`;
   let lbl_end = Compilenv.make_symbol (Some "data_end") in
   `	.data\n`;
-  `	.global	{emit_symbol lbl_end}\n`;
+  `	.globl	{emit_symbol lbl_end}\n`;
   `{emit_symbol lbl_end}:\n`;
-  `	.word	0\n`;
+  `	.long	0\n`;
   let lbl = Compilenv.make_symbol (Some "frametable") in
-  `	.data\n`;
-  `	.global	{emit_symbol lbl}\n`;
+  `	.globl	{emit_symbol lbl}\n`;
   `{emit_symbol lbl}:\n`;
-  `	.word	{emit_int (List.length !frame_descriptors)}\n`;
-  List.iter emit_frame !frame_descriptors;
-  frame_descriptors := []
+  emit_frames
+    { efa_label = (fun lbl ->
+                       `	.type	{emit_label lbl}, %function\n`;
+                       `	.word	{emit_label lbl}\n`);
+      efa_16 = (fun n -> `	.short	{emit_int n}\n`);
+      efa_32 = (fun n -> `	.long	{emit_int32 n}\n`);
+      efa_word = (fun n -> `	.word	{emit_int n}\n`);
+      efa_align = (fun n -> `	.align	{emit_int(Misc.log2 n)}\n`);
+      efa_label_rel = (fun lbl ofs ->
+                           `	.word	{emit_label lbl} - . + {emit_int32 ofs}\n`);
+      efa_def_label = (fun lbl -> `{emit_label lbl}:\n`);
+      efa_string = (fun s -> emit_string_directive "	.asciz	" s) };
+  `	.type	{emit_symbol lbl}, %object\n`;
+  `	.size	{emit_symbol lbl}, .-{emit_symbol lbl}\n`;
+  begin match Config.system with
+    "linux_eabihf" | "linux_eabi" ->
+      (* Mark stack as non-executable *)
+      `	.section	.note.GNU-stack,\"\",%progbits\n`
+  | _ -> ()
+  end
diff --git a/asmcomp/arm/proc.ml b/asmcomp/arm/proc.ml
index e56ac6e..aed2b01 100644
--- a/asmcomp/arm/proc.ml
+++ b/asmcomp/arm/proc.ml
@@ -1,16 +1,17 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: proc.ml 9252 2009-05-04 13:46:46Z xleroy $ *)
+(* $Id$ *)
 
 (* Description of the ARM processor *)
 
@@ -26,32 +27,56 @@ let word_addressed = false
 
 (* Registers available for register allocation *)
 
-(* Register map:
-    r0 - r3                     general purpose (not preserved by C)
-    r4 - r7                     general purpose (preserved)
-    r8                          allocation pointer (preserved)
-    r9                          platform register, usually reserved
-    r10                         allocation limit (preserved)
-    r11                         trap pointer (preserved)
-    r12                         general purpose (not preserved by C)
-    r13                         stack pointer
-    r14                         return address
-    r15                         program counter
+(* Integer register map:
+    r0 - r3               general purpose (not preserved)
+    r4 - r7               general purpose (preserved)
+    r8                    trap pointer (preserved)
+    r9                    platform register, usually reserved
+    r10                   allocation pointer (preserved)
+    r11                   allocation limit (preserved)
+    r12                   intra-procedural scratch register (not preserved)
+    r13                   stack pointer
+    r14                   return address
+    r15                   program counter
+   Floatinng-point register map (VFPv3):
+    d0 - d7               general purpose (not preserved)
+    d8 - d15              general purpose (preserved)
+    d16 - d31             generat purpose (not preserved), VFPv3 only
 *)
 
-let int_reg_name = [|
-  "r0"; "r1"; "r2"; "r3"; "r4"; "r5"; "r6"; "r7"; "r12"
-|]
+let int_reg_name =
+  [| "r0"; "r1"; "r2"; "r3"; "r4"; "r5"; "r6"; "r7"; "r12" |]
+
+let float_reg_name =
+  [| "d0";  "d1";  "d2";  "d3";  "d4";  "d5";  "d6";  "d7";
+     "d8";  "d9";  "d10"; "d11"; "d12"; "d13"; "d14"; "d15";
+     "d16"; "d17"; "d18"; "d19"; "d20"; "d21"; "d22"; "d23";
+     "d24"; "d25"; "d26"; "d27"; "d28"; "d29"; "d30"; "d31" |]
+
+(* We have three register classes:
+    0 for integer registers
+    1 for VFPv3-D16
+    2 for VFPv3
+   This way we can choose between VFPv3-D16 and VFPv3
+   at (ocamlopt) runtime using command line switches.
+*)
 
-let num_register_classes = 1
+let num_register_classes = 3
 
-let register_class r = assert (r.typ <> Float); 0
+let register_class r =
+  match (r.typ, !fpu) with
+    (Int | Addr), _  -> 0
+  | Float, VFPv3_D16 -> 1
+  | Float, _         -> 2
 
-let num_available_registers = [| 9 |]
+let num_available_registers =
+  [| 9; 16; 32 |]
 
-let first_available_register = [| 0 |]
+let first_available_register =
+  [| 0; 100; 100 |]
 
-let register_name r = int_reg_name.(r)
+let register_name r =
+  if r < 100 then int_reg_name.(r) else float_reg_name.(r - 100)
 
 let rotate_registers = true
 
@@ -59,25 +84,34 @@ let rotate_registers = true
 
 let hard_int_reg =
   let v = Array.create 9 Reg.dummy in
-  for i = 0 to 8 do v.(i) <- Reg.at_location Int (Reg i) done;
+  for i = 0 to 8 do
+    v.(i) <- Reg.at_location Int (Reg i)
+  done;
+  v
+
+let hard_float_reg =
+  let v = Array.create 32 Reg.dummy in
+  for i = 0 to 31 do
+    v.(i) <- Reg.at_location Float (Reg(100 + i))
+  done;
   v
 
-let all_phys_regs = hard_int_reg
+let all_phys_regs =
+  Array.append hard_int_reg hard_float_reg
 
-let phys_reg n = all_phys_regs.(n)
+let phys_reg n =
+  if n < 100 then hard_int_reg.(n) else hard_float_reg.(n - 100)
 
 let stack_slot slot ty =
-  assert (ty <> Float);
   Reg.at_location ty (Stack slot)
 
 (* Calling conventions *)
 
-(* XXX float types have already been expanded into pairs of integers.
-   So we cannot align these floats.  See if that causes a problem. *)
-
-let calling_conventions first_int last_int make_stack arg =
+let calling_conventions
+    first_int last_int first_float last_float make_stack arg =
   let loc = Array.create (Array.length arg) Reg.dummy in
   let int = ref first_int in
+  let float = ref first_float in
   let ofs = ref 0 in
   for i = 0 to Array.length arg - 1 do
     match arg.(i).typ with
@@ -90,37 +124,86 @@ let calling_conventions first_int last_int make_stack arg =
           ofs := !ofs + size_int
         end
     | Float ->
-        assert false
+        assert (abi = EABI_VFP);
+        assert (!fpu >= VFPv3_D16);
+        if !float <= last_float then begin
+          loc.(i) <- phys_reg !float;
+          incr float
+        end else begin
+          ofs := Misc.align !ofs size_float;
+          loc.(i) <- stack_slot (make_stack !ofs) Float;
+          ofs := !ofs + size_float
+        end
   done;
-  (loc, Misc.align !ofs 8)
+  (loc, Misc.align !ofs 8)  (* keep stack 8-aligned *)
 
 let incoming ofs = Incoming ofs
 let outgoing ofs = Outgoing ofs
 let not_supported ofs = fatal_error "Proc.loc_results: cannot call"
 
+(* OCaml calling convention:
+     first integer args in r0...r7
+     first float args in d0...d15 (EABI+VFP)
+     remaining args on stack.
+   Return values in r0...r7 or d0...d15. *)
+
 let loc_arguments arg =
-  calling_conventions 0 7 outgoing arg
+  calling_conventions 0 7 100 115 outgoing arg
 let loc_parameters arg =
-  let (loc, ofs) = calling_conventions 0 7 incoming arg in loc
+  let (loc, _) = calling_conventions 0 7 100 115 incoming arg in loc
 let loc_results res =
-  let (loc, ofs) = calling_conventions 0 7 not_supported res in loc
+  let (loc, _) = calling_conventions 0 7 100 115 not_supported res in loc
+
+(* C calling convention:
+     first integer args in r0...r3
+     first float args in d0...d7 (EABI+VFP)
+     remaining args on stack.
+   Return values in r0...r1 or d0. *)
 
 let loc_external_arguments arg =
-  calling_conventions 0 3 outgoing arg
+  calling_conventions 0 3 100 107 outgoing arg
 let loc_external_results res =
-  let (loc, ofs) = calling_conventions 0 1 not_supported res in loc
+  let (loc, _) = calling_conventions 0 1 100 100 not_supported res in loc
 
 let loc_exn_bucket = phys_reg 0
 
 (* Registers destroyed by operations *)
 
-let destroyed_at_c_call =               (* r4-r7 preserved *)
-  Array.of_list(List.map phys_reg [0;1;2;3;8])
+let destroyed_at_alloc =            (* r0-r6, d0-d15 preserved *)
+  Array.of_list (List.map
+                   phys_reg
+                   [7;8;
+                    116;116;118;119;120;121;122;123;
+                    124;125;126;127;128;129;130;131])
+
+let destroyed_at_c_call =
+  Array.of_list (List.map
+                   phys_reg
+                   (match abi with
+                      EABI ->       (* r4-r7 preserved *)
+                        [0;1;2;3;8;
+                         100;101;102;103;104;105;106;107;
+                         108;109;110;111;112;113;114;115;
+                         116;116;118;119;120;121;122;123;
+                         124;125;126;127;128;129;130;131]
+                    | EABI_VFP ->   (* r4-r7, d8-d15 preserved *)
+                        [0;1;2;3;8;
+                         100;101;102;103;104;105;106;107;
+                         116;116;118;119;120;121;122;123;
+                         124;125;126;127;128;129;130;131]))
 
 let destroyed_at_oper = function
-    Iop(Icall_ind | Icall_imm _ | Iextcall(_, true)) -> all_phys_regs
-  | Iop(Iextcall(_, false)) -> destroyed_at_c_call
-  | Iop(Ialloc(_)) -> [|phys_reg 8|]    (* r12 destroyed *)
+    Iop(Icall_ind | Icall_imm _ )
+  | Iop(Iextcall(_, true)) ->
+      all_phys_regs
+  | Iop(Iextcall(_, false)) ->
+      destroyed_at_c_call
+  | Iop(Ialloc n) ->
+      destroyed_at_alloc
+  | Iop(Iconst_symbol _) when !pic_code ->
+      [|phys_reg 3; phys_reg 8|]  (* r3 and r12 destroyed *)
+  | Iop(Iintoffloat | Ifloatofint | Iload(Single, _) | Istore(Single, _)) ->
+      [|phys_reg 107|]            (* d7 (s14-s15) destroyed *)
   | _ -> [||]
 
 let destroyed_at_raise = all_phys_regs
@@ -128,15 +211,16 @@ let destroyed_at_raise = all_phys_regs
 (* Maximal register pressure *)
 
 let safe_register_pressure = function
-    Iextcall(_, _) -> 4
+    Iextcall(_, _) -> 5
   | _ -> 9
+
 let max_register_pressure = function
-    Iextcall(_, _) -> [| 4 |]
-  | _ -> [| 9 |]
+    Iextcall(_, _) -> [| 5; 9; 9 |]
+  | _ -> [| 9; 16; 32 |]
 
 (* Layout of the stack *)
 
-let num_stack_slots = [| 0 |]
+let num_stack_slots = [| 0; 0; 0 |]
 let contains_calls = ref false
 
 (* Calling the assembler *)
@@ -144,6 +228,3 @@ let contains_calls = ref false
 let assemble_file infile outfile =
   Ccomp.command (Config.asm ^ " -o " ^
                  Filename.quote outfile ^ " " ^ Filename.quote infile)
-
-open Clflags;;
-open Config;;
diff --git a/asmcomp/arm/reload.ml b/asmcomp/arm/reload.ml
index 0917438..c5b137a 100644
--- a/asmcomp/arm/reload.ml
+++ b/asmcomp/arm/reload.ml
@@ -1,6 +1,6 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
 (*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
 (*                                                                     *)
@@ -10,7 +10,7 @@
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: reload.ml 2553 1999-11-17 18:59:06Z xleroy $ *)
+(* $Id$ *)
 
 (* Reloading for the ARM *)
 
diff --git a/asmcomp/arm/scheduling.ml b/asmcomp/arm/scheduling.ml
index 930e1bc..4b47733 100644
--- a/asmcomp/arm/scheduling.ml
+++ b/asmcomp/arm/scheduling.ml
@@ -1,51 +1,79 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1996 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: scheduling.ml 2553 1999-11-17 18:59:06Z xleroy $ *)
+(* $Id$ *)
 
+open Arch
 open Mach
 
-(* Instruction scheduling for the Sparc *)
+(* Instruction scheduling for the ARM *)
 
-class scheduler = object
+class scheduler = object(self)
 
-inherit Schedgen.scheduler_generic
+inherit Schedgen.scheduler_generic as super
 
-(* Scheduling -- based roughly on the Strong ARM *)
+(* Scheduling -- based roughly on the ARM11 (ARMv6) *)
 
 method oper_latency = function
-    Ireload -> 2
-  | Iload(_, _) -> 2
-  | Iconst_symbol _ -> 2                (* turned into a load *)
-  | Iconst_float _ -> 2                 (* turned into a load *)
-  | Iintop(Imul) -> 3
-  | Iintop_imm(Imul, _) -> 3
-  (* No data available for floatops, let's make educated guesses *)
-  | Iaddf -> 3
-  | Isubf -> 3
-  | Imulf -> 5
-  | Idivf -> 15
+  (* Loads have a latency of two cycles in general *)
+    Iconst_symbol _
+  | Iconst_float _
+  | Iload(_, _)
+  | Ireload
+  | Ifloatofint       (* mcr/mrc count as memory access *)
+  | Iintoffloat -> 2
+  (* Multiplys have a latency of two cycles *)
+  | Iintop Imul
+  | Ispecific(Imuladd | Imulsub) -> 2
+  (* VFP instructions *)
+  | Iaddf
+  | Isubf
+  | Idivf
+  | Imulf | Ispecific Inegmulf
+  | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf)
+  | Ispecific Isqrtf
+  | Inegf | Iabsf when !fpu >= VFPv3_D16 -> 2
+  (* Everything else *)
   | _ -> 1
 
-(* Issue cycles.  Rough approximations *)
+method! is_checkbound = function
+    Ispecific(Ishiftcheckbound _) -> true
+  | op -> super#is_checkbound op
+
+(* Issue cycles. Rough approximations *)
 
 method oper_issue_cycles = function
     Ialloc _ -> 4
-  | Iintop(Icomp _) -> 3
-  | Iintop(Icheckbound) -> 2
-  | Iintop_imm(Idiv, _) -> 4
-  | Iintop_imm(Imod, _) -> 6
+  | Iintop(Ilsl | Ilsr | Iasr) -> 2
+  | Iintop(Icomp _)
   | Iintop_imm(Icomp _, _) -> 3
+  | Iintop(Icheckbound)
   | Iintop_imm(Icheckbound, _) -> 2
+  | Ispecific(Ishiftcheckbound _) -> 3
+  | Iintop_imm(Idiv, _) -> 4
+  | Iintop_imm(Imod, _) -> 6
+  | Iintop Imul
+  | Ispecific(Imuladd | Imulsub) -> 2
+  (* VFP instructions *)
+  | Iaddf
+  | Isubf -> 7
+  | Imulf
+  | Ispecific Inegmulf -> 9
+  | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf) -> 17
+  | Idivf
+  | Ispecific Isqrtf -> 27
+  | Inegf | Iabsf | Iconst_float _ when !fpu >= VFPv3_D16 -> 4
+  (* Everything else *)
   | _ -> 1
 
 end
diff --git a/asmcomp/arm/selection.ml b/asmcomp/arm/selection.ml
index f09d146..94d0367 100644
--- a/asmcomp/arm/selection.ml
+++ b/asmcomp/arm/selection.ml
@@ -1,54 +1,77 @@
 (***********************************************************************)
 (*                                                                     *)
-(*                           Objective Caml                            *)
+(*                                OCaml                                *)
 (*                                                                     *)
-(*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         *)
+(*                  Benedikt Meurer, University of Siegen              *)
 (*                                                                     *)
-(*  Copyright 1998 Institut National de Recherche en Informatique et   *)
-(*  en Automatique.  All rights reserved.  This file is distributed    *)
-(*  under the terms of the Q Public License version 1.0.               *)
+(*    Copyright 1998 Institut National de Recherche en Informatique    *)
+(*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    *)
+(*    reserved.  This file is distributed  under the terms of the Q    *)
+(*    Public License version 1.0.                                      *)
 (*                                                                     *)
 (***********************************************************************)
 
-(* $Id: selection.ml 10295 2010-04-22 12:39:40Z xleroy $ *)
+(* $Id$ *)
 
 (* Instruction selection for the ARM processor *)
 
-open Misc
-open Cmm
-open Reg
 open Arch
-open Proc
+open Cmm
 open Mach
+open Misc
+open Proc
+open Reg
 
-(* Immediate operands are 8-bit immediate values, zero-extended, and rotated
-   right by 0, 2, 4, ... 30 bits.
-   To avoid problems with Caml's 31-bit arithmetic,
-   we check only with 8-bit values shifted left 0 to 22 bits. *)
-
-let rec is_immed n shift =
-  if shift > 22 then false
-  else if n land (0xFF lsl shift) = n then true
-  else is_immed n (shift + 2)
+let is_offset chunk n =
+  match chunk with
+  (* VFPv3 load/store have -1020 to 1020 *)
+    Single | Double | Double_u
+    when !fpu >= VFPv3_D16 ->
+      n >= -1020 && n <= 1020
+  (* ARM load/store byte/word have -4095 to 4095 *)
+  | Byte_unsigned | Byte_signed
+  | Thirtytwo_unsigned | Thirtytwo_signed
+  | Word | Single
+    when not !thumb ->
+      n >= -4095 && n <= 4095
+  (* Thumb-2 load/store have -255 to 4095 *)
+  | _ when !arch > ARMv6 && !thumb ->
+      n >= -255 && n <= 4095
+  (* Everything else has -255 to 255 *)
+  | _ ->
+      n >= -255 && n <= 255
 
-(* We have 12-bit + sign byte offsets for word accesses,
-   8-bit + sign word offsets for float accesses,
-   and 8-bit + sign byte offsets for bytes and shorts.
-   Use lowest common denominator. *)
+let is_intconst = function
+    Cconst_int _ -> true
+  | _ -> false
 
-let is_offset n = n < 256 && n > -256
+(* Special constraints on operand and result registers *)
 
-let is_intconst = function Cconst_int n -> true | _ -> false
+exception Use_default
 
-(* Soft emulation of float comparisons *)
+let r1 = phys_reg 1
 
-let float_comparison_function = function
-  | Ceq -> "__eqdf2"
-  | Cne -> "__nedf2"
-  | Clt -> "__ltdf2"
-  | Cle -> "__ledf2"
-  | Cgt -> "__gtdf2"
-  | Cge -> "__gedf2"
+let pseudoregs_for_operation op arg res =
+  match op with
+  (* For mul rd,rm,rs and mla rd,rm,rs,ra (pre-ARMv6) the registers rm
+     and rd must be different. We deal with this by pretending that rm
+     is also a result of the mul / mla operation. *)
+    Iintop Imul | Ispecific Imuladd when !arch < ARMv6 ->
+      (arg, [| res.(0); arg.(0) |])
+  (* Soft-float Iabsf and Inegf: arg.(0) and res.(0) must be the same *)
+  | Iabsf | Inegf when !fpu = Soft ->
+      ([|res.(0); arg.(1)|], res)
+  (* VFPv3 Imuladdf...Inegmulsubf: arg.(0) and res.(0) must be the same *)
+  | Ispecific(Imuladdf | Inegmuladdf | Imulsubf | Inegmulsubf) ->
+      let arg' = Array.copy arg in
+      arg'.(0) <- res.(0);
+      (arg', res)
+  (* We use __aeabi_idivmod for Cmodi only, and hence we care only
+     for the remainder in r1, so fix up the destination register. *)
+  | Iextcall("__aeabi_idivmod", false) ->
+      (arg, [|r1|])
+  (* Other instructions are regular *)
+  | _ -> raise Use_default
 
 (* Instruction selection *)
 class selector = object(self)
@@ -56,23 +79,32 @@ class selector = object(self)
 inherit Selectgen.selector_generic as super
 
 method! regs_for tyv =
-  (* Expand floats into pairs of integer registers *)
-  let nty = Array.length tyv in
-  let rec expand i =
-    if i >= nty then [] else begin
-      match tyv.(i) with
-      | Float -> Int :: Int :: expand (i+1)
-      | ty -> ty :: expand (i+1)
-    end in
-  Reg.createv (Array.of_list (expand 0))
+  Reg.createv (if !fpu = Soft then begin
+                 (* Expand floats into pairs of integer registers *)
+                 let rec expand = function
+                   [] -> []
+                 | Float :: tyl -> Int :: Int :: expand tyl
+                 | ty :: tyl -> ty :: expand tyl in
+                 Array.of_list (expand (Array.to_list tyv))
+               end else begin
+                 tyv
+               end)
 
 method is_immediate n =
-  n land 0xFF = n || is_immed n 2
+  is_immediate (Int32.of_int n)
+
+method! is_simple_expr = function
+  (* inlined floating-point ops are simple if their arguments are *)
+  | Cop(Cextcall("sqrt", _, _, _), args) when !fpu >= VFPv3_D16 ->
+      List.for_all self#is_simple_expr args
+  | e -> super#is_simple_expr e
 
-method select_addressing = function
-    Cop(Cadda, [arg; Cconst_int n]) when is_offset n ->
+method select_addressing chunk = function
+  | Cop(Cadda, [arg; Cconst_int n])
+    when is_offset chunk n ->
       (Iindexed n, arg)
-  | Cop(Cadda, [arg1; Cop(Caddi, [arg2; Cconst_int n])]) when is_offset n ->
+  | Cop(Cadda, [arg1; Cop(Caddi, [arg2; Cconst_int n])])
+    when is_offset chunk n ->
       (Iindexed n, Cop(Cadda, [arg1; arg2]))
   | arg ->
       (Iindexed 0, arg)
@@ -91,109 +123,146 @@ method select_shift_arith op shiftop shiftrevop args =
   | [Cop(Casr, [arg1; Cconst_int n]); arg2]
     when n > 0 && n < 32 && not(is_intconst arg1) ->
       (Ispecific(Ishiftarith(shiftrevop, -n)), [arg2; arg1])
-  | _ ->
-      super#select_operation op args
+  | args ->
+      begin match super#select_operation op args with
+      (* Recognize multiply and add *)
+        (Iintop Iadd, [Cop(Cmuli, args); arg3])
+      | (Iintop Iadd, [arg3; Cop(Cmuli, args)]) as op_args ->
+          begin match self#select_operation Cmuli args with
+            (Iintop Imul, [arg1; arg2]) ->
+              (Ispecific Imuladd, [arg1; arg2; arg3])
+          | _ -> op_args
+          end
+      (* Recognize multiply and subtract *)
+      | (Iintop Isub, [arg3; Cop(Cmuli, args)]) as op_args
+        when !arch > ARMv6 ->
+          begin match self#select_operation Cmuli args with
+            (Iintop Imul, [arg1; arg2]) ->
+              (Ispecific Imulsub, [arg1; arg2; arg3])
+          | _ -> op_args
+          end
+      | op_args -> op_args
+      end
 
 method! select_operation op args =
-  match op with
-    Cadda | Caddi ->
-      begin match args with
-        [arg1; Cconst_int n] when n < 0 && self#is_immediate (-n) ->
-          (Iintop_imm(Isub, -n), [arg1])
-      | _ ->
-          self#select_shift_arith op Ishiftadd Ishiftadd args
-      end
-  | Csuba | Csubi ->
-      begin match args with
-        [arg1; Cconst_int n] when n < 0 && self#is_immediate (-n) ->
-          (Iintop_imm(Iadd, -n), [arg1])
-      | [Cconst_int n; arg2] when self#is_immediate n ->
-          (Ispecific(Irevsubimm n), [arg2])
-      | _ ->
-          self#select_shift_arith op Ishiftsub Ishiftsubrev args
-      end
-  | Cmuli ->                    (* no multiply immediate *)
+  match (op, args) with
+  (* Recognize special shift arithmetic *)
+    ((Cadda | Caddi), [arg; Cconst_int n])
+    when n < 0 && self#is_immediate (-n) ->
+      (Iintop_imm(Isub, -n), [arg])
+  | ((Cadda | Caddi as op), args) ->
+      self#select_shift_arith op Ishiftadd Ishiftadd args
+  | ((Csuba | Csubi), [arg; Cconst_int n])
+    when n < 0 && self#is_immediate (-n) ->
+      (Iintop_imm(Iadd, -n), [arg])
+  | ((Csuba | Csubi), [Cconst_int n; arg])
+    when self#is_immediate n ->
+      (Ispecific(Irevsubimm n), [arg])
+  | ((Csuba | Csubi as op), args) ->
+      self#select_shift_arith op Ishiftsub Ishiftsubrev args
+  | (Ccheckbound _, [Cop(Clsr, [arg1; Cconst_int n]); arg2])
+    when n > 0 && n < 32 && not(is_intconst arg2) ->
+      (Ispecific(Ishiftcheckbound n), [arg1; arg2])
+  (* ARM does not support immediate operands for multiplication *)
+  | (Cmuli, args) ->
       (Iintop Imul, args)
-  | Cdivi ->
-      begin match args with
-        [arg1; Cconst_int n] when n = 1 lsl (Misc.log2 n) ->
-          (Iintop_imm(Idiv, n), [arg1])
-      | _ ->
-          (Iextcall("__divsi3", false), args)
-      end
-  | Cmodi ->
-      begin match args with
-        [arg1; Cconst_int n] when n = 1 lsl (Misc.log2 n) ->
-          (Iintop_imm(Imod, n), [arg1])
-      | _ ->
-          (Iextcall("__modsi3", false), args)
-      end
-  | Ccheckbound _ ->
-      begin match args with
-        [Cop(Clsr, [arg1; Cconst_int n]); arg2]
-        when n > 0 && n < 32 && not(is_intconst arg2) ->
-          (Ispecific(Ishiftcheckbound n), [arg1; arg2])
-      | _ ->
-        super#select_operation op args
-      end
-  (* Turn floating-point operations into library function calls *)
-  | Caddf -> (Iextcall("__adddf3", false), args)
-  | Csubf -> (Iextcall("__subdf3", false), args)
-  | Cmulf -> (Iextcall("__muldf3", false), args)
-  | Cdivf -> (Iextcall("__divdf3", false), args)
-  | Cfloatofint -> (Iextcall("__floatsidf", false), args)
-  | Cintoffloat -> (Iextcall("__fixdfsi", false), args)
-  | Ccmpf comp ->
-      (Iintop_imm(Icomp(Isigned comp), 0),
-       [Cop(Cextcall(float_comparison_function comp,
-                     typ_int, false, Debuginfo.none),
-            args)])
+  (* Turn integer division/modulus into runtime ABI calls *)
+  | (Cdivi, [arg; Cconst_int n])
+    when n = 1 lsl Misc.log2 n ->
+      (Iintop_imm(Idiv, n), [arg])
+  | (Cdivi, args) ->
+      (Iextcall("__aeabi_idiv", false), args)
+  | (Cmodi, [arg; Cconst_int n])
+    when n = 1 lsl Misc.log2 n ->
+      (Iintop_imm(Imod, n), [arg])
+  | (Cmodi, args) ->
+      (* See above for fix up of return register *)
+      (Iextcall("__aeabi_idivmod", false), args)
+  (* Turn floating-point operations into runtime ABI calls for softfp *)
+  | (op, args) when !fpu = Soft -> self#select_operation_softfp op args
+  (* Select operations for VFPv3 *)
+  | (op, args) -> self#select_operation_vfpv3 op args
+
+method private select_operation_softfp op args =
+  match (op, args) with
+  (* Turn floating-point operations into runtime ABI calls *)
+  | (Caddf, args) -> (Iextcall("__aeabi_dadd", false), args)
+  | (Csubf, args) -> (Iextcall("__aeabi_dsub", false), args)
+  | (Cmulf, args) -> (Iextcall("__aeabi_dmul", false), args)
+  | (Cdivf, args) -> (Iextcall("__aeabi_ddiv", false), args)
+  | (Cfloatofint, args) -> (Iextcall("__aeabi_i2d", false), args)
+  | (Cintoffloat, args) -> (Iextcall("__aeabi_d2iz", false), args)
+  | (Ccmpf comp, args) ->
+      let func = (match comp with
+                    Cne    (* there's no __aeabi_dcmpne *)
+                  | Ceq -> "__aeabi_dcmpeq"
+                  | Clt -> "__aeabi_dcmplt"
+                  | Cle -> "__aeabi_dcmple"
+                  | Cgt -> "__aeabi_dcmpgt"
+                  | Cge -> "__aeabi_dcmpge") in
+      let comp = (match comp with
+                    Cne -> Ceq (* eq 0 => false *)
+                  | _   -> Cne (* ne 0 => true *)) in
+      (Iintop_imm(Icomp(Iunsigned comp), 0),
+       [Cop(Cextcall(func, typ_int, false, Debuginfo.none), args)])
   (* Add coercions around loads and stores of 32-bit floats *)
-  | Cload Single ->
-      (Iextcall("__extendsfdf2", false), [Cop(Cload Word, args)])
-  | Cstore Single ->
-      begin match args with
-      | [arg1; arg2] ->
-          let arg2' =
-            Cop(Cextcall("__truncdfsf2", typ_int, false, Debuginfo.none),
-                [arg2]) in
-          self#select_operation (Cstore Word) [arg1; arg2']
-      | _ -> assert false
-      end
+  | (Cload Single, args) ->
+      (Iextcall("__aeabi_f2d", false), [Cop(Cload Word, args)])
+  | (Cstore Single, [arg1; arg2]) ->
+      let arg2' =
+        Cop(Cextcall("__aeabi_d2f", typ_int, false, Debuginfo.none),
+            [arg2]) in
+      self#select_operation (Cstore Word) [arg1; arg2']
   (* Other operations are regular *)
-  | _ -> super#select_operation op args
+  | (op, args) -> super#select_operation op args
+
+method private select_operation_vfpv3 op args =
+  match (op, args) with
+  (* Recognize floating-point negate and multiply *)
+    (Cnegf, [Cop(Cmulf, args)]) ->
+      (Ispecific Inegmulf, args)
+  (* Recognize floating-point multiply and add *)
+  | (Caddf, [arg; Cop(Cmulf, args)])
+  | (Caddf, [Cop(Cmulf, args); arg]) ->
+      (Ispecific Imuladdf, arg :: args)
+  (* Recognize floating-point negate, multiply and subtract *)
+  | (Csubf, [Cop(Cnegf, [arg]); Cop(Cmulf, args)])
+  | (Csubf, [Cop(Cnegf, [Cop(Cmulf, args)]); arg]) ->
+      (Ispecific Inegmulsubf, arg :: args)
+  (* Recognize floating-point negate, multiply and add *)
+  | (Csubf, [arg; Cop(Cmulf, args)]) ->
+      (Ispecific Inegmuladdf, arg :: args)
+  (* Recognize multiply and subtract *)
+  | (Csubf, [Cop(Cmulf, args); arg]) ->
+      (Ispecific Imulsubf, arg :: args)
+  (* Recognize floating-point square root *)
+  | (Cextcall("sqrt", _, false, _), args) ->
+      (Ispecific Isqrtf, args)
+  (* Other operations are regular *)
+  | (op, args) -> super#select_operation op args
 
 method! select_condition = function
-  | Cop(Ccmpf cmp, args) ->
-      (Iinttest_imm(Isigned cmp, 0),
-       Cop(Cextcall(float_comparison_function cmp,
-                    typ_int, false, Debuginfo.none),
-           args))
+  (* Turn floating-point comparisons into runtime ABI calls *)
+    Cop(Ccmpf _ as op, args) when !fpu = Soft ->
+      begin match self#select_operation_softfp op args with
+        (Iintop_imm(Icomp(Iunsigned Ceq), 0), [arg]) -> (Ifalsetest, arg)
+      | (Iintop_imm(Icomp(Iunsigned Cne), 0), [arg]) -> (Itruetest, arg)
+      | _ -> assert false
+      end
   | expr ->
       super#select_condition expr
 
-(* Deal with some register irregularities:
-
-1- In mul rd, rm, rs,  the registers rm and rd must be different.
-   We deal with this by pretending that rm is also a result of the mul
-   operation.
-
-2- For Inegf and Iabsf, force arguments and results in (r0, r1);
-   this simplifies code generation later.
-*)
+(* Deal with some register constraints *)
 
 method! insert_op_debug op dbg rs rd =
-  match op with
-  | Iintop(Imul) ->
-      self#insert_debug (Iop op) dbg rs [| rd.(0); rs.(0) |]; rd
-  | Iabsf | Inegf ->
-      let r = [| phys_reg 0; phys_reg 1 |] in
-      self#insert_moves rs r;
-      self#insert_debug (Iop op) dbg r r;
-      self#insert_moves r rd;
-      rd
-  | _ ->
-      super#insert_op_debug op dbg rs rd
+  try
+    let (rsrc, rdst) = pseudoregs_for_operation op rs rd in
+    self#insert_moves rs rsrc;
+    self#insert_debug (Iop op) dbg rsrc rdst;
+    self#insert_moves rdst rd;
+    rd
+  with Use_default ->
+    super#insert_op_debug op dbg rs rd
 
 end
 
diff --git a/asmcomp/i386/selection.ml b/asmcomp/i386/selection.ml
index 1700bf3..827a63d 100644
--- a/asmcomp/i386/selection.ml
+++ b/asmcomp/i386/selection.ml
@@ -168,7 +168,7 @@ method! is_simple_expr e =
   | _ ->
       super#is_simple_expr e
 
-method select_addressing exp =
+method select_addressing chunk exp =
   match select_addr exp with
     (Asymbol s, d) ->
       (Ibased(s, d), Ctuple [])
@@ -200,7 +200,7 @@ method! select_operation op args =
   match op with
   (* Recognize the LEA instruction *)
     Caddi | Cadda | Csubi | Csuba ->
-      begin match self#select_addressing (Cop(op, args)) with
+      begin match self#select_addressing Word (Cop(op, args)) with
         (Iindexed d, _) -> super#select_operation op args
       | (Iindexed2 0, _) -> super#select_operation op args
       | (addr, arg) -> (Ispecific(Ilea addr), [arg])
@@ -233,7 +233,7 @@ method! select_operation op args =
       begin match args with
         [loc; Cop(Caddi, [Cop(Cload _, [loc']); Cconst_int n])]
         when loc = loc' ->
-          let (addr, arg) = self#select_addressing loc in
+          let (addr, arg) = self#select_addressing Word loc in
           (Ispecific(Ioffset_loc(n, addr)), [arg])
       | _ ->
           super#select_operation op args
@@ -250,11 +250,11 @@ method! select_operation op args =
 method select_floatarith regular_op reversed_op mem_op mem_rev_op args =
   match args with
     [arg1; Cop(Cload chunk, [loc2])] ->
-      let (addr, arg2) = self#select_addressing loc2 in
+      let (addr, arg2) = self#select_addressing chunk loc2 in
       (Ispecific(Ifloatarithmem(chunk_double chunk, mem_op, addr)),
                  [arg1; arg2])
   | [Cop(Cload chunk, [loc1]); arg2] ->
-      let (addr, arg1) = self#select_addressing loc1 in
+      let (addr, arg1) = self#select_addressing chunk loc1 in
       (Ispecific(Ifloatarithmem(chunk_double chunk, mem_rev_op, addr)),
                  [arg2; arg1])
   | [arg1; arg2] ->
@@ -295,10 +295,10 @@ method select_push exp =
   | Cconst_natpointer n -> (Ispecific(Ipush_int n), Ctuple [])
   | Cconst_symbol s -> (Ispecific(Ipush_symbol s), Ctuple [])
   | Cop(Cload Word, [loc]) ->
-      let (addr, arg) = self#select_addressing loc in
+      let (addr, arg) = self#select_addressing Word loc in
       (Ispecific(Ipush_load addr), arg)
   | Cop(Cload Double_u, [loc]) ->
-      let (addr, arg) = self#select_addressing loc in
+      let (addr, arg) = self#select_addressing Double_u loc in
       (Ispecific(Ipush_load_float addr), arg)
   | _ -> (Ispecific(Ipush), exp)
 
diff --git a/asmcomp/power/selection.ml b/asmcomp/power/selection.ml
index ed15efb..0532d6b 100644
--- a/asmcomp/power/selection.ml
+++ b/asmcomp/power/selection.ml
@@ -52,7 +52,7 @@ inherit Selectgen.selector_generic as super
 
 method is_immediate n = (n <= 32767) && (n >= -32768)
 
-method select_addressing exp =
+method select_addressing chunk exp =
   match select_addr exp with
     (Asymbol s, d) ->
       (Ibased(s, d), Ctuple [])
diff --git a/asmcomp/power64/selection.ml b/asmcomp/power64/selection.ml
index 7b8e2a4..d2325e1 100644
--- a/asmcomp/power64/selection.ml
+++ b/asmcomp/power64/selection.ml
@@ -52,7 +52,7 @@ inherit Selectgen.selector_generic as super
 
 method is_immediate n = (n <= 32767) && (n >= -32768)
 
-method select_addressing exp =
+method select_addressing chunk exp =
   match select_addr exp with
     (Asymbol s, d) ->
       (Ibased(s, d), Ctuple [])
diff --git a/asmcomp/selectgen.ml b/asmcomp/selectgen.ml
index 2fc40f7..0bc9efb 100644
--- a/asmcomp/selectgen.ml
+++ b/asmcomp/selectgen.ml
@@ -204,7 +204,7 @@ method virtual is_immediate : int -> bool
 (* Selection of addressing modes *)
 
 method virtual select_addressing :
-  Cmm.expression -> Arch.addressing_mode * Cmm.expression
+  Cmm.memory_chunk -> Cmm.expression -> Arch.addressing_mode * Cmm.expression
 
 (* Default instruction selection for stores (of words) *)
 
@@ -219,10 +219,10 @@ method select_operation op args =
   | (Capply(ty, dbg), _) -> (Icall_ind, args)
   | (Cextcall(s, ty, alloc, dbg), _) -> (Iextcall(s, alloc), args)
   | (Cload chunk, [arg]) ->
-      let (addr, eloc) = self#select_addressing arg in
+      let (addr, eloc) = self#select_addressing chunk arg in
       (Iload(chunk, addr), [eloc])
   | (Cstore chunk, [arg1; arg2]) ->
-      let (addr, eloc) = self#select_addressing arg1 in
+      let (addr, eloc) = self#select_addressing chunk arg1 in
       if chunk = Word then begin
         let (op, newarg2) = self#select_store addr arg2 in
         (op, [newarg2; eloc])
@@ -366,7 +366,7 @@ method insert_move src dst =
     self#insert (Iop Imove) [|src|] [|dst|]
 
 method insert_moves src dst =
-  for i = 0 to Array.length src - 1 do
+  for i = 0 to min (Array.length src) (Array.length dst) - 1 do
     self#insert_move src.(i) dst.(i)
   done
 
@@ -490,9 +490,8 @@ method emit_expr env exp =
               let (loc_arg, stack_ofs) =
                 self#emit_extcall_args env new_args in
               let rd = self#regs_for ty in
-              let loc_res = Proc.loc_external_results rd in
-              self#insert_debug (Iop(Iextcall(lbl, alloc))) dbg
-                             loc_arg loc_res;
+              let loc_res = self#insert_op_debug (Iextcall(lbl, alloc)) dbg
+                                    loc_arg (Proc.loc_external_results rd) in
               self#insert_move_results loc_res rd stack_ofs;
               Some rd
           | Ialloc _ ->
diff --git a/asmcomp/selectgen.mli b/asmcomp/selectgen.mli
index ae53cda..69dae6d 100644
--- a/asmcomp/selectgen.mli
+++ b/asmcomp/selectgen.mli
@@ -26,7 +26,7 @@ class virtual selector_generic : object
     (* Must be defined to indicate whether a constant is a suitable
        immediate operand to arithmetic instructions *)
   method virtual select_addressing :
-    Cmm.expression -> Arch.addressing_mode * Cmm.expression
+    Cmm.memory_chunk -> Cmm.expression -> Arch.addressing_mode * Cmm.expression
     (* Must be defined to select addressing modes *)
   method is_simple_expr: Cmm.expression -> bool
     (* Can be overridden to reflect special extcalls known to be pure *)
diff --git a/asmcomp/sparc/selection.ml b/asmcomp/sparc/selection.ml
index 82758dc..c1f30fd 100644
--- a/asmcomp/sparc/selection.ml
+++ b/asmcomp/sparc/selection.ml
@@ -26,7 +26,7 @@ inherit Selectgen.selector_generic as super
 
 method is_immediate n = (n <= 4095) && (n >= -4096)
 
-method select_addressing = function
+method select_addressing chunk = function
     Cconst_symbol s ->
       (Ibased(s, 0), Ctuple [])
   | Cop(Cadda, [Cconst_symbol s; Cconst_int n]) ->
diff --git a/asmrun/arm.S b/asmrun/arm.S
index 1313e9c..6482956 100644
--- a/asmrun/arm.S
+++ b/asmrun/arm.S
@@ -1,286 +1,411 @@
 /***********************************************************************/
 /*                                                                     */
-/*                           Objective Caml                            */
+/*                                OCaml                                */
 /*                                                                     */
-/*            Xavier Leroy, projet Cristal, INRIA Rocquencourt         */
+/*                  Benedikt Meurer, University of Siegen              */
 /*                                                                     */
-/*  Copyright 1998 Institut National de Recherche en Informatique et   */
-/*  en Automatique.  All rights reserved.  This file is distributed    */
-/*  under the terms of the GNU Library General Public License, with    */
-/*  the special exception on linking described in file ../LICENSE.     */
+/*    Copyright 1998 Institut National de Recherche en Informatique    */
+/*    et en Automatique. Copyright 2012 Benedikt Meurer. All rights    */
+/*    reserved. This file is distributed under the terms of the GNU    */
+/*    Library General Public License, with the special exception on    */
+/*    linking described in file ../LICENSE.                            */
 /*                                                                     */
 /***********************************************************************/
 
-/* $Id: arm.S 9252 2009-05-04 13:46:46Z xleroy $ */
+/* $Id$ */
 
 /* Asm part of the runtime system, ARM processor */
+/* Must be preprocessed by cpp */
 
-trap_ptr        .req    r11
-alloc_ptr       .req    r8
-alloc_limit     .req    r10
-
+        .syntax unified
         .text
+#if defined(SYS_linux_eabihf)
+        .arch   armv7-a
+        .fpu    vfpv3-d16
+        .thumb
+#elif defined(SYS_linux_eabi)
+        .arch   armv4t
+        .arm
+
+    /* Compatibility macros */
+        .macro  blx reg
+        mov     lr, pc
+        bx      \reg
+        .endm
+        .macro  cbz reg, lbl
+        cmp     \reg, #0
+        beq     \lbl
+        .endm
+        .macro  vpop regs
+        .endm
+        .macro  vpush regs
+        .endm
+#endif
+
+trap_ptr        .req    r8
+alloc_ptr       .req    r10
+alloc_limit     .req    r11
+
+/* Support for profiling with gprof */
+
+#if defined(PROFILING) && (defined(SYS_linux_eabihf) || defined(SYS_linux_eabi))
+#define PROFILE \
+        push    {lr}; \
+        bl      __gnu_mcount_nc
+#else
+#define PROFILE
+#endif
 
 /* Allocation functions and GC interface */
 
-        .globl caml_call_gc
+        .globl  caml_system__code_begin
+caml_system__code_begin:
+        
+        .align  2
+        .globl  caml_call_gc
         .type caml_call_gc, %function
 caml_call_gc:
-    /* Record return address and desired size */
-    /* Can use alloc_limit as a temporary since it will be reloaded by
-       invoke_gc */
-        ldr     alloc_limit, .Lcaml_last_return_address
-        str     lr, [alloc_limit, #0]
-        ldr     alloc_limit, .Lcaml_requested_size
-        str     r12, [alloc_limit, #0]
-    /* Branch to shared GC code */
-        bl      .Linvoke_gc
-    /* Finish allocation */
-	ldr	r12, .Lcaml_requested_size
-	ldr	r12, [r12, #0]
-	sub	alloc_ptr, alloc_ptr, r12
+        PROFILE
+    /* Record return address */
+        ldr     r12, =caml_last_return_address
+        str     lr, [r12]
+.Lcaml_call_gc:
+    /* Record lowest stack address */
+        ldr     r12, =caml_bottom_of_stack
+        str     sp, [r12]
+    /* Save caller floating-point registers on the stack */
+        vpush   {d0-d7}
+    /* Save integer registers and return address on the stack */
+        push    {r0-r7,r12,lr}
+    /* Store pointer to saved integer registers in caml_gc_regs */
+        ldr     r12, =caml_gc_regs
+        str     sp, [r12]
+    /* Save current allocation pointer for debugging purposes */
+        ldr     alloc_limit, =caml_young_ptr
+        str     alloc_ptr, [alloc_limit]
+    /* Save trap pointer in case an exception is raised during GC */
+        ldr     r12, =caml_exception_pointer
+        str     trap_ptr, [r12]
+    /* Call the garbage collector */
+        bl      caml_garbage_collection
+    /* Restore integer registers and return address from the stack */
+        pop     {r0-r7,r12,lr}
+    /* Restore floating-point registers from the stack */
+        vpop    {d0-d7}
+    /* Reload new allocation pointer and limit */
+    /* alloc_limit still points to caml_young_ptr */
+        ldr     r12, =caml_young_limit
+        ldr     alloc_ptr, [alloc_limit]
+        ldr     alloc_limit, [r12]
+    /* Return to caller */
         bx      lr
+        .type   caml_call_gc, %function
+        .size   caml_call_gc, .-caml_call_gc
 
-        .globl caml_alloc1
+        .align  2
+        .globl  caml_alloc1
         .type caml_alloc1, %function
 caml_alloc1:
-        sub     alloc_ptr, alloc_ptr, #8
+        PROFILE
+.Lcaml_alloc1:
+        sub     alloc_ptr, alloc_ptr, 8
         cmp     alloc_ptr, alloc_limit
-        movcs   pc, lr                /* Return if alloc_ptr >= alloc_limit */
-    /* Record return address */
-        ldr     r12, .Lcaml_last_return_address
-        str     lr, [r12, #0]
-    /* Invoke GC */
-        bl      .Linvoke_gc
+        bcc     1f
+        bx      lr
+1:  /* Record return address */
+        ldr     r7, =caml_last_return_address
+        str     lr, [r7]
+    /* Call GC (preserves r7) */
+        bl      .Lcaml_call_gc
+    /* Restore return address */
+        ldr     lr, [r7]
     /* Try again */
-        b       caml_alloc1
+        b       .Lcaml_alloc1
+        .type   caml_alloc1, %function
+        .size   caml_alloc1, .-caml_alloc1
 
-        .globl caml_alloc2
+        .align  2
+        .globl  caml_alloc2
         .type caml_alloc2, %function
 caml_alloc2:
-        sub     alloc_ptr, alloc_ptr, #12
+        PROFILE
+.Lcaml_alloc2:
+        sub     alloc_ptr, alloc_ptr, 12
         cmp     alloc_ptr, alloc_limit
-        movcs   pc, lr                /* Return if alloc_ptr >= alloc_limit */
-    /* Record return address */
-        ldr     r12, .Lcaml_last_return_address
-        str     lr, [r12, #0]
-    /* Invoke GC */
-        bl      .Linvoke_gc
+        bcc     1f
+        bx      lr
+1:  /* Record return address */
+        ldr     r7, =caml_last_return_address
+        str     lr, [r7]
+    /* Call GC (preserves r7) */
+        bl      .Lcaml_call_gc
+    /* Restore return address */
+        ldr     lr, [r7]
     /* Try again */
-        b       caml_alloc2
+        b       .Lcaml_alloc2
+        .type   caml_alloc2, %function
+        .size   caml_alloc2, .-caml_alloc2
 
-        .globl caml_alloc3
+        .align  2
+        .globl  caml_alloc3
         .type caml_alloc3, %function
 caml_alloc3:
-        sub     alloc_ptr, alloc_ptr, #16
+        PROFILE
+.Lcaml_alloc3:
+        sub     alloc_ptr, alloc_ptr, 16
         cmp     alloc_ptr, alloc_limit
-        movcs   pc, lr                /* Return if alloc_ptr >= alloc_limit */
-    /* Record return address */
-        ldr     r12, .Lcaml_last_return_address
-        str     lr, [r12, #0]
-    /* Invoke GC */
-        bl      .Linvoke_gc
+        bcc     1f
+        bx      lr
+1:  /* Record return address */
+        ldr     r7, =caml_last_return_address
+        str     lr, [r7]
+    /* Call GC (preserves r7) */
+        bl      .Lcaml_call_gc
+    /* Restore return address */
+        ldr     lr, [r7]
     /* Try again */
-        b       caml_alloc3
+        b       .Lcaml_alloc3
+        .type   caml_alloc3, %function
+        .size   caml_alloc3, .-caml_alloc3
 
-        .globl caml_allocN
+        .align  2
+        .globl  caml_allocN
         .type caml_allocN, %function
 caml_allocN:
-        sub     alloc_ptr, alloc_ptr, r12
+        PROFILE
+.Lcaml_allocN:
+        sub     alloc_ptr, alloc_ptr, r7
         cmp     alloc_ptr, alloc_limit
-        movcs   pc, lr                /* Return if alloc_ptr >= alloc_limit */
-    /* Record return address and desired size */
-    /* Can use alloc_limit as a temporary since it will be reloaded by
-       invoke_gc */
-        ldr     alloc_limit, .Lcaml_last_return_address
-        str     lr, [alloc_limit, #0]
-        ldr     alloc_limit, .Lcaml_requested_size
-        str     r12, [alloc_limit, #0]
-    /* Invoke GC */
-        bl      .Linvoke_gc
+        bcc     1f
+        bx      lr
+1:  /* Record return address */
+        ldr     r12, =caml_last_return_address
+        str     lr, [r12]
+    /* Call GC (preserves r7) */
+        bl      .Lcaml_call_gc
+    /* Restore return address */
+        ldr     r12, =caml_last_return_address
+        ldr     lr, [r12]
     /* Try again */
-	ldr	r12, .Lcaml_requested_size
-	ldr	r12, [r12, #0]
-        b       caml_allocN
-
-/* Shared code to invoke the GC */
-.Linvoke_gc:
-    /* Record lowest stack address */
-        ldr     r12, .Lcaml_bottom_of_stack
-        str     sp, [r12, #0]
-    /* Save integer registers and return address on stack */
-        stmfd   sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r12,lr}
-    /* Store pointer to saved integer registers in caml_gc_regs */
-        ldr     r12, .Lcaml_gc_regs
-        str     sp, [r12, #0]
-    /* Save current allocation pointer for debugging purposes */
-        ldr     r12, .Lcaml_young_ptr
-        str     alloc_ptr, [r12, #0]
-    /* Save trap pointer in case an exception is raised during GC */
-        ldr     r12, .Lcaml_exception_pointer
-        str     trap_ptr, [r12, #0]
-    /* Call the garbage collector */
-        bl      caml_garbage_collection
-    /* Restore the registers from the stack */
-        ldmfd   sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r12}
-    /* Reload return address */
-        ldr     r12, .Lcaml_last_return_address
-        ldr     lr, [r12, #0]
-    /* Reload new allocation pointer and allocation limit */
-        ldr     r12, .Lcaml_young_ptr
-        ldr     alloc_ptr, [r12, #0]
-        ldr     r12, .Lcaml_young_limit
-        ldr     alloc_limit, [r12, #0]
-    /* Return to caller */
-        ldr     r12, [sp], #4
-        bx      r12
+        b       .Lcaml_allocN
+        .type   caml_allocN, %function
+        .size   caml_allocN, .-caml_allocN
 
-/* Call a C function from Caml */
-/* Function to call is in r12 */
+/* Call a C function from OCaml */
+/* Function to call is in r7 */
 
-        .globl caml_c_call
+        .align  2
+        .globl  caml_c_call
         .type caml_c_call, %function
 caml_c_call:
+        PROFILE
+    /* Record lowest stack address and return address */
+        ldr     r5, =caml_last_return_address
+        ldr     r6, =caml_bottom_of_stack
+        str     lr, [r5]
+        str     sp, [r6]
     /* Preserve return address in callee-save register r4 */
         mov     r4, lr
-    /* Record lowest stack address and return address */
-        ldr     r5, .Lcaml_last_return_address
-        ldr     r6, .Lcaml_bottom_of_stack
-        str     lr, [r5, #0]
-        str     sp, [r6, #0]
-    /* Make the exception handler and alloc ptr available to the C code */
-        ldr     r6, .Lcaml_young_ptr
-        ldr     r7, .Lcaml_exception_pointer
-        str     alloc_ptr, [r6, #0]
-        str     trap_ptr, [r7, #0]
+    /* Make the exception handler alloc ptr available to the C code */
+        ldr     r5, =caml_young_ptr
+        ldr     r6, =caml_exception_pointer
+        str     alloc_ptr, [r5]
+        str     trap_ptr, [r6]
     /* Call the function */
-        mov     lr, pc
-        bx      r12
+        blx     r7
     /* Reload alloc ptr and alloc limit */
-        ldr     r5, .Lcaml_young_limit
-        ldr     alloc_ptr, [r6, #0]    /* r6 still points to caml_young_ptr */
-        ldr     alloc_limit, [r5, #0]
+        ldr     r6, =caml_young_limit
+        ldr     alloc_ptr, [r5]         /* r5 still points to caml_young_ptr */
+        ldr     alloc_limit, [r6]
     /* Return */
         bx      r4
+        .type   caml_c_call, %function
+        .size   caml_c_call, .-caml_c_call
 
-/* Start the Caml program */
+/* Start the OCaml program */
 
-        .globl caml_start_program
+        .align  2
+        .globl  caml_start_program
         .type caml_start_program, %function
 caml_start_program:
-        ldr     r12, .Lcaml_program
+        PROFILE
+        ldr     r12, =caml_program
 
 /* Code shared with caml_callback* */
-/* Address of Caml code to call is in r12 */
-/* Arguments to the Caml code are in r0...r3 */
+/* Address of OCaml code to call is in r12 */
+/* Arguments to the OCaml code are in r0...r3 */
 
 .Ljump_to_caml:
     /* Save return address and callee-save registers */
-        stmfd   sp!, {r4,r5,r6,r7,r8,r10,r11,lr} /* 8-alignment */
+        vpush   {d8-d15}
+        push    {r4-r8,r10,r11,lr}              /* 8-byte alignment */
     /* Setup a callback link on the stack */
-        sub     sp, sp, #4*4                    /* 8-alignment */
-        ldr     r4, .Lcaml_bottom_of_stack
-        ldr     r4, [r4, #0]
-        str     r4, [sp, #0]
-        ldr     r4, .Lcaml_last_return_address
-        ldr     r4, [r4, #0]
-        str     r4, [sp, #4]
-        ldr     r4, .Lcaml_gc_regs
-        ldr     r4, [r4, #0]
-        str     r4, [sp, #8]
-    /* Setup a trap frame to catch exceptions escaping the Caml code */
-        sub     sp, sp, #4*2
-        ldr     r4, .Lcaml_exception_pointer
-        ldr     r4, [r4, #0]
-        str     r4, [sp, #0]
-        ldr     r4, .LLtrap_handler
-        str     r4, [sp, #4]
+        sub     sp, sp, 4*4                     /* 8-byte alignment */
+        ldr     r4, =caml_bottom_of_stack
+        ldr     r5, =caml_last_return_address
+        ldr     r6, =caml_gc_regs
+        ldr     r4, [r4]
+        ldr     r5, [r5]
+        ldr     r6, [r6]
+        str     r4, [sp, 0]
+        str     r5, [sp, 4]
+        str     r6, [sp, 8]
+    /* Setup a trap frame to catch exceptions escaping the OCaml code */
+        sub     sp, sp, 2*4
+        ldr     r6, =caml_exception_pointer
+        ldr     r5, =.Ltrap_handler
+        ldr     r4, [r6]
+        str     r4, [sp, 0]
+        str     r5, [sp, 4]
         mov     trap_ptr, sp
     /* Reload allocation pointers */
-        ldr     r4, .Lcaml_young_ptr
-        ldr     alloc_ptr, [r4, #0]
-        ldr     r4, .Lcaml_young_limit
-        ldr     alloc_limit, [r4, #0]
-    /* Call the Caml code */
-        mov     lr, pc
-        bx      r12
+        ldr     r4, =caml_young_ptr
+        ldr     alloc_ptr, [r4]
+        ldr     r4, =caml_young_limit
+        ldr     alloc_limit, [r4]
+    /* Call the OCaml code */
+        blx     r12
 .Lcaml_retaddr:
     /* Pop the trap frame, restoring caml_exception_pointer */
-        ldr     r4, .Lcaml_exception_pointer
-        ldr     r5, [sp, #0]
-        str     r5, [r4, #0]
-        add     sp, sp, #2 * 4
+        ldr     r4, =caml_exception_pointer
+        ldr     r5, [sp, 0]
+        str     r5, [r4]
+        add     sp, sp, 2*4
     /* Pop the callback link, restoring the global variables */
 .Lreturn_result:
-        ldr     r4, .Lcaml_bottom_of_stack
-        ldr     r5, [sp, #0]
-        str     r5, [r4, #0]
-        ldr     r4, .Lcaml_last_return_address
-        ldr     r5, [sp, #4]
-        str     r5, [r4, #0]
-        ldr     r4, .Lcaml_gc_regs
-        ldr     r5, [sp, #8]
-        str     r5, [r4, #0]
-        add     sp, sp, #4*4
+        ldr     r4, =caml_bottom_of_stack
+        ldr     r5, [sp, 0]
+        str     r5, [r4]
+        ldr     r4, =caml_last_return_address
+        ldr     r5, [sp, 4]
+        str     r5, [r4]
+        ldr     r4, =caml_gc_regs
+        ldr     r5, [sp, 8]
+        str     r5, [r4]
+        add     sp, sp, 4*4
     /* Update allocation pointer */
-        ldr     r4, .Lcaml_young_ptr
-        str     alloc_ptr, [r4, #0]
+        ldr     r4, =caml_young_ptr
+        str     alloc_ptr, [r4]
     /* Reload callee-save registers and return */
-        ldmfd   sp!, {r4,r5,r6,r7,r8,r10,r11,lr}
-	bx	lr
+        pop     {r4-r8,r10,r11,lr}
+        vpop    {d8-d15}
+        bx      lr
+        .type   .Lcaml_retaddr, %function
+        .size   .Lcaml_retaddr, .-.Lcaml_retaddr
+        .type   caml_start_program, %function
+        .size   caml_start_program, .-caml_start_program
+
+/* The trap handler */
 
-    /* The trap handler */
+        .align  2
 .Ltrap_handler:
     /* Save exception pointer */
-        ldr     r4, .Lcaml_exception_pointer
-        str     trap_ptr, [r4, #0]
+        ldr     r12, =caml_exception_pointer
+        str     trap_ptr, [r12]
     /* Encode exception bucket as an exception result */
-        orr     r0, r0, #2
+        orr     r0, r0, 2
     /* Return it */
         b       .Lreturn_result
+        .type   .Ltrap_handler, %function
+        .size   .Ltrap_handler, .-.Ltrap_handler
+
+/* Raise an exception from OCaml */
+
+        .align  2
+        .globl  caml_raise_exn
+caml_raise_exn:
+        PROFILE
+    /* Test if backtrace is active */
+        ldr     r1, =caml_backtrace_active
+        ldr     r1, [r1]
+        cbz     r1, 1f
+    /* Preserve exception bucket in callee-save register r4 */
+        mov     r4, r0
+    /* Stash the backtrace */
+        mov     r1, lr                          /* arg2: pc of raise */
+        mov     r2, sp                          /* arg3: sp of raise */
+        mov     r3, trap_ptr                    /* arg4: sp of handler */
+        bl      caml_stash_backtrace
+    /* Restore exception bucket */
+        mov     r0, r4
+1:  /* Cut stack at current trap handler */
+        mov     sp, trap_ptr
+    /* Pop previous handler and addr of trap, and jump to it */
+        pop     {trap_ptr, pc}
+        .type   caml_raise_exn, %function
+        .size   caml_raise_exn, .-caml_raise_exn
 
 /* Raise an exception from C */
 
-        .globl caml_raise_exception
+        .align  2
+        .globl  caml_raise_exception
         .type caml_raise_exception, %function
 caml_raise_exception:
-    /* Reload Caml allocation pointers */
-        ldr     r12, .Lcaml_young_ptr
-        ldr     alloc_ptr, [r12, #0]
-        ldr     r12, .Lcaml_young_limit
-        ldr     alloc_limit, [r12, #0]
-    /* Cut stack at current trap handler */
-        ldr     r12, .Lcaml_exception_pointer
-        ldr     sp, [r12, #0]
+        PROFILE
+    /* Reload trap ptr, alloc ptr and alloc limit */
+        ldr     trap_ptr, =caml_exception_pointer
+        ldr     alloc_ptr, =caml_young_ptr
+        ldr     alloc_limit, =caml_young_limit
+        ldr     trap_ptr, [trap_ptr]
+        ldr     alloc_ptr, [alloc_ptr]
+        ldr     alloc_limit, [alloc_limit]
+    /* Test if backtrace is active */
+        ldr     r1, =caml_backtrace_active
+        ldr     r1, [r1]
+        cbz     r1, 1f
+    /* Preserve exception bucket in callee-save register r4 */
+        mov     r4, r0
+        ldr     r1, =caml_last_return_address   /* arg2: pc of raise */
+        ldr     r1, [r1]
+        ldr     r2, =caml_bottom_of_stack       /* arg3: sp of raise */
+        ldr     r2, [r2]
+        mov     r3, trap_ptr                    /* arg4: sp of handler */
+        bl      caml_stash_backtrace
+    /* Restore exception bucket */
+        mov     r0, r4
+1:  /* Cut stack at current trap handler */
+        mov     sp, trap_ptr
     /* Pop previous handler and addr of trap, and jump to it */
-        ldmfd   sp!, {trap_ptr, pc}
+        pop     {trap_ptr, pc}
+        .type   caml_raise_exception, %function
+        .size   caml_raise_exception, .-caml_raise_exception
 
-/* Callback from C to Caml */
+/* Callback from C to OCaml */
 
-        .globl caml_callback_exn
+        .align  2
+        .globl  caml_callback_exn
         .type caml_callback_exn, %function
 caml_callback_exn:
+        PROFILE
     /* Initial shuffling of arguments (r0 = closure, r1 = first arg) */
         mov     r12, r0
-        mov     r0, r1            /* r0 = first arg */
-        mov     r1, r12           /* r1 = closure environment */
-        ldr     r12, [r12, #0]    /* code pointer */
+        mov     r0, r1          /* r0 = first arg */
+        mov     r1, r12         /* r1 = closure environment */
+        ldr     r12, [r12]      /* code pointer */
         b       .Ljump_to_caml
+        .type   caml_callback_exn, %function
+        .size   caml_callback_exn, .-caml_callback_exn
 
-        .globl caml_callback2_exn
+        .align  2
+        .globl  caml_callback2_exn
         .type caml_callback2_exn, %function
 caml_callback2_exn:
+        PROFILE
     /* Initial shuffling of arguments (r0 = closure, r1 = arg1, r2 = arg2) */
         mov     r12, r0
-        mov     r0, r1           /* r0 = first arg */
-        mov     r1, r2           /* r1 = second arg */
-        mov     r2, r12          /* r2 = closure environment */
-        ldr     r12, .Lcaml_apply2
+        mov     r0, r1          /* r0 = first arg */
+        mov     r1, r2          /* r1 = second arg */
+        mov     r2, r12         /* r2 = closure environment */
+        ldr     r12, =caml_apply2
         b       .Ljump_to_caml
+        .type   caml_callback2_exn, %function
+        .size   caml_callback2_exn, .-caml_callback2_exn
 
-        .globl caml_callback3_exn
+        .align  2
+        .globl  caml_callback3_exn
         .type caml_callback3_exn, %function
 caml_callback3_exn:
+        PROFILE
     /* Initial shuffling of arguments */
     /* (r0 = closure, r1 = arg1, r2 = arg2, r3 = arg3) */
         mov     r12, r0
@@ -288,43 +413,36 @@ caml_callback3_exn:
         mov     r1, r2          /* r1 = second arg */
         mov     r2, r3          /* r2 = third arg */
         mov     r3, r12         /* r3 = closure environment */
-        ldr     r12, .Lcaml_apply3
+        ldr     r12, =caml_apply3
         b       .Ljump_to_caml
+        .type   caml_callback3_exn, %function
+        .size   caml_callback3_exn, .-caml_callback3_exn
 
-        .globl caml_ml_array_bound_error
+        .align  2
+        .globl  caml_ml_array_bound_error
         .type caml_ml_array_bound_error, %function
 caml_ml_array_bound_error:
-    /* Load address of [caml_array_bound_error] in r12 */
-        ldr     r12, .Lcaml_array_bound_error
+        PROFILE
+    /* Load address of [caml_array_bound_error] in r7 */
+        ldr     r7, =caml_array_bound_error
     /* Call that function */
         b       caml_c_call
+        .type   caml_ml_array_bound_error, %function
+        .size   caml_ml_array_bound_error, .-caml_ml_array_bound_error
 
-/* Global references */
-
-.Lcaml_last_return_address:     .word caml_last_return_address
-.Lcaml_bottom_of_stack:         .word caml_bottom_of_stack
-.Lcaml_gc_regs:                 .word caml_gc_regs
-.Lcaml_young_ptr:               .word caml_young_ptr
-.Lcaml_young_limit:             .word caml_young_limit
-.Lcaml_exception_pointer:       .word caml_exception_pointer
-.Lcaml_program:                 .word caml_program
-.LLtrap_handler:                .word .Ltrap_handler
-.Lcaml_apply2:                  .word caml_apply2
-.Lcaml_apply3:                  .word caml_apply3
-.Lcaml_array_bound_error:       .word caml_array_bound_error
-.Lcaml_requested_size:          .word caml_requested_size
-
-	.data
-caml_requested_size:
-	.word	0
+        .globl  caml_system__code_end
+caml_system__code_end:
 
 /* GC roots for callback */
 
         .data
-        .globl caml_system__frametable
+        .align  2
+        .globl  caml_system__frametable
 caml_system__frametable:
         .word   1               /* one descriptor */
         .word   .Lcaml_retaddr  /* return address into callback */
         .short  -1              /* negative frame size => use callback link */
         .short  0               /* no roots */
         .align  2
+        .type   caml_system__frametable, %object
+        .size   caml_system__frametable, .-caml_system__frametable
diff --git a/asmrun/signals_osdep.h b/asmrun/signals_osdep.h
index 1e91327..732f3a0 100644
--- a/asmrun/signals_osdep.h
+++ b/asmrun/signals_osdep.h
@@ -78,7 +78,7 @@
 
 /****************** ARM, Linux */
 
-#elif defined(TARGET_arm) && defined (SYS_linux)
+#elif defined(TARGET_arm) && (defined (SYS_linux_eabi) || defined(SYS_linux_eabihf))
 
   #include <sys/ucontext.h>
 
diff --git a/configure b/configure
index 6ed0a9c..4e07c92 100755
--- a/configure
+++ b/configure
@@ -636,6 +636,7 @@ if test $withsharedlibs = "yes"; then
     i[345]86-*-netbsd*)           natdynlink=true;;
     x86_64-*-netbsd*)             natdynlink=true;;
     i386-*-gnu0.3)                natdynlink=true;;
+    arm*-*-linux*)                natdynlink=true;;
   esac
 fi
 
@@ -691,8 +692,13 @@ case "$host" in
   powerpc-*-rhapsody*)          arch=power; model=ppc; system=rhapsody;;
   powerpc-*-darwin*)            arch=power; system=rhapsody
                                 if $arch64; then model=ppc64; else model=ppc; fi;;
-  arm*-*-linux*)                arch=arm; system=linux;;
-  arm*-*-gnu*)                  arch=arm; system=gnu;;
+  arm*-*-linux-gnueabihf)       arch=arm; system=linux_eabihf;;
+  armv7*-*-linux-gnueabi)       arch=arm; model=armv7; system=linux_eabi;;
+  armv6t2*-*-linux-gnueabi)     arch=arm; model=armv6t2; system=linux_eabi;;
+  armv6*-*-linux-gnueabi)       arch=arm; model=armv6; system=linux_eabi;;
+  armv5te*-*-linux-gnueabi)     arch=arm; model=armv5te; system=linux_eabi;;
+  armv5*-*-linux-gnueabi)       arch=arm; model=armv5; system=linux_eabi;;
+  arm*-*-linux-gnueabi)         arch=arm; system=linux_eabi;;
   ia64-*-linux*)                arch=ia64; system=linux;;
   ia64-*-gnu*)                  arch=ia64; system=gnu;;
   ia64-*-freebsd*)              arch=ia64; system=freebsd;;
@@ -804,6 +810,7 @@ case "$arch,$model,$system" in
     case "$nativecc" in gcc*) ;; *) cc_profile='-xpg';; esac;;
   amd64,*,linux) profiling='prof';;
   amd64,*,gnu) profiling='prof';;
+  arm,*,linux*) profiling='prof';;
   *) profiling='noprof';;
 esac
 
-- 
1.7.10.1