ocaml-flambda · gretay-js · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025 · Jan 8, 2025
diff --git a/backend/amd64/CSE.ml b/backend/amd64/CSE.ml
@@ -21,6 +21,12 @@ open Arch
 open Mach
 open CSE_utils
 
+let of_simd_class (cl : Simd.operation_class)  =
+  match cl with
+  | Pure -> Op_pure
+  | Load { is_mutable = true } -> Op_load Mutable
+  | Load { is_mutable = false } -> Op_load Immutable
+
 class cse = object
 
 inherit CSEgen.cse_generic as super
@@ -37,9 +43,9 @@ method! class_of_operation op =
     | Irdtsc | Irdpmc
     | Ilfence | Isfence | Imfence -> Op_other
     | Isimd op ->
-      begin match Simd.class_of_operation op with
-      | Pure -> Op_pure
-      end
+      of_simd_class (Simd.class_of_operation op)
+    | Isimd_mem (op,_addr) ->
+      of_simd_class (Simd.Mem.class_of_operation op)
     | Ipause
     | Icldemote _
     | Iprefetch _ -> Op_other
@@ -81,9 +87,9 @@ class cfg_cse = object
     | Irdtsc | Irdpmc
     | Ilfence | Isfence | Imfence -> Op_other
     | Isimd op ->
-      begin match Simd.class_of_operation op with
-      | Pure -> Op_pure
-      end
+      of_simd_class (Simd.class_of_operation op)
+    | Isimd_mem (op,_addr) ->
+      of_simd_class (Simd.Mem.class_of_operation op)
     | Ipause
     | Icldemote _
     | Iprefetch _ -> Op_other

diff --git a/backend/amd64/arch.ml b/backend/amd64/arch.ml
@@ -153,6 +153,9 @@ type specific_operation =
   | Imfence                            (* memory fence *)
   | Ipause                             (* hint for spin-wait loops *)
   | Isimd of Simd.operation            (* SIMD instruction set operations *)
+  | Isimd_mem of Simd.Mem.operation * addressing_mode
+                                       (* SIMD instruction set operations
+                                          with memory args *)
   | Icldemote of addressing_mode       (* hint to demote a cacheline to L3 *)
   | Iprefetch of                       (* memory prefetching hint *)
       { is_write: bool;
@@ -273,6 +276,8 @@ let print_specific_operation printreg op ppf arg =
       fprintf ppf "rdpmc %a" printreg arg.(0)
   | Isimd simd ->
       Simd.print_operation printreg simd ppf arg
+  | Isimd_mem (simd, addr) ->
+      Simd.Mem.print_operation printreg (print_addressing printreg addr) simd ppf arg
   | Ipause ->
       fprintf ppf "pause"
   | Icldemote _ ->
@@ -299,13 +304,14 @@ let operation_is_pure = function
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Icldemote _ | Iprefetch _ -> false
   | Isimd op -> Simd.is_pure op
+  | Isimd_mem (op, _addr) -> Simd.Mem.is_pure op
 
 (* Specific operations that can raise *)
 (* Keep in sync with [Vectorize_specific] *)
 let operation_can_raise = function
   | Ilea _ | Ibswap _ | Isextend32 | Izextend32
   | Ifloatarithmem _
-  | Irdtsc | Irdpmc | Ipause | Isimd _
+  | Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
   | Ilfence | Isfence | Imfence
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Icldemote _ | Iprefetch _ -> false
@@ -314,7 +320,7 @@ let operation_can_raise = function
 let operation_allocates = function
   | Ilea _ | Ibswap _ | Isextend32 | Izextend32
   | Ifloatarithmem _
-  | Irdtsc | Irdpmc | Ipause | Isimd _
+  | Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
   | Ilfence | Isfence | Imfence
   | Istore_int (_, _, _) | Ioffset_loc (_, _)
   | Icldemote _ | Iprefetch _ -> false
@@ -405,9 +411,11 @@ let equal_specific_operation left right =
     && equal_addressing_mode left_addr right_addr
   | Isimd l, Isimd r ->
     Simd.equal_operation l r
+  | Isimd_mem (l,al), Isimd_mem (r,ar) ->
+    Simd.Mem.equal_operation l r && equal_addressing_mode al ar
   | (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
      Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
-     Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
+     Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
     false
 
 (* addressing mode functions *)
@@ -512,7 +520,9 @@ let isomorphic_specific_operation op1 op2 =
     && equal_addressing_mode_without_displ left_addr right_addr
   | Isimd l, Isimd r ->
     Simd.equal_operation l r
+  | Isimd_mem (l,al), Isimd_mem (r,ar) ->
+    Simd.Mem.equal_operation l r && equal_addressing_mode_without_displ al ar
   | (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
      Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
-     Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
+     Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
     false
diff --git a/backend/amd64/arch.mli b/backend/amd64/arch.mli
@@ -86,6 +86,9 @@ type specific_operation =
   | Imfence                            (* memory fence *)
   | Ipause                             (* hint for spin-wait loops *)
   | Isimd of Simd.operation            (* SIMD instruction set operations *)
+  | Isimd_mem of Simd.Mem.operation * addressing_mode
+                                       (* SIMD instruction set operations
+                                          with memory args *)
   | Icldemote of addressing_mode       (* hint to demote a cacheline to L3 *)
   | Iprefetch of                       (* memory prefetching hint *)
       { is_write: bool;

diff --git a/backend/amd64/cfg_selection.ml b/backend/amd64/cfg_selection.ml
@@ -86,7 +86,14 @@ let pseudoregs_for_operation op arg res =
        edx (high) and eax (low). Make it simple and force the argument in rcx,
        and rax and rdx clobbered *)
     [| rcx |], res
-  | Specific (Isimd op) -> Simd_selection.pseudoregs_for_operation op arg res
+  | Specific (Isimd op) ->
+    Simd_selection.pseudoregs_for_operation
+      (Simd_proc.register_behavior op)
+      arg res
+  | Specific (Isimd_mem (op, _addr)) ->
+    Simd_selection.pseudoregs_for_operation
+      (Simd_proc.Mem.register_behavior op)
+      arg res
   | Csel _ ->
     (* last arg must be the same as res.(0) *)
     let len = Array.length arg in

diff --git a/backend/amd64/emit.mlp b/backend/amd64/emit.mlp
@@ -54,7 +54,7 @@ let float_reg_name = Array.init 16 (fun i -> XMM i)
 let register_name typ r =
   match (typ : machtype_component) with
   | Int | Val | Addr -> Reg64 (int_reg_name.(r))
-  | Float | Float32 | Vec128 -> Regf (float_reg_name.(r - 100))
+  | Float | Float32 | Vec128 | Valx2 -> Regf (float_reg_name.(r - 100))
 
 let phys_rax = phys_reg Int 0
 let phys_rdx = phys_reg Int 4
@@ -293,6 +293,7 @@ let emit_Llabel fallthrough lbl section_name =
 let x86_data_type_for_stack_slot : machtype_component -> data_type = function
   | Float -> REAL8
   | Vec128 -> VEC128
+  | Valx2 -> VEC128
   | Int | Addr | Val -> QWORD
   | Float32 -> REAL4
 
@@ -364,13 +365,23 @@ let record_frame_label live dbg =
   let live_offset = ref [] in
   Reg.Set.iter
     (function
-      | {typ = Val; loc = Reg r} ->
+      | {typ = Val; loc = Reg r} as reg ->
+          assert (Proc.gc_regs_offset reg = r);
           live_offset := ((r lsl 1) + 1) :: !live_offset
       | {typ = Val; loc = Stack s} as reg ->
           live_offset := slot_offset s (stack_slot_class reg.typ) :: !live_offset
+      | {typ = Valx2; loc = Reg r} as reg ->
+          let n = Proc.gc_regs_offset reg in
+          let encode n = ((n lsl 1) + 1) in
+          live_offset := encode n :: encode (n + 1) :: !live_offset
+      | {typ = Valx2; loc = Stack s} as reg ->
+          let n = slot_offset s (stack_slot_class reg.typ)  in
+          live_offset := n :: n + Arch.size_addr :: !live_offset
       | {typ = Addr} as r ->
           Misc.fatal_error ("bad GC root " ^ Reg.name r)
-      | _ -> ()
+      | { typ = (Val | Valx2); loc = Unknown ; } as r ->
+        Misc.fatal_error ("Unknown location " ^ Reg.name r)
+      | { typ = Int | Float | Float32 | Vec128; _ } -> ()
     )
     live;
   record_frame_descr ~label:lbl ~frame_size:(frame_size())
@@ -801,15 +812,15 @@ let move (src : Reg.t) (dst : Reg.t) =
   begin match src.typ, src.loc, dst.typ, dst.loc with
   | Float, Reg _, Float, Reg _
   | Float32, Reg _, Float32, Reg _
-  | Vec128, _, Vec128, _ (* Vec128 stack slots are always aligned. *) ->
+  | (Vec128 | Valx2), _, (Vec128 | Valx2), _ (* Vec128 stack slots are always aligned. *) ->
     if distinct then I.movapd (reg src) (reg dst)
   | Float, _, Float, _ ->
     if distinct then I.movsd (reg src) (reg dst)
   | Float32, _, Float32, _ ->
     if distinct then I.movss (reg src) (reg dst)
   | (Int | Val | Addr), _, (Int | Val | Addr), _ ->
     if distinct then I.mov (reg src) (reg dst)
-  | (Float | Float32 | Vec128 | Int | Val | Addr), _, _, _ ->
+  | (Float | Float32 | Vec128 | Int | Val | Addr | Valx2), _, _, _ ->
     Misc.fatal_errorf
       "Illegal move between registers of differing types (%a to %a)\n"
       Printreg.reg src Printreg.reg dst
@@ -823,7 +834,7 @@ let stack_to_stack_move (src : Reg.t) (dst : Reg.t) =
       (* Not calling move because r15 is not in int_reg_name. *)
       I.mov (reg src) r15;
       I.mov r15 (reg dst)
-    | Float | Addr | Vec128 | Float32 ->
+    | Float | Addr | Vec128 | Valx2 | Float32 ->
       Misc.fatal_errorf
         "Unexpected register type for stack to stack move: from %s to %s\n"
         (Reg.name src) (Reg.name dst)
@@ -1035,8 +1046,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
        CR mslater: (SIMD) don't load 32 bits once we have unboxed int16/int8 *)
     I.movd (arg32 i 0) (res i 0)
 
-let emit_simd_instr op i =
-  (match Simd_proc.register_behavior op with
+let check_simd_instr (register_behavior : Simd_proc.register_behavior) i =
+  (match register_behavior with
   | R_to_fst ->
     assert (Reg.same_loc i.arg.(0) i.res.(0));
     assert (Reg.is_reg i.arg.(0))
@@ -1076,6 +1087,23 @@ let emit_simd_instr op i =
     assert (Reg.is_reg i.arg.(0));
     assert (Reg.same_loc i.res.(0) (phys_xmm0v ()))
   );
+  ()
+
+let emit_simd_instr_with_memory_arg op i addressing_mode =
+  check_simd_instr (Simd_proc.Mem.register_behavior op) i;
+  let addr = addressing addressing_mode VEC128 i 1 in
+  match (op : Simd.Mem.operation) with
+  | SSE2 Add_f64 -> I.addpd addr (res i 0)
+  | SSE2 Sub_f64 -> I.subpd addr (res i 0)
+  | SSE2 Mul_f64 -> I.mulpd addr (res i 0)
+  | SSE2 Div_f64 -> I.divpd addr (res i 0)
+  | SSE Add_f32 -> I.addps addr (res i 0)
+  | SSE Sub_f32 -> I.subps addr (res i 0)
+  | SSE Mul_f32 -> I.mulps addr (res i 0)
+  | SSE Div_f32 -> I.divps addr (res i 0)
+
+let emit_simd_instr op i =
+  check_simd_instr (Simd_proc.register_behavior op) i;
   match (op : Simd.operation) with
   | CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
   | BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
@@ -1715,6 +1743,8 @@ let emit_instr ~first ~fallthrough i =
     I.mfence ()
   | Lop (Specific (Isimd op)) ->
     emit_simd_instr op i
+  | Lop (Specific (Isimd_mem (op, addressing_mode))) ->
+    emit_simd_instr_with_memory_arg op i addressing_mode
   | Lop (Static_cast cast) ->
     emit_static_cast cast i
   | Lop (Reinterpret_cast cast) ->
@@ -2144,7 +2174,7 @@ let size_of_regs regs =
       | Float | Float32 ->
         (* Float32 slots still take up a full word *)
         acc + size_float
-      | Vec128 -> acc + size_vec128)
+      | Vec128 | Valx2 -> acc + size_vec128)
     regs 0
 
 let stack_locations ~offset regs =
@@ -2154,7 +2184,7 @@ let stack_locations ~offset regs =
       | Float | Float32 ->
         (* Float32 slots still take up a full word *)
         size_float
-      | Vec128 -> size_vec128 in
+      | Vec128 | Valx2 -> size_vec128 in
     next, (make_stack_loc n r ~offset :: offsets)) regs (0, []) in
   locs |> Array.of_list
 
@@ -2242,6 +2272,7 @@ let emit_probe_handler_wrapper p =
         (match r.typ with
         | Val -> k::acc
         | Int | Float | Vec128 | Float32 -> acc
+        | Valx2 -> k::k+Arch.size_addr::acc
         | Addr -> Misc.fatal_error ("bad GC root " ^ Reg.name r))
       | _ -> assert false)
     saved_live