Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore - for testing only (vectorizer on by default) #3459

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ba86bbf
Cleanup size_component
gretay-js Jan 8, 2025
a7b4f08
Add tests
gretay-js Jan 8, 2025
aea3fe3
Add function [DLL.for_all_i]
gretay-js Jan 8, 2025
3adc20f
Runtime: make types explicit when reading [gc_regs].
gretay-js Jan 8, 2025
4984208
Do not allow naked pointers (remove configure option)
gretay-js Jan 8, 2025
5e1ee40
Add [dump-vectorize] to OCAMLPARAM for debugging
gretay-js Jan 8, 2025
8632bd8
Add [Printreg.reglist] for debugging
gretay-js Jan 8, 2025
afac808
Improve debug printouts
gretay-js Jan 8, 2025
2c23188
Refactor [Block.find_last_instruction], cache [Computation.last_pos]
gretay-js Jan 8, 2025
7efdbfa
Improve heuristics in [Computation.select_and_join] using [last_pos]
gretay-js Jan 8, 2025
fe8b458
Fix bug: use address arg of the first instruction in a group
gretay-js Jan 8, 2025
3e729c4
Rename New to New_vec128 to make the type clear
gretay-js Jan 8, 2025
cd3aeae
Add [Valx2] to [Cmm.machtype_component]
gretay-js Jan 8, 2025
1cdf09c
Vectorizer generates [Valx2]
gretay-js Jan 8, 2025
666674b
Record live offsets of [Valx2] in the frametable
gretay-js Jan 8, 2025
f7d96e5
Propagate alignment of memory accesses to simd_selection
gretay-js Jan 8, 2025
f8d6c3d
Fix bug: 128-bit vectorized constant high/low correctly ordered
gretay-js Jan 8, 2025
e5b4033
Vectorize [Specific.Istore_int] for array initialization
gretay-js Jan 8, 2025
3acc38e
Add [Isimd_mem] to [Arch.Specific] and emit [addpd] with memory arg
gretay-js Jan 8, 2025
9a9a52e
Vectorize [Ifloatarithmem]
gretay-js Jan 8, 2025
35e2c04
baseline: -regalloc cfg -cfg-cse-vectorize -cfgzero-alloc-checker
gretay-js Jan 10, 2025
d9a38eb
enable vectorizer
gretay-js Jan 10, 2025
b2510b7
Revert "Do not allow naked pointers (remove configure option)"
gretay-js Jan 10, 2025
aba576b
Fix live offset for Valx2, it's in bytes
gretay-js Jan 13, 2025
1f92f77
all_deps is reflexive
gretay-js Jan 13, 2025
488af6b
Merge remote-tracking branch 'upstream-flambda-backend/main' into tes…
gretay-js Jan 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions backend/amd64/CSE.ml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,12 @@ open Arch
open Mach
open CSE_utils

let of_simd_class (cl : Simd.operation_class) =
match cl with
| Pure -> Op_pure
| Load { is_mutable = true } -> Op_load Mutable
| Load { is_mutable = false } -> Op_load Immutable

class cse = object

inherit CSEgen.cse_generic as super
Expand All @@ -37,9 +43,9 @@ method! class_of_operation op =
| Irdtsc | Irdpmc
| Ilfence | Isfence | Imfence -> Op_other
| Isimd op ->
begin match Simd.class_of_operation op with
| Pure -> Op_pure
end
of_simd_class (Simd.class_of_operation op)
| Isimd_mem (op,_addr) ->
of_simd_class (Simd.Mem.class_of_operation op)
| Ipause
| Icldemote _
| Iprefetch _ -> Op_other
Expand Down Expand Up @@ -81,9 +87,9 @@ class cfg_cse = object
| Irdtsc | Irdpmc
| Ilfence | Isfence | Imfence -> Op_other
| Isimd op ->
begin match Simd.class_of_operation op with
| Pure -> Op_pure
end
of_simd_class (Simd.class_of_operation op)
| Isimd_mem (op,_addr) ->
of_simd_class (Simd.Mem.class_of_operation op)
| Ipause
| Icldemote _
| Iprefetch _ -> Op_other
Expand Down
18 changes: 14 additions & 4 deletions backend/amd64/arch.ml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ type specific_operation =
| Imfence (* memory fence *)
| Ipause (* hint for spin-wait loops *)
| Isimd of Simd.operation (* SIMD instruction set operations *)
| Isimd_mem of Simd.Mem.operation * addressing_mode
(* SIMD instruction set operations
with memory args *)
| Icldemote of addressing_mode (* hint to demote a cacheline to L3 *)
| Iprefetch of (* memory prefetching hint *)
{ is_write: bool;
Expand Down Expand Up @@ -273,6 +276,8 @@ let print_specific_operation printreg op ppf arg =
fprintf ppf "rdpmc %a" printreg arg.(0)
| Isimd simd ->
Simd.print_operation printreg simd ppf arg
| Isimd_mem (simd, addr) ->
Simd.Mem.print_operation printreg (print_addressing printreg addr) simd ppf arg
| Ipause ->
fprintf ppf "pause"
| Icldemote _ ->
Expand All @@ -299,13 +304,14 @@ let operation_is_pure = function
| Istore_int (_, _, _) | Ioffset_loc (_, _)
| Icldemote _ | Iprefetch _ -> false
| Isimd op -> Simd.is_pure op
| Isimd_mem (op, _addr) -> Simd.Mem.is_pure op

(* Specific operations that can raise *)
(* Keep in sync with [Vectorize_specific] *)
let operation_can_raise = function
| Ilea _ | Ibswap _ | Isextend32 | Izextend32
| Ifloatarithmem _
| Irdtsc | Irdpmc | Ipause | Isimd _
| Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
| Ilfence | Isfence | Imfence
| Istore_int (_, _, _) | Ioffset_loc (_, _)
| Icldemote _ | Iprefetch _ -> false
Expand All @@ -314,7 +320,7 @@ let operation_can_raise = function
let operation_allocates = function
| Ilea _ | Ibswap _ | Isextend32 | Izextend32
| Ifloatarithmem _
| Irdtsc | Irdpmc | Ipause | Isimd _
| Irdtsc | Irdpmc | Ipause | Isimd _ | Isimd_mem _
| Ilfence | Isfence | Imfence
| Istore_int (_, _, _) | Ioffset_loc (_, _)
| Icldemote _ | Iprefetch _ -> false
Expand Down Expand Up @@ -405,9 +411,11 @@ let equal_specific_operation left right =
&& equal_addressing_mode left_addr right_addr
| Isimd l, Isimd r ->
Simd.equal_operation l r
| Isimd_mem (l,al), Isimd_mem (r,ar) ->
Simd.Mem.equal_operation l r && equal_addressing_mode al ar
| (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
false

(* addressing mode functions *)
Expand Down Expand Up @@ -512,7 +520,9 @@ let isomorphic_specific_operation op1 op2 =
&& equal_addressing_mode_without_displ left_addr right_addr
| Isimd l, Isimd r ->
Simd.equal_operation l r
| Isimd_mem (l,al), Isimd_mem (r,ar) ->
Simd.Mem.equal_operation l r && equal_addressing_mode_without_displ al ar
| (Ilea _ | Istore_int _ | Ioffset_loc _ | Ifloatarithmem _ | Ibswap _ |
Isextend32 | Izextend32 | Irdtsc | Irdpmc | Ilfence | Isfence | Imfence |
Ipause | Isimd _ | Icldemote _ | Iprefetch _), _ ->
Ipause | Isimd _ | Isimd_mem _ | Icldemote _ | Iprefetch _), _ ->
false
3 changes: 3 additions & 0 deletions backend/amd64/arch.mli
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,9 @@ type specific_operation =
| Imfence (* memory fence *)
| Ipause (* hint for spin-wait loops *)
| Isimd of Simd.operation (* SIMD instruction set operations *)
| Isimd_mem of Simd.Mem.operation * addressing_mode
(* SIMD instruction set operations
with memory args *)
| Icldemote of addressing_mode (* hint to demote a cacheline to L3 *)
| Iprefetch of (* memory prefetching hint *)
{ is_write: bool;
Expand Down
9 changes: 8 additions & 1 deletion backend/amd64/cfg_selection.ml
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,14 @@ let pseudoregs_for_operation op arg res =
edx (high) and eax (low). Make it simple and force the argument in rcx,
and rax and rdx clobbered *)
[| rcx |], res
| Specific (Isimd op) -> Simd_selection.pseudoregs_for_operation op arg res
| Specific (Isimd op) ->
Simd_selection.pseudoregs_for_operation
(Simd_proc.register_behavior op)
arg res
| Specific (Isimd_mem (op, _addr)) ->
Simd_selection.pseudoregs_for_operation
(Simd_proc.Mem.register_behavior op)
arg res
| Csel _ ->
(* last arg must be the same as res.(0) *)
let len = Array.length arg in
Expand Down
51 changes: 41 additions & 10 deletions backend/amd64/emit.mlp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ let float_reg_name = Array.init 16 (fun i -> XMM i)
let register_name typ r =
match (typ : machtype_component) with
| Int | Val | Addr -> Reg64 (int_reg_name.(r))
| Float | Float32 | Vec128 -> Regf (float_reg_name.(r - 100))
| Float | Float32 | Vec128 | Valx2 -> Regf (float_reg_name.(r - 100))

let phys_rax = phys_reg Int 0
let phys_rdx = phys_reg Int 4
Expand Down Expand Up @@ -293,6 +293,7 @@ let emit_Llabel fallthrough lbl section_name =
let x86_data_type_for_stack_slot : machtype_component -> data_type = function
| Float -> REAL8
| Vec128 -> VEC128
| Valx2 -> VEC128
| Int | Addr | Val -> QWORD
| Float32 -> REAL4

Expand Down Expand Up @@ -364,13 +365,23 @@ let record_frame_label live dbg =
let live_offset = ref [] in
Reg.Set.iter
(function
| {typ = Val; loc = Reg r} ->
| {typ = Val; loc = Reg r} as reg ->
assert (Proc.gc_regs_offset reg = r);
live_offset := ((r lsl 1) + 1) :: !live_offset
| {typ = Val; loc = Stack s} as reg ->
live_offset := slot_offset s (stack_slot_class reg.typ) :: !live_offset
| {typ = Valx2; loc = Reg r} as reg ->
let n = Proc.gc_regs_offset reg in
let encode n = ((n lsl 1) + 1) in
live_offset := encode n :: encode (n + 1) :: !live_offset
| {typ = Valx2; loc = Stack s} as reg ->
let n = slot_offset s (stack_slot_class reg.typ) in
live_offset := n :: n + Arch.size_addr :: !live_offset
| {typ = Addr} as r ->
Misc.fatal_error ("bad GC root " ^ Reg.name r)
| _ -> ()
| { typ = (Val | Valx2); loc = Unknown ; } as r ->
Misc.fatal_error ("Unknown location " ^ Reg.name r)
| { typ = Int | Float | Float32 | Vec128; _ } -> ()
)
live;
record_frame_descr ~label:lbl ~frame_size:(frame_size())
Expand Down Expand Up @@ -801,15 +812,15 @@ let move (src : Reg.t) (dst : Reg.t) =
begin match src.typ, src.loc, dst.typ, dst.loc with
| Float, Reg _, Float, Reg _
| Float32, Reg _, Float32, Reg _
| Vec128, _, Vec128, _ (* Vec128 stack slots are always aligned. *) ->
| (Vec128 | Valx2), _, (Vec128 | Valx2), _ (* Vec128 stack slots are always aligned. *) ->
if distinct then I.movapd (reg src) (reg dst)
| Float, _, Float, _ ->
if distinct then I.movsd (reg src) (reg dst)
| Float32, _, Float32, _ ->
if distinct then I.movss (reg src) (reg dst)
| (Int | Val | Addr), _, (Int | Val | Addr), _ ->
if distinct then I.mov (reg src) (reg dst)
| (Float | Float32 | Vec128 | Int | Val | Addr), _, _, _ ->
| (Float | Float32 | Vec128 | Int | Val | Addr | Valx2), _, _, _ ->
Misc.fatal_errorf
"Illegal move between registers of differing types (%a to %a)\n"
Printreg.reg src Printreg.reg dst
Expand All @@ -823,7 +834,7 @@ let stack_to_stack_move (src : Reg.t) (dst : Reg.t) =
(* Not calling move because r15 is not in int_reg_name. *)
I.mov (reg src) r15;
I.mov r15 (reg dst)
| Float | Addr | Vec128 | Float32 ->
| Float | Addr | Vec128 | Valx2 | Float32 ->
Misc.fatal_errorf
"Unexpected register type for stack to stack move: from %s to %s\n"
(Reg.name src) (Reg.name dst)
Expand Down Expand Up @@ -1035,8 +1046,8 @@ let emit_static_cast (cast : Cmm.static_cast) i =
CR mslater: (SIMD) don't load 32 bits once we have unboxed int16/int8 *)
I.movd (arg32 i 0) (res i 0)

let emit_simd_instr op i =
(match Simd_proc.register_behavior op with
let check_simd_instr (register_behavior : Simd_proc.register_behavior) i =
(match register_behavior with
| R_to_fst ->
assert (Reg.same_loc i.arg.(0) i.res.(0));
assert (Reg.is_reg i.arg.(0))
Expand Down Expand Up @@ -1076,6 +1087,23 @@ let emit_simd_instr op i =
assert (Reg.is_reg i.arg.(0));
assert (Reg.same_loc i.res.(0) (phys_xmm0v ()))
);
()

let emit_simd_instr_with_memory_arg op i addressing_mode =
check_simd_instr (Simd_proc.Mem.register_behavior op) i;
let addr = addressing addressing_mode VEC128 i 1 in
match (op : Simd.Mem.operation) with
| SSE2 Add_f64 -> I.addpd addr (res i 0)
| SSE2 Sub_f64 -> I.subpd addr (res i 0)
| SSE2 Mul_f64 -> I.mulpd addr (res i 0)
| SSE2 Div_f64 -> I.divpd addr (res i 0)
| SSE Add_f32 -> I.addps addr (res i 0)
| SSE Sub_f32 -> I.subps addr (res i 0)
| SSE Mul_f32 -> I.mulps addr (res i 0)
| SSE Div_f32 -> I.divps addr (res i 0)

let emit_simd_instr op i =
check_simd_instr (Simd_proc.register_behavior op) i;
match (op : Simd.operation) with
| CLMUL (Clmul_64 n) -> I.pclmulqdq (X86_dsl.int n) (arg i 1) (res i 0)
| BMI2 Extract_64 -> I.pext (arg i 1) (arg i 0) (res i 0)
Expand Down Expand Up @@ -1715,6 +1743,8 @@ let emit_instr ~first ~fallthrough i =
I.mfence ()
| Lop (Specific (Isimd op)) ->
emit_simd_instr op i
| Lop (Specific (Isimd_mem (op, addressing_mode))) ->
emit_simd_instr_with_memory_arg op i addressing_mode
| Lop (Static_cast cast) ->
emit_static_cast cast i
| Lop (Reinterpret_cast cast) ->
Expand Down Expand Up @@ -2144,7 +2174,7 @@ let size_of_regs regs =
| Float | Float32 ->
(* Float32 slots still take up a full word *)
acc + size_float
| Vec128 -> acc + size_vec128)
| Vec128 | Valx2 -> acc + size_vec128)
regs 0

let stack_locations ~offset regs =
Expand All @@ -2154,7 +2184,7 @@ let stack_locations ~offset regs =
| Float | Float32 ->
(* Float32 slots still take up a full word *)
size_float
| Vec128 -> size_vec128 in
| Vec128 | Valx2 -> size_vec128 in
next, (make_stack_loc n r ~offset :: offsets)) regs (0, []) in
locs |> Array.of_list

Expand Down Expand Up @@ -2242,6 +2272,7 @@ let emit_probe_handler_wrapper p =
(match r.typ with
| Val -> k::acc
| Int | Float | Vec128 | Float32 -> acc
| Valx2 -> k::k+Arch.size_addr::acc
| Addr -> Misc.fatal_error ("bad GC root " ^ Reg.name r))
| _ -> assert false)
saved_live
Expand Down
Loading
Loading