From 8987c688b885ea7e4bc442f30d210d53e8685f91 Mon Sep 17 00:00:00 2001 From: Krste Asanovic Date: Fri, 1 Feb 2019 02:18:49 -0800 Subject: [PATCH] Checkpoint. --- inst-table.adoc | 243 +++++++++++++++++++++++++---------------------- v-spec.adoc | 198 ++++++++++++++++++-------------------- valu-format.adoc | 14 +++ vamo-format.adoc | 7 ++ vcfg-format.adoc | 9 ++ 5 files changed, 254 insertions(+), 217 deletions(-) create mode 100644 valu-format.adoc create mode 100644 vamo-format.adoc create mode 100644 vcfg-format.adoc diff --git a/inst-table.adoc b/inst-table.adoc index c01243ac..45291a1b 100644 --- a/inst-table.adoc +++ b/inst-table.adoc @@ -1,99 +1,14 @@ -.Vector Unit-Stride Load/Store Instruction Listing -[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] -|======================== -|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode - -|off[1:0]|000|vm|00000|rs1|000 2+|vd|0000111|VLBU.V -|off[1:0]|000|vm|00000|rs1|101 2+|vd|0000111|VLHU.V -|off[1:0]|000|vm|00000|rs1|110 2+|vd|0000111|VLWU.V -|off[1:0]|000|vm|00000|rs1|111 2+|vd|0000111|VLE.V -|off[1:0]|100|vm|00000|rs1|000 2+|vd|0000111|VLB.V -|off[1:0]|100|vm|00000|rs1|101 2+|vd|0000111|VLH.V -|off[1:0]|100|vm|00000|rs1|110 2+|vd|0000111|VLW.V -2+|vs3|vm|00000|rs1|000|off[1:0]|000|0100111|VSB.V -2+|vs3|vm|00000|rs1|101|off[1:0]|000|0100111|VSH.V -2+|vs3|vm|00000|rs1|110|off[1:0]|000|0100111|VSW.V -2+|vs3|vm|00000|rs1|111|off[1:0]|000|0100111|VSE.V -|======================== - - -.Vector Unit-Stride Fault-First Load Instruction Listing -[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] -|======================== -|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode - -|off[1:0]|000|vm|10000|rs1|000 2+|vd|0000111|VLBUFF.V -|off[1:0]|000|vm|10000|rs1|101 2+|vd|0000111|VLHUFF.V -|off[1:0]|000|vm|10000|rs1|110 2+|vd|0000111|VLWUFF.V -|off[1:0]|000|vm|10000|rs1|111 2+|vd|0000111|VLEFF.V -|off[1:0]|100|vm|10000|rs1|000 2+|vd|0000111|VLBFF.V -|off[1:0]|100|vm|10000|rs1|101 2+|vd|0000111|VLHFF.V -|off[1:0]|100|vm|10000|rs1|110 2+|vd|0000111|VLWFF.V - -|======================== - -.Vector Strided Load/Store Instruction Listing -[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] -|======================== -|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode - -|off[1:0]|010|vm|rs2|rs1|000 2+|vd|0000111|VLSBU.V -|off[1:0]|010|vm|rs2|rs1|101 2+|vd|0000111|VLSHU.V -|off[1:0]|010|vm|rs2|rs1|110 2+|vd|0000111|VLSWU.V -|off[1:0]|010|vm|rs2|rs1|111 2+|vd|0000111|VLSE.V -|off[1:0]|110|vm|rs2|rs1|000 2+|vd|0000111|VLSB.V -|off[1:0]|110|vm|rs2|rs1|101 2+|vd|0000111|VLSH.V -|off[1:0]|110|vm|rs2|rs1|110 2+|vd|0000111|VLSW.V -2+|vs3|vm|rs2|rs1|000|off[1:0]|010|0100111|VSSB.V -2+|vs3|vm|rs2|rs1|101|off[1:0]|010|0100111|VSSH.V -2+|vs3|vm|rs2|rs1|110|off[1:0]|010|0100111|VSSW.V -2+|vs3|vm|rs2|rs1|111|off[1:0]|010|0100111|VSSE.V - -|======================== - - -.Vector Indexed Load/Store Instruction Listing -[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] -|======================== -|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode - -|off[1:0]|011|vm|vs2|rs1|000 2+|vd|0000111|VLXBU.V -|off[1:0]|011|vm|vs2|rs1|101 2+|vd|0000111|VLXHU.V -|off[1:0]|011|vm|vs2|rs1|110 2+|vd|0000111|VLXWU.V -|off[1:0]|011|vm|vs2|rs1|111 2+|vd|0000111|VLXE.V -|off[1:0]|111|vm|vs2|rs1|000 2+|vd|0000111|VLXB.V -|off[1:0]|111|vm|vs2|rs1|101 2+|vd|0000111|VLXH.V -|off[1:0]|111|vm|vs2|rs1|110 2+|vd|0000111|VLXW.V -2+|vs3|vm|vs2|rs1|000|off[1:0]|011|0100111|VSXB.V -2+|vs3|vm|vs2|rs1|101|off[1:0]|011|0100111|VSXH.V -2+|vs3|vm|vs2|rs1|110|off[1:0]|011|0100111|VSXW.V -2+|vs3|vm|vs2|rs1|111|off[1:0]|011|0100111|VSXE.V -2+|vs3|vm|vs2|rs1|000|off[1:0]|111|0100111|VSUXB.V -2+|vs3|vm|vs2|rs1|101|off[1:0]|111|0100111|VSUXH.V -2+|vs3|vm|vs2|rs1|110|off[1:0]|111|0100111|VSUXW.V -2+|vs3|vm|vs2|rs1|111|off[1:0]|111|0100111|VSUXE.V - -|======================== - - -.Vector Table -[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] -|======================== -2+|31 27|26 25 |24 20 |19 15 |14 12 2+|11 7 |6 0 |Opcode - -2+|00000|vm |vs2 |vs1 |001 2+|vd |1010111|VADD.VV -2+|00000|vm |vs2 |vs1 |000 2+|vd |1010111|VADD.VS -2+|00000|vm |vs2 |rs1 |010 2+|vd |1010111|VADD.VX -2+|00000|vm |vs2 |simm[4:0]|011 2+|vd |1010111|VADD.VI -|======================== - +[source] +---- Integer Integer FP -OPIVV V OPMVV V OPFVV V +funct3 funct3 funct3 +OPIVV V OPMVV V OPFVV V OPIVX X OPMVX X OPFVF F OPIVI I +funct6 funct6 funct6 000000 VXI vadd 000000 V vredsum 000000 VF vfadd 000001 VX vsub 000001 V vredand 000001 VF vfsub 000010 000010 V vredor 000010 V vfredsum @@ -136,32 +51,35 @@ OPIVI I 100101 VXI vsle 100101 100101 F vfgt 100110 XI vsgtu 100110 100110 100111 XI vsgt 100111 100111 F vfgte -101000 VXI vadc 101000 X vext.x.v 101000 V vfdot +101000 VXI vadc 101000 X vext.x.v 101000 101001 VXI vsbc 101001 X vins.v.x 101001 101010 101010 V vmv.x.s 101010 V vfmv.f.s 101011 101011 X vmv.s.x 101011 F vfmv.s.f -101100 VX vmulhu 101100 101100 -101101 VX vmul 101101 101101 -101110 VX vmulhsu 101110 101110 -101111 VX vmulh 101111 101111 - -110000 110000 VX vdivu 110000 VF vfmadd -110001 VX vmadd 110001 VX vdiv 110001 VF vfnmadd -110010 110010 VX vremu 110010 VF vfmsub -110011 VX vmsub 110011 VX vrem 110011 VF vfnmsub -110100 110100 110100 VF vfmacc -110101 VX vmacc 110101 V vdot 110101 VF vfnmacc -110110 110110 110110 VF vfmsac -110111 VX vmsac 110111 110111 VF vfnmsac -111000 VX vwmulu 111000 111000 VF vfwmul -111001 111001 111001 +101100 101100 101100 +101101 101101 101101 +101110 101110 101110 +101111 101111 101111 + +110000 VX vmulhu 110000 110000 VF vfmadd +110001 VX vmul 110001 VX vmadd 110001 VF vfnmadd +110010 VX vmulhsu 110010 110010 VF vfmsub +110011 VX vmulh 110011 VX vmsub 110011 VF vfnmsub +110100 VX vdivu 110100 110100 VF vfmacc +110101 VX vdiv 110101 VX vmacc 110101 VF vfnmacc +110110 VX vremu 110110 110110 VF vfmsac +110111 VX vrem 110111 VX vmsac 110111 VF vfnmsac +111000 VX vwmulu 111000 V vdotu 111000 VF vfwmul +111001 111001 V vdot 111001 V vfdot 111010 VX vwmulsu 111010 111010 -111011 VX vwmul 111011 111011 V vfwdot -111100 VX vwmaccu 111100 V vwdotu 111100 VF vfwmacc -111101 VX vwmacc 111101 V vwdot 111101 VF vfwnmacc -111110 VX vwmsacu 111110 V vw4dotu 111110 VF vfwmsac -111111 VX vwmsac 111111 V vw4dot 111111 VF vfwnmsac - +111011 VX vwmul 111011 111011 +111100 VX vwmaccu 111100 111100 VF vfwmacc +111101 VX vwmacc 111101 111101 VF vfwnmacc +111110 VX vwmsacu 111110 111110 VF vfwmsac +111111 VX vwmsac 111111 111111 VF vfwnmsac +---- + +[source] +---- VFUNARY0 encoding space rs1 single-width converts @@ -183,12 +101,18 @@ VFUNARY0 encoding space 10010 vfncvt.f.xu.v 10011 vfncvt.f.x.v 10100 vfncvt.f.f.v +---- +[source] +---- VFUNARY1 encoding space rs1 00000 vfsqrt.v 10000 vfclass.v +---- +[source] +---- VMUNARY0 encoding space rs1 00001 vmsbf @@ -196,8 +120,91 @@ VMUNARY0 encoding space 00011 vmsif 10000 vmiota 10001 vid +---- +//// +.Vector Unit-Stride Load/Store Instruction Listing +[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] +|======================== +|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode + +|off[1:0]|000|vm|00000|rs1|000 2+|vd|0000111|VLBU.V +|off[1:0]|000|vm|00000|rs1|101 2+|vd|0000111|VLHU.V +|off[1:0]|000|vm|00000|rs1|110 2+|vd|0000111|VLWU.V +|off[1:0]|000|vm|00000|rs1|111 2+|vd|0000111|VLE.V +|off[1:0]|100|vm|00000|rs1|000 2+|vd|0000111|VLB.V +|off[1:0]|100|vm|00000|rs1|101 2+|vd|0000111|VLH.V +|off[1:0]|100|vm|00000|rs1|110 2+|vd|0000111|VLW.V +2+|vs3|vm|00000|rs1|000|off[1:0]|000|0100111|VSB.V +2+|vs3|vm|00000|rs1|101|off[1:0]|000|0100111|VSH.V +2+|vs3|vm|00000|rs1|110|off[1:0]|000|0100111|VSW.V +2+|vs3|vm|00000|rs1|111|off[1:0]|000|0100111|VSE.V +|======================== + + +.Vector Unit-Stride Fault-First Load Instruction Listing +[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] +|======================== +|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode + +|off[1:0]|000|vm|10000|rs1|000 2+|vd|0000111|VLBUFF.V +|off[1:0]|000|vm|10000|rs1|101 2+|vd|0000111|VLHUFF.V +|off[1:0]|000|vm|10000|rs1|110 2+|vd|0000111|VLWUFF.V +|off[1:0]|000|vm|10000|rs1|111 2+|vd|0000111|VLEFF.V +|off[1:0]|100|vm|10000|rs1|000 2+|vd|0000111|VLBFF.V +|off[1:0]|100|vm|10000|rs1|101 2+|vd|0000111|VLHFF.V +|off[1:0]|100|vm|10000|rs1|110 2+|vd|0000111|VLWFF.V + +|======================== + +.Vector Strided Load/Store Instruction Listing +[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] +|======================== +|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode + +|off[1:0]|010|vm|rs2|rs1|000 2+|vd|0000111|VLSBU.V +|off[1:0]|010|vm|rs2|rs1|101 2+|vd|0000111|VLSHU.V +|off[1:0]|010|vm|rs2|rs1|110 2+|vd|0000111|VLSWU.V +|off[1:0]|010|vm|rs2|rs1|111 2+|vd|0000111|VLSE.V +|off[1:0]|110|vm|rs2|rs1|000 2+|vd|0000111|VLSB.V +|off[1:0]|110|vm|rs2|rs1|101 2+|vd|0000111|VLSH.V +|off[1:0]|110|vm|rs2|rs1|110 2+|vd|0000111|VLSW.V +2+|vs3|vm|rs2|rs1|000|off[1:0]|010|0100111|VSSB.V +2+|vs3|vm|rs2|rs1|101|off[1:0]|010|0100111|VSSH.V +2+|vs3|vm|rs2|rs1|110|off[1:0]|010|0100111|VSSW.V +2+|vs3|vm|rs2|rs1|111|off[1:0]|010|0100111|VSSE.V + +|======================== + + +.Vector Indexed Load/Store Instruction Listing +[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] +|======================== +|31 30 |29 27 |26 25 |24 20 |19 15 |14 12 |11 10 |9 7 |6 0 |Opcode + +|off[1:0]|011|vm|vs2|rs1|000 2+|vd|0000111|VLXBU.V +|off[1:0]|011|vm|vs2|rs1|101 2+|vd|0000111|VLXHU.V +|off[1:0]|011|vm|vs2|rs1|110 2+|vd|0000111|VLXWU.V +|off[1:0]|011|vm|vs2|rs1|111 2+|vd|0000111|VLXE.V +|off[1:0]|111|vm|vs2|rs1|000 2+|vd|0000111|VLXB.V +|off[1:0]|111|vm|vs2|rs1|101 2+|vd|0000111|VLXH.V +|off[1:0]|111|vm|vs2|rs1|110 2+|vd|0000111|VLXW.V +2+|vs3|vm|vs2|rs1|000|off[1:0]|011|0100111|VSXB.V +2+|vs3|vm|vs2|rs1|101|off[1:0]|011|0100111|VSXH.V +2+|vs3|vm|vs2|rs1|110|off[1:0]|011|0100111|VSXW.V +2+|vs3|vm|vs2|rs1|111|off[1:0]|011|0100111|VSXE.V +2+|vs3|vm|vs2|rs1|000|off[1:0]|111|0100111|VSUXB.V +2+|vs3|vm|vs2|rs1|101|off[1:0]|111|0100111|VSUXH.V +2+|vs3|vm|vs2|rs1|110|off[1:0]|111|0100111|VSUXW.V +2+|vs3|vm|vs2|rs1|111|off[1:0]|111|0100111|VSUXE.V + +|======================== +//// + + + +//// X vsgteu X vsgte @@ -207,6 +214,17 @@ vx4muladd vx4mulsub +.Vector Table +[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] +|======================== +2+|31 27|26 25 |24 20 |19 15 |14 12 2+|11 7 |6 0 |Opcode + +2+|00000|vm |vs2 |vs1 |001 2+|vd |1010111|VADD.VV +2+|00000|vm |vs2 |vs1 |000 2+|vd |1010111|VADD.VS +2+|00000|vm |vs2 |rs1 |010 2+|vd |1010111|VADD.VX +2+|00000|vm |vs2 |simm[4:0]|011 2+|vd |1010111|VADD.VI +|======================== + .Vector Table [width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"] |======================== @@ -398,3 +416,4 @@ vx4mulsub 2+|vs3|vm|vs2|vs1|110 2+|vd|1001111|VMSUBW.VVS |======================== +//// \ No newline at end of file diff --git a/v-spec.adoc b/v-spec.adoc index c9132731..46269c8e 100644 --- a/v-spec.adoc +++ b/v-spec.adoc @@ -1030,18 +1030,14 @@ XLEN-1:9 Reserved (write 0) vlmul2 # Vlmul x2 vlmul4 # Vlmul x4 vlmul8 # Vlmul x8 +---- +//// vlmul1max # Vlmul x1 max vlmul2max # Vlmul x2 max vlmul4max # Vlmul x4 max vlmul8max # Vlmul x8 max ----- - -NOTE: The immediate argument `vtypei` can be a compressed form of the -full vtype setting, capturing the most common use cases. For the base -proposed here, it is assumed that at least four bits of immediate are -available to write all standard values of `vtype` (`vsew[2:0]` and -`vlmul[1:0]`). +//// The `vtype` setting must be supported by the implementation, and the `vsetvl{i}` instructions will raise an illegal instruction exception @@ -1095,6 +1091,7 @@ This allows software to avoid needing to explicitly calculate a running maximum of vector lengths observed during a stripmined loop. -- +//// === Rules for `vlmul` Due to the striping of elements across multiple vector registers, @@ -1121,6 +1118,7 @@ Software sets `vlmulmax` according to how many register groups are used by the code. Software must not use any vector register numbers that would be illegal if the largest value of `vlmulmax` is chosen for `vlmul`. +//// === `vsetvl` Instruction @@ -1144,7 +1142,7 @@ throughput on mixed-width operations in a single loop. # Loop using only widest elements: loop: - vsetvli a3, a0, vsew32,vlmul8max # Use only 32-bit elements + vsetvli a3, a0, vsew32,vlmul8 # Use only 32-bit elements vlh.v v8, (a1) # Sign-extend 16b load values to 32b elements sll t1, a3, 1 add a1, a1, t1 # Bump pointer @@ -1179,6 +1177,7 @@ machines where 16b widening multiplies are faster than 32b integer multiplies, and where 16b vector load can run faster due to the narrower writes to the vector regfile. +//// NOTE: The `vlmul8max` version is used in the single-width loop to reduce LMUL for shorter application vector lengths. The mixed-width loop has to use a fixed SEW=16/LMULMAX=4 and the SEW=32/LMULMAX=8 @@ -1190,6 +1189,7 @@ within the vector registers would not line up correctly. NOTE: It should be possible to specify a different form of vlmulmax that specifies the range of element widths to be used in overall loop to set LMUL correctly. TBD. +//// == Vector Loads and Stores @@ -1394,64 +1394,6 @@ address of the vector memory access. vse.v vs3, offset(rs1), vm # SEW store ---- -[[sec-aos]] -=== Vector Array-of-Structures Load/Store Instructions - -NOTE: This is being considered as a possible extension. - -These instructions move a packed contiguous array of structures into -multiple destination vector registers. The low three bits of `*umop` -indicate the number of memory fields in a structure from 1-8. The -width encoding gives the size of the memory fields, which must be -homogeneous. - -NOTE: Array-of-structs memory instruction with structure size of 1 is -identical to a unit-stride instruction with LMUL=1. - -NOTE: Software can further unpack fields within a structure into -separate vector registers. - -The assembler prefix `vlaos`/`vsaos` is used for loads and stores -respectively. - -[source,asm] ----- - vlaos2b.v vd, offset(rs1), vm # Load vector of 2*1-byte structure into destination vd, vd+1 - vlaos3b.v vd, offset(rs1), vm # Load vector of 3*1-byte structure into destination vd, vd+1 - vlaos7w.v vd, offset(rs1), vm # Load vector of 7*4-byte structure into destination vd, vd+1, ... vd+6 - vlaos8e.v vd, offset(rs1), vm # Load vector of 8*SEW-byte structure into destination vd, vd+1, .. vd+7 - - vsaos3b.v vs3, offset(rs1), vm ----- - -[source] ----- - # Example 1 - # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp) - vlaos3b.v v8, (a0), vm - # v8 holds the red pixels - # v9 holds the green pixels - # v10 holds the blue pixels - - # Example 2 - # Memory structure holds complex values, 32b for real and 32b for imaginary - vlaos2w.v v8, (a0), vm - # v8 holds real - # v9 holds imaginary ----- - -For loads, the `vd` register is the first register in the group to be -loaded. For stores, the `vs3` register is the first register to be -stored. - -LMUL is ignored and these instructions load each vector register as if -LMUL=1. - -The `vl` register gives the number of structures to move, which is -equal to the number of elements tranferred to each vector register. - -If a trap is taken, `vstart` is in units of structures. - === Vector Strided Instructions [source,asm] @@ -1568,6 +1510,66 @@ security. It is possible that security mitigations can be implemented to allow first-fault variants of non-contiguous accesses in future vector extensions. +[[sec-aos]] +=== Vector Array-of-Structures Load/Store Instructions + +NOTE: This is being considered as a possible extension. + +These instructions move a packed contiguous array of structures into +multiple destination vector registers. The low three bits of `*umop` +indicate the number of memory fields in a structure from 1-8. The +width encoding gives the size of the memory fields, which must be +homogeneous. + +NOTE: An array-of-structs memory instruction with structure size of 1 +is identical to a unit-stride instruction with LMUL=1. + +NOTE: Software can further unpack fields within a structure into +separate vector registers. + +The assembler prefix `vlaos`/`vsaos` is used for loads and stores +respectively. + +[source,asm] +---- + vlaos2b.v vd, offset(rs1), vm # Load vector of 2*1-byte structure into destination vd, vd+1 + vlaos3b.v vd, offset(rs1), vm # Load vector of 3*1-byte structure into destination vd, vd+1 + vlaos7w.v vd, offset(rs1), vm # Load vector of 7*4-byte structure into destination vd, vd+1, ... vd+6 + vlaos8e.v vd, offset(rs1), vm # Load vector of 8*SEW-byte structure into destination vd, vd+1, .. vd+7 + + vsaos3b.v vs3, offset(rs1), vm +---- + +[source] +---- + # Example 1 + # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp) + vlaos3b.v v8, (a0), vm + # v8 holds the red pixels + # v9 holds the green pixels + # v10 holds the blue pixels + + # Example 2 + # Memory structure holds complex values, 32b for real and 32b for imaginary + vlaos2w.v v8, (a0), vm + # v8 holds real + # v9 holds imaginary +---- + +For loads, the `vd` register is the first register in the group to be +loaded. For stores, the `vs3` register is the first register to be +stored. + +LMUL is ignored and these instructions load each vector register as if +LMUL=1. + +The `vl` register gives the number of structures to move, which is +equal to the number of elements tranferred to each vector register. + +If a trap is taken, `vstart` is in units of structures. + +There are also first-fault versions of these instructions. + == Vector AMO Operations NOTE: Profiles will dictate whether vector AMO operations are @@ -1582,13 +1584,10 @@ Vector AMO operations are encoded using the unused width encodings under the standard AMO major opcode. Each active element performs an atomic read-modify-write of a single memory location. +include::vamo-format.adoc[] + [source] ---- -Format for Vector AMO Instructions -31 27 26 25 24 20 19 15 14 12 11 7 6 0 - amoop |wd| vm | vs2 | rs1 | width | vs3/vd |0101111| VAMO* - 5 1 1 5 5 3 5 7 - vs2[4:0] specifies v register holding address vs3/vd[4:0] specifies v register holding source operand and destination @@ -3835,60 +3834,49 @@ interpreted as relative to the enclosing element only. Index values NOTE: Implementations can provide fast implementations of register gathers constrained within a single element width. -=== Vector Single-Width Integer Dot-Product Instructions +=== Vector Integer Dot-Product Instruction The dot-product reduction `vdot.vv` performs an element-wise -multiplication between the source vectors then accumulates the result -into the destination vector register. Note the assembler syntax uses -a `.vv` suffix since both inputs are vectors of elements. +multiplication between the source sub-elements then accumulates the +result into the full-width destination vector element. Note the +assembler syntax uses a `.vv` suffix since both inputs are vectors of +elements. [source] ---- - # Dot product - vdot.vv vd, vs2, vs1, vm # vd[0] += sum_i( vs2[i] * vs1[i] ) ----- +# Signed dot-product +vdot.vv vd, vs2, vs1, vm # Vector-vector -=== Vector Widening Integer Dot-Product Instructions - -The `vwdotu.vv` instruction calculates an unsigned dot-product of -the input vectors and stores the result in a double-width element 0 of -the vector destination register. - -The dot-product instruction can perform an unordered summation, but -the result must correspond to some sequential ordering of the -individual add operations. +# Unsigned dot-product +vdotu.vv vd, vs2, vs1, vm # Vector-vector +---- [source] ---- - # Unsigned dot product into double-width result - vwdotu.vv vd, vs2, vs1, vm # 2*SEW = sum_i(SEW[i]*SEW[i]) + # Dot product, SEW=32, EDIV=1 + vdot.vv vd, vs2, vs1, vm # vd[i][31:0] += vs2[i][31:0] * vs1[i][31:0] + + # Dot product, SEW=32, EDIV=2 + vdot.vv vd, vs2, vs1, vm # vd[i][31:0] += vs2[i][31:16] * vs1[i][31:16] + + vs2[i][15:0] * vs1[i][15:0] - # Signed dot product into double-width result - vwdot.vv vd, vs2, vs1, vm # 2*SEW = sum_i(SEW[i]*SEW[i]) ---- -The `vw4dotu.vv` and `vw4dot.vv` instructions uses a quad-width -vector element for the accumulator value in the dot-product. +=== Vector Floating-Point Dot Product Instruction [source] ---- - # Unsigned dot product. - vw4dotu.vv vd, vs2, vs1, vm # 4*SEW += sum(SEW*SEW) - - # Signed dot product. - vw4dot.vv vd, vs2, vs1, vm # 4*SEW += sum(SEW*SEW) +# Signed dot-product +vfdot.vv vd, vs2, vs1, vm # Vector-vector ---- -=== Vector Floating-Point Dot Product - [source] ---- - # Dot product. - vfdot.v vd, vs2, vs1, vm # vd[0] += sum_i(vs2[i] * vs1[i]) - - # Widening dot product. - vfwdot.v vd, vs2, vs1, vm # 2*SEW += sum(SEW*SEW) + # Dot product. SEW=32, EDIV=2 + vfdot.v vd, vs2, vs1, vm # vd[i][31:0] += vs2[i][31:16] * vs1[i][31:16] + + vs2[i][15:0] * vs1[i][15:0] ---- + == Vector Instruction Listing include::inst-table.adoc[] diff --git a/valu-format.adoc b/valu-format.adoc new file mode 100644 index 00000000..5476c5ad --- /dev/null +++ b/valu-format.adoc @@ -0,0 +1,14 @@ +[source] +---- +Formats for Vector Arithmetic Instructions under OP-V major opcode + +31 26 25 24 20 19 15 14 12 11 7 6 0 + funct6 | vm | vs2 | vs1 | 0 0 0 | vd |1010111| OP-V (OPIVV) + funct6 | vm | vs2 | vs1 | 0 0 1 | vd |1010111| OP-V (OPFVV) + funct6 | vm | vs2 | vs1 | 0 1 0 | vd/rd |1010111| OP-V (OPMVV) + funct6 | vm | vs2 | simm5 | 0 1 1 | vd |1010111| OP-V (OPIVI) + funct6 | vm | vs2 | rs1 | 1 0 0 | vd |1010111| OP-V (OPIVX) + funct6 | vm | vs2 | rs1 | 1 0 1 | vd |1010111| OP-V (OPFVF) + funct6 | vm | vs2 | rs1 | 1 1 0 | vd/rd |1010111| OP-V (OPMVX) + 6 1 5 5 3 5 7 +---- diff --git a/vamo-format.adoc b/vamo-format.adoc new file mode 100644 index 00000000..325a6c95 --- /dev/null +++ b/vamo-format.adoc @@ -0,0 +1,7 @@ +[source] +---- +Format for Vector AMO Instructions under AMO major opcode +31 27 26 25 24 20 19 15 14 12 11 7 6 0 + amoop |wd| vm | vs2 | rs1 | width | vs3/vd |0101111| VAMO* + 5 1 1 5 5 3 5 7 +---- diff --git a/vcfg-format.adoc b/vcfg-format.adoc new file mode 100644 index 00000000..e06a7a11 --- /dev/null +++ b/vcfg-format.adoc @@ -0,0 +1,9 @@ +[source] +---- +Formats for Vector Configuration Instructions + + 31 30 25 24 20 19 15 14 12 11 7 6 0 + 0 | zimm[10:0] | rs1 | 1 1 1 | rd |1010111| vsetvli + 1 | 000000 | rs2 | rs1 | 1 1 1 | rd |1010111| vsetvl + 1 6 5 5 3 5 7 +----