From 8987c688b885ea7e4bc442f30d210d53e8685f91 Mon Sep 17 00:00:00 2001
From: Krste Asanovic <krste@eecs.berkeley.edu>
Date: Fri, 1 Feb 2019 02:18:49 -0800
Subject: [PATCH] Checkpoint.

---
 inst-table.adoc  | 243 +++++++++++++++++++++++++----------------------
 v-spec.adoc      | 198 ++++++++++++++++++--------------------
 valu-format.adoc |  14 +++
 vamo-format.adoc |   7 ++
 vcfg-format.adoc |   9 ++
 5 files changed, 254 insertions(+), 217 deletions(-)
 create mode 100644 valu-format.adoc
 create mode 100644 vamo-format.adoc
 create mode 100644 vcfg-format.adoc

diff --git a/inst-table.adoc b/inst-table.adoc
index c01243ac..45291a1b 100644
--- a/inst-table.adoc
+++ b/inst-table.adoc
@@ -1,99 +1,14 @@
-.Vector Unit-Stride Load/Store Instruction Listing
-[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
-|========================
-|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
-
-|off[1:0]|000|vm|00000|rs1|000 2+|vd|0000111|VLBU.V
-|off[1:0]|000|vm|00000|rs1|101 2+|vd|0000111|VLHU.V
-|off[1:0]|000|vm|00000|rs1|110 2+|vd|0000111|VLWU.V
-|off[1:0]|000|vm|00000|rs1|111 2+|vd|0000111|VLE.V
-|off[1:0]|100|vm|00000|rs1|000 2+|vd|0000111|VLB.V
-|off[1:0]|100|vm|00000|rs1|101 2+|vd|0000111|VLH.V
-|off[1:0]|100|vm|00000|rs1|110 2+|vd|0000111|VLW.V
-2+|vs3|vm|00000|rs1|000|off[1:0]|000|0100111|VSB.V
-2+|vs3|vm|00000|rs1|101|off[1:0]|000|0100111|VSH.V
-2+|vs3|vm|00000|rs1|110|off[1:0]|000|0100111|VSW.V
-2+|vs3|vm|00000|rs1|111|off[1:0]|000|0100111|VSE.V
-|========================
-
-
-.Vector Unit-Stride Fault-First Load Instruction Listing
-[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
-|========================
-|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
-
-|off[1:0]|000|vm|10000|rs1|000 2+|vd|0000111|VLBUFF.V
-|off[1:0]|000|vm|10000|rs1|101 2+|vd|0000111|VLHUFF.V
-|off[1:0]|000|vm|10000|rs1|110 2+|vd|0000111|VLWUFF.V
-|off[1:0]|000|vm|10000|rs1|111 2+|vd|0000111|VLEFF.V
-|off[1:0]|100|vm|10000|rs1|000 2+|vd|0000111|VLBFF.V
-|off[1:0]|100|vm|10000|rs1|101 2+|vd|0000111|VLHFF.V
-|off[1:0]|100|vm|10000|rs1|110 2+|vd|0000111|VLWFF.V
-
-|========================
-
-.Vector Strided Load/Store Instruction Listing
-[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
-|========================
-|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
-
-|off[1:0]|010|vm|rs2|rs1|000 2+|vd|0000111|VLSBU.V
-|off[1:0]|010|vm|rs2|rs1|101 2+|vd|0000111|VLSHU.V
-|off[1:0]|010|vm|rs2|rs1|110 2+|vd|0000111|VLSWU.V
-|off[1:0]|010|vm|rs2|rs1|111 2+|vd|0000111|VLSE.V
-|off[1:0]|110|vm|rs2|rs1|000 2+|vd|0000111|VLSB.V
-|off[1:0]|110|vm|rs2|rs1|101 2+|vd|0000111|VLSH.V
-|off[1:0]|110|vm|rs2|rs1|110 2+|vd|0000111|VLSW.V
-2+|vs3|vm|rs2|rs1|000|off[1:0]|010|0100111|VSSB.V
-2+|vs3|vm|rs2|rs1|101|off[1:0]|010|0100111|VSSH.V
-2+|vs3|vm|rs2|rs1|110|off[1:0]|010|0100111|VSSW.V
-2+|vs3|vm|rs2|rs1|111|off[1:0]|010|0100111|VSSE.V
-
-|========================
-
-
-.Vector Indexed Load/Store Instruction Listing
-[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
-|========================
-|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
-
-|off[1:0]|011|vm|vs2|rs1|000 2+|vd|0000111|VLXBU.V
-|off[1:0]|011|vm|vs2|rs1|101 2+|vd|0000111|VLXHU.V
-|off[1:0]|011|vm|vs2|rs1|110 2+|vd|0000111|VLXWU.V
-|off[1:0]|011|vm|vs2|rs1|111 2+|vd|0000111|VLXE.V
-|off[1:0]|111|vm|vs2|rs1|000 2+|vd|0000111|VLXB.V
-|off[1:0]|111|vm|vs2|rs1|101 2+|vd|0000111|VLXH.V
-|off[1:0]|111|vm|vs2|rs1|110 2+|vd|0000111|VLXW.V                    
-2+|vs3|vm|vs2|rs1|000|off[1:0]|011|0100111|VSXB.V
-2+|vs3|vm|vs2|rs1|101|off[1:0]|011|0100111|VSXH.V
-2+|vs3|vm|vs2|rs1|110|off[1:0]|011|0100111|VSXW.V
-2+|vs3|vm|vs2|rs1|111|off[1:0]|011|0100111|VSXE.V
-2+|vs3|vm|vs2|rs1|000|off[1:0]|111|0100111|VSUXB.V
-2+|vs3|vm|vs2|rs1|101|off[1:0]|111|0100111|VSUXH.V
-2+|vs3|vm|vs2|rs1|110|off[1:0]|111|0100111|VSUXW.V
-2+|vs3|vm|vs2|rs1|111|off[1:0]|111|0100111|VSUXE.V
-
-|========================
-
-
-.Vector Table
-[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
-|========================
-2+|31 27|26 25 |24   20 |19    15 |14  12 2+|11  7 |6  0   |Opcode
-
-2+|00000|vm    |vs2     |vs1      |001    2+|vd    |1010111|VADD.VV
-2+|00000|vm    |vs2     |vs1      |000    2+|vd    |1010111|VADD.VS
-2+|00000|vm    |vs2     |rs1      |010    2+|vd    |1010111|VADD.VX
-2+|00000|vm    |vs2     |simm[4:0]|011    2+|vd    |1010111|VADD.VI
-|========================
-
+[source]
+----
 
 Integer               Integer               FP
 
-OPIVV  V              OPMVV  V              OPFVV  V
+funct3                funct3                funct3
+OPIVV  V              OPMVV  V              OPFVV  V     
 OPIVX   X             OPMVX   X             OPFVF   F
 OPIVI    I
 
+funct6                funct6                funct6
 000000 VXI vadd       000000 V  vredsum     000000 VF vfadd
 000001 VX  vsub       000001 V  vredand     000001 VF vfsub
 000010                000010 V  vredor      000010 V  vfredsum
@@ -136,32 +51,35 @@ OPIVI    I
 100101 VXI vsle       100101                100101  F vfgt
 100110  XI vsgtu      100110                100110
 100111  XI vsgt       100111                100111  F vfgte
-101000 VXI vadc       101000  X vext.x.v    101000 V  vfdot
+101000 VXI vadc       101000  X vext.x.v    101000
 101001 VXI vsbc       101001  X vins.v.x    101001
 101010                101010 V   vmv.x.s    101010 V  vfmv.f.s
 101011                101011  X  vmv.s.x    101011  F vfmv.s.f          
-101100 VX  vmulhu     101100                101100                 
-101101 VX  vmul       101101                101101
-101110 VX  vmulhsu    101110                101110                          
-101111 VX  vmulh      101111                101111
-
-110000                110000 VX vdivu       110000 VF vfmadd
-110001 VX  vmadd      110001 VX vdiv        110001 VF vfnmadd
-110010                110010 VX vremu       110010 VF vfmsub
-110011 VX  vmsub      110011 VX vrem        110011 VF vfnmsub
-110100                110100                110100 VF vfmacc
-110101 VX  vmacc      110101 V  vdot        110101 VF vfnmacc
-110110                110110                110110 VF vfmsac
-110111 VX  vmsac      110111                110111 VF vfnmsac
-111000 VX  vwmulu     111000                111000 VF vfwmul
-111001                111001                111001
+101100                101100                101100                 
+101101                101101                101101
+101110                101110                101110                          
+101111                101111                101111
+
+110000 VX  vmulhu     110000                110000 VF vfmadd
+110001 VX  vmul       110001 VX vmadd       110001 VF vfnmadd
+110010 VX  vmulhsu    110010                110010 VF vfmsub
+110011 VX  vmulh      110011 VX vmsub       110011 VF vfnmsub
+110100 VX  vdivu      110100                110100 VF vfmacc
+110101 VX  vdiv       110101 VX vmacc       110101 VF vfnmacc
+110110 VX  vremu      110110                110110 VF vfmsac
+110111 VX  vrem       110111 VX vmsac       110111 VF vfnmsac
+111000 VX  vwmulu     111000 V  vdotu       111000 VF vfwmul
+111001                111001 V  vdot        111001 V  vfdot
 111010 VX  vwmulsu    111010                111010
-111011 VX  vwmul      111011                111011 V  vfwdot
-111100 VX  vwmaccu    111100 V  vwdotu      111100 VF vfwmacc
-111101 VX  vwmacc     111101 V  vwdot       111101 VF vfwnmacc
-111110 VX  vwmsacu    111110 V  vw4dotu     111110 VF vfwmsac
-111111 VX  vwmsac     111111 V  vw4dot      111111 VF vfwnmsac
-
+111011 VX  vwmul      111011                111011 
+111100 VX  vwmaccu    111100                111100 VF vfwmacc
+111101 VX  vwmacc     111101                111101 VF vfwnmacc
+111110 VX  vwmsacu    111110                111110 VF vfwmsac
+111111 VX  vwmsac     111111                111111 VF vfwnmsac
+----
+
+[source]
+----
 VFUNARY0 encoding space
  rs1
  single-width converts
@@ -183,12 +101,18 @@ VFUNARY0 encoding space
 10010 vfncvt.f.xu.v
 10011 vfncvt.f.x.v
 10100 vfncvt.f.f.v
+----
 
+[source]
+----
 VFUNARY1 encoding space
  rs1
 00000 vfsqrt.v
 10000 vfclass.v
+----
 
+[source]
+----
 VMUNARY0 encoding space
  rs1
 00001 vmsbf
@@ -196,8 +120,91 @@ VMUNARY0 encoding space
 00011 vmsif
 10000 vmiota
 10001 vid   
+----
 
 
+////
+.Vector Unit-Stride Load/Store Instruction Listing
+[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
+|========================
+|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
+
+|off[1:0]|000|vm|00000|rs1|000 2+|vd|0000111|VLBU.V
+|off[1:0]|000|vm|00000|rs1|101 2+|vd|0000111|VLHU.V
+|off[1:0]|000|vm|00000|rs1|110 2+|vd|0000111|VLWU.V
+|off[1:0]|000|vm|00000|rs1|111 2+|vd|0000111|VLE.V
+|off[1:0]|100|vm|00000|rs1|000 2+|vd|0000111|VLB.V
+|off[1:0]|100|vm|00000|rs1|101 2+|vd|0000111|VLH.V
+|off[1:0]|100|vm|00000|rs1|110 2+|vd|0000111|VLW.V
+2+|vs3|vm|00000|rs1|000|off[1:0]|000|0100111|VSB.V
+2+|vs3|vm|00000|rs1|101|off[1:0]|000|0100111|VSH.V
+2+|vs3|vm|00000|rs1|110|off[1:0]|000|0100111|VSW.V
+2+|vs3|vm|00000|rs1|111|off[1:0]|000|0100111|VSE.V
+|========================
+
+
+.Vector Unit-Stride Fault-First Load Instruction Listing
+[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
+|========================
+|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
+
+|off[1:0]|000|vm|10000|rs1|000 2+|vd|0000111|VLBUFF.V
+|off[1:0]|000|vm|10000|rs1|101 2+|vd|0000111|VLHUFF.V
+|off[1:0]|000|vm|10000|rs1|110 2+|vd|0000111|VLWUFF.V
+|off[1:0]|000|vm|10000|rs1|111 2+|vd|0000111|VLEFF.V
+|off[1:0]|100|vm|10000|rs1|000 2+|vd|0000111|VLBFF.V
+|off[1:0]|100|vm|10000|rs1|101 2+|vd|0000111|VLHFF.V
+|off[1:0]|100|vm|10000|rs1|110 2+|vd|0000111|VLWFF.V
+
+|========================
+
+.Vector Strided Load/Store Instruction Listing
+[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
+|========================
+|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
+
+|off[1:0]|010|vm|rs2|rs1|000 2+|vd|0000111|VLSBU.V
+|off[1:0]|010|vm|rs2|rs1|101 2+|vd|0000111|VLSHU.V
+|off[1:0]|010|vm|rs2|rs1|110 2+|vd|0000111|VLSWU.V
+|off[1:0]|010|vm|rs2|rs1|111 2+|vd|0000111|VLSE.V
+|off[1:0]|110|vm|rs2|rs1|000 2+|vd|0000111|VLSB.V
+|off[1:0]|110|vm|rs2|rs1|101 2+|vd|0000111|VLSH.V
+|off[1:0]|110|vm|rs2|rs1|110 2+|vd|0000111|VLSW.V
+2+|vs3|vm|rs2|rs1|000|off[1:0]|010|0100111|VSSB.V
+2+|vs3|vm|rs2|rs1|101|off[1:0]|010|0100111|VSSH.V
+2+|vs3|vm|rs2|rs1|110|off[1:0]|010|0100111|VSSW.V
+2+|vs3|vm|rs2|rs1|111|off[1:0]|010|0100111|VSSE.V
+
+|========================
+
+
+.Vector Indexed Load/Store Instruction Listing
+[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
+|========================
+|31 30 |29 27 |26 25 |24  20 |19    15 |14  12 |11 10 |9 7 |6  0 |Opcode
+
+|off[1:0]|011|vm|vs2|rs1|000 2+|vd|0000111|VLXBU.V
+|off[1:0]|011|vm|vs2|rs1|101 2+|vd|0000111|VLXHU.V
+|off[1:0]|011|vm|vs2|rs1|110 2+|vd|0000111|VLXWU.V
+|off[1:0]|011|vm|vs2|rs1|111 2+|vd|0000111|VLXE.V
+|off[1:0]|111|vm|vs2|rs1|000 2+|vd|0000111|VLXB.V
+|off[1:0]|111|vm|vs2|rs1|101 2+|vd|0000111|VLXH.V
+|off[1:0]|111|vm|vs2|rs1|110 2+|vd|0000111|VLXW.V                    
+2+|vs3|vm|vs2|rs1|000|off[1:0]|011|0100111|VSXB.V
+2+|vs3|vm|vs2|rs1|101|off[1:0]|011|0100111|VSXH.V
+2+|vs3|vm|vs2|rs1|110|off[1:0]|011|0100111|VSXW.V
+2+|vs3|vm|vs2|rs1|111|off[1:0]|011|0100111|VSXE.V
+2+|vs3|vm|vs2|rs1|000|off[1:0]|111|0100111|VSUXB.V
+2+|vs3|vm|vs2|rs1|101|off[1:0]|111|0100111|VSUXH.V
+2+|vs3|vm|vs2|rs1|110|off[1:0]|111|0100111|VSUXW.V
+2+|vs3|vm|vs2|rs1|111|off[1:0]|111|0100111|VSUXE.V
+
+|========================
+////
+
+
+
+////
 
 X  vsgteu
   X  vsgte
@@ -207,6 +214,17 @@ vx4muladd
 vx4mulsub
 
 
+.Vector Table
+[width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
+|========================
+2+|31 27|26 25 |24   20 |19    15 |14  12 2+|11  7 |6  0   |Opcode
+
+2+|00000|vm    |vs2     |vs1      |001    2+|vd    |1010111|VADD.VV
+2+|00000|vm    |vs2     |vs1      |000    2+|vd    |1010111|VADD.VS
+2+|00000|vm    |vs2     |rs1      |010    2+|vd    |1010111|VADD.VX
+2+|00000|vm    |vs2     |simm[4:0]|011    2+|vd    |1010111|VADD.VI
+|========================
+
 .Vector Table
 [width="100%",cols="^3,^3,^3,^4,^4,^3,^3,^3,^7,<10"]
 |========================
@@ -398,3 +416,4 @@ vx4mulsub
 2+|vs3|vm|vs2|vs1|110 2+|vd|1001111|VMSUBW.VVS
 
 |========================
+////
\ No newline at end of file
diff --git a/v-spec.adoc b/v-spec.adoc
index c9132731..46269c8e 100644
--- a/v-spec.adoc
+++ b/v-spec.adoc
@@ -1030,18 +1030,14 @@ XLEN-1:9    Reserved (write 0)
  vlmul2   # Vlmul x2
  vlmul4   # Vlmul x4
  vlmul8   # Vlmul x8
+----
 
+////
  vlmul1max   # Vlmul x1 max
  vlmul2max   # Vlmul x2 max
  vlmul4max   # Vlmul x4 max
  vlmul8max   # Vlmul x8 max
-----
-
-NOTE: The immediate argument `vtypei` can be a compressed form of the
-full vtype setting, capturing the most common use cases.  For the base
-proposed here, it is assumed that at least four bits of immediate are
-available to write all standard values of `vtype` (`vsew[2:0]` and
-`vlmul[1:0]`).
+////
 
 The `vtype` setting must be supported by the implementation, and the
  `vsetvl{i}` instructions will raise an illegal instruction exception
@@ -1095,6 +1091,7 @@ This allows software to avoid needing to explicitly calculate a running
 maximum of vector lengths observed during a stripmined loop.
 --
 
+////
 === Rules for `vlmul`
 
 Due to the striping of elements across multiple vector registers,
@@ -1121,6 +1118,7 @@ Software sets `vlmulmax` according to how many register groups are
 used by the code.  Software must not use any vector register numbers
 that would be illegal if the largest value of `vlmulmax` is chosen for
 `vlmul`.
+////
 
 === `vsetvl` Instruction
 
@@ -1144,7 +1142,7 @@ throughput on mixed-width operations in a single loop.
 # Loop using only widest elements:
 
 loop:
-    vsetvli a3, a0, vsew32,vlmul8max   # Use only 32-bit elements
+    vsetvli a3, a0, vsew32,vlmul8   # Use only 32-bit elements
     vlh.v v8, (a1)          # Sign-extend 16b load values to 32b elements
       sll t1, a3, 1
       add a1, a1, t1        # Bump pointer
@@ -1179,6 +1177,7 @@ machines where 16b widening multiplies are faster than 32b integer
 multiplies, and where 16b vector load can run faster due to the
 narrower writes to the vector regfile.
 
+////
 NOTE: The `vlmul8max` version is used in the single-width loop to
 reduce LMUL for shorter application vector lengths.  The mixed-width
 loop has to use a fixed SEW=16/LMULMAX=4 and the SEW=32/LMULMAX=8
@@ -1190,6 +1189,7 @@ within the vector registers would not line up correctly.
 NOTE: It should be possible to specify a different form of vlmulmax
 that specifies the range of element widths to be used in overall loop
 to set LMUL correctly. TBD.
+////
 
 == Vector Loads and Stores
 
@@ -1394,64 +1394,6 @@ address of the vector memory access.
     vse.v     vs3, offset(rs1), vm  # SEW store
 ----
 
-[[sec-aos]]
-=== Vector Array-of-Structures Load/Store Instructions
-
-NOTE: This is being considered as a possible extension.
-
-These instructions move a packed contiguous array of structures into
-multiple destination vector registers.  The low three bits of `*umop`
-indicate the number of memory fields in a structure from 1-8. The
-width encoding gives the size of the memory fields, which must be
-homogeneous.
-
-NOTE: Array-of-structs memory instruction with structure size of 1 is
-identical to a unit-stride instruction with LMUL=1.
-
-NOTE: Software can further unpack fields within a structure into
-separate vector registers.
-
-The assembler prefix `vlaos`/`vsaos` is used for loads and stores
-respectively. 
-
-[source,asm]
-----
-    vlaos2b.v vd, offset(rs1), vm   # Load vector of 2*1-byte structure into destination vd, vd+1
-    vlaos3b.v vd, offset(rs1), vm   # Load vector of 3*1-byte structure into destination vd, vd+1
-    vlaos7w.v vd, offset(rs1), vm   # Load vector of 7*4-byte structure into destination vd, vd+1, ... vd+6
-    vlaos8e.v vd, offset(rs1), vm   # Load vector of 8*SEW-byte structure into destination vd, vd+1, .. vd+7
-
-    vsaos3b.v vs3, offset(rs1), vm
-----
-
-[source]
-----
-    # Example 1
-    # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
-    vlaos3b.v v8, (a0), vm
-    # v8 holds the red pixels
-    # v9 holds the green pixels
-    # v10 holds the blue pixels
-
-    # Example 2
-    # Memory structure holds complex values, 32b for real and 32b for imaginary
-    vlaos2w.v v8, (a0), vm
-    # v8 holds real
-    # v9 holds imaginary
-----
-
-For loads, the `vd` register is the first register in the group to be
-loaded. For stores, the `vs3` register is the first register to be
-stored.
-
-LMUL is ignored and these instructions load each vector register as if
-LMUL=1.
-
-The `vl` register gives the number of structures to move, which is
-equal to the number of elements tranferred to each vector register.
-
-If a trap is taken, `vstart` is in units of structures.
-
 === Vector Strided Instructions
 
 [source,asm]
@@ -1568,6 +1510,66 @@ security.  It is possible that security mitigations can be
 implemented to allow first-fault variants of non-contiguous accesses
 in future vector extensions.
 
+[[sec-aos]]
+=== Vector Array-of-Structures Load/Store Instructions
+
+NOTE: This is being considered as a possible extension.
+
+These instructions move a packed contiguous array of structures into
+multiple destination vector registers.  The low three bits of `*umop`
+indicate the number of memory fields in a structure from 1-8. The
+width encoding gives the size of the memory fields, which must be
+homogeneous.
+
+NOTE: An array-of-structs memory instruction with structure size of 1
+is identical to a unit-stride instruction with LMUL=1.
+
+NOTE: Software can further unpack fields within a structure into
+separate vector registers.
+
+The assembler prefix `vlaos`/`vsaos` is used for loads and stores
+respectively. 
+
+[source,asm]
+----
+    vlaos2b.v vd, offset(rs1), vm   # Load vector of 2*1-byte structure into destination vd, vd+1
+    vlaos3b.v vd, offset(rs1), vm   # Load vector of 3*1-byte structure into destination vd, vd+1
+    vlaos7w.v vd, offset(rs1), vm   # Load vector of 7*4-byte structure into destination vd, vd+1, ... vd+6
+    vlaos8e.v vd, offset(rs1), vm   # Load vector of 8*SEW-byte structure into destination vd, vd+1, .. vd+7
+
+    vsaos3b.v vs3, offset(rs1), vm
+----
+
+[source]
+----
+    # Example 1
+    # Memory structure holds packed RGB pixels (24-bit data structure, 8bpp)
+    vlaos3b.v v8, (a0), vm
+    # v8 holds the red pixels
+    # v9 holds the green pixels
+    # v10 holds the blue pixels
+
+    # Example 2
+    # Memory structure holds complex values, 32b for real and 32b for imaginary
+    vlaos2w.v v8, (a0), vm
+    # v8 holds real
+    # v9 holds imaginary
+----
+
+For loads, the `vd` register is the first register in the group to be
+loaded. For stores, the `vs3` register is the first register to be
+stored.
+
+LMUL is ignored and these instructions load each vector register as if
+LMUL=1.
+
+The `vl` register gives the number of structures to move, which is
+equal to the number of elements tranferred to each vector register.
+
+If a trap is taken, `vstart` is in units of structures.
+
+There are also first-fault versions of these instructions.
+
 == Vector AMO Operations
 
 NOTE: Profiles will dictate whether vector AMO operations are
@@ -1582,13 +1584,10 @@ Vector AMO operations are encoded using the unused width encodings
 under the standard AMO major opcode.  Each active element performs an
 atomic read-modify-write of a single memory location.
 
+include::vamo-format.adoc[]
+
 [source]
 ----
-Format for Vector AMO Instructions
-31    27 26  25  24      20 19       15 14   12 11      7 6     0
- amoop  |wd| vm |   vs2    |    rs1    | width | vs3/vd  |0101111| VAMO*
-   5      1   1      5           5         3        5        7
-
 vs2[4:0] specifies v register holding address
 vs3/vd[4:0] specifies v register holding source operand and destination
 
@@ -3835,60 +3834,49 @@ interpreted as relative to the enclosing element only.  Index values
 NOTE: Implementations can provide fast implementations of register
 gathers constrained within a single element width.
 
-=== Vector Single-Width Integer Dot-Product Instructions
+=== Vector Integer Dot-Product Instruction
 
 The dot-product reduction `vdot.vv` performs an element-wise
-multiplication between the source vectors then accumulates the result
-into the destination vector register.  Note the assembler syntax uses
-a `.vv` suffix since both inputs are vectors of elements.
+multiplication between the source sub-elements then accumulates the
+result into the full-width destination vector element.  Note the
+assembler syntax uses a `.vv` suffix since both inputs are vectors of
+elements.
 
 [source]
 ----
-    # Dot product
-    vdot.vv  vd, vs2, vs1, vm   # vd[0] += sum_i( vs2[i] * vs1[i] )
-----
+# Signed dot-product
+vdot.vv vd, vs2, vs1, vm   # Vector-vector
 
-=== Vector Widening Integer Dot-Product Instructions
-
-The `vwdotu.vv` instruction calculates an unsigned dot-product of
-the input vectors and stores the result in a double-width element 0 of
-the vector destination register.
-
-The dot-product instruction can perform an unordered summation, but
-the result must correspond to some sequential ordering of the
-individual add operations.
+# Unsigned dot-product
+vdotu.vv vd, vs2, vs1, vm  # Vector-vector
+----
 
 [source]
 ----
-    # Unsigned dot product into double-width result
-    vwdotu.vv  vd, vs2, vs1, vm   # 2*SEW = sum_i(SEW[i]*SEW[i])
+  # Dot product, SEW=32, EDIV=1
+  vdot.vv  vd, vs2, vs1, vm   # vd[i][31:0] += vs2[i][31:0] * vs1[i][31:0]
+
+  # Dot product, SEW=32, EDIV=2
+  vdot.vv vd, vs2, vs1, vm # vd[i][31:0] += vs2[i][31:16] * vs1[i][31:16]
+                                            + vs2[i][15:0] * vs1[i][15:0]
 
-    # Signed dot product into double-width result
-    vwdot.vv  vd, vs2, vs1, vm   # 2*SEW = sum_i(SEW[i]*SEW[i])
 ----
 
-The `vw4dotu.vv` and `vw4dot.vv` instructions uses a quad-width
-vector element for the accumulator value in the dot-product.
+=== Vector Floating-Point Dot Product Instruction
 
 [source]
 ----
-    # Unsigned dot product.
-    vw4dotu.vv  vd, vs2, vs1, vm   # 4*SEW += sum(SEW*SEW)
-
-    # Signed dot product.
-    vw4dot.vv  vd, vs2, vs1, vm   # 4*SEW += sum(SEW*SEW)
+# Signed dot-product
+vfdot.vv vd, vs2, vs1, vm   # Vector-vector
 ----
 
-=== Vector Floating-Point Dot Product
-
 [source]
 ----
-    # Dot product.
-    vfdot.v  vd, vs2, vs1, vm # vd[0] += sum_i(vs2[i] * vs1[i])
-
-    # Widening dot product.
-    vfwdot.v vd, vs2, vs1, vm  # 2*SEW += sum(SEW*SEW)
+  # Dot product. SEW=32, EDIV=2
+  vfdot.v  vd, vs2, vs1, vm # vd[i][31:0] += vs2[i][31:16] * vs1[i][31:16]
+                                             + vs2[i][15:0] * vs1[i][15:0]
 ----
+
 == Vector Instruction Listing
 
 include::inst-table.adoc[]
diff --git a/valu-format.adoc b/valu-format.adoc
new file mode 100644
index 00000000..5476c5ad
--- /dev/null
+++ b/valu-format.adoc
@@ -0,0 +1,14 @@
+[source]
+----
+Formats for Vector Arithmetic Instructions under OP-V major opcode
+
+31       26  25   24      20 19      15 14   12 11      7 6     0
+  funct6   | vm  |   vs2    |    vs1   | 0 0 0 |    vd   |1010111| OP-V (OPIVV)
+  funct6   | vm  |   vs2    |    vs1   | 0 0 1 |    vd   |1010111| OP-V (OPFVV)
+  funct6   | vm  |   vs2    |    vs1   | 0 1 0 |  vd/rd  |1010111| OP-V (OPMVV)
+  funct6   | vm  |   vs2    |   simm5  | 0 1 1 |    vd   |1010111| OP-V (OPIVI)
+  funct6   | vm  |   vs2    |    rs1   | 1 0 0 |    vd   |1010111| OP-V (OPIVX)
+  funct6   | vm  |   vs2    |    rs1   | 1 0 1 |    vd   |1010111| OP-V (OPFVF)
+  funct6   | vm  |   vs2    |    rs1   | 1 1 0 |  vd/rd  |1010111| OP-V (OPMVX)
+     6        1        5          5        3        5        7
+----
diff --git a/vamo-format.adoc b/vamo-format.adoc
new file mode 100644
index 00000000..325a6c95
--- /dev/null
+++ b/vamo-format.adoc
@@ -0,0 +1,7 @@
+[source]
+----
+Format for Vector AMO Instructions under AMO major opcode
+31    27 26  25  24      20 19       15 14   12 11      7 6     0
+ amoop  |wd| vm |   vs2    |    rs1    | width | vs3/vd  |0101111| VAMO*
+   5      1   1      5           5         3        5        7
+----
diff --git a/vcfg-format.adoc b/vcfg-format.adoc
new file mode 100644
index 00000000..e06a7a11
--- /dev/null
+++ b/vcfg-format.adoc
@@ -0,0 +1,9 @@
+[source]                                                     
+----                                                         
+Formats for Vector Configuration Instructions                
+                                                             
+ 31 30         25 24      20 19      15 14   12 11      7 6     0
+ 0 |        zimm[10:0]      |    rs1   | 1 1 1 |    rd   |1010111| vsetvli
+ 1 |   000000    |   rs2    |    rs1   | 1 1 1 |    rd   |1010111| vsetvl
+ 1        6            5          5        3        5        7
+----