added pytests

ROCm · Jan 21, 2025 · 5bc3ae7 · 5bc3ae7
1 parent 3dd6e22
commit 5bc3ae7
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 0 deletions.
diff --git a/...sile/Tests/common/gemm/dtvA_swizzleA.yaml → ...e/Tensile/Tests/common/gemm/swizzleA.yaml b/...sile/Tests/common/gemm/dtvA_swizzleA.yaml → ...e/Tensile/Tests/common/gemm/swizzleA.yaml
diff --git a/tensilelite/Tensile/Tests/common/gemm/swizzleB.yaml b/tensilelite/Tensile/Tests/common/gemm/swizzleB.yaml
@@ -0,0 +1,126 @@
+TestParameters:
+  marks: [skip-gfx900, skip-gfx906, skip-gfx908, skip-gfx90a, skip-gfx1010, skip-gfx1011, skip-gfx1012, skip-gfx1030, skip-gfx1100, skip-gfx1101, skip-gfx1102, skip-gfx1200, skip-gfx1201] # not supported by arch
+
+GlobalParameters:
+  NumElementsToValidate: -1
+  MinimumRequiredVersion: 4.14.0
+  PrintLevel: 1
+  #PrintSolutionRejectionReason: True
+  Device: 0
+  CMakeBuildType: Release
+  KernelTime: True
+  MaxWorkspaceSize: 13421772800
+  DataInitTypeA: 13
+  DataInitTypeB: 12
+  DataInitTypeAlpha: 1
+  DataInitTypeBeta: 1
+  DataInitTypeBias: 13
+  DataInitTypeScaleAlphaVec: 12
+  BoundsCheck: 2
+  #MaxFileName: 256
+
+BenchmarkProblems:
+  ########################################
+  # HHS TN DTVB + SWIZZLED_B + BIAS + Activation + SAV
+  ########################################
+  -
+    - # ProblemType
+      OperationType: GEMM
+      DataType: h
+      DestDataType: h
+      ComputeDataType: s
+      HighPrecisionAccumulate: True
+      TransposeA: 1
+      TransposeB: 0
+      SwizzleTensorB: True
+      UseBeta: True
+      Batched: True
+      UseBias: 1
+      Activation: True
+      BiasDataTypeList: ['h']
+      UseScaleAlphaVec: 1
+    - # BenchmarkProblemSizeGroup - Standard - All problem
+      InitialSolutionParameters:
+      BenchmarkCommonParameters:
+        - KernelLanguage: ["Assembly"]
+      ForkParameters:
+        - MatrixInstruction:
+          - [16, 16, 16, 1,  1,   1, 1,  1,4 ] # MT = 16x64
+          - [16, 16, 16, 1,  1,   1, 2,  1,4 ] # MT = 16x128
+          - [16, 16, 16, 1,  1,   1, 4,  1,4 ] # MT = 16x256
+          - [16, 16, 16, 1,  1,   1, 8,  1,4 ] # MT = 16x512
+          - [16, 16, 16, 1,  1,   1, 5,  1,2 ] # MT = 16x160
+
+          - [16, 16, 16, 1,  1,   8,  1,  1,4 ] # MT = 128x64
+          - [16, 16, 16, 1,  1,   16, 1,  1,4 ] # MT = 256x64
+          - [16, 16, 16, 1,  1,   8,  2,  1,4 ] # MT = 128x128
+          - [16, 16, 16, 1,  1,   16, 2,  1,4 ] # MT = 256x128
+          - [16, 16, 16, 1,  1,   8,  4,  1,4 ] # MT = 128x256
+          - [16, 16, 16, 1,  1,   16, 4,  1,4 ] # MT = 256x256
+          - [16, 16, 16, 1,  1,   8,  8,  1,4 ] # MT = 128x512
+          - [16, 16, 16, 1,  1,   16, 8,  1,4 ] # MT = 256x512
+          - [16, 16, 16, 1,  1,   8,  5,  1,2 ] # MT = 128x160
+
+          - [16, 16, 16, 1,  1,   4, 2,   2,2 ] # MT = 128x64
+          - [16, 16, 16, 1,  1,   8, 2,   2,2 ] # MT = 256x64
+          - [16, 16, 16, 1,  1,   4, 4,   2,2 ] # MT = 128x128
+          - [16, 16, 16, 1,  1,   8, 4,   2,2 ] # MT = 256x128
+          - [16, 16, 16, 1,  1,   4, 8,   2,2 ] # MT = 128x256
+          - [16, 16, 16, 1,  1,   8, 8,   2,2 ] # MT = 256x256
+          - [16, 16, 16, 1,  1,   4, 16,  2,2 ] # MT = 128x512
+          - [16, 16, 16, 1,  1,   8, 16,  2,2 ] # MT = 256x512
+          - [16, 16, 16, 1,  1,   4, 5,   2,2 ] # MT = 128x160
+
+          - [16, 16, 16, 1,  1,   2, 4,   4,1 ] # MT = 128x64
+          - [16, 16, 16, 1,  1,   4, 4,   4,1 ] # MT = 256x64
+          - [16, 16, 16, 1,  1,   2, 8,   4,1 ] # MT = 128x128
+          - [16, 16, 16, 1,  1,   4, 8,   4,1 ] # MT = 256x128
+          - [16, 16, 16, 1,  1,   2, 16,  4,1 ] # MT = 128x256
+          - [16, 16, 16, 1,  1,   4, 16,  4,1 ] # MT = 256x256
+          - [16, 16, 16, 1,  1,   2, 10,  4,1 ] # MT = 128x160
+        - AssertFree1ElementMultiple: [16]
+        - AssertSummationElementMultiple: [32]
+        - GlobalReadVectorWidthA: [-1]
+        - GlobalReadVectorWidthB: [8]
+        - PrefetchGlobalRead: [1,2]
+        - PrefetchLocalRead: [1,2,4]
+        - ClusterLocalRead: [1]
+        - NumElementsPerBatchStore: [0]
+        - DepthU: [32,64,128]
+        - VectorWidthA: [-1]
+        - VectorWidthB: [1]
+        - LocalWritePerMfma: [-1]
+        - StaggerU: [4]
+        - StaggerUStride: [256]
+        - StaggerUMapping: [0]
+        - WorkGroupMappingXCC: [8]
+        - ScheduleIterAlg: [3]
+        - LdsBlockSizePerPadA: [-1]
+        - LdsBlockSizePerPadB: [-1]
+        - StorePriorityOpt: [0]
+        - VectorStore: [-1]
+        - StoreSyncOpt: [0]
+        - LdsPadA: [-1]
+        - LdsPadB: [-1]
+        - 1LDSBuffer: [1]
+        - GlobalSplitU: [1,3]
+        - GlobalSplitUAlgorithm: ["MultipleBuffer", "MultipleBufferSingleKernel"]
+        - LocalReadVectorWidth: [2,4,8]
+        - DirectToVgprB: [1]
+        - UseSgprForGRO: [0,1]
+      BenchmarkJoinParameters:
+      BenchmarkFinalParameters:
+        - ProblemSizes:
+          - Exact: [256, 160,   1, 224]
+          - Exact: [256, 160,   1, 256]
+          - Exact: [256, 160,   1, 288]
+          - Exact: [512, 1600,  1, 992]
+          - Exact: [512, 1600,  1, 1024]
+          - Exact: [512, 1600,  1, 1056]
+          - Exact: [256, 512,   1, 224]
+          - Exact: [256, 512,   1, 256]
+          - Exact: [256, 512,   1, 288]
+        - BiasTypeArgs: ['h']
+        - ActivationArgs:
+          - [Enum: none]
+          - [Enum: relu]