From e9b654b213f1790bfb6c48cdc2aea0992803c652 Mon Sep 17 00:00:00 2001 From: Timur Magomedov Date: Sun, 27 Aug 2017 02:03:29 +0300 Subject: [PATCH 1/2] Removed deprecated unused tilemul fetching flags TILEMUL_SKEWS, TILEMUL_OPTIMIZE_COORD_CALC --- src/library/blas/gens/blas_kgen.h | 13 ++++--------- src/library/blas/gens/gemv.c | 1 - src/library/blas/gens/symv.c | 1 - 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/src/library/blas/gens/blas_kgen.h b/src/library/blas/gens/blas_kgen.h index b48545e3..90882694 100644 --- a/src/library/blas/gens/blas_kgen.h +++ b/src/library/blas/gens/blas_kgen.h @@ -140,24 +140,19 @@ typedef enum TileMulFlags { TILEMUL_GLOBAL_CYCLIC = TILEMUL_GLOBAL_CYCLIC_A | TILEMUL_GLOBAL_CYCLIC_B | TILEMUL_GLOBAL_CYCLIC_K, - // Deprecated - TILEMUL_SKEWS = TILEMUL_SKEW_A | TILEMUL_SKEW_B | TILEMUL_SKEW_K, - /** Optimize coordinates calculations by storing coordinates values */ - // Deprecated - TILEMUL_OPTIMIZE_COORD_CALC = 0x4000, /** Use bwidth0 stride */ - TILEMUL_BW_STRIDE = 0x8000, + TILEMUL_BW_STRIDE = 0x4000, /** Optimize coordinates calculations by using vectors * and pointer increments */ // Deprecated - TILEMUL_OPTIMIZE_VEC_COORDS = 0x10000, + TILEMUL_OPTIMIZE_VEC_COORDS = 0x8000, /** Do not increment K*/ - TILEMUL_NOT_INC_K = 0x20000, + TILEMUL_NOT_INC_K = 0x10000, /** * Use variants with explicit vectorization. Useful on platforms with * true SIMD. */ - TILEMUL_FORCE_VECTORIZATION = 0x40000 + TILEMUL_FORCE_VECTORIZATION = 0x20000 } TileMulFlags; diff --git a/src/library/blas/gens/gemv.c b/src/library/blas/gens/gemv.c index 9835482f..497e6662 100644 --- a/src/library/blas/gens/gemv.c +++ b/src/library/blas/gens/gemv.c @@ -297,7 +297,6 @@ generator( kgenAddBlankLine(ctx); } - mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } diff --git a/src/library/blas/gens/symv.c b/src/library/blas/gens/symv.c index 47c8f1d2..7eed79c4 100644 --- a/src/library/blas/gens/symv.c +++ b/src/library/blas/gens/symv.c @@ -477,7 +477,6 @@ generator( kgenAddBlankLine(ctx); } - mulOpts.flags |= TILEMUL_OPTIMIZE_COORD_CALC; if (tailM) { vnames->sizeM = "N"; } From 4571e307d7c9f0b324029ff8a6e96255beb647ef Mon Sep 17 00:00:00 2001 From: Timur Magomedov Date: Sun, 27 Aug 2017 02:57:00 +0300 Subject: [PATCH 2/2] Fixed tilemul generator debugging tool it is working at least on global matrixes now --- src/library/blas/gens/blas_kgen.h | 14 +--- src/library/blas/gens/gemm.c | 2 - src/library/blas/gens/tests/t_tilemul.c | 97 ++++++------------------- 3 files changed, 24 insertions(+), 89 deletions(-) diff --git a/src/library/blas/gens/blas_kgen.h b/src/library/blas/gens/blas_kgen.h index 90882694..8ea7f05f 100644 --- a/src/library/blas/gens/blas_kgen.h +++ b/src/library/blas/gens/blas_kgen.h @@ -116,7 +116,7 @@ typedef enum TileMulFlags { itself */ /** - * Deprecated. Use the repsective mode being a part of FetchAddr mode. + * Deprecated. Use the respective mode being a part of FetchAddr mode. * He is left just for backward compatibility to don't break the working * code and will be removed soon */ @@ -142,17 +142,13 @@ typedef enum TileMulFlags { TILEMUL_GLOBAL_CYCLIC_K, /** Use bwidth0 stride */ TILEMUL_BW_STRIDE = 0x4000, - /** Optimize coordinates calculations by using vectors - * and pointer increments */ - // Deprecated - TILEMUL_OPTIMIZE_VEC_COORDS = 0x8000, /** Do not increment K*/ - TILEMUL_NOT_INC_K = 0x10000, + TILEMUL_NOT_INC_K = 0x8000, /** * Use variants with explicit vectorization. Useful on platforms with * true SIMD. */ - TILEMUL_FORCE_VECTORIZATION = 0x20000 + TILEMUL_FORCE_VECTORIZATION = 0x10000 } TileMulFlags; @@ -253,10 +249,6 @@ typedef struct KernelVarNames { const char *lda; /**< Leading dimension of matrix A */ const char *ldb; /**< Leading dimension of matrix B */ const char *ldc; /**< Leading dimension of matrix C, in vectors */ - const char *vectCoordA; /**< Vector containing indexes of tile a elements - in matrix A */ - const char *vectCoordB; /**< Vector containing indexes of tile b elements - in matrix B*/ const char *startM; const char *startN; const char *startK; diff --git a/src/library/blas/gens/gemm.c b/src/library/blas/gens/gemm.c index efa2375f..14456c99 100644 --- a/src/library/blas/gens/gemm.c +++ b/src/library/blas/gens/gemm.c @@ -746,8 +746,6 @@ subgGen( vnames->alpha = "alpha"; vnames->beta = "beta"; - vnames->vectCoordA = "vca"; - vnames->vectCoordB = "vcb"; vnames->k = exprK.buf; subgroupsA = (unsigned int)(gset.subdims[0].y/gset.subdims[1].y); diff --git a/src/library/blas/gens/tests/t_tilemul.c b/src/library/blas/gens/tests/t_tilemul.c index 4b4dd803..c951ac35 100644 --- a/src/library/blas/gens/tests/t_tilemul.c +++ b/src/library/blas/gens/tests/t_tilemul.c @@ -70,7 +70,8 @@ typedef union FType { static void printUsage(const char *programName, int exitCode) { - printf( "USAGE: %s [options] \n" + printf( "%s - tiles multiplier generator testing and debugging tool.\n" + "USAGE: %s [options] \n" " --help, -h Print this help message.\n" " --device, -d OpenCL device used. can " "be \"gpu\" or \"cpu\". Default is \"gpu\".\n" @@ -102,8 +103,8 @@ printUsage(const char *programName, int exitCode) "with one generator function call for both fetching and " "multiplication. Separate generators functions are used by " "default.\n" - " M N K Size of block.\n", - programName); + " M N K Sizes of blocks multiplied, 4 4 4 for example.\n", + programName, programName); exit(exitCode); } @@ -158,14 +159,15 @@ genTest( { char s[1024]; Kstring kstr; - char *tName, tVect[64], *ptrName; + char *tName, *ptrName; + char nameA[64], nameB[64]; KernelVarNames *vnames = &gset->varNames; DataType dtype = gset->kextra->dtype; const SubproblemDim *subdims = gset->subdims; unsigned int vecLen = gset->kextra->vecLen; size_t m, n, k; unsigned int i, j; - bool tra, trb, localA, localB, vecCoords; + bool tra, trb, localA, localB; int ret; TileMulFlags flags = mulOpts->flags; FetchOpts fetchOpts; @@ -179,14 +181,6 @@ genTest( localA = (mulOpts->memA == CLMEM_LOCAL_MEMORY); localB = (mulOpts->memB == CLMEM_LOCAL_MEMORY); - vecCoords = ((flags & TILEMUL_OPTIMIZE_VEC_COORDS) != 0); - - tVect[0] = '\0'; - - if (vecCoords && vecLen != 1) { - sprintf(tVect, "%u", vecLen); - } - switch (dtype) { case TYPE_FLOAT: tName = "float"; @@ -208,15 +202,14 @@ genTest( return; } - if (vecCoords) { - //Do not use GPtrs in fetching - vnames->A = "A"; - vnames->B = "B"; - } - else { - vnames->A = localA ? "LAptr" : "((GPtr)A)"; - vnames->B = localB ? "LBptr" : "((GPtr)B)"; + { + const char *typePtrName; + getVectorTypeName(dtype, vecLen, NULL, &typePtrName); + sprintf(nameA, localA ? "LAptr.%s" : "((GPtr)A).%s", typePtrName); + sprintf(nameB, localB ? "LBptr.%s" : "((GPtr)B).%s", typePtrName); } + vnames->A = nameA; + vnames->B = nameB; if (!localA) { vnames->lda = "lda"; @@ -243,9 +236,9 @@ genTest( kgenAddStmt(ctx, s); sprintf(s," %s alpha,\n", tName); kgenAddStmt(ctx, s); - sprintf(s," __global %s%s *A,\n", tName, tVect); + sprintf(s," __global %s *A,\n", tName); kgenAddStmt(ctx, s); - sprintf(s," __global %s%s *B,\n", tName, tVect); + sprintf(s," __global %s *B,\n", tName); kgenAddStmt(ctx, s); kgenAddStmt(ctx, " uint M,\n" " uint N,\n" @@ -291,67 +284,15 @@ genTest( initDefaultTiles(gset, CLBLAS_GEMM, TILE_PACKED, PRIV_STORAGE_ARRAY); declareTileStorages(ctx, gset); - if (vecCoords) { - size_t ha, hb; - char *str; - - ha = tra ? k : m; - hb = trb ? n : k; - - if (ha > 1) { - str = s; - str += sprintf(str, "uint%lu ca = {0", ha); - for (i = 1; i < ha; i++) { - str += sprintf(str, ", %s * %u / %u", vnames->lda, i, vecLen); - } - str += sprintf(str, "};\n"); - kgenAddStmt(ctx, s); - } - else { - kgenAddStmt(ctx, "uint ca = 0;\n"); - } - vnames->vectCoordA = "ca"; - - if (hb > 1) { - str = s; - str += sprintf(str, "uint%lu cb = {0", hb); - for (i = 1; i < hb; i++) { - str += sprintf(str, ", %s * %u / %u", vnames->ldb, i, vecLen); - } - str += sprintf(str, "};\n"); - kgenAddStmt(ctx, s); - } - else { - kgenAddStmt(ctx, "uint cb = 0;\n"); - } - vnames->vectCoordB = "cb"; - -// uint4 ca = {0, vecLDA, vecLDA * 2, vecLDA * 3}; -// uint4 cb = {0, vecLDB, vecLDB * 2, vecLDB * 3}; - } - kgenAddBlankLine(ctx); sprintf(s, "for (int it = 0; it < iter; it++)"); kgenBeginBranch(ctx, s); - if (!(localA && localB)) { - kgenAddStmt(ctx, "uint k = 0;\n"); - } + kgenAddStmt(ctx, "uint k = 0;\n"); genZeroTile(ctx, &gset->tileCY); - if (vecCoords) { - char *coordsA[2] = {"workItemM", "k"}; - char *coordsB[2] = {"k", "workItemN"}; - sprintf(s, "A += %s * (lda / %u) + %s / %u;\n", - coordsA[tra], vecLen, coordsA[1 - tra], vecLen); - kgenAddStmt(ctx, s); - sprintf(s, "B += %s * (ldb / %u) + %s / %u;\n", - coordsB[trb], vecLen, coordsB[1 - trb], vecLen); - kgenAddStmt(ctx, s); - } - sprintf(s, "for (int k0 = 0; k0 < K; k0 += %lu)", subdims[0].bwidth); kgenBeginBranch(ctx, s); @@ -1062,6 +1003,10 @@ int main(int argc, char *argv[]) blockN = atoi(argv[optind + 1]); blockK = atoi(argv[optind + 2]); + kextra.vecLenA = kextra.vecLen; + kextra.vecLenB = kextra.vecLen; + kextra.vecLenC = kextra.vecLen; + if ((mulOpts.memA == CLMEM_LOCAL_MEMORY || mulOpts.memB == CLMEM_LOCAL_MEMORY) && ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) {