diff --git a/src/plugins/intel_cpu/src/nodes/reduce.cpp b/src/plugins/intel_cpu/src/nodes/reduce.cpp index e50f8f413915a5..54e4e2e6fd5a8b 100644 --- a/src/plugins/intel_cpu/src/nodes/reduce.cpp +++ b/src/plugins/intel_cpu/src/nodes/reduce.cpp @@ -150,9 +150,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene data_type::f32); } - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16 = std::make_shared(this, isa); - } + uni_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); @@ -188,9 +186,7 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene this->postamble(); - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16->emit_data(); - } + uni_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Algorithm::ReduceAnd || jcp_.reduce_mode == Algorithm::ReduceL1 || jcp_.reduce_mode == Algorithm::ReduceMax || jcp_.reduce_mode == Algorithm::ReduceMin || @@ -1017,9 +1013,15 @@ struct jit_uni_reduce_kernel_f32 : public jit_uni_reduce_kernel, public jit_gene uni_vmovups(op, vmm_dst); break; case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); + if (isa == cpu::x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } break; case memory::data_type::f16: vcvtps2ph(op, vmm_dst, 0x4); @@ -1253,9 +1255,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi data_type::f32); } - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16 = std::make_shared(this, isa); - } + uni_vcvtneps2bf16 = std::make_shared(this, isa); this->preamble(); @@ -1312,9 +1312,7 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi this->postamble(); - if (mayiuse(avx512_core)) { - uni_vcvtneps2bf16->emit_data(); - } + uni_vcvtneps2bf16->emit_data(); if (jcp_.reduce_mode == Algorithm::ReduceLogSum || jcp_.reduce_mode == Algorithm::ReduceLogSumExp) { log_injector->prepare_table(); @@ -1770,9 +1768,15 @@ struct jit_uni_reduce_post_kernel_f32 : public jit_uni_reduce_post_kernel, publi uni_vmovups(op, vmm_dst); break; case memory::data_type::bf16: - uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, - {static_cast(ymm_dst.getIdx())}); - vmovdqu16(op, ymm_dst); + if (isa == cpu::x64::avx512_core) { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(ymm_dst.getIdx())}); + vmovdqu16(op, ymm_dst); + } else { + uni_vcvtneps2bf16->emit_code({static_cast(vmm_dst.getIdx())}, + {static_cast(xmm_dst.getIdx())}); + uni_vmovdqu(op, xmm_dst); + } break; case memory::data_type::f16: vcvtps2ph(op, vmm_dst, 0x4);