From 6251e7ea0a75a98955dc2963757e738e69f7a621 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 4 Jul 2022 09:55:13 +0200 Subject: [PATCH 1/2] zstd: Improve decoder memcopy Improve memcopy for small matches. Up to 30% increased throughput, depending on input. ``` benchmark old MB/s new MB/s speedup Benchmark_seqdec_execute/n-12286-lits-13914-prev-9869-1990358-3296656-win-4194304.blk-32 1284.77 1525.03 1.19x Benchmark_seqdec_execute/n-12485-lits-6960-prev-976039-2250252-2463561-win-4194304.blk-32 1107.87 1614.28 1.46x Benchmark_seqdec_execute/n-14746-lits-14461-prev-209-8-1379909-win-4194304.blk-32 3947.25 4100.49 1.04x Benchmark_seqdec_execute/n-1525-lits-1498-prev-2009476-797934-2994405-win-4194304.blk-32 10281.12 10316.14 1.00x Benchmark_seqdec_execute/n-3478-lits-3628-prev-895243-2104056-2119329-win-4194304.blk-32 8115.99 8829.85 1.09x Benchmark_seqdec_execute/n-8422-lits-5840-prev-168095-2298675-433830-win-4194304.blk-32 1578.08 2290.47 1.45x Benchmark_seqdec_execute/n-1000-lits-1057-prev-21887-92-217-win-8388608.blk-32 17079.65 16716.41 0.98x Benchmark_seqdec_execute/n-15134-lits-20798-prev-4882976-4884216-4474622-win-8388608.blk-32 2020.09 2166.56 1.07x Benchmark_seqdec_execute/n-2-lits-0-prev-620601-689171-848-win-8388608.blk-32 35781.31 35745.53 1.00x Benchmark_seqdec_execute/n-90-lits-67-prev-19498-23-19710-win-8388608.blk-32 33125.43 32785.93 0.99x Benchmark_seqdec_execute/n-931-lits-1179-prev-36502-1526-1518-win-8388608.blk-32 19394.38 19643.49 1.01x Benchmark_seqdec_execute/n-2898-lits-4062-prev-335-386-751-win-8388608.blk-32 10494.30 10653.09 1.02x Benchmark_seqdec_execute/n-4056-lits-12419-prev-10792-66-309849-win-8388608.blk-32 7425.77 7506.51 1.01x Benchmark_seqdec_execute/n-8028-lits-4568-prev-917-65-920-win-8388608.blk-32 2855.17 3396.09 1.19x benchmark old MB/s new MB/s speedup BenchmarkDecoder_DecoderSmall/kppkn.gtb.zst-32 537.74 651.27 1.21x BenchmarkDecoder_DecoderSmall/geo.protodata.zst-32 1500.59 1610.11 1.07x BenchmarkDecoder_DecoderSmall/plrabn12.txt.zst-32 410.13 505.82 1.23x BenchmarkDecoder_DecoderSmall/lcet10.txt.zst-32 467.83 601.25 1.29x BenchmarkDecoder_DecoderSmall/asyoulik.txt.zst-32 434.53 530.71 1.22x BenchmarkDecoder_DecoderSmall/alice29.txt.zst-32 433.95 544.87 1.26x BenchmarkDecoder_DecoderSmall/html_x_4.zst-32 2860.31 3189.40 1.12x BenchmarkDecoder_DecoderSmall/paper-100k.pdf.zst-32 5336.43 5437.24 1.02x BenchmarkDecoder_DecoderSmall/fireworks.jpeg.zst-32 12327.10 12350.86 1.00x BenchmarkDecoder_DecoderSmall/urls.10K.zst-32 660.52 774.52 1.17x BenchmarkDecoder_DecoderSmall/html.zst-32 1076.67 1284.53 1.19x BenchmarkDecoder_DecoderSmall/comp-data.bin.zst-32 569.30 576.15 1.01x BenchmarkDecoder_DecodeAll/kppkn.gtb.zst-32 812.16 813.72 1.00x BenchmarkDecoder_DecodeAll/geo.protodata.zst-32 1943.14 1933.04 0.99x BenchmarkDecoder_DecodeAll/plrabn12.txt.zst-32 712.27 715.46 1.00x BenchmarkDecoder_DecodeAll/lcet10.txt.zst-32 688.23 775.97 1.13x BenchmarkDecoder_DecodeAll/asyoulik.txt.zst-32 702.87 700.17 1.00x BenchmarkDecoder_DecodeAll/alice29.txt.zst-32 717.44 720.89 1.00x BenchmarkDecoder_DecodeAll/html_x_4.zst-32 1960.55 1968.90 1.00x BenchmarkDecoder_DecodeAll/paper-100k.pdf.zst-32 5981.50 6169.12 1.03x BenchmarkDecoder_DecodeAll/fireworks.jpeg.zst-32 13140.18 13145.86 1.00x BenchmarkDecoder_DecodeAll/urls.10K.zst-32 983.71 988.16 1.00x BenchmarkDecoder_DecodeAll/html.zst-32 1624.80 1624.92 1.00x BenchmarkDecoder_DecodeAll/comp-data.bin.zst-32 569.84 570.96 1.00x BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/fastest-32 504.31 622.83 1.24x BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/default-32 564.68 717.57 1.27x BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/better-32 615.18 766.33 1.25x BenchmarkDecoder_DecodeAllFiles/.tracker-unpacked.bin/best-32 786.17 857.17 1.09x BenchmarkDecoder_DecodeAllFiles/.tracker.bin/fastest-32 12860.99 12870.57 1.00x BenchmarkDecoder_DecodeAllFiles/.tracker.bin/default-32 619.06 617.54 1.00x BenchmarkDecoder_DecodeAllFiles/.tracker.bin/better-32 630.33 625.20 0.99x BenchmarkDecoder_DecodeAllFiles/.tracker.bin/best-32 609.12 612.50 1.01x BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/fastest-32 658.22 659.45 1.00x BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/default-32 723.60 729.95 1.01x BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/better-32 735.73 737.52 1.00x BenchmarkDecoder_DecodeAllFiles/Mark.Twain-Tom.Sawyer.txt/best-32 745.43 749.55 1.01x BenchmarkDecoder_DecodeAllFiles/e.txt/fastest-32 12801.86 12967.61 1.01x BenchmarkDecoder_DecodeAllFiles/e.txt/default-32 680.29 677.69 1.00x BenchmarkDecoder_DecodeAllFiles/e.txt/better-32 739.23 733.45 0.99x BenchmarkDecoder_DecodeAllFiles/e.txt/best-32 820.16 825.62 1.01x BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/fastest-32 1186.63 1194.87 1.01x BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/default-32 1384.74 1412.45 1.02x BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/better-32 1104.17 1107.00 1.00x BenchmarkDecoder_DecodeAllFiles/fse-artifact3.bin/best-32 409.59 409.27 1.00x BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/fastest-32 392.32 391.89 1.00x BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/default-32 296.47 296.65 1.00x BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/better-32 296.52 296.68 1.00x BenchmarkDecoder_DecodeAllFiles/gettysburg.txt/best-32 299.85 295.83 0.99x BenchmarkDecoder_DecodeAllFiles/html.txt/fastest-32 988.75 996.39 1.01x BenchmarkDecoder_DecodeAllFiles/html.txt/default-32 987.11 989.51 1.00x BenchmarkDecoder_DecodeAllFiles/html.txt/better-32 1027.64 1038.21 1.01x BenchmarkDecoder_DecodeAllFiles/html.txt/best-32 973.41 989.86 1.02x BenchmarkDecoder_DecodeAllFiles/pi.txt/fastest-32 12976.96 13045.11 1.01x BenchmarkDecoder_DecodeAllFiles/pi.txt/default-32 678.88 674.53 0.99x BenchmarkDecoder_DecodeAllFiles/pi.txt/better-32 746.38 747.36 1.00x BenchmarkDecoder_DecodeAllFiles/pi.txt/best-32 823.52 827.84 1.01x BenchmarkDecoder_DecodeAllFiles/pngdata.bin/fastest-32 2115.58 2121.84 1.00x BenchmarkDecoder_DecodeAllFiles/pngdata.bin/default-32 1767.98 1779.35 1.01x BenchmarkDecoder_DecodeAllFiles/pngdata.bin/better-32 2306.86 2328.47 1.01x BenchmarkDecoder_DecodeAllFiles/pngdata.bin/best-32 1660.52 1684.65 1.01x BenchmarkDecoder_DecodeAllFiles/sharnd.out/fastest-32 13027.08 12999.49 1.00x BenchmarkDecoder_DecodeAllFiles/sharnd.out/default-32 13054.18 13084.25 1.00x BenchmarkDecoder_DecodeAllFiles/sharnd.out/better-32 13067.23 13099.47 1.00x BenchmarkDecoder_DecodeAllFiles/sharnd.out/best-32 13079.77 13104.13 1.00x BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/fastest-32 10354.84 11838.70 1.14x BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/default-32 11557.12 13404.78 1.16x BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/better-32 12644.67 14519.37 1.15x BenchmarkDecoder_DecodeAllFilesP/.tracker-unpacked.bin/best-32 15934.00 17312.77 1.09x BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/fastest-32 35354.57 34836.95 0.99x BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/default-32 11392.27 11275.11 0.99x BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/better-32 11793.77 11771.24 1.00x BenchmarkDecoder_DecodeAllFilesP/.tracker.bin/best-32 11203.91 11142.52 0.99x BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/fastest-32 12089.54 11983.77 0.99x BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/default-32 12604.67 12514.75 0.99x BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/better-32 13265.79 13152.64 0.99x BenchmarkDecoder_DecodeAllFilesP/Mark.Twain-Tom.Sawyer.txt/best-32 13078.85 12983.91 0.99x BenchmarkDecoder_DecodeAllFilesP/e.txt/fastest-32 52477.17 52657.54 1.00x BenchmarkDecoder_DecodeAllFilesP/e.txt/default-32 11947.06 11809.75 0.99x BenchmarkDecoder_DecodeAllFilesP/e.txt/better-32 13184.17 13140.65 1.00x BenchmarkDecoder_DecodeAllFilesP/e.txt/best-32 14630.26 14718.01 1.01x BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/fastest-32 3013.25 3088.05 1.02x BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/default-32 3125.61 3091.48 0.99x BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/better-32 3181.68 3034.74 0.95x BenchmarkDecoder_DecodeAllFilesP/fse-artifact3.bin/best-32 3351.22 3526.91 1.05x BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/fastest-32 1188.15 1136.88 0.96x BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/default-32 1215.39 1193.99 0.98x BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/better-32 1219.20 1206.23 0.99x BenchmarkDecoder_DecodeAllFilesP/gettysburg.txt/best-32 1216.72 1200.26 0.99x BenchmarkDecoder_DecodeAllFilesP/html.txt/fastest-32 16901.32 17076.26 1.01x BenchmarkDecoder_DecodeAllFilesP/html.txt/default-32 16819.66 16892.32 1.00x BenchmarkDecoder_DecodeAllFilesP/html.txt/better-32 17805.12 17873.77 1.00x BenchmarkDecoder_DecodeAllFilesP/html.txt/best-32 16916.87 17184.02 1.02x BenchmarkDecoder_DecodeAllFilesP/pi.txt/fastest-32 52314.15 51687.88 0.99x BenchmarkDecoder_DecodeAllFilesP/pi.txt/default-32 11878.94 11778.57 0.99x BenchmarkDecoder_DecodeAllFilesP/pi.txt/better-32 13303.16 13162.44 0.99x BenchmarkDecoder_DecodeAllFilesP/pi.txt/best-32 14622.76 14717.80 1.01x BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/fastest-32 34134.48 37031.10 1.08x BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/default-32 33589.32 35277.28 1.05x BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/better-32 43754.89 44761.13 1.02x BenchmarkDecoder_DecodeAllFilesP/pngdata.bin/best-32 32422.22 34107.42 1.05x BenchmarkDecoder_DecodeAllFilesP/sharnd.out/fastest-32 52706.00 52396.81 0.99x BenchmarkDecoder_DecodeAllFilesP/sharnd.out/default-32 52527.76 52048.36 0.99x BenchmarkDecoder_DecodeAllFilesP/sharnd.out/better-32 52177.25 52688.64 1.01x BenchmarkDecoder_DecodeAllFilesP/sharnd.out/best-32 52443.28 52799.86 1.01x BenchmarkDecoder_DecodeAllParallel/kppkn.gtb.zst-32 13992.47 13994.15 1.00x BenchmarkDecoder_DecodeAllParallel/geo.protodata.zst-32 34107.95 34221.23 1.00x BenchmarkDecoder_DecodeAllParallel/plrabn12.txt.zst-32 12012.34 11976.30 1.00x BenchmarkDecoder_DecodeAllParallel/lcet10.txt.zst-32 12630.22 13384.70 1.06x BenchmarkDecoder_DecodeAllParallel/asyoulik.txt.zst-32 12327.02 12251.04 0.99x BenchmarkDecoder_DecodeAllParallel/alice29.txt.zst-32 11932.73 11896.92 1.00x BenchmarkDecoder_DecodeAllParallel/html_x_4.zst-32 31233.38 36258.56 1.16x BenchmarkDecoder_DecodeAllParallel/paper-100k.pdf.zst-32 97435.31 100317.73 1.03x BenchmarkDecoder_DecodeAllParallel/fireworks.jpeg.zst-32 62247.22 62306.36 1.00x BenchmarkDecoder_DecodeAllParallel/urls.10K.zst-32 18659.58 18592.14 1.00x BenchmarkDecoder_DecodeAllParallel/html.zst-32 28464.78 28519.30 1.00x BenchmarkDecoder_DecodeAllParallel/comp-data.bin.zst-32 3114.03 3297.01 1.06x BenchmarkDecoderSilesia/multithreaded-writer-32 1099.69 1104.92 1.00x BenchmarkDecoderSilesia/multithreaded-writer-himem-32 1093.10 1102.98 1.01x BenchmarkDecoderSilesia/singlethreaded-writer-32 803.85 818.55 1.02x BenchmarkDecoderSilesia/singlethreaded-writerto-32 812.83 828.19 1.02x BenchmarkDecoderSilesia/singlethreaded-himem-32 813.14 828.32 1.02x BenchmarkDecoderEnwik9/multithreaded-writer-32 877.55 996.49 1.14x BenchmarkDecoderEnwik9/multithreaded-writer-himem-32 961.20 1036.76 1.08x BenchmarkDecoderEnwik9/singlethreaded-writer-32 632.07 631.96 1.00x BenchmarkDecoderEnwik9/singlethreaded-writerto-32 634.62 634.52 1.00x BenchmarkDecoderEnwik9/singlethreaded-himem-32 763.68 758.40 0.99x BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/multithreaded-writer-32 1626.86 1730.88 1.06x BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/multithreaded-writer-himem-32 2299.80 2375.04 1.03x BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-writer-32 1221.34 1221.43 1.00x BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-writerto-32 1236.18 1237.97 1.00x BenchmarkDecoderWithCustomFiles/github-june-2days-2019.json.zst/singlethreaded-himem-32 1749.21 1754.96 1.00x BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/multithreaded-writer-32 839.51 933.63 1.11x BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/multithreaded-writer-himem-32 1055.54 1100.37 1.04x BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-writer-32 574.91 613.88 1.07x BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-writerto-32 579.19 618.72 1.07x BenchmarkDecoderWithCustomFiles/github-ranks-backup.bin.zst/singlethreaded-himem-32 780.67 867.96 1.11x ``` --- zstd/_generate/gen.go | 107 ++-- zstd/seqdec_amd64.s | 1164 +++++++++++++++++++++++------------------ 2 files changed, 698 insertions(+), 573 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index 96671d414d..b1fa6c124a 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -1132,7 +1132,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle JZ(LabelRef("check_offset")) // TODO: Investigate if it is possible to consistently overallocate literals. if e.safeMem { - e.copyMemoryPrecise("1", c.literals, c.outBase, ll) + e.copyMemoryPrecise("1", c.literals, c.outBase, ll, 1) } else { e.copyMemoryND("1", c.literals, c.outBase, ll) ADDQ(ll, c.literals) @@ -1201,7 +1201,8 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle continue } */ - e.copyMemoryPrecise("4", ptr, c.outBase, ml) + // We know ml will be 4 + e.copyMemoryPrecise("4", ptr, c.outBase, ml, 3) ADDQ(ml, c.outPosition) // Note: for the current go tests this branch is taken in 99.53% cases, // this is why we repeat a little code here. @@ -1217,7 +1218,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle seq.ml -= v } */ - e.copyMemoryPrecise("5", ptr, c.outBase, v) + e.copyMemoryPrecise("5", ptr, c.outBase, v, 1) ADDQ(v, c.outPosition) SUBQ(v, ml) // fallback to the next block @@ -1251,7 +1252,7 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle { ADDQ(ml, c.outPosition) if e.safeMem { - e.copyMemoryPrecise("2", src, c.outBase, ml) + e.copyMemoryPrecise("2", src, c.outBase, ml, 1) } else { dst := GP64() MOVQ(c.outBase, dst) @@ -1311,7 +1312,7 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua // copyMemoryPrecise will copy memory in blocks of 16 bytes, // without overreading. It adds length to src and dst, // preserving length. -func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual) { +func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual, minLength int) { n := GP64() MOVQ(length, n) SUBQ(U8(16), n) @@ -1346,53 +1347,57 @@ func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPV } Label("copy_" + suffix + "_small") - ofs := GP64() - s := Mem{Base: src, Index: ofs, Scale: 1} - d := Mem{Base: dst, Index: ofs, Scale: 1} - - tmp := GP64() - XORQ(ofs, ofs) - - Label("copy_" + suffix + "_byte") - TESTQ(U32(0x1), length) - JZ(LabelRef("copy_" + suffix + "_word")) - - // copy one byte if length & 0x01 != 0 - MOVB(s, tmp.As8()) - MOVB(tmp.As8(), d) - ADDQ(U8(1), ofs) - - Label("copy_" + suffix + "_word") - TESTQ(U32(0x2), length) - JZ(LabelRef("copy_" + suffix + "_dword")) - - // copy two bytes if length & 0x02 != 0 - MOVW(s, tmp.As16()) - MOVW(tmp.As16(), d) - ADDQ(U8(2), ofs) - - Label("copy_" + suffix + "_dword") - TESTQ(U32(0x4), length) - JZ(LabelRef("copy_" + suffix + "_qword")) - - // copy four bytes if length & 0x04 != 0 - MOVL(s, tmp.As32()) - MOVL(tmp.As32(), d) - ADDQ(U8(4), ofs) - - Label("copy_" + suffix + "_qword") - TESTQ(U32(0x8), length) - JZ(LabelRef("copy_" + suffix + "_add")) - - // copy eight bytes if length & 0x08 != 0 - MOVQ(s, tmp) - MOVQ(tmp, d) - ADDQ(U8(8), ofs) - - Label("copy_" + suffix + "_add") - ADDQ(length, dst) - ADDQ(length, src) + { + name := "copy_" + suffix + "_" + end := LabelRef("copy_" + suffix + "_end") + CMPQ(length, U8(3)) + JE(LabelRef(name + "move_3")) + if minLength < 3 { + JB(LabelRef(name + "move_1or2")) + } + CMPQ(length, U8(8)) + JB(LabelRef(name + "move_4through7")) + JMP(LabelRef(name + "move_8through16")) + AX, CX := GP64(), GP64() + + if minLength < 3 { + Label(name + "move_1or2") + MOVB(Mem{Base: src}, AX.As8()) + MOVB(Mem{Base: src, Disp: -1, Index: length, Scale: 1}, CX.As8()) + MOVB(AX.As8(), Mem{Base: dst}) + MOVB(CX.As8(), Mem{Base: dst, Disp: -1, Index: length, Scale: 1}) + ADDQ(length, src) + ADDQ(length, dst) + JMP(end) + } + Label(name + "move_3") + MOVW(Mem{Base: src}, AX.As16()) + MOVB(Mem{Base: src, Disp: 2}, CX.As8()) + MOVW(AX.As16(), Mem{Base: dst}) + MOVB(CX.As8(), Mem{Base: dst, Disp: 2}) + ADDQ(length, src) + ADDQ(length, dst) + JMP(end) + + Label(name + "move_4through7") + MOVL(Mem{Base: src}, AX.As32()) + MOVL(Mem{Base: src, Disp: -4, Index: length, Scale: 1}, CX.As32()) + MOVL(AX.As32(), Mem{Base: dst}) + MOVL(CX.As32(), Mem{Base: dst, Disp: -4, Index: length, Scale: 1}) + ADDQ(length, src) + ADDQ(length, dst) + JMP(end) + + Label(name + "move_8through16") + MOVQ(Mem{Base: src}, AX) + MOVQ(Mem{Base: src, Disp: -8, Index: length, Scale: 1}, CX) + MOVQ(AX, Mem{Base: dst}) + MOVQ(CX, Mem{Base: dst, Disp: -8, Index: length, Scale: 1}) + ADDQ(length, src) + ADDQ(length, dst) + JMP(end) + } Label("copy_" + suffix + "_end") } diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 9d76f0580f..1a86f13577 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -1206,37 +1206,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) ADDQ R13, R14 + ADDQ R13, BX copy_4_end: ADDQ R13, DI @@ -1265,37 +1265,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX copy_5_end: ADDQ R11, DI @@ -1426,37 +1436,47 @@ copy_1_loop: JMP copy_1_end copy_1_small: - XORQ R14, R14 - TESTQ $0x00000001, R11 - JZ copy_1_word - MOVB (SI)(R14*1), R15 - MOVB R15, (BX)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, R11 - JZ copy_1_dword - MOVW (SI)(R14*1), R15 - MOVW R15, (BX)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, R11 - JZ copy_1_qword - MOVL (SI)(R14*1), R15 - MOVL R15, (BX)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, R11 - JZ copy_1_add - MOVQ (SI)(R14*1), R15 - MOVQ R15, (BX)(R14*1) - ADDQ $0x08, R14 - -copy_1_add: + CMPQ R11, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ R11, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (SI), R14 + MOVB -1(SI)(R11*1), R15 + MOVB R14, (BX) + MOVB R15, -1(BX)(R11*1) + ADDQ R11, SI ADDQ R11, BX + JMP copy_1_end + +copy_1_move_3: + MOVW (SI), R14 + MOVB 2(SI), R15 + MOVW R14, (BX) + MOVB R15, 2(BX) ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end + +copy_1_move_4through7: + MOVL (SI), R14 + MOVL -4(SI)(R11*1), R15 + MOVL R14, (BX) + MOVL R15, -4(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (SI), R14 + MOVQ -8(SI)(R11*1), R15 + MOVQ R14, (BX) + MOVQ R15, -8(BX)(R11*1) + ADDQ R11, SI + ADDQ R11, BX copy_1_end: ADDQ R11, DI @@ -1495,37 +1515,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ R11, R11 - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(R11*1), R12 - MOVB R12, (BX)(R11*1) - ADDQ $0x01, R11 - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(R11*1), R12 - MOVW R12, (BX)(R11*1) - ADDQ $0x02, R11 - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(R11*1), R12 - MOVL R12, (BX)(R11*1) - ADDQ $0x04, R11 - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(R11*1), R12 - MOVQ R12, (BX)(R11*1) - ADDQ $0x08, R11 - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), R11 + MOVB 2(R14), R12 + MOVW R11, (BX) + MOVB R12, 2(BX) + ADDQ R13, R14 ADDQ R13, BX + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), R11 + MOVL -4(R14)(R13*1), R12 + MOVL R11, (BX) + MOVL R12, -4(BX)(R13*1) ADDQ R13, R14 + ADDQ R13, BX + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), R11 + MOVQ -8(R14)(R13*1), R12 + MOVQ R11, (BX) + MOVQ R12, -8(BX)(R13*1) + ADDQ R13, R14 + ADDQ R13, BX copy_4_end: ADDQ R13, DI @@ -1554,37 +1574,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, R11 - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (BX)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, R11 - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (BX)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, R11 - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (BX)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, R11 - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (BX)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ R11, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ R11, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(R11*1), BP + MOVB R15, (BX) + MOVB BP, -1(BX)(R11*1) + ADDQ R11, R14 ADDQ R11, BX + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (BX) + MOVB BP, 2(BX) ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(R11*1), BP + MOVL R15, (BX) + MOVL BP, -4(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(R11*1), BP + MOVQ R15, (BX) + MOVQ BP, -8(BX)(R11*1) + ADDQ R11, R14 + ADDQ R11, BX copy_5_end: ADDQ R11, DI @@ -1621,37 +1651,47 @@ copy_2_loop: JMP copy_2_end copy_2_small: - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (R11)(R12*1), R14 - MOVB R14, (BX)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (R11)(R12*1), R14 - MOVW R14, (BX)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (R11)(R12*1), R14 - MOVL R14, (BX)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_add - MOVQ (R11)(R12*1), R14 - MOVQ R14, (BX)(R12*1) - ADDQ $0x08, R12 - -copy_2_add: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (R11), R12 + MOVB -1(R11)(R13*1), R14 + MOVB R12, (BX) + MOVB R14, -1(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_3: + MOVW (R11), R12 + MOVB 2(R11), R14 + MOVW R12, (BX) + MOVB R14, 2(BX) + ADDQ R13, R11 ADDQ R13, BX + JMP copy_2_end + +copy_2_move_4through7: + MOVL (R11), R12 + MOVL -4(R11)(R13*1), R14 + MOVL R12, (BX) + MOVL R14, -4(BX)(R13*1) ADDQ R13, R11 + ADDQ R13, BX + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (R11), R12 + MOVQ -8(R11)(R13*1), R14 + MOVQ R12, (BX) + MOVQ R14, -8(BX)(R13*1) + ADDQ R13, R11 + ADDQ R13, BX copy_2_end: JMP handle_loop @@ -2036,37 +2076,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 copy_4_end: ADDQ R13, R12 @@ -2092,37 +2132,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) ADDQ AX, R14 + ADDQ AX, R10 copy_5_end: ADDQ AX, R12 @@ -2535,37 +2585,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 copy_4_end: ADDQ R13, R11 @@ -2591,37 +2641,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 copy_5_end: ADDQ CX, R11 @@ -3032,37 +3092,47 @@ copy_1_loop: JMP copy_1_end copy_1_small: - XORQ R14, R14 - TESTQ $0x00000001, AX - JZ copy_1_word - MOVB (R11)(R14*1), R15 - MOVB R15, (R10)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, AX - JZ copy_1_dword - MOVW (R11)(R14*1), R15 - MOVW R15, (R10)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, AX - JZ copy_1_qword - MOVL (R11)(R14*1), R15 - MOVL R15, (R10)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, AX - JZ copy_1_add - MOVQ (R11)(R14*1), R15 - MOVQ R15, (R10)(R14*1) - ADDQ $0x08, R14 - -copy_1_add: + CMPQ AX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ AX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R11), R14 + MOVB -1(R11)(AX*1), R15 + MOVB R14, (R10) + MOVB R15, -1(R10)(AX*1) + ADDQ AX, R11 ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_3: + MOVW (R11), R14 + MOVB 2(R11), R15 + MOVW R14, (R10) + MOVB R15, 2(R10) ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R11), R14 + MOVL -4(R11)(AX*1), R15 + MOVL R14, (R10) + MOVL R15, -4(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (R11), R14 + MOVQ -8(R11)(AX*1), R15 + MOVQ R14, (R10) + MOVQ R15, -8(R10)(AX*1) + ADDQ AX, R11 + ADDQ AX, R10 copy_1_end: ADDQ AX, R12 @@ -3102,37 +3172,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ AX, AX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(AX*1), CL - MOVB CL, (R10)(AX*1) - ADDQ $0x01, AX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(AX*1), CX - MOVW CX, (R10)(AX*1) - ADDQ $0x02, AX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(AX*1), CX - MOVL CX, (R10)(AX*1) - ADDQ $0x04, AX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(AX*1), CX - MOVQ CX, (R10)(AX*1) - ADDQ $0x08, AX - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), AX + MOVB 2(R14), CL + MOVW AX, (R10) + MOVB CL, 2(R10) + ADDQ R13, R14 ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), AX + MOVL -4(R14)(R13*1), CX + MOVL AX, (R10) + MOVL CX, -4(R10)(R13*1) + ADDQ R13, R14 + ADDQ R13, R10 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), AX + MOVQ -8(R14)(R13*1), CX + MOVQ AX, (R10) + MOVQ CX, -8(R10)(R13*1) ADDQ R13, R14 + ADDQ R13, R10 copy_4_end: ADDQ R13, R12 @@ -3158,37 +3228,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, AX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R10)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, AX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R10)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, AX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R10)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, AX - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (R10)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ AX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ AX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(AX*1), BP + MOVB R15, (R10) + MOVB BP, -1(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R10) + MOVB BP, 2(R10) + ADDQ AX, R14 ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(AX*1), BP + MOVL R15, (R10) + MOVL BP, -4(R10)(AX*1) ADDQ AX, R14 + ADDQ AX, R10 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(AX*1), BP + MOVQ R15, (R10) + MOVQ BP, -8(R10)(AX*1) + ADDQ AX, R14 + ADDQ AX, R10 copy_5_end: ADDQ AX, R12 @@ -3225,37 +3305,47 @@ copy_2_loop: JMP copy_2_end copy_2_small: - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (AX)(CX*1), R14 - MOVB R14, (R10)(CX*1) - ADDQ $0x01, CX - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (AX)(CX*1), R14 - MOVW R14, (R10)(CX*1) - ADDQ $0x02, CX - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (AX)(CX*1), R14 - MOVL R14, (R10)(CX*1) - ADDQ $0x04, CX - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_add - MOVQ (AX)(CX*1), R14 - MOVQ R14, (R10)(CX*1) - ADDQ $0x08, CX - -copy_2_add: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (AX), CL + MOVB -1(AX)(R13*1), R14 + MOVB CL, (R10) + MOVB R14, -1(R10)(R13*1) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_3: + MOVW (AX), CX + MOVB 2(AX), R14 + MOVW CX, (R10) + MOVB R14, 2(R10) + ADDQ R13, AX + ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (AX), CX + MOVL -4(AX)(R13*1), R14 + MOVL CX, (R10) + MOVL R14, -4(R10)(R13*1) + ADDQ R13, AX ADDQ R13, R10 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (AX), CX + MOVQ -8(AX)(R13*1), R14 + MOVQ CX, (R10) + MOVQ R14, -8(R10)(R13*1) ADDQ R13, AX + ADDQ R13, R10 copy_2_end: JMP handle_loop @@ -3613,37 +3703,47 @@ copy_1_loop: JMP copy_1_end copy_1_small: - XORQ R14, R14 - TESTQ $0x00000001, CX - JZ copy_1_word - MOVB (R10)(R14*1), R15 - MOVB R15, (R9)(R14*1) - ADDQ $0x01, R14 - -copy_1_word: - TESTQ $0x00000002, CX - JZ copy_1_dword - MOVW (R10)(R14*1), R15 - MOVW R15, (R9)(R14*1) - ADDQ $0x02, R14 - -copy_1_dword: - TESTQ $0x00000004, CX - JZ copy_1_qword - MOVL (R10)(R14*1), R15 - MOVL R15, (R9)(R14*1) - ADDQ $0x04, R14 - -copy_1_qword: - TESTQ $0x00000008, CX - JZ copy_1_add - MOVQ (R10)(R14*1), R15 - MOVQ R15, (R9)(R14*1) - ADDQ $0x08, R14 - -copy_1_add: + CMPQ CX, $0x03 + JE copy_1_move_3 + JB copy_1_move_1or2 + CMPQ CX, $0x08 + JB copy_1_move_4through7 + JMP copy_1_move_8through16 + +copy_1_move_1or2: + MOVB (R10), R14 + MOVB -1(R10)(CX*1), R15 + MOVB R14, (R9) + MOVB R15, -1(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_3: + MOVW (R10), R14 + MOVB 2(R10), R15 + MOVW R14, (R9) + MOVB R15, 2(R9) + ADDQ CX, R10 ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_4through7: + MOVL (R10), R14 + MOVL -4(R10)(CX*1), R15 + MOVL R14, (R9) + MOVL R15, -4(R9)(CX*1) ADDQ CX, R10 + ADDQ CX, R9 + JMP copy_1_end + +copy_1_move_8through16: + MOVQ (R10), R14 + MOVQ -8(R10)(CX*1), R15 + MOVQ R14, (R9) + MOVQ R15, -8(R9)(CX*1) + ADDQ CX, R10 + ADDQ CX, R9 copy_1_end: ADDQ CX, R11 @@ -3683,37 +3783,37 @@ copy_4_loop: JMP copy_4_end copy_4_small: - XORQ CX, CX - TESTQ $0x00000001, R13 - JZ copy_4_word - MOVB (R14)(CX*1), R12 - MOVB R12, (R9)(CX*1) - ADDQ $0x01, CX - -copy_4_word: - TESTQ $0x00000002, R13 - JZ copy_4_dword - MOVW (R14)(CX*1), R12 - MOVW R12, (R9)(CX*1) - ADDQ $0x02, CX - -copy_4_dword: - TESTQ $0x00000004, R13 - JZ copy_4_qword - MOVL (R14)(CX*1), R12 - MOVL R12, (R9)(CX*1) - ADDQ $0x04, CX - -copy_4_qword: - TESTQ $0x00000008, R13 - JZ copy_4_add - MOVQ (R14)(CX*1), R12 - MOVQ R12, (R9)(CX*1) - ADDQ $0x08, CX - -copy_4_add: + CMPQ R13, $0x03 + JE copy_4_move_3 + CMPQ R13, $0x08 + JB copy_4_move_4through7 + JMP copy_4_move_8through16 + +copy_4_move_3: + MOVW (R14), CX + MOVB 2(R14), R12 + MOVW CX, (R9) + MOVB R12, 2(R9) + ADDQ R13, R14 ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_4through7: + MOVL (R14), CX + MOVL -4(R14)(R13*1), R12 + MOVL CX, (R9) + MOVL R12, -4(R9)(R13*1) ADDQ R13, R14 + ADDQ R13, R9 + JMP copy_4_end + +copy_4_move_8through16: + MOVQ (R14), CX + MOVQ -8(R14)(R13*1), R12 + MOVQ CX, (R9) + MOVQ R12, -8(R9)(R13*1) + ADDQ R13, R14 + ADDQ R13, R9 copy_4_end: ADDQ R13, R11 @@ -3739,37 +3839,47 @@ copy_5_loop: JMP copy_5_end copy_5_small: - XORQ R15, R15 - TESTQ $0x00000001, CX - JZ copy_5_word - MOVB (R14)(R15*1), BP - MOVB BP, (R9)(R15*1) - ADDQ $0x01, R15 - -copy_5_word: - TESTQ $0x00000002, CX - JZ copy_5_dword - MOVW (R14)(R15*1), BP - MOVW BP, (R9)(R15*1) - ADDQ $0x02, R15 - -copy_5_dword: - TESTQ $0x00000004, CX - JZ copy_5_qword - MOVL (R14)(R15*1), BP - MOVL BP, (R9)(R15*1) - ADDQ $0x04, R15 - -copy_5_qword: - TESTQ $0x00000008, CX - JZ copy_5_add - MOVQ (R14)(R15*1), BP - MOVQ BP, (R9)(R15*1) - ADDQ $0x08, R15 - -copy_5_add: + CMPQ CX, $0x03 + JE copy_5_move_3 + JB copy_5_move_1or2 + CMPQ CX, $0x08 + JB copy_5_move_4through7 + JMP copy_5_move_8through16 + +copy_5_move_1or2: + MOVB (R14), R15 + MOVB -1(R14)(CX*1), BP + MOVB R15, (R9) + MOVB BP, -1(R9)(CX*1) + ADDQ CX, R14 ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_3: + MOVW (R14), R15 + MOVB 2(R14), BP + MOVW R15, (R9) + MOVB BP, 2(R9) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_4through7: + MOVL (R14), R15 + MOVL -4(R14)(CX*1), BP + MOVL R15, (R9) + MOVL BP, -4(R9)(CX*1) + ADDQ CX, R14 + ADDQ CX, R9 + JMP copy_5_end + +copy_5_move_8through16: + MOVQ (R14), R15 + MOVQ -8(R14)(CX*1), BP + MOVQ R15, (R9) + MOVQ BP, -8(R9)(CX*1) ADDQ CX, R14 + ADDQ CX, R9 copy_5_end: ADDQ CX, R11 @@ -3806,37 +3916,47 @@ copy_2_loop: JMP copy_2_end copy_2_small: - XORQ R12, R12 - TESTQ $0x00000001, R13 - JZ copy_2_word - MOVB (CX)(R12*1), R14 - MOVB R14, (R9)(R12*1) - ADDQ $0x01, R12 - -copy_2_word: - TESTQ $0x00000002, R13 - JZ copy_2_dword - MOVW (CX)(R12*1), R14 - MOVW R14, (R9)(R12*1) - ADDQ $0x02, R12 - -copy_2_dword: - TESTQ $0x00000004, R13 - JZ copy_2_qword - MOVL (CX)(R12*1), R14 - MOVL R14, (R9)(R12*1) - ADDQ $0x04, R12 - -copy_2_qword: - TESTQ $0x00000008, R13 - JZ copy_2_add - MOVQ (CX)(R12*1), R14 - MOVQ R14, (R9)(R12*1) - ADDQ $0x08, R12 - -copy_2_add: + CMPQ R13, $0x03 + JE copy_2_move_3 + JB copy_2_move_1or2 + CMPQ R13, $0x08 + JB copy_2_move_4through7 + JMP copy_2_move_8through16 + +copy_2_move_1or2: + MOVB (CX), R12 + MOVB -1(CX)(R13*1), R14 + MOVB R12, (R9) + MOVB R14, -1(R9)(R13*1) + ADDQ R13, CX ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_3: + MOVW (CX), R12 + MOVB 2(CX), R14 + MOVW R12, (R9) + MOVB R14, 2(R9) + ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_4through7: + MOVL (CX), R12 + MOVL -4(CX)(R13*1), R14 + MOVL R12, (R9) + MOVL R14, -4(R9)(R13*1) ADDQ R13, CX + ADDQ R13, R9 + JMP copy_2_end + +copy_2_move_8through16: + MOVQ (CX), R12 + MOVQ -8(CX)(R13*1), R14 + MOVQ R12, (R9) + MOVQ R14, -8(R9)(R13*1) + ADDQ R13, CX + ADDQ R13, R9 copy_2_end: JMP handle_loop From 5e0adf7fc7b002236048a02d9b76409f029b427f Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Mon, 4 Jul 2022 10:23:30 +0200 Subject: [PATCH 2/2] Remove unneeded zero check. --- zstd/_generate/gen.go | 19 +++++++++++------ zstd/seqdec_amd64.s | 48 ++++++++++++++++--------------------------- 2 files changed, 31 insertions(+), 36 deletions(-) diff --git a/zstd/_generate/gen.go b/zstd/_generate/gen.go index b1fa6c124a..70e1b3a1d4 100644 --- a/zstd/_generate/gen.go +++ b/zstd/_generate/gen.go @@ -1194,14 +1194,14 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle } SUBQ(v, ptr) // ptr := &hist[len(hist) - v] CMPQ(ml, v) - JGE(LabelRef("copy_all_from_history")) + JG(LabelRef("copy_all_from_history")) /* if ml <= v { copy(out[outPosition:], hist[start:start+seq.ml]) t += seq.ml continue } */ - // We know ml will be 4 + // We know ml will be at least 3, since we didn't copy anything yet. e.copyMemoryPrecise("4", ptr, c.outBase, ml, 3) ADDQ(ml, c.outPosition) // Note: for the current go tests this branch is taken in 99.53% cases, @@ -1221,15 +1221,13 @@ func (e executeSimple) executeSingleTriple(c *executeSingleTripleContext, handle e.copyMemoryPrecise("5", ptr, c.outBase, v, 1) ADDQ(v, c.outPosition) SUBQ(v, ml) - // fallback to the next block + // ml cannot be 0, since we only jump here is ml > v. + // Copy rest from current block. } Comment("Copy match from the current buffer") Label("copy_match") { - TESTQ(ml, ml) - JZ(LabelRef("handle_loop")) - src := GP64() MOVQ(c.outBase, src) SUBQ(mo, src) // src = &s.out[t - mo] @@ -1313,6 +1311,15 @@ func (e executeSimple) copyMemoryND(suffix string, src, dst, length reg.GPVirtua // without overreading. It adds length to src and dst, // preserving length. func (e executeSimple) copyMemoryPrecise(suffix string, src, dst, length reg.GPVirtual, minLength int) { + assert(func(ok LabelRef) { + // if length >= minLength, ok + CMPQ(length, U8(minLength)) + JAE(ok) + }) + if minLength == 0 { + TESTQ(length, length) + JZ(LabelRef("copy_" + suffix + "_end")) + } n := GP64() MOVQ(length, n) SUBQ(U8(16), n) diff --git a/zstd/seqdec_amd64.s b/zstd/seqdec_amd64.s index 1a86f13577..147e116180 100644 --- a/zstd/seqdec_amd64.s +++ b/zstd/seqdec_amd64.s @@ -1187,7 +1187,7 @@ check_offset: MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, R11 SUBQ $0x10, R11 JB copy_4_small @@ -1313,10 +1313,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -1496,7 +1494,7 @@ check_offset: MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, R11 SUBQ $0x10, R11 JB copy_4_small @@ -1622,10 +1620,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ BX, R11 - SUBQ R12, R11 + MOVQ BX, R11 + SUBQ R12, R11 // ml <= mo CMPQ R13, R12 @@ -2057,7 +2053,7 @@ check_offset: MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, AX SUBQ $0x10, AX JB copy_4_small @@ -2180,10 +2176,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -2566,7 +2560,7 @@ check_offset: MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, CX SUBQ $0x10, CX JB copy_4_small @@ -2689,10 +2683,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12 @@ -3153,7 +3145,7 @@ check_offset: MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, AX SUBQ $0x10, AX JB copy_4_small @@ -3276,10 +3268,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R10, AX - SUBQ CX, AX + MOVQ R10, AX + SUBQ CX, AX // ml <= mo CMPQ R13, CX @@ -3764,7 +3754,7 @@ check_offset: MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX - JGE copy_all_from_history + JG copy_all_from_history MOVQ R13, CX SUBQ $0x10, CX JB copy_4_small @@ -3887,10 +3877,8 @@ copy_5_end: // Copy match from the current buffer copy_match: - TESTQ R13, R13 - JZ handle_loop - MOVQ R9, CX - SUBQ R12, CX + MOVQ R9, CX + SUBQ R12, CX // ml <= mo CMPQ R13, R12