From 2432b6f48b3e777fd8bbed54a0d0b776bd08c599 Mon Sep 17 00:00:00 2001 From: Annabelle Huo Date: Fri, 6 Sep 2024 16:15:55 -0400 Subject: [PATCH] Optimize arraycopy with constant copy size Optimize arraycopy sequence if copy size is constant and less than or equal to 64 bytes. This feature can be disabled by env variable TR_DisableArrayCopyInlineSmallSizeConstantCopySize. Create generateMemoryCopyInstructions that loads and stores memory based on specified register size. This method is used by multiple arraycopy inline methods to reduce the repetition of the same code. Update generateArrayElement{Load|Store} to include vector registers and instructions. Signed-off-by: Annabelle Huo --- compiler/x/codegen/OMRTreeEvaluator.cpp | 551 ++++++++++++++++-------- 1 file changed, 375 insertions(+), 176 deletions(-) diff --git a/compiler/x/codegen/OMRTreeEvaluator.cpp b/compiler/x/codegen/OMRTreeEvaluator.cpp index fdbffde3892..ba1cc239572 100644 --- a/compiler/x/codegen/OMRTreeEvaluator.cpp +++ b/compiler/x/codegen/OMRTreeEvaluator.cpp @@ -1704,6 +1704,133 @@ static void arrayCopy64BitPrimitiveOnIA32(TR::Node* node, TR::Register* dstReg, cg->stopUsingRegister(scratch); } + +/** \brief +* Generate instructions to copy memory +* +* \param node +* The tree node +* +* \param dstReg +* The destination address register +* +* \param srcReg +* The source address register +* +* \param sizeReg +* The register holding the total size to be copied, in bytes +* +* \param tmpReg1 +* The temporary register that must support the register size (regSize) +* +* \param tmpReg2 +* The temporary register that must support the register size (regSize) +* +* \param regSize +* The size of the register to use, in bytes +** +* \param cg +* The code generator +*/ +static void generateMemoryCopyInstructions(TR::Node* node, + TR::Register* dstReg, + TR::Register* srcReg, + TR::Register* sizeReg, + TR::Register* tmpReg1, + TR::Register* tmpReg2, + uint8_t regSize, + TR::CodeGenerator* cg) + { + TR::InstOpCode::Mnemonic loadOpCode; + TR::InstOpCode::Mnemonic storeOpCode; + + bool supported = false; + + switch (regSize) + { + case 1: + if ((tmpReg1->getKind() == TR_GPR) && (tmpReg2->getKind() == TR_GPR)) + { + loadOpCode = TR::InstOpCode::L1RegMem; + storeOpCode = TR::InstOpCode::S1MemReg; + supported = true; + } + break; + case 2: + if ((tmpReg1->getKind() == TR_GPR) && (tmpReg2->getKind() == TR_GPR)) + { + loadOpCode = TR::InstOpCode::L2RegMem; + storeOpCode = TR::InstOpCode::S2MemReg; + supported = true; + } + break; + case 4: + if ((tmpReg1->getKind() == TR_GPR) && (tmpReg2->getKind() == TR_GPR)) + { + loadOpCode = TR::InstOpCode::L4RegMem; + storeOpCode = TR::InstOpCode::S4MemReg; + supported = true; + } + else if ((tmpReg1->getKind() == TR_FPR) && (tmpReg2->getKind() == TR_FPR)) + { + loadOpCode = TR::InstOpCode::MOVDRegMem; + storeOpCode = TR::InstOpCode::MOVDMemReg; + supported = true; + } + break; + case 8: + if ((tmpReg1->getKind() == TR_GPR) && (tmpReg2->getKind() == TR_GPR)) + { + loadOpCode = TR::InstOpCode::L8RegMem; + storeOpCode = TR::InstOpCode::S8MemReg; + supported = true; + } + else if ((tmpReg1->getKind() == TR_FPR) && (tmpReg2->getKind() == TR_FPR)) + { + loadOpCode = TR::InstOpCode::MOVQRegMem; + storeOpCode = TR::InstOpCode::MOVQMemReg; + supported = true; + } + break; + case 16: + if (((tmpReg1->getKind() == TR_FPR) && (tmpReg2->getKind() == TR_FPR)) || + ((tmpReg1->getKind() == TR_VRF) && (tmpReg2->getKind() == TR_VRF))) + { + loadOpCode = TR::InstOpCode::MOVDQURegMem; + storeOpCode = TR::InstOpCode::MOVDQUMemReg; + supported = true; + } + break; + case 32: + if (((tmpReg1->getKind() == TR_FPR) && (tmpReg2->getKind() == TR_FPR)) || + ((tmpReg1->getKind() == TR_VRF) && (tmpReg2->getKind() == TR_VRF))) + { + loadOpCode = TR::InstOpCode::VMOVDQUYmmMem; + storeOpCode = TR::InstOpCode::VMOVDQUMemYmm; + supported = true; + } + break; + case 64: + if ((tmpReg1->getKind() == TR_VRF) & (tmpReg2->getKind() == TR_VRF)) + { + loadOpCode = TR::InstOpCode::VMOVDQUZmmMem; + storeOpCode = TR::InstOpCode::VMOVDQUMemZmm; + supported = true; + } + break; + default: + break; + } + + TR_ASSERT_FATAL(supported, "%s: Unsupported tmpReg1 %d tmpReg2 %d regSize %u", __FUNCTION__, tmpReg1->getKind(), tmpReg2->getKind(), regSize); + + int32_t index = 0 - regSize; + generateRegMemInstruction(loadOpCode, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, index, cg), cg); + generateRegMemInstruction(loadOpCode, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); + generateMemRegInstruction(storeOpCode, node, generateX86MemoryReference(dstReg, sizeReg, 0, index, cg), tmpReg1, cg); + generateMemRegInstruction(storeOpCode, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + } + void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::Node *node, TR::Register *dstReg, TR::Register *srcReg, @@ -1780,10 +1907,7 @@ void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg); // 8 or 16 Bytes - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 8, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -1793,10 +1917,7 @@ void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel1, cg); // 24 or 32 Bytes - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 16, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 32) @@ -1809,10 +1930,7 @@ void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel2, cg); // 40-64 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 32, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 64) @@ -1824,10 +1942,7 @@ void OMR::X86::TreeEvaluator::arrayCopy64BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 72-128 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -64, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -64, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 64, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); } @@ -1917,20 +2032,14 @@ void OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg); // 4 or 8 Bytes - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -4, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -4, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 4, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- generateLabelInstruction(TR::InstOpCode::label, node, copy12RMoreBytesLabel, cg); // 12 or 16 Bytes - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 8, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -1940,10 +2049,7 @@ void OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel1, cg); // 20-32 Bytes - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 16, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 32) @@ -1956,10 +2062,7 @@ void OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel2, cg); // 36-64 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 32, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 64) @@ -1971,10 +2074,7 @@ void OMR::X86::TreeEvaluator::arrayCopy32BitPrimitiveInlineSmallSizeWithoutREPMO generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 68-128 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -64, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUZmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -64, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemZmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 64, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); } @@ -2156,20 +2256,14 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N generateLabelInstruction(TR::InstOpCode::JA4, node, copy10ORMoreBytesLabel, cg); // 4, 6, 8 Bytes - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -4, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -4, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 4, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- generateLabelInstruction(TR::InstOpCode::label, node, copy10ORMoreBytesLabel, cg); // 10-16 Bytes - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 8, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -2179,10 +2273,7 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel, cg); // 18-32 Bytes - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 16, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 32) @@ -2194,10 +2285,7 @@ static void arrayCopy16BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot16(TR::N generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 34-64 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 32, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); } @@ -2296,10 +2384,7 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JE4, node, mainEndLabel, cg); // 1-2 Bytes - generateRegMemInstruction(TR::InstOpCode::L1RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -1, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L1RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S1MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -1, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S1MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 1, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -2308,20 +2393,14 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JA4, node, copy5ORMoreBytesLabel, cg); // 3-4 Bytes - generateRegMemInstruction(TR::InstOpCode::L2RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -2, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L2RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S2MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -2, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S2MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 2, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- generateLabelInstruction(TR::InstOpCode::label, node, copy5ORMoreBytesLabel, cg); // 5-8 Bytes - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -4, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L4RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -4, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S4MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 4, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -2330,10 +2409,7 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JA4, node, copy17ORMoreBytesLabel, cg); // 9-16 Bytes - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -8, cg), cg); - generateRegMemInstruction(TR::InstOpCode::L8RegMem, node, tmpReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -8, cg), tmpReg1, cg); - generateMemRegInstruction(TR::InstOpCode::S8MemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpReg1, tmpReg2, 8, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); // --------------------------------- @@ -2343,10 +2419,7 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JA4, node, copyLabel, cg); // 17-32 Bytes - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -16, cg), cg); - generateRegMemInstruction(TR::InstOpCode::MOVDQURegMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, sizeReg, 0, -16, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::MOVDQUMemReg, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 16, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); if (repMovsThresholdBytes == 32) @@ -2358,10 +2431,7 @@ static void arrayCopy8BitPrimitiveInlineSmallSizeWithoutREPMOVSImplRoot8(TR::Nod generateLabelInstruction(TR::InstOpCode::JA4, node, repMovsLabel, cg); // 33-64 Bytes - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg1, generateX86MemoryReference(srcReg, sizeReg, 0, -32, cg), cg); - generateRegMemInstruction(TR::InstOpCode::VMOVDQUYmmMem, node, tmpXmmYmmReg2, generateX86MemoryReference(srcReg, 0, cg), cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, sizeReg, 0, -32, cg), tmpXmmYmmReg1, cg); - generateMemRegInstruction(TR::InstOpCode::VMOVDQUMemYmm, node, generateX86MemoryReference(dstReg, 0, cg), tmpXmmYmmReg2, cg); + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, tmpXmmYmmReg1, tmpXmmYmmReg2, 32, cg); generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); } @@ -2664,13 +2734,15 @@ static void arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS(TR::Node* node, static bool enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(uint8_t elementSize, TR::CodeGenerator* cg, int32_t& threshold) { + if (!cg->comp()->target().cpu.supportsAVX() || !cg->comp()->target().is64Bit()) + return false; + static bool disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable8BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; static bool disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable16BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; static bool disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable32BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; static bool disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS = feGetEnv("TR_Disable64BitPrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS") != NULL; bool disableEnhancement = false; - bool result = false; threshold = 32; @@ -2726,100 +2798,7 @@ static bool enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(uint8_t elemen break; } - result = (!disableEnhancement && - cg->comp()->target().cpu.supportsAVX() && - cg->comp()->target().is64Bit()) ? true : false; - - return result; - } - -/** \brief -* Generate instructions to do array copy. -* -* \param node -* The tree node -* -* \param elementSize -* The size of an element, in bytes -* -* \param dstReg -* The destination array address register -* -* \param srcReg -* The source array address register -* -* \param sizeReg -* The register holding the total size of elements to be copied, in bytes -* -* \param cg -* The code generator -*/ -static void arrayCopyDefault(TR::Node* node, uint8_t elementSize, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) - { - int32_t repMovsThresholdBytes = 0; - if (enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(elementSize, cg, repMovsThresholdBytes)) - { - arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS(node, dstReg, srcReg, sizeReg, cg, elementSize, repMovsThresholdBytes); - return; - } - - TR::RegisterDependencyConditions* dependencies = generateRegisterDependencyConditions((uint8_t)3, (uint8_t)3, cg); - dependencies->addPreCondition(srcReg, TR::RealRegister::esi, cg); - dependencies->addPreCondition(dstReg, TR::RealRegister::edi, cg); - dependencies->addPreCondition(sizeReg, TR::RealRegister::ecx, cg); - dependencies->addPostCondition(srcReg, TR::RealRegister::esi, cg); - dependencies->addPostCondition(dstReg, TR::RealRegister::edi, cg); - dependencies->addPostCondition(sizeReg, TR::RealRegister::ecx, cg); - - TR::InstOpCode::Mnemonic repmovs; - switch (elementSize) - { - case 8: - repmovs = TR::InstOpCode::REPMOVSQ; - break; - case 4: - repmovs = TR::InstOpCode::REPMOVSD; - break; - case 2: - repmovs = TR::InstOpCode::REPMOVSW; - break; - default: - repmovs = TR::InstOpCode::REPMOVSB; - break; - } - if (node->isForwardArrayCopy()) - { - generateRepMovsInstruction(repmovs, node, sizeReg, dependencies, cg); - } - else // decide direction during runtime - { - TR::LabelSymbol* mainBegLabel = generateLabelSymbol(cg); - TR::LabelSymbol* mainEndLabel = generateLabelSymbol(cg); - TR::LabelSymbol* backwardLabel = generateLabelSymbol(cg); - mainBegLabel->setStartInternalControlFlow(); - mainEndLabel->setEndInternalControlFlow(); - - generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg); - - generateRegRegInstruction(TR::InstOpCode::SUBRegReg(), node, dstReg, srcReg, cg); // dst = dst - src - generateRegRegInstruction(TR::InstOpCode::CMPRegReg(), node, dstReg, sizeReg, cg); // cmp dst, size - generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, srcReg, 0, cg), cg); // dst = dst + src - generateLabelInstruction(TR::InstOpCode::JB4, node, backwardLabel, cg); // jb, skip backward copy setup - generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); - - { - TR_OutlinedInstructionsGenerator og(backwardLabel, node, cg); - generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, srcReg, generateX86MemoryReference(srcReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); - generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); - generateInstruction(TR::InstOpCode::STD, node, cg); - generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); - generateInstruction(TR::InstOpCode::CLD, node, cg); - generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); - og.endOutlinedInstructionSequence(); - } - - generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg); - } + return disableEnhancement ? false : true; } /** \brief @@ -2847,6 +2826,24 @@ static void arrayCopyDefault(TR::Node* node, uint8_t elementSize, TR::Register* static void generateArrayElementStore(TR::Node* node, TR::Register* addressReg, int32_t index, TR::Register* valueReg, uint8_t size, TR::CodeGenerator* cg) { TR::InstOpCode::Mnemonic storeOpcode; + if (valueReg->getKind() == TR_VRF) + { + switch (size) + { + case 16: + storeOpcode = TR::InstOpCode::MOVDQUMemReg; + break; + case 32: + storeOpcode = TR::InstOpCode::VMOVDQUMemYmm; + break; + case 64: + storeOpcode = TR::InstOpCode::VMOVDQUMemZmm; + break; + default: + TR_ASSERT(0, "%s: Unsupported size: %u for TR_VRF registers\n", __FUNCTION__, size); + break; + } + } if (valueReg->getKind() == TR_FPR) { switch (size) @@ -2861,7 +2858,7 @@ static void generateArrayElementStore(TR::Node* node, TR::Register* addressReg, storeOpcode = TR::InstOpCode::MOVDQUMemReg; break; default: - TR_ASSERT(0, "Unsupported size in generateArrayElementStore, size: %d", size); + TR_ASSERT(0, "%s: Unsupported size: %u for TR_FPR registers\n", __FUNCTION__, size); break; } } @@ -2882,7 +2879,7 @@ static void generateArrayElementStore(TR::Node* node, TR::Register* addressReg, storeOpcode = TR::InstOpCode::S8MemReg; break; default: - TR_ASSERT(0, "Unsupported size in generateArrayElementStore, size: %d", size); + TR_ASSERT(0, "%s: Unsupported size: %u for TR_GPR registers\n", __FUNCTION__, size); break; } @@ -2918,7 +2915,25 @@ static void generateArrayElementStore(TR::Node* node, TR::Register* addressReg, static void generateArrayElementLoad(TR::Node* node, TR::Register* valueReg, uint8_t size, TR::Register* addressReg, int32_t index, TR::CodeGenerator* cg) { TR::InstOpCode::Mnemonic loadOpCode; - if (valueReg->getKind() == TR_FPR) + if (valueReg->getKind() == TR_VRF) + { + switch (size) + { + case 16: + loadOpCode = TR::InstOpCode::MOVDQURegMem; + break; + case 32: + loadOpCode = TR::InstOpCode::VMOVDQUYmmMem; + break; + case 64: + loadOpCode = TR::InstOpCode::VMOVDQUZmmMem; + break; + default: + TR_ASSERT(0, "%s: Unsupported size %u for TR_VRF registers\n", __FUNCTION__, size); + break; + } + } + else if (valueReg->getKind() == TR_FPR) { switch (size) { @@ -2932,7 +2947,7 @@ static void generateArrayElementLoad(TR::Node* node, TR::Register* valueReg, uin loadOpCode = TR::InstOpCode::MOVDQURegMem; break; default: - TR_ASSERT(0, "Unsupported size in generateArrayElementLoad, size: %d", size); + TR_ASSERT(0, "%s: Unsupported size %u for TR_FPR registers\n", __FUNCTION__, size); break; } } @@ -2953,7 +2968,7 @@ static void generateArrayElementLoad(TR::Node* node, TR::Register* valueReg, uin loadOpCode = TR::InstOpCode::L8RegMem; break; default: - TR_ASSERT(0, "Unsupported size in generateArrayElementLoad, size: %d", size); + TR_ASSERT(0, "%s: Unsupported size %u for TR_GPR registers\n", __FUNCTION__, size); break; } @@ -2965,6 +2980,190 @@ static void generateArrayElementLoad(TR::Node* node, TR::Register* valueReg, uin generateRegMemInstruction(loadOpCode, node, valueReg, generateX86MemoryReference(addressReg, index, cg), cg); } +/** \brief +* Generate instructions to do array copy with copy size as a constant +* +* \param node +* The tree node +* +* \param dstReg +* The destination array address register +* +* \param srcReg +* The source array address register +* +* \param sizeReg +* The register holding the total size of elements to be copied, in bytes +* +* \param elementSize +* The size of an element, in bytes +* +* \param copySize +* The copy size, in bytes +* +* \param cg +* The code generator +*/ +static void arrayCopyPrimitiveInlineSmallSizeConstantCopySize(TR::Node* node, + TR::Register* dstReg, + TR::Register* srcReg, + TR::Register* sizeReg, + uint32_t copySize, + TR::CodeGenerator* cg) + { + if (cg->comp()->getOption(TR_TraceCG)) + { + traceMsg(cg->comp(), "%s: node n%dn srcReg %s dstReg %s sizeReg %s copySize %d\n", __FUNCTION__, node->getGlobalIndex(), + cg->comp()->getDebug()->getName(srcReg), cg->comp()->getDebug()->getName(dstReg), cg->comp()->getDebug()->getName(sizeReg), copySize); + } + + if (copySize == 0) + return; + + TR::Register* tmpReg1 = cg->allocateRegister(TR_GPR); + TR::Register* tmpReg2 = cg->allocateRegister(TR_GPR); + TR::Register* tmpVRFReg1 = cg->allocateRegister(TR_VRF); + TR::Register* tmpVRFReg2 = cg->allocateRegister(TR_VRF); + + if ((copySize == 1) || (copySize == 2) || (copySize == 4) || (copySize == 8) || (copySize == 16) || (copySize == 32) + || ((copySize == 64) && cg->comp()->target().cpu.supportsFeature(OMR_FEATURE_X86_AVX512F))) + { + TR::Register* valueReg = (copySize >= 16) ? tmpVRFReg1 : tmpReg1; + generateArrayElementLoad(node, valueReg, copySize, srcReg, 0 /* index */, cg); + generateArrayElementStore(node, dstReg, 0 /* index */, valueReg, copySize, cg); + } + else + { + uint8_t regSize = 0; + TR::Register* t1 = tmpReg1; + TR::Register* t2 = tmpReg2; + + if (copySize < 8) + { + regSize = (copySize < 4) ? 2 : 4; + } + else if (copySize < 16) // 8-15 + { + regSize = 8; + } + else // 16-64 + { + regSize = (copySize < 32) ? 16 : 32; + t1 = tmpVRFReg1; + t2 = tmpVRFReg2; + } + generateMemoryCopyInstructions(node, dstReg, srcReg, sizeReg, t1, t2, regSize, cg); + } + + cg->stopUsingRegister(tmpReg1); + cg->stopUsingRegister(tmpReg2); + cg->stopUsingRegister(tmpVRFReg1); + cg->stopUsingRegister(tmpVRFReg2); + } + +/** \brief +* Generate instructions to do array copy. +* +* \param node +* The tree node +* +* \param elementSize +* The size of an element, in bytes +* +* \param dstReg +* The destination array address register +* +* \param srcReg +* The source array address register +* +* \param sizeReg +* The register holding the total size of elements to be copied, in bytes +* +* \param cg +* The code generator +*/ +static void arrayCopyDefault(TR::Node* node, uint8_t elementSize, TR::Register* dstReg, TR::Register* srcReg, TR::Register* sizeReg, TR::CodeGenerator* cg) + { + int32_t repMovsThresholdBytes = 0; + if (enablePrimitiveArrayCopyInlineSmallSizeWithoutREPMOVS(elementSize, cg, repMovsThresholdBytes)) + { + static bool disableArrayCopyInlineSmallSizeConstantCopySize = feGetEnv("TR_DisableArrayCopyInlineSmallSizeConstantCopySize") != NULL; + TR::Node* sizeNode = node->getChild(2); // the size of memory to copy, in bytes + + if (sizeNode->getOpCode().isLoadConst() && !disableArrayCopyInlineSmallSizeConstantCopySize) + { + uint32_t copySize = static_cast(TR::TreeEvaluator::integerConstNodeValue(sizeNode, cg)); + + if (copySize <= 64) + { + arrayCopyPrimitiveInlineSmallSizeConstantCopySize(node, dstReg, srcReg, sizeReg, copySize, cg); + return; + } + } + + arrayCopyPrimitiveInlineSmallSizeWithoutREPMOVS(node, dstReg, srcReg, sizeReg, cg, elementSize, repMovsThresholdBytes); + return; + } + + TR::RegisterDependencyConditions* dependencies = generateRegisterDependencyConditions((uint8_t)3, (uint8_t)3, cg); + dependencies->addPreCondition(srcReg, TR::RealRegister::esi, cg); + dependencies->addPreCondition(dstReg, TR::RealRegister::edi, cg); + dependencies->addPreCondition(sizeReg, TR::RealRegister::ecx, cg); + dependencies->addPostCondition(srcReg, TR::RealRegister::esi, cg); + dependencies->addPostCondition(dstReg, TR::RealRegister::edi, cg); + dependencies->addPostCondition(sizeReg, TR::RealRegister::ecx, cg); + + TR::InstOpCode::Mnemonic repmovs; + switch (elementSize) + { + case 8: + repmovs = TR::InstOpCode::REPMOVSQ; + break; + case 4: + repmovs = TR::InstOpCode::REPMOVSD; + break; + case 2: + repmovs = TR::InstOpCode::REPMOVSW; + break; + default: + repmovs = TR::InstOpCode::REPMOVSB; + break; + } + if (node->isForwardArrayCopy()) + { + generateRepMovsInstruction(repmovs, node, sizeReg, dependencies, cg); + } + else // decide direction during runtime + { + TR::LabelSymbol* mainBegLabel = generateLabelSymbol(cg); + TR::LabelSymbol* mainEndLabel = generateLabelSymbol(cg); + TR::LabelSymbol* backwardLabel = generateLabelSymbol(cg); + mainBegLabel->setStartInternalControlFlow(); + mainEndLabel->setEndInternalControlFlow(); + + generateLabelInstruction(TR::InstOpCode::label, node, mainBegLabel, cg); + + generateRegRegInstruction(TR::InstOpCode::SUBRegReg(), node, dstReg, srcReg, cg); // dst = dst - src + generateRegRegInstruction(TR::InstOpCode::CMPRegReg(), node, dstReg, sizeReg, cg); // cmp dst, size + generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, srcReg, 0, cg), cg); // dst = dst + src + generateLabelInstruction(TR::InstOpCode::JB4, node, backwardLabel, cg); // jb, skip backward copy setup + generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); + + { + TR_OutlinedInstructionsGenerator og(backwardLabel, node, cg); + generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, srcReg, generateX86MemoryReference(srcReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); + generateRegMemInstruction(TR::InstOpCode::LEARegMem(), node, dstReg, generateX86MemoryReference(dstReg, sizeReg, 0, -(intptr_t)elementSize, cg), cg); + generateInstruction(TR::InstOpCode::STD, node, cg); + generateRepMovsInstruction(repmovs, node, sizeReg, NULL, cg); + generateInstruction(TR::InstOpCode::CLD, node, cg); + generateLabelInstruction(TR::InstOpCode::JMP4, node, mainEndLabel, cg); + og.endOutlinedInstructionSequence(); + } + + generateLabelInstruction(TR::InstOpCode::label, node, mainEndLabel, dependencies, cg); + } + } + /** \brief * Generate instructions to do arraycopy for a short constant array. We try to copy as many elements as we can every time. *