|
| 1 | +/*========================== begin_copyright_notice ============================ |
| 2 | + |
| 3 | +Copyright (C) 2025 Intel Corporation |
| 4 | + |
| 5 | +SPDX-License-Identifier: MIT |
| 6 | + |
| 7 | +============================= end_copyright_notice ===========================*/ |
| 8 | + |
| 9 | +/*========================== begin_copyright_notice ============================ |
| 10 | + |
| 11 | +Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 12 | +See https://llvm.org/LICENSE.txt for license information. |
| 13 | +SPDX-License-Identifier: Apache-2.0 with LLVM-exception |
| 14 | + |
| 15 | +============================= end_copyright_notice ===========================*/ |
| 16 | + |
| 17 | +From c5c679933c462f28fac7358841a23ee32c292a47 Mon Sep 17 00:00:00 2001 |
| 18 | +From: peterbell10 < [email protected]> |
| 19 | +Date: Wed, 20 Nov 2024 21:06:57 +0000 |
| 20 | +Subject: [PATCH] [InstCombine] Only fold extract element to trunc if vector |
| 21 | + `hasOneUse` (#115627) |
| 22 | + |
| 23 | +This fixes a missed optimization caused by the `foldBitcastExtElt` |
| 24 | +pattern interfering with other combine patterns. In the case I was |
| 25 | +hitting, we have IR that combines two vectors into a new larger vector |
| 26 | +by extracting elements and inserting them into the new vector. |
| 27 | + |
| 28 | +```llvm |
| 29 | +define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) { |
| 30 | + %avec = bitcast i32 %a to <2 x half> |
| 31 | + %a0 = extractelement <2 x half> %avec, i32 0 |
| 32 | + %a1 = extractelement <2 x half> %avec, i32 1 |
| 33 | + %bvec = bitcast i32 %b to <2 x half> |
| 34 | + %b0 = extractelement <2 x half> %bvec, i32 0 |
| 35 | + %b1 = extractelement <2 x half> %bvec, i32 1 |
| 36 | + %ins0 = insertelement <4 x half> undef, half %a0, i32 0 |
| 37 | + %ins1 = insertelement <4 x half> %ins0, half %a1, i32 1 |
| 38 | + %ins2 = insertelement <4 x half> %ins1, half %b0, i32 2 |
| 39 | + %ins3 = insertelement <4 x half> %ins2, half %b1, i32 3 |
| 40 | + ret <4 x half> %ins3 |
| 41 | +} |
| 42 | +``` |
| 43 | + |
| 44 | +With the current behavior, `InstCombine` converts each vector extract |
| 45 | +sequence to |
| 46 | + |
| 47 | +```llvm |
| 48 | + %tmp = trunc i32 %a to i16 |
| 49 | + %a0 = bitcast i16 %tmp to half |
| 50 | + %a1 = extractelement <2 x half> %avec, i32 1 |
| 51 | +``` |
| 52 | + |
| 53 | +where the extraction of `%a0` is now done by truncating the original |
| 54 | +integer. While on it's own this is fairly reasonable, in this case it |
| 55 | +also blocks the pattern which converts `extractelement` - |
| 56 | +`insertelement` into shuffles which gives the overall simpler result: |
| 57 | + |
| 58 | +```llvm |
| 59 | +define <4 x half> @bitcast_extract_insert_to_shuffle(i32 %a, i32 %b) { |
| 60 | + %avec = bitcast i32 %a to <2 x half> |
| 61 | + %bvec = bitcast i32 %b to <2 x half> |
| 62 | + %ins3 = shufflevector <2 x half> %avec, <2 x half> %bvec, <4 x i32> <i32 0, i32 1, i32 2, i32 3> |
| 63 | + ret <4 x half> %ins3 |
| 64 | +} |
| 65 | +``` |
| 66 | + |
| 67 | +In this PR I fix the conflict by obeying the `hasOneUse` check even if |
| 68 | +there is no shift instruction required. In these cases we can't remove |
| 69 | +the vector completely, so the pattern has less benefit anyway. |
| 70 | + |
| 71 | +Also fwiw, I think dropping the `hasOneUse` check for the 0th element |
| 72 | +might have been a mistake in the first place. Looking at |
| 73 | +https://github.com/llvm/llvm-project/commit/535c5d56a7bc9966036a11362d8984983a4bf090 |
| 74 | +the commit message only mentions loosening the `isDesirableIntType` |
| 75 | +requirement and doesn't mention changing the `hasOneUse` check at all. |
| 76 | +--- |
| 77 | + llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp | 6 +++--- |
| 78 | + 1 file changed, 3 insertions(+), 3 deletions(-) |
| 79 | + |
| 80 | +diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp |
| 81 | +index 61e62adbe327..d3b30848ab8b 100644 |
| 82 | +--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp |
| 83 | ++++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp |
| 84 | +@@ -202,9 +202,9 @@ Instruction *InstCombinerImpl::foldBitcastExtElt(ExtractElementInst &Ext) { |
| 85 | + if (IsBigEndian) |
| 86 | + ExtIndexC = NumElts.getKnownMinValue() - 1 - ExtIndexC; |
| 87 | + unsigned ShiftAmountC = ExtIndexC * DestWidth; |
| 88 | +- if (!ShiftAmountC || |
| 89 | +- (isDesirableIntType(X->getType()->getPrimitiveSizeInBits()) && |
| 90 | +- Ext.getVectorOperand()->hasOneUse())) { |
| 91 | ++ if ((!ShiftAmountC || |
| 92 | ++ isDesirableIntType(X->getType()->getPrimitiveSizeInBits())) && |
| 93 | ++ Ext.getVectorOperand()->hasOneUse()) { |
| 94 | + if (ShiftAmountC) |
| 95 | + X = Builder.CreateLShr(X, ShiftAmountC, "extelt.offset"); |
| 96 | + if (DestTy->isFloatingPointTy()) { |
| 97 | +-- |
| 98 | +2.43.0 |
| 99 | + |
0 commit comments