mirror of
https://github.com/RPCS3/llvm-mirror.git
synced 2025-03-04 18:38:37 +00:00

As discussed in D76983, that patch can turn a chain of insert/extract with scalar trunc ops into bitcast+extract and existing instcombine vector transforms end up creating a shuffle out of that (see the PhaseOrdering test for an example). Currently, that process requires at least this sequence: -instcombine -early-cse -instcombine. Before D76983, the sequence of insert/extract would reach the SLP vectorizer and become a vector trunc there. Based on a small sampling of public targets/types, converting the shuffle to a trunc is better for codegen in most cases (and a regression of that form is the reason this was noticed). The trunc is clearly better for IR-level analysis as well. This means that we can induce "spontaneous vectorization" without invoking any explicit vectorizer passes (at least a vector cast op may be created out of scalar casts), but that seems to be the right choice given that we started with a chain of insert/extract, and the backend would expand back to that chain if a target does not support the op. Differential Revision: https://reviews.llvm.org/D77299
24 lines
967 B
LLVM
24 lines
967 B
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt -O2 -S -data-layout="e" < %s | FileCheck %s --check-prefixes=ANY,OLDPM
|
|
; RUN: opt -passes='default<O2>' -S -data-layout="e" < %s | FileCheck %s --check-prefixes=ANY,NEWPM
|
|
|
|
define <4 x i16> @truncate(<4 x i32> %x) {
|
|
; ANY-LABEL: @truncate(
|
|
; ANY-NEXT: [[V3:%.*]] = trunc <4 x i32> [[X:%.*]] to <4 x i16>
|
|
; ANY-NEXT: ret <4 x i16> [[V3]]
|
|
;
|
|
%x0 = extractelement <4 x i32> %x, i32 0
|
|
%t0 = trunc i32 %x0 to i16
|
|
%v0 = insertelement <4 x i16> undef, i16 %t0, i32 0
|
|
%x1 = extractelement <4 x i32> %x, i32 1
|
|
%t1 = trunc i32 %x1 to i16
|
|
%v1 = insertelement <4 x i16> %v0, i16 %t1, i32 1
|
|
%x2 = extractelement <4 x i32> %x, i32 2
|
|
%t2 = trunc i32 %x2 to i16
|
|
%v2 = insertelement <4 x i16> %v1, i16 %t2, i32 2
|
|
%x3 = extractelement <4 x i32> %x, i32 3
|
|
%t3 = trunc i32 %x3 to i16
|
|
%v3 = insertelement <4 x i16> %v2, i16 %t3, i32 3
|
|
ret <4 x i16> %v3
|
|
}
|