diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 94f9ee6f953..a391a890d7a 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -186,6 +186,8 @@ def InMicroMips : Predicate<"Subtarget.inMicroMipsMode()">, AssemblerPredicate<"FeatureMicroMips">; def NotInMicroMips : Predicate<"!Subtarget.inMicroMipsMode()">, AssemblerPredicate<"!FeatureMicroMips">; +def IsLE : Predicate<"Subtarget.isLittle()">; +def IsBE : Predicate<"!Subtarget.isLittle()">; class MipsPat : Pat { let Predicates = [HasStdEnc]; diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td index 24d2c91422b..8383646dfe0 100644 --- a/lib/Target/Mips/MipsMSAInstrInfo.td +++ b/lib/Target/Mips/MipsMSAInstrInfo.td @@ -2536,8 +2536,8 @@ def XOR_V : XOR_V_ENC, XOR_V_DESC, Requires<[HasMSA]>; def XORI_B : XORI_B_ENC, XORI_B_DESC, Requires<[HasMSA]>; // Patterns. -class MSAPat : - Pat, Requires<[pred]>; +class MSAPat pred = [HasMSA]> : + Pat, Requires; def LD_FH : MSAPat<(v8f16 (load addr:$addr)), (LD_H addr:$addr)>; @@ -2552,3 +2552,150 @@ def ST_FW : MSAPat<(store (v4f32 MSA128W:$ws), addr:$addr), (ST_W MSA128W:$ws, addr:$addr)>; def ST_FD : MSAPat<(store (v2f64 MSA128D:$ws), addr:$addr), (ST_D MSA128D:$ws, addr:$addr)>; + +class MSABitconvertPat preds = [HasMSA]> : + MSAPat<(DstVT (bitconvert SrcVT:$src)), + (COPY_TO_REGCLASS SrcVT:$src, DstRC), preds>; + +// These are endian-independant because the element size doesnt change +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +// Little endian bitcasts are always no-ops +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; +def : MSABitconvertPat; + +// Big endian bitcasts expand to shuffle instructions. +// This is because bitcast is defined to be a store/load sequence and the +// vector store/load instructions are mixed-endian with respect to the vector +// as a whole (little endian with respect to element order, but big endian +// elements). + +class MSABitconvertReverseQuartersPat : + MSAPat<(DstVT (bitconvert SrcVT:$src)), + (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 27), + DstRC), + [HasMSA, IsBE]>; + +class MSABitconvertReverseHalvesPat : + MSAPat<(DstVT (bitconvert SrcVT:$src)), + (COPY_TO_REGCLASS (Insn (COPY_TO_REGCLASS SrcVT:$src, ViaRC), 177), + DstRC), + [HasMSA, IsBE]>; + +class MSABitconvertReverseBInHPat : + MSABitconvertReverseHalvesPat; + +class MSABitconvertReverseBInWPat : + MSABitconvertReverseQuartersPat; + +class MSABitconvertReverseBInDPat : + MSAPat<(DstVT (bitconvert SrcVT:$src)), + (COPY_TO_REGCLASS + (SHF_W + (COPY_TO_REGCLASS + (SHF_B (COPY_TO_REGCLASS SrcVT:$src, MSA128B), 27), + MSA128W), 177), + DstRC), + [HasMSA, IsBE]>; + +class MSABitconvertReverseHInWPat : + MSABitconvertReverseHalvesPat; + +class MSABitconvertReverseHInDPat : + MSABitconvertReverseQuartersPat; + +class MSABitconvertReverseWInDPat : + MSABitconvertReverseHalvesPat; + +def : MSABitconvertReverseBInHPat; +def : MSABitconvertReverseBInHPat; +def : MSABitconvertReverseBInWPat; +def : MSABitconvertReverseBInWPat; +def : MSABitconvertReverseBInDPat; +def : MSABitconvertReverseBInDPat; + +def : MSABitconvertReverseBInHPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseHInDPat; + +def : MSABitconvertReverseBInHPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseHInDPat; + +def : MSABitconvertReverseBInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseWInDPat; +def : MSABitconvertReverseWInDPat; + +def : MSABitconvertReverseBInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseHInWPat; +def : MSABitconvertReverseWInDPat; +def : MSABitconvertReverseWInDPat; + +def : MSABitconvertReverseBInDPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseWInDPat; +def : MSABitconvertReverseWInDPat; + +def : MSABitconvertReverseBInDPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseHInDPat; +def : MSABitconvertReverseWInDPat; +def : MSABitconvertReverseWInDPat; diff --git a/test/CodeGen/Mips/msa/bitcast.ll b/test/CodeGen/Mips/msa/bitcast.ll new file mode 100644 index 00000000000..fccdbf6df21 --- /dev/null +++ b/test/CodeGen/Mips/msa/bitcast.ll @@ -0,0 +1,1208 @@ +; RUN: llc -march=mipsel -mattr=+msa < %s | FileCheck -check-prefix=LITENDIAN %s +; RUN: llc -march=mips -mattr=+msa < %s | FileCheck -check-prefix=BIGENDIAN %s + +define void @v16i8_to_v16i8(<16 x i8>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v16i8: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v16i8_to_v16i8 + +; BIGENDIAN: v16i8_to_v16i8: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.b [[R3]], +; BIGENDIAN: .size v16i8_to_v16i8 + +define void @v16i8_to_v8i16(<16 x i8>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v8i16: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v16i8_to_v8i16 + +; BIGENDIAN: v16i8_to_v8i16: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.h [[R4]], +; BIGENDIAN: .size v16i8_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v16i8_to_v8f16(<16 x i8>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v8f16: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.b [[R2]], +; LITENDIAN: .size v16i8_to_v8f16 + +; BIGENDIAN: v16i8_to_v8f16: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.b [[R2]], +; BIGENDIAN: .size v16i8_to_v8f16 + +define void @v16i8_to_v4i32(<16 x i8>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v4i32: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v16i8_to_v4i32 + +; BIGENDIAN: v16i8_to_v4i32: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v16i8_to_v4i32 + +define void @v16i8_to_v4f32(<16 x i8>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v4f32: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v16i8_to_v4f32 + +; BIGENDIAN: v16i8_to_v4f32: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v16i8_to_v4f32 + +define void @v16i8_to_v2i64(<16 x i8>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v2i64: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v16i8_to_v2i64 + +; BIGENDIAN: v16i8_to_v2i64: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177 +; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v16i8_to_v2i64 + +define void @v16i8_to_v2f64(<16 x i8>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <16 x i8>* %src + %1 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %0, <16 x i8> %0) + %2 = bitcast <16 x i8> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v16i8_to_v2f64: +; LITENDIAN: ld.b [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v16i8_to_v2f64 + +; BIGENDIAN: v16i8_to_v2f64: +; BIGENDIAN: ld.b [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.b [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177 +; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v16i8_to_v2f64 + +define void @v8i16_to_v16i8(<8 x i16>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v16i8: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v8i16_to_v16i8 + +; BIGENDIAN: v8i16_to_v16i8: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v8i16_to_v16i8 + +define void @v8i16_to_v8i16(<8 x i16>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v8i16: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v8i16_to_v8i16 + +; BIGENDIAN: v8i16_to_v8i16: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.h [[R3]], +; BIGENDIAN: .size v8i16_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8i16_to_v8f16(<8 x i16>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v8f16: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.h [[R2]], +; LITENDIAN: .size v8i16_to_v8f16 + +; BIGENDIAN: v8i16_to_v8f16: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.h [[R2]], +; BIGENDIAN: .size v8i16_to_v8f16 + +define void @v8i16_to_v4i32(<8 x i16>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v4i32: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v8i16_to_v4i32 + +; BIGENDIAN: v8i16_to_v4i32: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v8i16_to_v4i32 + +define void @v8i16_to_v4f32(<8 x i16>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v4f32: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v8i16_to_v4f32 + +; BIGENDIAN: v8i16_to_v4f32: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v8i16_to_v4f32 + +define void @v8i16_to_v2i64(<8 x i16>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v2i64: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v8i16_to_v2i64 + +; BIGENDIAN: v8i16_to_v2i64: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v8i16_to_v2i64 + +define void @v8i16_to_v2f64(<8 x i16>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <8 x i16>* %src + %1 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %0, <8 x i16> %0) + %2 = bitcast <8 x i16> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v8i16_to_v2f64: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v8i16_to_v2f64 + +; BIGENDIAN: v8i16_to_v2f64: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v8i16_to_v2f64 + +;---- +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v16i8(<8 x half>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <16 x i8> + %2 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %1, <16 x i8> %1) + store <16 x i8> %2, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v16i8: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v8f16_to_v16i8 + +; BIGENDIAN: v8f16_to_v16i8: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R1]], 177 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v8f16_to_v16i8 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v8i16(<8 x half>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <8 x i16> + %2 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %1, <8 x i16> %1) + store <8 x i16> %2, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v8i16: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.h [[R2]], +; LITENDIAN: .size v8f16_to_v8i16 + +; BIGENDIAN: v8f16_to_v8i16: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.h [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.h [[R2]], +; BIGENDIAN: .size v8f16_to_v8i16 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v8f16(<8 x half>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <8 x half> + store <8 x half> %1, <8 x half>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v8f16: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: st.h [[R1]], +; LITENDIAN: .size v8f16_to_v8f16 + +; BIGENDIAN: v8f16_to_v8f16: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: st.h [[R1]], +; BIGENDIAN: .size v8f16_to_v8f16 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v4i32(<8 x half>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <4 x i32> + %2 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %1, <4 x i32> %1) + store <4 x i32> %2, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v4i32: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.w [[R2]], +; LITENDIAN: .size v8f16_to_v4i32 + +; BIGENDIAN: v8f16_to_v4i32: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177 +; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v8f16_to_v4i32 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v4f32(<8 x half>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <4 x float> + %2 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %1, <4 x float> %1) + store <4 x float> %2, <4 x float>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v4f32: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.w [[R2]], +; LITENDIAN: .size v8f16_to_v4f32 + +; BIGENDIAN: v8f16_to_v4f32: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 177 +; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v8f16_to_v4f32 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v2i64(<8 x half>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <2 x i64> + %2 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %1, <2 x i64> %1) + store <2 x i64> %2, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v2i64: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.d [[R2]], +; LITENDIAN: .size v8f16_to_v2i64 + +; BIGENDIAN: v8f16_to_v2i64: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27 +; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v8f16_to_v2i64 + +; We can't prevent the (bitcast (load X)) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v8f16_to_v2f64(<8 x half>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <8 x half>* %src + %1 = bitcast <8 x half> %0 to <2 x double> + %2 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %1, <2 x double> %1) + store <2 x double> %2, <2 x double>* %dst + ret void +} + +; LITENDIAN: v8f16_to_v2f64: +; LITENDIAN: ld.h [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.d [[R2]], +; LITENDIAN: .size v8f16_to_v2f64 + +; BIGENDIAN: v8f16_to_v2f64: +; BIGENDIAN: ld.h [[R1:\$w[0-9]+]], +; BIGENDIAN: shf.h [[R2:\$w[0-9]+]], [[R1]], 27 +; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v8f16_to_v2f64 +;---- + +define void @v4i32_to_v16i8(<4 x i32>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v16i8: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v4i32_to_v16i8 + +; BIGENDIAN: v4i32_to_v16i8: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v4i32_to_v16i8 + +define void @v4i32_to_v8i16(<4 x i32>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v8i16: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v4i32_to_v8i16 + +; BIGENDIAN: v4i32_to_v8i16: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.h [[R4]], +; BIGENDIAN: .size v4i32_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v4i32_to_v8f16(<4 x i32>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v8f16: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.w [[R2]], +; LITENDIAN: .size v4i32_to_v8f16 + +; BIGENDIAN: v4i32_to_v8f16: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.w [[R2]], +; BIGENDIAN: .size v4i32_to_v8f16 + +define void @v4i32_to_v4i32(<4 x i32>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v4i32: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v4i32_to_v4i32 + +; BIGENDIAN: v4i32_to_v4i32: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v4i32_to_v4i32 + +define void @v4i32_to_v4f32(<4 x i32>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v4f32: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v4i32_to_v4f32 + +; BIGENDIAN: v4i32_to_v4f32: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v4i32_to_v4f32 + +define void @v4i32_to_v2i64(<4 x i32>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v2i64: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v4i32_to_v2i64 + +; BIGENDIAN: v4i32_to_v2i64: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v4i32_to_v2i64 + +define void @v4i32_to_v2f64(<4 x i32>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <4 x i32>* %src + %1 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %0, <4 x i32> %0) + %2 = bitcast <4 x i32> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v4i32_to_v2f64: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v4i32_to_v2f64 + +; BIGENDIAN: v4i32_to_v2f64: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v4i32_to_v2f64 + +define void @v4f32_to_v16i8(<4 x float>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v16i8: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v4f32_to_v16i8 + +; BIGENDIAN: v4f32_to_v16i8: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v4f32_to_v16i8 + +define void @v4f32_to_v8i16(<4 x float>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v8i16: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v4f32_to_v8i16 + +; BIGENDIAN: v4f32_to_v8i16: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.h [[R4]], +; BIGENDIAN: .size v4f32_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v4f32_to_v8f16(<4 x float>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v8f16: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.w [[R2]], +; LITENDIAN: .size v4f32_to_v8f16 + +; BIGENDIAN: v4f32_to_v8f16: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.w [[R2]], +; BIGENDIAN: .size v4f32_to_v8f16 + +define void @v4f32_to_v4i32(<4 x float>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v4i32: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v4f32_to_v4i32 + +; BIGENDIAN: v4f32_to_v4i32: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v4f32_to_v4i32 + +define void @v4f32_to_v4f32(<4 x float>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v4f32: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v4f32_to_v4f32 + +; BIGENDIAN: v4f32_to_v4f32: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.w [[R3]], +; BIGENDIAN: .size v4f32_to_v4f32 + +define void @v4f32_to_v2i64(<4 x float>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v2i64: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v4f32_to_v2i64 + +; BIGENDIAN: v4f32_to_v2i64: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v4f32_to_v2i64 + +define void @v4f32_to_v2f64(<4 x float>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <4 x float>* %src + %1 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %0, <4 x float> %0) + %2 = bitcast <4 x float> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v4f32_to_v2f64: +; LITENDIAN: ld.w [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v4f32_to_v2f64 + +; BIGENDIAN: v4f32_to_v2f64: +; BIGENDIAN: ld.w [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.w [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: fadd.d [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.d [[R4]], +; BIGENDIAN: .size v4f32_to_v2f64 + +define void @v2i64_to_v16i8(<2 x i64>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v16i8: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v2i64_to_v16i8 + +; BIGENDIAN: v2i64_to_v16i8: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v2i64_to_v16i8 + +define void @v2i64_to_v8i16(<2 x i64>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v8i16: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v2i64_to_v8i16 + +; BIGENDIAN: v2i64_to_v8i16: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.h [[R4]], +; BIGENDIAN: .size v2i64_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v2i64_to_v8f16(<2 x i64>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v8f16: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.d [[R2]], +; LITENDIAN: .size v2i64_to_v8f16 + +; BIGENDIAN: v2i64_to_v8f16: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.d [[R2]], +; BIGENDIAN: .size v2i64_to_v8f16 + +define void @v2i64_to_v4i32(<2 x i64>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v4i32: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v2i64_to_v4i32 + +; BIGENDIAN: v2i64_to_v4i32: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v2i64_to_v4i32 + +define void @v2i64_to_v4f32(<2 x i64>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v4f32: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v2i64_to_v4f32 + +; BIGENDIAN: v2i64_to_v4f32: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v2i64_to_v4f32 + +define void @v2i64_to_v2i64(<2 x i64>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v2i64: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v2i64_to_v2i64 + +; BIGENDIAN: v2i64_to_v2i64: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v2i64_to_v2i64 + +define void @v2i64_to_v2f64(<2 x i64>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <2 x i64>* %src + %1 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %0, <2 x i64> %0) + %2 = bitcast <2 x i64> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v2i64_to_v2f64: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v2i64_to_v2f64 + +; BIGENDIAN: v2i64_to_v2f64: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: addv.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v2i64_to_v2f64 + +define void @v2f64_to_v16i8(<2 x double>* %src, <16 x i8>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <16 x i8> + %3 = tail call <16 x i8> @llvm.mips.addv.b(<16 x i8> %2, <16 x i8> %2) + store <16 x i8> %3, <16 x i8>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v16i8: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.b [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.b [[R3]], +; LITENDIAN: .size v2f64_to_v16i8 + +; BIGENDIAN: v2f64_to_v16i8: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.b [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R3]], 177 +; BIGENDIAN: addv.b [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.b [[R4]], +; BIGENDIAN: .size v2f64_to_v16i8 + +define void @v2f64_to_v8i16(<2 x double>* %src, <8 x i16>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <8 x i16> + %3 = tail call <8 x i16> @llvm.mips.addv.h(<8 x i16> %2, <8 x i16> %2) + store <8 x i16> %3, <8 x i16>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v8i16: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.h [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.h [[R3]], +; LITENDIAN: .size v2f64_to_v8i16 + +; BIGENDIAN: v2f64_to_v8i16: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.h [[R3:\$w[0-9]+]], [[R2]], 27 +; BIGENDIAN: addv.h [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.h [[R4]], +; BIGENDIAN: .size v2f64_to_v8i16 + +; We can't prevent the (store (bitcast X), Y) DAG Combine here because there +; are no operations for v8f16 to put in the way. +define void @v2f64_to_v8f16(<2 x double>* %src, <8 x half>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <8 x half> + store <8 x half> %2, <8 x half>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v8f16: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: st.d [[R2]], +; LITENDIAN: .size v2f64_to_v8f16 + +; BIGENDIAN: v2f64_to_v8f16: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: st.d [[R2]], +; BIGENDIAN: .size v2f64_to_v8f16 + +define void @v2f64_to_v4i32(<2 x double>* %src, <4 x i32>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <4 x i32> + %3 = tail call <4 x i32> @llvm.mips.addv.w(<4 x i32> %2, <4 x i32> %2) + store <4 x i32> %3, <4 x i32>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v4i32: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v2f64_to_v4i32 + +; BIGENDIAN: v2f64_to_v4i32: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: addv.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v2f64_to_v4i32 + +define void @v2f64_to_v4f32(<2 x double>* %src, <4 x float>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <4 x float> + %3 = tail call <4 x float> @llvm.mips.fadd.w(<4 x float> %2, <4 x float> %2) + store <4 x float> %3, <4 x float>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v4f32: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.w [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.w [[R3]], +; LITENDIAN: .size v2f64_to_v4f32 + +; BIGENDIAN: v2f64_to_v4f32: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: shf.w [[R3:\$w[0-9]+]], [[R2]], 177 +; BIGENDIAN: fadd.w [[R4:\$w[0-9]+]], [[R3]], [[R3]] +; BIGENDIAN: st.w [[R4]], +; BIGENDIAN: .size v2f64_to_v4f32 + +define void @v2f64_to_v2i64(<2 x double>* %src, <2 x i64>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <2 x i64> + %3 = tail call <2 x i64> @llvm.mips.addv.d(<2 x i64> %2, <2 x i64> %2) + store <2 x i64> %3, <2 x i64>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v2i64: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v2f64_to_v2i64 + +; BIGENDIAN: v2f64_to_v2i64: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: addv.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v2f64_to_v2i64 + +define void @v2f64_to_v2f64(<2 x double>* %src, <2 x double>* %dst) nounwind { +entry: + %0 = load volatile <2 x double>* %src + %1 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %0, <2 x double> %0) + %2 = bitcast <2 x double> %1 to <2 x double> + %3 = tail call <2 x double> @llvm.mips.fadd.d(<2 x double> %2, <2 x double> %2) + store <2 x double> %3, <2 x double>* %dst + ret void +} + +; LITENDIAN: v2f64_to_v2f64: +; LITENDIAN: ld.d [[R1:\$w[0-9]+]], +; LITENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; LITENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; LITENDIAN: st.d [[R3]], +; LITENDIAN: .size v2f64_to_v2f64 + +; BIGENDIAN: v2f64_to_v2f64: +; BIGENDIAN: ld.d [[R1:\$w[0-9]+]], +; BIGENDIAN: fadd.d [[R2:\$w[0-9]+]], [[R1]], [[R1]] +; BIGENDIAN: fadd.d [[R3:\$w[0-9]+]], [[R2]], [[R2]] +; BIGENDIAN: st.d [[R3]], +; BIGENDIAN: .size v2f64_to_v2f64 + +declare <16 x i8> @llvm.mips.addv.b(<16 x i8>, <16 x i8>) nounwind +declare <8 x i16> @llvm.mips.addv.h(<8 x i16>, <8 x i16>) nounwind +declare <4 x i32> @llvm.mips.addv.w(<4 x i32>, <4 x i32>) nounwind +declare <2 x i64> @llvm.mips.addv.d(<2 x i64>, <2 x i64>) nounwind +declare <4 x float> @llvm.mips.fadd.w(<4 x float>, <4 x float>) nounwind +declare <2 x double> @llvm.mips.fadd.d(<2 x double>, <2 x double>) nounwind