[GlobalISel][AArch64] Add isel support for FP16 vector @llvm.ceil

This patch adds support for vector @llvm.ceil intrinsics when full 16 bit floating point support isn't available. To do this, this patch... - Implements basic isel for G_UNMERGE_VALUES - Teaches the legalizer about 16 bit floats - Teaches AArch64RegisterBankInfo to respect floating point registers on G_BUILD_VECTOR and G_UNMERGE_VALUES - Teaches selectCopy about 16-bit floating point vectors It also adds - A legalizer test for the 16-bit vector ceil which verifies that we create a G_UNMERGE_VALUES and G_BUILD_VECTOR when full fp16 isn't supported - An instruction selection test which makes sure we lower to G_FCEIL when full fp16 is supported - A test for selecting G_UNMERGE_VALUES And also updates arm64-vfloatintrinsics.ll to show that the new ceiling types work as expected. https://reviews.llvm.org/D56682 llvm-svn: 352113
2024-12-11 08:48:12 +00:00 · 2019-01-24 22:00:41 +00:00 · 2019-01-24 22:00:41 +00:00 · 245047dfe8
commit 245047dfe8
parent 38ebaf7d5d
8 changed files with 790 additions and 95 deletions
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@ -1289,7 +1289,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
  case TargetOpcode::G_FABS:
  case TargetOpcode::G_FDIV:
  case TargetOpcode::G_FREM:
-  case TargetOpcode::G_FMA: {
+  case TargetOpcode::G_FMA:
+  case TargetOpcode::G_FCEIL: {
    unsigned NarrowSize = NarrowTy.getSizeInBits();
    unsigned DstReg = MI.getOperand(0).getReg();
    unsigned Flags = MI.getFlags();
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@ -73,6 +73,7 @@ private:
                          MachineRegisterInfo &MRI) const;
  bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
  bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;

  ComplexRendererFns selectArithImmed(MachineOperand &Root) const;

@ -176,6 +177,70 @@ getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB,
  return nullptr;
 }

+/// Given a register bank, and size in bits, return the smallest register class
+/// that can represent that combination.
+const TargetRegisterClass *getMinClassForRegBank(const RegisterBank &RB,
+                                                 unsigned SizeInBits,
+                                                 bool GetAllRegSet = false) {
+  unsigned RegBankID = RB.getID();
+
+  if (RegBankID == AArch64::GPRRegBankID) {
+    if (SizeInBits <= 32)
+      return GetAllRegSet ? &AArch64::GPR32allRegClass
+                          : &AArch64::GPR32RegClass;
+    if (SizeInBits == 64)
+      return GetAllRegSet ? &AArch64::GPR64allRegClass
+                          : &AArch64::GPR64RegClass;
+  }
+
+  if (RegBankID == AArch64::FPRRegBankID) {
+    switch (SizeInBits) {
+    default:
+      return nullptr;
+    case 8:
+      return &AArch64::FPR8RegClass;
+    case 16:
+      return &AArch64::FPR16RegClass;
+    case 32:
+      return &AArch64::FPR32RegClass;
+    case 64:
+      return &AArch64::FPR64RegClass;
+    case 128:
+      return &AArch64::FPR128RegClass;
+    }
+  }
+
+  return nullptr;
+}
+
+/// Returns the correct subregister to use for a given register class.
+static bool getSubRegForClass(const TargetRegisterClass *RC,
+                              const TargetRegisterInfo &TRI, unsigned &SubReg) {
+  switch (TRI.getRegSizeInBits(*RC)) {
+  case 8:
+    SubReg = AArch64::bsub;
+    break;
+  case 16:
+    SubReg = AArch64::hsub;
+    break;
+  case 32:
+    if (RC == &AArch64::GPR32RegClass)
+      SubReg = AArch64::sub_32;
+    else
+      SubReg = AArch64::ssub;
+    break;
+  case 64:
+    SubReg = AArch64::dsub;
+    break;
+  default:
+    LLVM_DEBUG(
+        dbgs() << "Couldn't find appropriate subregister for register class.");
+    return false;
+  }
+
+  return true;
+}
+
 /// Check whether \p I is a currently unsupported binary operation:
 /// - it has an unsized type
 /// - an operand is not a vreg
@ -331,20 +396,66 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID,
  return GenericOpc;
 }

-static bool selectFP16CopyFromGPR32(MachineInstr &I, const TargetInstrInfo &TII,
-                                    MachineRegisterInfo &MRI, unsigned SrcReg) {
-  // Copies from gpr32 to fpr16 need to use a sub-register copy.
-  unsigned CopyReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY))
-      .addDef(CopyReg)
-      .addUse(SrcReg);
-  unsigned SubRegCopy = MRI.createVirtualRegister(&AArch64::FPR16RegClass);
-  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY))
-      .addDef(SubRegCopy)
-      .addUse(CopyReg, 0, AArch64::hsub);
+/// Helper function that verifies that we have a valid copy at the end of
+/// selectCopy. Verifies that the source and dest have the expected sizes and
+/// then returns true.
+static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
+                        const MachineRegisterInfo &MRI,
+                        const TargetRegisterInfo &TRI,
+                        const RegisterBankInfo &RBI) {
+  const unsigned DstReg = I.getOperand(0).getReg();
+  const unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);

+  // Make sure the size of the source and dest line up.
+  assert(
+      (DstSize == SrcSize ||
+       // Copies are a mean to setup initial types, the number of
+       // bits may not exactly match.
+       (TargetRegisterInfo::isPhysicalRegister(SrcReg) && DstSize <= SrcSize) ||
+       // Copies are a mean to copy bits around, as long as we are
+       // on the same register class, that's fine. Otherwise, that
+       // means we need some SUBREG_TO_REG or AND & co.
+       (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
+      "Copy with different width?!");
+
+  // Check the size of the destination.
+  assert((DstSize <= 64 || DstBank.getID() == AArch64::FPRRegBankID) &&
+         "GPRs cannot get more than 64-bit width values");
+
+  return true;
+}
+
+/// Helper function for selectCopy. Inserts a subregister copy from
+/// \p *From to \p *To, linking it up to \p I.
+///
+/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into
+///
+/// CopyReg (From class) = COPY SrcReg
+/// SubRegCopy (To class) = COPY CopyReg:SubReg
+/// Dst = COPY SubRegCopy
+static bool selectSubregisterCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                                  MachineRegisterInfo &MRI,
+                                  const RegisterBankInfo &RBI, unsigned SrcReg,
+                                  const TargetRegisterClass *From,
+                                  const TargetRegisterClass *To,
+                                  unsigned SubReg) {
+  unsigned CopyReg = MRI.createVirtualRegister(From);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::COPY), CopyReg)
+      .addUse(SrcReg);
+  unsigned SubRegCopy = MRI.createVirtualRegister(To);
+  BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+          SubRegCopy)
+      .addUse(CopyReg, 0, SubReg);
  MachineOperand &RegOp = I.getOperand(1);
  RegOp.setReg(SubRegCopy);
+
+  // It's possible that the destination register won't be constrained. Make
+  // sure that happens.
+  if (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()))
+    RBI.constrainGenericRegister(I.getOperand(0).getReg(), *To, MRI);
+
  return true;
 }

@ -354,84 +465,110 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,

  unsigned DstReg = I.getOperand(0).getReg();
  unsigned SrcReg = I.getOperand(1).getReg();
-
-  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
-    if (TRI.getRegClass(AArch64::FPR16RegClassID)->contains(DstReg) &&
-        !TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
-      const RegisterBank &RegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
-      const TargetRegisterClass *SrcRC = getRegClassForTypeOnBank(
-          MRI.getType(SrcReg), RegBank, RBI, /* GetAllRegSet */ true);
-      if (SrcRC == &AArch64::GPR32allRegClass)
-        return selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
-    }
-    assert(I.isCopy() && "Generic operators do not allow physical registers");
-    return true;
-  }
-
-  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
-  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
-  (void)DstSize;
-  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-  (void)SrcSize;
-  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
-         "No phys reg on generic operators");
-  assert(
-      (DstSize == SrcSize ||
-       // Copies are a mean to setup initial types, the number of
-       // bits may not exactly match.
-       (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
-        DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI)) ||
-       // Copies are a mean to copy bits around, as long as we are
-       // on the same register class, that's fine. Otherwise, that
-       // means we need some SUBREG_TO_REG or AND & co.
-       (((DstSize + 31) / 32 == (SrcSize + 31) / 32) && DstSize > SrcSize)) &&
-      "Copy with different width?!");
-  assert((DstSize <= 64 || RegBank.getID() == AArch64::FPRRegBankID) &&
-         "GPRs cannot get more than 64-bit width values");
-
-  const TargetRegisterClass *RC = getRegClassForTypeOnBank(
-      MRI.getType(DstReg), RegBank, RBI, /* GetAllRegSet */ true);
-  if (!RC) {
-    LLVM_DEBUG(dbgs() << "Unexpected bitcast size " << DstSize << '\n');
+  const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+  const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+  const TargetRegisterClass *DstRC = getMinClassForRegBank(
+      DstRegBank, RBI.getSizeInBits(DstReg, MRI, TRI), true);
+  if (!DstRC) {
+    LLVM_DEBUG(dbgs() << "Unexpected dest size "
+                      << RBI.getSizeInBits(DstReg, MRI, TRI) << '\n');
    return false;
  }

-  if (!TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
-    const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(SrcReg);
-    const TargetRegisterClass *SrcRC =
-        RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
-    const RegisterBank *RB = nullptr;
+  // A couple helpers below, for making sure that the copy we produce is valid.
+
+  // Set to true if we insert a SUBREG_TO_REG. If we do this, then we don't want
+  // to verify that the src and dst are the same size, since that's handled by
+  // the SUBREG_TO_REG.
+  bool KnownValid = false;
+
+  // Returns true, or asserts if something we don't expect happens. Instead of
+  // returning true, we return isValidCopy() to ensure that we verify the
+  // result.
+  auto CheckCopy = [&I, &DstRegBank, &MRI, &TRI, &RBI, &KnownValid]() {
+    // If we have a bitcast or something, we can't have physical registers.
+    assert(
+        I.isCopy() ||
+        (!TargetRegisterInfo::isPhysicalRegister(I.getOperand(0).getReg()) &&
+         !TargetRegisterInfo::isPhysicalRegister(I.getOperand(1).getReg())) &&
+            "No phys reg on generic operator!");
+    assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
+    return true;
+  };
+
+  // Is this a copy? If so, then we may need to insert a subregister copy, or
+  // a SUBREG_TO_REG.
+  if (I.isCopy()) {
+    // Yes. Check if there's anything to fix up.
+    const TargetRegisterClass *SrcRC = getMinClassForRegBank(
+        SrcRegBank, RBI.getSizeInBits(SrcReg, MRI, TRI), true);
    if (!SrcRC) {
-      RB = RegClassOrBank.get<const RegisterBank *>();
-      SrcRC = getRegClassForTypeOnBank(MRI.getType(SrcReg), *RB, RBI, true);
+      LLVM_DEBUG(dbgs() << "Couldn't determine source register class\n");
+      return false;
    }
-    // Copies from fpr16 to gpr32 need to use SUBREG_TO_REG.
-    if (RC == &AArch64::GPR32allRegClass && SrcRC == &AArch64::FPR16RegClass) {
-      unsigned PromoteReg = MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-      BuildMI(*I.getParent(), I, I.getDebugLoc(),
-              TII.get(AArch64::SUBREG_TO_REG))
-          .addDef(PromoteReg)
-          .addImm(0)
-          .addUse(SrcReg)
-          .addImm(AArch64::hsub);
-      MachineOperand &RegOp = I.getOperand(1);
-      RegOp.setReg(PromoteReg);
-    } else if (RC == &AArch64::FPR16RegClass &&
-               SrcRC == &AArch64::GPR32allRegClass) {
-      selectFP16CopyFromGPR32(I, TII, MRI, SrcReg);
+
+    // Is this a cross-bank copy?
+    if (DstRegBank.getID() != SrcRegBank.getID()) {
+      // If we're doing a cross-bank copy on different-sized registers, we need
+      // to do a bit more work.
+      unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
+      unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+
+      if (SrcSize > DstSize) {
+        // We're doing a cross-bank copy into a smaller register. We need a
+        // subregister copy. First, get a register class that's on the same bank
+        // as the destination, but the same size as the source.
+        const TargetRegisterClass *SubregRC =
+            getMinClassForRegBank(DstRegBank, SrcSize, true);
+        assert(SubregRC && "Didn't get a register class for subreg?");
+
+        // Get the appropriate subregister for the destination.
+        unsigned SubReg = 0;
+        if (!getSubRegForClass(DstRC, TRI, SubReg)) {
+          LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
+          return false;
+        }
+
+        // Now, insert a subregister copy using the new register class.
+        selectSubregisterCopy(I, TII, MRI, RBI, SrcReg, SubregRC, DstRC,
+                              SubReg);
+        return CheckCopy();
+      }
+
+      else if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
+               SrcSize == 16) {
+        // Special case for FPR16 to GPR32.
+        // FIXME: This can probably be generalized like the above case.
+        unsigned PromoteReg =
+            MRI.createVirtualRegister(&AArch64::FPR32RegClass);
+        BuildMI(*I.getParent(), I, I.getDebugLoc(),
+                TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
+            .addImm(0)
+            .addUse(SrcReg)
+            .addImm(AArch64::hsub);
+        MachineOperand &RegOp = I.getOperand(1);
+        RegOp.setReg(PromoteReg);
+
+        // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
+        KnownValid = true;
+      }
    }
+
+    // If the destination is a physical register, then there's nothing to
+    // change, so we're done.
+    if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+      return CheckCopy();
  }

-  // No need to constrain SrcReg. It will get constrained when
-  // we hit another of its use or its defs.
-  // Copies do not have constraints.
-  if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+  // No need to constrain SrcReg. It will get constrained when we hit another
+  // of its use or its defs. Copies do not have constraints.
+  if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
    LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
                      << " operand\n");
    return false;
  }
  I.setDesc(TII.get(AArch64::COPY));
-  return true;
+  return CheckCopy();
 }

 static unsigned selectFPConvOpc(unsigned GenericOpc, LLT DstTy, LLT SrcTy) {
@ -1555,6 +1692,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I,
    return selectBuildVector(I, MRI);
  case TargetOpcode::G_MERGE_VALUES:
    return selectMergeValues(I, MRI);
+  case TargetOpcode::G_UNMERGE_VALUES:
+    return selectUnmergeValues(I, MRI);
  }

  return false;
@ -1583,6 +1722,8 @@ bool AArch64InstructionSelector::emitScalarToVector(
  };

  switch (DstTy.getElementType().getSizeInBits()) {
+  case 16:
+    return BuildFn(AArch64::hsub);
  case 32:
    return BuildFn(AArch64::ssub);
  case 64:
@ -1638,6 +1779,137 @@ bool AArch64InstructionSelector::selectMergeValues(
  return true;
 }

+bool AArch64InstructionSelector::selectUnmergeValues(
+    MachineInstr &I, MachineRegisterInfo &MRI) const {
+  assert(I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES &&
+         "unexpected opcode");
+
+  // TODO: Handle unmerging into GPRs and from scalars to scalars.
+  if (RBI.getRegBank(I.getOperand(0).getReg(), MRI, TRI)->getID() !=
+          AArch64::FPRRegBankID ||
+      RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI)->getID() !=
+          AArch64::FPRRegBankID) {
+    LLVM_DEBUG(dbgs() << "Unmerging vector-to-gpr and scalar-to-scalar "
+                         "currently unsupported.\n");
+    return false;
+  }
+
+  // The last operand is the vector source register, and every other operand is
+  // a register to unpack into.
+  unsigned NumElts = I.getNumOperands() - 1;
+  unsigned SrcReg = I.getOperand(NumElts).getReg();
+  const LLT NarrowTy = MRI.getType(I.getOperand(0).getReg());
+  const LLT WideTy = MRI.getType(SrcReg);
+  assert(WideTy.isVector() && "can only unmerge from vector types!");
+  assert(WideTy.getSizeInBits() > NarrowTy.getSizeInBits() &&
+         "source register size too small!");
+
+  // TODO: Handle unmerging into scalars.
+  if (!NarrowTy.isScalar()) {
+    LLVM_DEBUG(dbgs() << "Vector-to-vector unmerges not supported yet.\n");
+    return false;
+  }
+
+  // Choose a lane copy opcode and subregister based off of the size of the
+  // vector's elements.
+  unsigned CopyOpc = 0;
+  unsigned ExtractSubReg = 0;
+  switch (NarrowTy.getSizeInBits()) {
+  case 16:
+    CopyOpc = AArch64::CPYi16;
+    ExtractSubReg = AArch64::hsub;
+    break;
+  case 32:
+    CopyOpc = AArch64::CPYi32;
+    ExtractSubReg = AArch64::ssub;
+    break;
+  case 64:
+    CopyOpc = AArch64::CPYi64;
+    ExtractSubReg = AArch64::dsub;
+    break;
+  default:
+    // Unknown size, bail out.
+    LLVM_DEBUG(dbgs() << "NarrowTy had unsupported size.\n");
+    return false;
+  }
+
+  // Set up for the lane copies.
+  MachineBasicBlock &MBB = *I.getParent();
+
+  // Stores the registers we'll be copying from.
+  SmallVector<unsigned, 4> InsertRegs;
+
+  // We'll use the first register twice, so we only need NumElts-1 registers.
+  unsigned NumInsertRegs = NumElts - 1;
+
+  // If our elements fit into exactly 128 bits, then we can copy from the source
+  // directly. Otherwise, we need to do a bit of setup with some subregister
+  // inserts.
+  if (NarrowTy.getSizeInBits() * NumElts == 128) {
+    InsertRegs = SmallVector<unsigned, 4>(NumInsertRegs, SrcReg);
+  } else {
+    // No. We have to perform subregister inserts. For each insert, create an
+    // implicit def and a subregister insert, and save the register we create.
+    for (unsigned Idx = 0; Idx < NumInsertRegs; ++Idx) {
+      unsigned ImpDefReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
+      MachineInstr &ImpDefMI =
+          *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::IMPLICIT_DEF),
+                   ImpDefReg);
+
+      // Now, create the subregister insert from SrcReg.
+      unsigned InsertReg = MRI.createVirtualRegister(&AArch64::FPR128RegClass);
+      MachineInstr &InsMI =
+          *BuildMI(MBB, I, I.getDebugLoc(),
+                   TII.get(TargetOpcode::INSERT_SUBREG), InsertReg)
+               .addUse(ImpDefReg)
+               .addUse(SrcReg)
+               .addImm(AArch64::dsub);
+
+      constrainSelectedInstRegOperands(ImpDefMI, TII, TRI, RBI);
+      constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
+
+      // Save the register so that we can copy from it after.
+      InsertRegs.push_back(InsertReg);
+    }
+  }
+
+  // Now that we've created any necessary subregister inserts, we can
+  // create the copies.
+  //
+  // Perform the first copy separately as a subregister copy.
+  unsigned CopyTo = I.getOperand(0).getReg();
+  MachineInstr &FirstCopy =
+      *BuildMI(MBB, I, I.getDebugLoc(), TII.get(TargetOpcode::COPY), CopyTo)
+           .addUse(InsertRegs[0], 0, ExtractSubReg);
+  constrainSelectedInstRegOperands(FirstCopy, TII, TRI, RBI);
+
+  // Now, perform the remaining copies as vector lane copies.
+  unsigned LaneIdx = 1;
+  for (unsigned InsReg : InsertRegs) {
+    unsigned CopyTo = I.getOperand(LaneIdx).getReg();
+    MachineInstr &CopyInst =
+        *BuildMI(MBB, I, I.getDebugLoc(), TII.get(CopyOpc), CopyTo)
+             .addUse(InsReg)
+             .addImm(LaneIdx);
+    constrainSelectedInstRegOperands(CopyInst, TII, TRI, RBI);
+    ++LaneIdx;
+  }
+
+  // Separately constrain the first copy's destination. Because of the
+  // limitation in constrainOperandRegClass, we can't guarantee that this will
+  // actually be constrained. So, do it ourselves using the second operand.
+  const TargetRegisterClass *RC =
+      MRI.getRegClassOrNull(I.getOperand(1).getReg());
+  if (!RC) {
+    LLVM_DEBUG(dbgs() << "Couldn't constrain copy destination.\n");
+    return false;
+  }
+
+  RBI.constrainGenericRegister(CopyTo, *RC, MRI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectBuildVector(
    MachineInstr &I, MachineRegisterInfo &MRI) const {
  assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
@ -1646,7 +1918,7 @@ bool AArch64InstructionSelector::selectBuildVector(
  const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
  const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
  unsigned EltSize = EltTy.getSizeInBits();
-  if (EltSize < 32 || EltSize > 64)
+  if (EltSize < 16 || EltSize > 64)
    return false; // Don't support all element types yet.
  const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
  unsigned Opc;
@ -1660,7 +1932,10 @@ bool AArch64InstructionSelector::selectBuildVector(
      SubregIdx = AArch64::dsub;
    }
  } else {
-    if (EltSize == 32) {
+    if (EltSize == 16) {
+      Opc = AArch64::INSvi16lane;
+      SubregIdx = AArch64::hsub;
+    } else if (EltSize == 32) {
      Opc = AArch64::INSvi32lane;
      SubregIdx = AArch64::ssub;
    } else {
@ -1669,21 +1944,24 @@ bool AArch64InstructionSelector::selectBuildVector(
    }
  }

-  if (EltSize * DstTy.getNumElements() != 128)
-    return false; // Don't handle unpacked vectors yet.
-
  unsigned DstVec = 0;
-  const TargetRegisterClass *DstRC = getRegClassForTypeOnBank(
-      DstTy, RBI.getRegBank(AArch64::FPRRegBankID), RBI);
-  emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(),
-                     *I.getParent(), I.getIterator(), MRI);
-  for (unsigned i = 2, e = DstTy.getSizeInBits() / EltSize + 1; i < e; ++i) {
+
+  const TargetRegisterClass *DstRC = &AArch64::FPR128RegClass;
+  if (!emitScalarToVector(DstVec, DstTy, DstRC, I.getOperand(1).getReg(),
+                          *I.getParent(), I.getIterator(), MRI))
+    return false;
+
+  unsigned DstSize = DstTy.getSizeInBits();
+
+  // Keep track of the last MI we inserted. Later on, we might be able to save
+  // a copy using it.
+  MachineInstr *PrevMI = nullptr;
+  for (unsigned i = 2, e = DstSize / EltSize + 1; i < e; ++i) {
    unsigned InsDef;
-    // For the last insert re-use the dst reg of the G_BUILD_VECTOR.
-    if (i + 1 < e)
-      InsDef = MRI.createVirtualRegister(DstRC);
-    else
-      InsDef = I.getOperand(0).getReg();
+
+    // Note that if we don't do a subregister copy, we end up making one more
+    // of these than we need.
+    InsDef = MRI.createVirtualRegister(DstRC);
    unsigned LaneIdx = i - 1;
    if (RB.getID() == AArch64::FPRRegBankID) {
      unsigned ImpDef = MRI.createVirtualRegister(DstRC);
@ -1708,6 +1986,7 @@ bool AArch64InstructionSelector::selectBuildVector(
      constrainSelectedInstRegOperands(InsSubMI, TII, TRI, RBI);
      constrainSelectedInstRegOperands(InsEltMI, TII, TRI, RBI);
      DstVec = InsDef;
+      PrevMI = &InsEltMI;
    } else {
      MachineInstr &InsMI =
          *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
@ -1717,8 +1996,53 @@ bool AArch64InstructionSelector::selectBuildVector(
               .addUse(I.getOperand(i).getReg());
      constrainSelectedInstRegOperands(InsMI, TII, TRI, RBI);
      DstVec = InsDef;
+      PrevMI = &InsMI;
    }
  }
+
+  // If DstTy's size in bits is less than 128, then emit a subregister copy
+  // from DstVec to the last register we've defined.
+  if (DstSize < 128) {
+    unsigned SubReg = 0;
+
+    // Helper lambda to decide on a register class and subregister for the
+    // subregister copy.
+    auto GetRegInfoForCopy = [&SubReg,
+                              &DstSize]() -> const TargetRegisterClass * {
+      switch (DstSize) {
+      default:
+        LLVM_DEBUG(dbgs() << "Unknown destination size (" << DstSize << ")\n");
+        return nullptr;
+      case 32:
+        SubReg = AArch64::ssub;
+        return &AArch64::FPR32RegClass;
+      case 64:
+        SubReg = AArch64::dsub;
+        return &AArch64::FPR64RegClass;
+      }
+    };
+
+    const TargetRegisterClass *RC = GetRegInfoForCopy();
+    if (!RC)
+      return false;
+
+    unsigned Reg = MRI.createVirtualRegister(RC);
+    unsigned DstReg = I.getOperand(0).getReg();
+
+    BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
+            DstReg)
+        .addUse(DstVec, 0, SubReg);
+    MachineOperand &RegOp = I.getOperand(1);
+    RegOp.setReg(Reg);
+    RBI.constrainGenericRegister(DstReg, *RC, MRI);
+  } else {
+    // We don't need a subregister copy. Save a copy by re-using the
+    // destination register on the final insert.
+    assert(PrevMI && "PrevMI was null?");
+    PrevMI->getOperand(0).setReg(I.getOperand(0).getReg());
+    constrainSelectedInstRegOperands(*PrevMI, TII, TRI, RBI);
+  }
+
  I.eraseFromParent();
  return true;
 }
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@ -124,6 +124,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
  getActionDefinitionsBuilder({G_FREM, G_FPOW}).libcallFor({s32, s64});

  getActionDefinitionsBuilder(G_FCEIL)
+      // If we don't have full FP16 support, then scalarize the elements of
+      // vectors containing fp16 types.
+      .fewerElementsIf(
+          [=, &ST](const LegalityQuery &Query) {
+            const auto &Ty = Query.Types[0];
+            return Ty.isVector() && Ty.getElementType() == s16 &&
+                   !ST.hasFullFP16();
+          },
+          [=](const LegalityQuery &Query) { return std::make_pair(0, s16); })
      // If we don't have full FP16 support, then widen s16 to s32 if we
      // encounter it.
      .widenScalarIf(
@ -131,7 +140,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
            return Query.Types[0] == s16 && !ST.hasFullFP16();
          },
          [=](const LegalityQuery &Query) { return std::make_pair(0, s32); })
-      .legalFor({s16, s32, s64, v2s32, v4s32, v2s64});
+      .legalFor({s16, s32, s64, v2s32, v4s32, v2s64, v2s16, v4s16, v8s16});

  getActionDefinitionsBuilder(G_INSERT)
      .unsupportedIf([=](const LegalityQuery &Query) {
@ -435,7 +444,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
      });

  getActionDefinitionsBuilder(G_BUILD_VECTOR)
-      .legalFor({{v4s32, s32}, {v2s64, s64}})
+      .legalFor({{v4s16, s16}, {v8s16, s16}, {v4s32, s32}, {v2s64, s64}})
      .clampNumElements(0, v4s32, v4s32)
      .clampNumElements(0, v2s64, v2s64)

--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@ -635,6 +635,62 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
        OpRegBankIdx[0] = PMI_FirstFPR;
      break;
    }
+    break;
+  case TargetOpcode::G_UNMERGE_VALUES: {
+    // If the first operand belongs to a FPR register bank, then make sure that
+    // we preserve that.
+    if (OpRegBankIdx[0] != PMI_FirstGPR)
+      break;
+
+    // Helper lambda that returns true if MI has floating point constraints.
+    auto HasFPConstraints = [&TRI, &MRI, this](MachineInstr &MI) {
+      unsigned Op = MI.getOpcode();
+
+      // Do we have an explicit floating point instruction?
+      if (isPreISelGenericFloatingPointOpcode(Op))
+        return true;
+
+      // No. Check if we have a copy-like instruction. If we do, then we could
+      // still be fed by floating point instructions.
+      if (Op != TargetOpcode::COPY && !MI.isPHI())
+        return false;
+
+      // MI is copy-like. Return true if it's using an FPR.
+      return getRegBank(MI.getOperand(0).getReg(), MRI, TRI) ==
+             &AArch64::FPRRegBank;
+    };
+
+    if (any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+               [&](MachineInstr &MI) { return HasFPConstraints(MI); })) {
+      // Set the register bank of every operand to FPR.
+      for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
+           Idx < NumOperands; ++Idx)
+        OpRegBankIdx[Idx] = PMI_FirstFPR;
+    }
+    break;
+  }
+
+  case TargetOpcode::G_BUILD_VECTOR:
+    // If the first source operand belongs to a FPR register bank, then make
+    // sure that we preserve that.
+    if (OpRegBankIdx[1] != PMI_FirstGPR)
+      break;
+    unsigned VReg = MI.getOperand(1).getReg();
+    if (!VReg)
+      break;
+
+    // Get the instruction that defined the source operand reg, and check if
+    // it's a floating point operation.
+    MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    unsigned DefOpc = DefMI->getOpcode();
+    if (isPreISelGenericFloatingPointOpcode(DefOpc)) {
+      // Have a floating point op.
+      // Make sure every operand gets mapped to a FPR register class.
+      unsigned NumOperands = MI.getNumOperands();
+      for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
+        OpRegBankIdx[Idx] = PMI_FirstFPR;
+    }
+    break;
  }

  // Finally construct the computed mapping.
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ceil.mir
@ -0,0 +1,86 @@
+# RUN: llc -mtriple=arm64-unknown-unknown -global-isel -O0 -mattr=-fullfp16 -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+  define <8 x half> @test_v8f16.ceil(<8 x half> %a) {
+    ret <8 x half> %a
+  }
+
+  define <4 x half> @test_v4f16.ceil(<4 x half> %a) {
+    ret <4 x half> %a
+  }
+
+...
+---
+name:            test_v8f16.ceil
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v8f16.ceil
+    %0:_(<8 x s16>) = COPY $q0
+    ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16) = G_UNMERGE_VALUES %{{[0-9]+}}(<8 x s16>)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(<8 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16)
+    %1:_(<8 x s16>) = G_FCEIL %0
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+
+...
+---
+name:            test_v4f16.ceil
+alignment:       2
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4f16.ceil
+    %0:_(<4 x s16>) = COPY $d0
+    ; CHECK: %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16), %{{[0-9]+}}:_(s16)  = G_UNMERGE_VALUES %{{[0-9]+}}(<4 x s16>)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FPEXT %{{[0-9]+}}(s16)
+    ; CHECK: %{{[0-9]+}}:_(s32) = G_FCEIL %{{[0-9]+}}
+    ; CHECK: %{{[0-9]+}}:_(s16) = G_FPTRUNC %{{[0-9]+}}(s32)
+    ; CHECK: %{{[0-9]+}}:_(<4 x s16>) = G_BUILD_VECTOR %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16), %{{[0-9]+}}(s16)
+    %1:_(<4 x s16>) = G_FCEIL %0
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+
+...
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-ceil.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-ceil.mir
@ -1,5 +1,6 @@
 # RUN: llc -verify-machineinstrs -mtriple aarch64--- \
-# RUN: -run-pass=instruction-select -global-isel %s -o - | FileCheck %s
+# RUN: -run-pass=instruction-select -mattr=+fullfp16 -global-isel %s -o - \
+# RUN: | FileCheck %s
 ...
 ---
 name:            ceil_float
@ -91,3 +92,39 @@ body:             |
    $q0 = COPY %1(<2 x s64>)

 ...
+---
+name:            ceil_v4f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name:            ceil_v4f16
+    ; CHECK: %{{[0-9]+}}:fpr64 = FRINTPv4f16 %{{[0-9]+}}
+    liveins: $d0
+    %0:fpr(<4 x s16>) = COPY $d0
+    %1:fpr(<4 x s16>) = G_FCEIL %0
+    $d0 = COPY %1(<4 x s16>)
+
+...
+---
+name:            ceil_v8f16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+body:             |
+  bb.0:
+    ; CHECK-LABEL: name:            ceil_v8f16
+    ; CHECK: %{{[0-9]+}}:fpr128 = FRINTPv8f16 %{{[0-9]+}}
+    liveins: $q0
+    %0:fpr(<8 x s16>) = COPY $q0
+    %1:fpr(<8 x s16>) = G_FCEIL %0
+    $q0 = COPY %1(<8 x s16>)
+
+...
--- a/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-unmerge.mir
@ -0,0 +1,154 @@
+
+# RUN: llc -O0 -mattr=-fullfp16 -mtriple=aarch64-- \
+# RUN: -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s
+
+--- |
+  define <2 x double> @test_v2s64_unmerge(<2 x double> %a) {
+    ret <2 x double> %a
+  }
+
+  define <4 x float> @test_v4s32_unmerge(<4 x float> %a) {
+    ret <4 x float> %a
+  }
+
+  define <4 x half> @test_v4s16_unmerge(<4 x half> %a) {
+    ret <4 x half> %a
+  }
+
+  define <8 x half> @test_v8s16_unmerge(<8 x half> %a) {
+    ret <8 x half> %a
+  }
+
+...
+---
+name:            test_v2s64_unmerge
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v2s64_unmerge
+    %0:fpr(<2 x s64>) = COPY $q0
+
+    ; Since 2 * 64 = 128, we can just directly copy.
+    ; CHECK: %2:fpr64 = COPY %0.dsub
+    ; CHECK: %3:fpr64 = CPYi64 %0, 1
+    %2:fpr(s64), %3:fpr(s64) = G_UNMERGE_VALUES %0(<2 x s64>)
+
+    %1:fpr(<2 x s64>) = G_BUILD_VECTOR %2(s64), %3(s64)
+    $q0 = COPY %1(<2 x s64>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            test_v4s32_unmerge
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+  - { id: 5, class: fpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v4s32_unmerge
+    %0:fpr(<4 x s32>) = COPY $q0
+
+    ; Since 4 * 32 = 128, we can just directly copy.
+    ; CHECK: %2:fpr32 = COPY %0.ssub
+    ; CHECK: %3:fpr32 = CPYi32 %0, 1
+    ; CHECK: %4:fpr32 = CPYi32 %0, 2
+    ; CHECK: %5:fpr32 = CPYi32 %0, 3
+    %2:fpr(s32), %3:fpr(s32), %4:fpr(s32), %5:fpr(s32) = G_UNMERGE_VALUES %0(<4 x s32>)
+
+    %1:fpr(<4 x s32>) = G_BUILD_VECTOR %2(s32), %3(s32), %4(s32), %5(s32)
+    $q0 = COPY %1(<4 x s32>)
+    RET_ReallyLR implicit $q0
+...
+---
+name:            test_v4s16_unmerge
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+  - { id: 5, class: fpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $d0
+    ; CHECK-LABEL: name:            test_v4s16_unmerge
+    %0:fpr(<4 x s16>) = COPY $d0
+
+    ; Since 4 * 16 != 128, we need to widen using implicit defs.
+    ; Note that we expect to reuse one of the INSERT_SUBREG results, as CPYi16
+    ; expects a lane > 0.
+    ; CHECK-DAG: [[IMPDEF1:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INS_SHARED:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF1]], %0, %subreg.dsub
+    ; CHECK: [[IMPDEF2:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INS2:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF2]], %0, %subreg.dsub
+    ; CHECK: [[IMPDEF3:%[0-9]+]]:fpr128 = IMPLICIT_DEF
+    ; CHECK-NEXT: [[INS3:%[0-9]+]]:fpr128 = INSERT_SUBREG [[IMPDEF3]], %0, %subreg.dsub
+    ; CHECK: %2:fpr16 = COPY [[INS_SHARED]].hsub
+    ; CHECK: %3:fpr16 = CPYi16 [[INS_SHARED]], 1
+    ; CHECK: %4:fpr16 = CPYi16 [[INS2]], 2
+    ; CHECK: %5:fpr16 = CPYi16 [[INS3]], 3
+    %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16) = G_UNMERGE_VALUES %0(<4 x s16>)
+
+    %1:fpr(<4 x s16>) = G_BUILD_VECTOR %2(s16), %3(s16), %4(s16), %5(s16)
+    $d0 = COPY %1(<4 x s16>)
+    RET_ReallyLR implicit $d0
+...
+---
+name:            test_v8s16_unmerge
+alignment:       2
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: fpr }
+  - { id: 4, class: fpr }
+  - { id: 5, class: fpr }
+  - { id: 6, class: fpr }
+  - { id: 7, class: fpr }
+  - { id: 8, class: fpr }
+  - { id: 9, class: fpr }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: $q0
+    ; CHECK-LABEL: name:            test_v8s16_unmerge
+    %0:fpr(<8 x s16>) = COPY $q0
+
+    ; Since 8 * 16 = 128, we can just directly copy.
+    ; CHECK: %2:fpr16 = COPY %0.hsub
+    ; CHECK: %3:fpr16 = CPYi16 %0, 1
+    ; CHECK: %4:fpr16 = CPYi16 %0, 2
+    ; CHECK: %5:fpr16 = CPYi16 %0, 3
+    ; CHECK: %6:fpr16 = CPYi16 %0, 4
+    ; CHECK: %7:fpr16 = CPYi16 %0, 5
+    ; CHECK: %8:fpr16 = CPYi16 %0, 6
+    ; CHECK: %9:fpr16 = CPYi16 %0, 7
+    %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16) = G_UNMERGE_VALUES %0(<8 x s16>)
+
+    %1:fpr(<8 x s16>) = G_BUILD_VECTOR %2:fpr(s16), %3:fpr(s16), %4:fpr(s16), %5:fpr(s16), %6:fpr(s16), %7:fpr(s16), %8:fpr(s16), %9:fpr(s16)
+    $q0 = COPY %1(<8 x s16>)
+    RET_ReallyLR implicit $q0
+...
--- a/llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vfloatintrinsics.ll
@ -3,6 +3,13 @@
 ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \
 ; RUN:     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FP16

+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=-fullfp16 \
+; RUN:     -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
+; RUN:     2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-NOFP16,FALLBACK
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -mattr=+fullfp16 \
+; RUN:     -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
+; RUN:     2>&1 | FileCheck %s --check-prefixes=GISEL,GISEL-FP16,FALLBACK
+
 ;;; Half vectors

 %v4f16 = type <4 x half>
@ -111,6 +118,12 @@ define %v4f16 @test_v4f16.ceil(%v4f16 %a) {
  ; CHECK-FP16-NOT:       fcvt
  ; CHECK-FP16:           frintp.4h
  ; CHECK-FP16-NEXT:      ret
+  ; FALLBACK-NOT: remark{{.*}}test_v4f16.ceil:
+  ; GISEL-LABEL:          test_v4f16.ceil:
+  ; GISEL-NOFP16-COUNT-4: frintp s{{[0-9]+}}, s{{[0-9]+}}
+  ; GISEL-FP16-NOT:       fcvt
+  ; GISEL-FP16:           frintp.4h
+  ; GISEL-FP16-NEXT:      ret
  %1 = call %v4f16 @llvm.ceil.v4f16(%v4f16 %a)
  ret %v4f16 %1
 }
@ -268,6 +281,12 @@ define %v8f16 @test_v8f16.ceil(%v8f16 %a) {
  ; CHECK-FP16-NOT:       fcvt
  ; CHECK-FP16:           frintp.8h
  ; CHECK-FP16-NEXT:      ret
+  ; FALLBACK-NOT:         remark{{.*}}test_v8f16.ceil:
+  ; GISEL-LABEL:          test_v8f16.ceil:
+  ; GISEL-NOFP16-COUNT-8: frintp s{{[0-9]+}}, s{{[0-9]+}}
+  ; GISEL-FP16-NOT:       fcvt
+  ; GISEL-FP16:           frintp.8h
+  ; GISEL-FP16-NEXT:      ret
  %1 = call %v8f16 @llvm.ceil.v8f16(%v8f16 %a)
  ret %v8f16 %1
 }
@ -400,8 +419,11 @@ define %v2f32 @test_v2f32.floor(%v2f32 %a) {
  ret %v2f32 %1
 }
 ; CHECK-LABEL: test_v2f32.ceil:
+; FALLBACK-NOT: remark{{.*}}test_v2f32.ceil
+; GISEL-LABEL: test_v2f32.ceil:
 define %v2f32 @test_v2f32.ceil(%v2f32 %a) {
  ; CHECK: frintp.2s
+  ; GISEL: frintp.2s
  %1 = call %v2f32 @llvm.ceil.v2f32(%v2f32 %a)
  ret %v2f32 %1
 }
@ -525,8 +547,11 @@ define %v4f32 @test_v4f32.floor(%v4f32 %a) {
  ret %v4f32 %1
 }
 ; CHECK: test_v4f32.ceil:
+; FALLBACK-NOT: remark{{.*}}test_v4f32.ceil
+; GISEL-LABEL: test_v4f32.ceil:
 define %v4f32 @test_v4f32.ceil(%v4f32 %a) {
  ; CHECK: frintp.4s
+  ; GISEL: frintp.4s
  %1 = call %v4f32 @llvm.ceil.v4f32(%v4f32 %a)
  ret %v4f32 %1
 }
@ -649,8 +674,11 @@ define %v2f64 @test_v2f64.floor(%v2f64 %a) {
  ret %v2f64 %1
 }
 ; CHECK: test_v2f64.ceil:
+; FALLBACK-NOT: remark{{.*}}test_v2f64.ceil
+; GISEL-LABEL: test_v2f64.ceil:
 define %v2f64 @test_v2f64.ceil(%v2f64 %a) {
  ; CHECK: frintp.2d
+  ; GISEL: frintp.2d
  %1 = call %v2f64 @llvm.ceil.v2f64(%v2f64 %a)
  ret %v2f64 %1
 }