[LoongArch] Define ual feature and override allowsMisalignedMemoryAccesses

Some CPUs do not allow memory accesses to be unaligned, e.g. 2k1000la who uses the la264 core on which misaligned access will trigger an exception. In this patch, a backend feature called `ual` is defined to decribe whether the CPU supports unaligned memroy accesses. And this feature can be toggled by clang options `-m[no-]unaligned-access` or the aliases `-m[no-]strict-align`. When this feature is on, `allowsMisalignedMemoryAccesses` sets the speed number to 1 and returns true that allows the codegen to generate unaligned memory access insns. Clang options `-m[no-]unaligned-access` are moved from `m_arm_Features_Group` to `m_Group` because now more than one targets use them. And a test is added to show that they remain unused on a target that does not support them. In addition, to keep compatible with gcc, a new alias `-mno-strict-align` is added which is equal to `-munaligned-access`. The feature name `ual` is consistent with linux kernel [1] and the output of `lscpu` or `/proc/cpuinfo` [2]. There is an `LLT` variant of `allowsMisalignedMemoryAccesses`, but seems that curently it is only used in GlobalISel which LoongArch doesn't support yet. So this variant is not implemented in this patch. [1]: https://github.com/torvalds/linux/blob/master/arch/loongarch/include/asm/cpu.h#L77 [2]: https://github.com/torvalds/linux/blob/master/arch/loongarch/kernel/proc.c#L75 Reviewed By: xen0n Differential Revision: https://reviews.llvm.org/D149946
2024-11-27 15:41:46 +00:00 · 2023-06-07 11:20:30 +08:00 · 2023-06-07 11:20:30 +08:00 · 47601815ec
commit 47601815ec
parent 8dd28c5682
21 changed files with 314 additions and 82 deletions
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@ -3697,12 +3697,14 @@ def mrvv_vector_bits_EQ : Joined<["-"], "mrvv-vector-bits=">, Group<m_Group>,
           "to use the value implied by -march/-mcpu. Value will be reflected "
           "in __riscv_v_fixed_vlen preprocessor define (RISC-V only)">;

-def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_arm_Features_Group>,
-  HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64 only)">;
-def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_arm_Features_Group>,
-  HelpText<"Force all memory accesses to be aligned (AArch32/AArch64 only)">;
+def munaligned_access : Flag<["-"], "munaligned-access">, Group<m_Group>,
+  HelpText<"Allow memory accesses to be unaligned (AArch32/AArch64/LoongArch only)">;
+def mno_unaligned_access : Flag<["-"], "mno-unaligned-access">, Group<m_Group>,
+  HelpText<"Force all memory accesses to be aligned (AArch32/AArch64/LoongArch only)">;
 def mstrict_align : Flag<["-"], "mstrict-align">, Alias<mno_unaligned_access>, Flags<[CC1Option,HelpHidden]>,
  HelpText<"Force all memory accesses to be aligned (same as mno-unaligned-access)">;
+def mno_strict_align : Flag<["-"], "mno-strict-align">, Alias<munaligned_access>, Flags<[CC1Option,HelpHidden]>,
+  HelpText<"Allow memory accesses to be unaligned (same as munaligned-access)">;
 def mno_thumb : Flag<["-"], "mno-thumb">, Group<m_arm_Features_Group>;
 def mrestrict_it: Flag<["-"], "mrestrict-it">, Group<m_arm_Features_Group>,
  HelpText<"Disallow generation of complex IT blocks.">;
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//

 #include "LoongArch.h"
+#include "ToolChains/CommonArgs.h"
 #include "clang/Basic/DiagnosticDriver.h"
 #include "clang/Driver/Driver.h"
 #include "clang/Driver/DriverDiagnostic.h"
@ -133,4 +134,9 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
      D.Diag(diag::err_drv_loongarch_invalid_mfpu_EQ) << FPU;
    }
  }
+
+  // Select the `ual` feature determined by -m[no-]unaligned-access
+  // or the alias -m[no-]strict-align.
+  AddTargetFeature(Args, Features, options::OPT_munaligned_access,
+                   options::OPT_mno_unaligned_access, "ual");
 }
--- a/clang/test/Driver/loongarch-default-features.c
+++ b/clang/test/Driver/loongarch-default-features.c
@ -2,7 +2,7 @@
 // RUN: %clang --target=loongarch64 -S -emit-llvm %s -o - | FileCheck %s --check-prefix=LA64

 // LA32: "target-features"="+32bit"
-// LA64: "target-features"="+64bit,+d,+f"
+// LA64: "target-features"="+64bit,+d,+f,+ual"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-march.c
+++ b/clang/test/Driver/loongarch-march.c
@ -8,17 +8,17 @@
 // RUN:   FileCheck %s --check-prefix=IR-LA464

 // CC1-LOONGARCH64-NOT: "-target-feature"
-// CC1-LOONGARCH64: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d"
+// CC1-LOONGARCH64: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+ual"
 // CC1-LOONGARCH64-NOT: "-target-feature"
 // CC1-LOONGARCH64: "-target-abi" "lp64d"

 // CC1-LA464-NOT: "-target-feature"
-// CC1-LA464: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx"
+// CC1-LA464: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d" "-target-feature" "+lsx" "-target-feature" "+lasx" "-target-feature" "+ual"
 // CC1-LA464-NOT: "-target-feature"
 // CC1-LA464: "-target-abi" "lp64d"

-// IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f"
-// IR-LA464: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx"
+// IR-LOONGARCH64: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f,+ual"
+// IR-LA464: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f,+lasx,+lsx,+ual"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-mdouble-float.c
+++ b/clang/test/Driver/loongarch-mdouble-float.c
@ -8,12 +8,10 @@
 // WARN: warning: argument unused during compilation: '-mfpu=0'
 // WARN: warning: argument unused during compilation: '-mabi=lp64s'

-// CC1-NOT: "-target-feature"
-// CC1: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d"
-// CC1-NOT: "-target-feature"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "+d"
 // CC1: "-target-abi" "lp64d"

-// IR: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+d,{{(.*,)?}}+f{{(,.*)?}}"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-mfpu.c
+++ b/clang/test/Driver/loongarch-mfpu.c
@ -16,24 +16,18 @@
 // RUN: %clang --target=loongarch64 -mfpu=none -S -emit-llvm %s -o - | \
 // RUN:   FileCheck %s --check-prefix=IR-FPU0

-// CC1-FPU64-NOT: "-target-feature"
-// CC1-FPU64: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "+d"
-// CC1-FPU64-NOT: "-target-feature"
+// CC1-FPU64: "-target-feature" "+f"{{.*}} "-target-feature" "+d"
 // CC1-FPU64: "-target-abi" "lp64d"

-// CC1-FPU32-NOT: "-target-feature"
-// CC1-FPU32: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "-d"
-// CC1-FPU32-NOT: "-target-feature"
+// CC1-FPU32: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
 // CC1-FPU32: "-target-abi" "lp64f"

-// CC1-FPU0-NOT: "-target-feature"
-// CC1-FPU0: "-target-feature" "+64bit" "-target-feature" "-f" "-target-feature" "-d"
-// CC1-FPU0-NOT: "-target-feature"
+// CC1-FPU0: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
 // CC1-FPU0: "-target-abi" "lp64s"

-// IR-FPU64: attributes #[[#]] ={{.*}}"target-features"="+64bit,+d,+f"
-// IR-FPU32: attributes #[[#]] ={{.*}}"target-features"="+64bit,+f,-d"
-// IR-FPU0: attributes #[[#]] ={{.*}}"target-features"="+64bit,-d,-f"
+// IR-FPU64: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+d,{{(.*,)?}}+f{{(,.*)?}}"
+// IR-FPU32: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d{{(,.*)?}}"
+// IR-FPU0: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-msingle-float.c
+++ b/clang/test/Driver/loongarch-msingle-float.c
@ -8,12 +8,10 @@
 // WARN: warning: argument unused during compilation: '-mfpu=0'
 // WARN: warning: argument unused during compilation: '-mabi=lp64s'

-// CC1-NOT: "-target-feature"
-// CC1: "-target-feature" "+64bit" "-target-feature" "+f" "-target-feature" "-d"
-// CC1-NOT: "-target-feature"
+// CC1: "-target-feature" "+f"{{.*}} "-target-feature" "-d"
 // CC1: "-target-abi" "lp64f"

-// IR: attributes #[[#]] ={{.*}}"target-features"="+64bit,+f,-d"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+f,{{(.*,)?}}-d"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-msoft-float.c
+++ b/clang/test/Driver/loongarch-msoft-float.c
@ -8,12 +8,10 @@
 // WARN: warning: argument unused during compilation: '-mfpu=64'
 // WARN: warning: argument unused during compilation: '-mabi=lp64d'

-// CC1-NOT: "-target-feature"
-// CC1: "-target-feature" "+64bit" "-target-feature" "-f" "-target-feature" "-d"
-// CC1-NOT: "-target-feature"
+// CC1: "-target-feature" "-f"{{.*}} "-target-feature" "-d"
 // CC1: "-target-abi" "lp64s"

-// IR: attributes #[[#]] ={{.*}}"target-features"="+64bit,-d,-f"
+// IR: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-d,{{(.*,)?}}-f{{(,.*)?}}"

 int foo(void) {
  return 3;
--- a/clang/test/Driver/loongarch-munaligned-access.c
+++ b/clang/test/Driver/loongarch-munaligned-access.c
@ -0,0 +1,61 @@
+/// Test -m[no-]unaligned-access and -m[no-]strict-align options.
+
+// RUN: %clang --target=loongarch64 -munaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -fsyntax-only %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-UNALIGNED
+
+// RUN: %clang --target=loongarch64 -munaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -munaligned-access -mno-unaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -munaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -mno-strict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -mstrict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -munaligned-access -mstrict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+// RUN: %clang --target=loongarch64 -mstrict-align -munaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-unaligned-access -mno-strict-align -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-UNALIGNED
+// RUN: %clang --target=loongarch64 -mno-strict-align -mno-unaligned-access -S -emit-llvm %s -o - | \
+// RUN:   FileCheck %s --check-prefix=IR-NO-UNALIGNED
+
+// CC1-UNALIGNED: "-target-feature" "+ual"
+// CC1-NO-UNALIGNED: "-target-feature" "-ual"
+
+// IR-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+ual{{(,.*)?}}"
+// IR-NO-UNALIGNED: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-ual{{(,.*)?}}"
+
+int foo(void) {
+  return 3;
+}
--- a/clang/test/Driver/munaligned-access-unused.c
+++ b/clang/test/Driver/munaligned-access-unused.c
@ -0,0 +1,8 @@
+/// Check -m[no-]unaligned-access and -m[no-]strict-align are warned unused on a target that does not support them.
+
+// RUN: %clang --target=x86_64 -munaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=unaligned-access
+// RUN: %clang --target=x86_64 -mno-unaligned-access -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-unaligned-access
+// RUN: %clang --target=x86_64 -mstrict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=strict-align
+// RUN: %clang --target=x86_64 -mno-strict-align -fsyntax-only %s -### 2>&1 | FileCheck %s -DOPTION=no-strict-align
+
+// CHECK: clang: warning: argument unused during compilation: '-m[[OPTION]]' [-Wunused-command-line-argument]
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.def
@ -11,6 +11,7 @@ LOONGARCH_FEATURE("+lsx", FK_LSX)
 LOONGARCH_FEATURE("+lasx", FK_LASX)
 LOONGARCH_FEATURE("+lbt", FK_LBT)
 LOONGARCH_FEATURE("+lvz", FK_LVZ)
+LOONGARCH_FEATURE("+ual", FK_UAL)

 #undef LOONGARCH_FEATURE

@ -19,7 +20,7 @@ LOONGARCH_FEATURE("+lvz", FK_LVZ)
 #endif

 LOONGARCH_ARCH("invalid", AK_INVALID, FK_INVALID)
-LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64)
-LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX)
+LOONGARCH_ARCH("loongarch64", AK_LOONGARCH64, FK_64BIT | FK_FP32 | FK_FP64 | FK_UAL)
+LOONGARCH_ARCH("la464", AK_LA464, FK_64BIT | FK_FP32 | FK_FP64 | FK_LSX | FK_LASX | FK_UAL)

 #undef LOONGARCH_ARCH
--- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
+++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h
@ -46,6 +46,9 @@ enum FeatureKind : uint32_t {

  // Loongson Virtualization Extension is available.
  FK_LVZ = 1 << 7,
+
+  // Allow memory accesses to be unaligned.
+  FK_UAL = 1 << 8,
 };

 struct FeatureInfo {
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@ -115,6 +115,11 @@ def HasLaLocalWithAbs
      AssemblerPredicate<(all_of LaLocalWithAbs),
                         "Expand la.local as la.abs">;

+// Unaligned memory access
+def FeatureUAL
+    : SubtargetFeature<"ual", "HasUAL", "true",
+                       "Allow memory accesses to be unaligned">;
+
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
 //===----------------------------------------------------------------------===//
@ -128,13 +133,14 @@ include "LoongArchInstrInfo.td"
 //===----------------------------------------------------------------------===//

 def : ProcessorModel<"generic-la32", NoSchedModel, [Feature32Bit]>;
-def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit]>;
+def : ProcessorModel<"generic-la64", NoSchedModel, [Feature64Bit, FeatureUAL]>;

 // Support generic for compatibility with other targets. The triple will be used
 // to change to the appropriate la32/la64 version.
 def : ProcessorModel<"generic", NoSchedModel, []>;

 def : ProcessorModel<"la464", NoSchedModel, [Feature64Bit,
+                                             FeatureUAL,
                                             FeatureExtLASX,
                                             FeatureExtLVZ,
                                             FeatureExtLBT]>;
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@ -1785,6 +1785,18 @@ MachineBasicBlock *LoongArchTargetLowering::EmitInstrWithCustomInserter(
  }
 }

+bool LoongArchTargetLowering::allowsMisalignedMemoryAccesses(
+    EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
+    unsigned *Fast) const {
+  if (!Subtarget.hasUAL())
+    return false;
+
+  // TODO: set reasonable speed number.
+  if (Fast)
+    *Fast = 1;
+  return true;
+}
+
 const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
  switch ((LoongArchISD::NodeType)Opcode) {
  case LoongArchISD::FIRST_NUMBER:
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@ -191,6 +191,11 @@ public:

  bool convertSelectOfConstantsToMath(EVT VT) const override { return true; }

+  bool allowsMisalignedMemoryAccesses(
+      EVT VT, unsigned AddrSpace = 0, Align Alignment = Align(1),
+      MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
+      unsigned *Fast = nullptr) const override;
+
 private:
  /// Target-specific function used to lower LoongArch calling conventions.
  typedef bool LoongArchCCAssignFn(const DataLayout &DL, LoongArchABI::ABI ABI,
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@ -42,6 +42,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
  bool HasLaGlobalWithPcrel = false;
  bool HasLaGlobalWithAbs = false;
  bool HasLaLocalWithAbs = false;
+  bool HasUAL = false;
  unsigned GRLen = 32;
  MVT GRLenVT = MVT::i32;
  LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@ -91,6 +92,7 @@ public:
  bool hasLaGlobalWithPcrel() const { return HasLaGlobalWithPcrel; }
  bool hasLaGlobalWithAbs() const { return HasLaGlobalWithAbs; }
  bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
+  bool hasUAL() const { return HasUAL; }
  MVT getGRLenVT() const { return GRLenVT; }
  unsigned getGRLen() const { return GRLen; }
  LoongArchABI::ABI getTargetABI() const { return TargetABI; }
--- a/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
+++ b/llvm/test/CodeGen/LoongArch/calling-conv-common.ll
@ -360,17 +360,13 @@ define void @callee_large_struct_ret(ptr noalias sret(%struct.large) %agg.result
 ; CHECK-LABEL: callee_large_struct_ret:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    ori $a1, $zero, 4
-; CHECK-NEXT:    st.w $a1, $a0, 24
+; CHECK-NEXT:    st.d $a1, $a0, 24
 ; CHECK-NEXT:    ori $a1, $zero, 3
-; CHECK-NEXT:    st.w $a1, $a0, 16
+; CHECK-NEXT:    st.d $a1, $a0, 16
 ; CHECK-NEXT:    ori $a1, $zero, 2
-; CHECK-NEXT:    st.w $a1, $a0, 8
-; CHECK-NEXT:    st.w $zero, $a0, 28
-; CHECK-NEXT:    st.w $zero, $a0, 20
-; CHECK-NEXT:    st.w $zero, $a0, 12
-; CHECK-NEXT:    st.w $zero, $a0, 4
+; CHECK-NEXT:    st.d $a1, $a0, 8
 ; CHECK-NEXT:    ori $a1, $zero, 1
-; CHECK-NEXT:    st.w $a1, $a0, 0
+; CHECK-NEXT:    st.d $a1, $a0, 0
 ; CHECK-NEXT:    ret
  %a = getelementptr inbounds %struct.large, ptr %agg.result, i64 0, i32 0
  store i64 1, ptr %a, align 4
--- a/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
+++ b/llvm/test/CodeGen/LoongArch/ir-instruction/atomicrmw-fp.ll
@ -315,10 +315,7 @@ define double @double_fadd_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.wu $a0, $a0, 0
-; LA64F-NEXT:    ld.wu $a1, $fp, 4
-; LA64F-NEXT:    slli.d $a1, $a1, 32
-; LA64F-NEXT:    or $a0, $a1, $a0
+; LA64F-NEXT:    ld.d $a0, $a0, 0
 ; LA64F-NEXT:    ori $s0, $zero, 8
 ; LA64F-NEXT:    addi.d $s1, $sp, 8
 ; LA64F-NEXT:    addi.d $s2, $sp, 0
@ -360,11 +357,7 @@ define double @double_fadd_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
-; LA64D-NEXT:    ld.wu $a0, $a0, 0
-; LA64D-NEXT:    ld.wu $a1, $fp, 4
-; LA64D-NEXT:    slli.d $a1, $a1, 32
-; LA64D-NEXT:    or $a0, $a1, $a0
-; LA64D-NEXT:    movgr2fr.d $fa0, $a0
+; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fs0, $a0
 ; LA64D-NEXT:    ori $s0, $zero, 8
@ -411,10 +404,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.wu $a0, $a0, 0
-; LA64F-NEXT:    ld.wu $a1, $fp, 4
-; LA64F-NEXT:    slli.d $a1, $a1, 32
-; LA64F-NEXT:    or $a0, $a1, $a0
+; LA64F-NEXT:    ld.d $a0, $a0, 0
 ; LA64F-NEXT:    ori $s0, $zero, 8
 ; LA64F-NEXT:    addi.d $s1, $sp, 8
 ; LA64F-NEXT:    addi.d $s2, $sp, 0
@ -456,11 +446,7 @@ define double @double_fsub_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
-; LA64D-NEXT:    ld.wu $a0, $a0, 0
-; LA64D-NEXT:    ld.wu $a1, $fp, 4
-; LA64D-NEXT:    slli.d $a1, $a1, 32
-; LA64D-NEXT:    or $a0, $a1, $a0
-; LA64D-NEXT:    movgr2fr.d $fa0, $a0
+; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI5_0)
 ; LA64D-NEXT:    addi.d $a0, $a0, %pc_lo12(.LCPI5_0)
 ; LA64D-NEXT:    fld.d $fs0, $a0, 0
@ -507,10 +493,7 @@ define double @double_fmin_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.wu $a0, $a0, 0
-; LA64F-NEXT:    ld.wu $a1, $fp, 4
-; LA64F-NEXT:    slli.d $a1, $a1, 32
-; LA64F-NEXT:    or $a0, $a1, $a0
+; LA64F-NEXT:    ld.d $a0, $a0, 0
 ; LA64F-NEXT:    ori $s0, $zero, 8
 ; LA64F-NEXT:    addi.d $s1, $sp, 8
 ; LA64F-NEXT:    addi.d $s2, $sp, 0
@ -552,11 +535,7 @@ define double @double_fmin_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
-; LA64D-NEXT:    ld.wu $a0, $a0, 0
-; LA64D-NEXT:    ld.wu $a1, $fp, 4
-; LA64D-NEXT:    slli.d $a1, $a1, 32
-; LA64D-NEXT:    or $a0, $a1, $a0
-; LA64D-NEXT:    movgr2fr.d $fa0, $a0
+; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fs0, $a0
 ; LA64D-NEXT:    ori $s0, $zero, 8
@ -604,10 +583,7 @@ define double @double_fmax_acquire(ptr %p) nounwind {
 ; LA64F-NEXT:    st.d $s2, $sp, 24 # 8-byte Folded Spill
 ; LA64F-NEXT:    st.d $s3, $sp, 16 # 8-byte Folded Spill
 ; LA64F-NEXT:    move $fp, $a0
-; LA64F-NEXT:    ld.wu $a0, $a0, 0
-; LA64F-NEXT:    ld.wu $a1, $fp, 4
-; LA64F-NEXT:    slli.d $a1, $a1, 32
-; LA64F-NEXT:    or $a0, $a1, $a0
+; LA64F-NEXT:    ld.d $a0, $a0, 0
 ; LA64F-NEXT:    ori $s0, $zero, 8
 ; LA64F-NEXT:    addi.d $s1, $sp, 8
 ; LA64F-NEXT:    addi.d $s2, $sp, 0
@ -649,11 +625,7 @@ define double @double_fmax_acquire(ptr %p) nounwind {
 ; LA64D-NEXT:    st.d $s3, $sp, 32 # 8-byte Folded Spill
 ; LA64D-NEXT:    fst.d $fs0, $sp, 24 # 8-byte Folded Spill
 ; LA64D-NEXT:    move $fp, $a0
-; LA64D-NEXT:    ld.wu $a0, $a0, 0
-; LA64D-NEXT:    ld.wu $a1, $fp, 4
-; LA64D-NEXT:    slli.d $a1, $a1, 32
-; LA64D-NEXT:    or $a0, $a1, $a0
-; LA64D-NEXT:    movgr2fr.d $fa0, $a0
+; LA64D-NEXT:    fld.d $fa0, $a0, 0
 ; LA64D-NEXT:    addi.d $a0, $zero, 1
 ; LA64D-NEXT:    movgr2fr.d $fs0, $a0
 ; LA64D-NEXT:    ori $s0, $zero, 8
--- a/llvm/test/CodeGen/LoongArch/tail-calls.ll
+++ b/llvm/test/CodeGen/LoongArch/tail-calls.ll
@ -13,6 +13,7 @@ entry:
 }

 ;; Perform tail call optimization for external symbol.
+;; Bytes copied should be large enough, otherwise the memcpy call would be optimized to multiple ld/st insns.
@dest = global [2 x i8] zeroinitializer
 declare void @llvm.memcpy.p0i8.p0i8.i32(ptr, ptr, i32, i1)
 define void @caller_extern(ptr %src) optsize {
@ -21,10 +22,10 @@ define void @caller_extern(ptr %src) optsize {
 ; CHECK-NEXT:    move $a1, $a0
 ; CHECK-NEXT:    pcalau12i $a0, %got_pc_hi20(dest)
 ; CHECK-NEXT:    ld.d $a0, $a0, %got_pc_lo12(dest)
-; CHECK-NEXT:    ori $a2, $zero, 7
+; CHECK-NEXT:    ori $a2, $zero, 33
 ; CHECK-NEXT:    b %plt(memcpy)
 entry:
-  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr getelementptr inbounds ([2 x i8], ptr @dest, i32 0, i32 0), ptr %src, i32 7, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(ptr getelementptr inbounds ([2 x i8], ptr @dest, i32 0, i32 0), ptr %src, i32 33, i1 false)
  ret void
 }

--- a/llvm/test/CodeGen/LoongArch/unaligned-access.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-access.ll
@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+;; Test the ual feature which is similar to AArch64/arm64-strict-align.ll.
+
+; RUN: llc --mtriple=loongarch32 < %s | FileCheck %s --check-prefix=LA32-ALIGNED
+; RUN: llc --mtriple=loongarch32 --mattr=+ual < %s | FileCheck %s --check-prefix=LA32-UNALIGNED
+; RUN: llc --mtriple=loongarch32 --mattr=-ual < %s | FileCheck %s --check-prefix=LA32-ALIGNED
+
+; RUN: llc --mtriple=loongarch64 < %s | FileCheck %s --check-prefix=LA64-UNALIGNED
+; RUN: llc --mtriple=loongarch64 --mattr=+ual < %s | FileCheck %s --check-prefix=LA64-UNALIGNED
+; RUN: llc --mtriple=loongarch64 --mattr=-ual < %s | FileCheck %s --check-prefix=LA64-ALIGNED
+
+define i32 @f0(ptr %p) nounwind {
+; LA32-ALIGNED-LABEL: f0:
+; LA32-ALIGNED:       # %bb.0:
+; LA32-ALIGNED-NEXT:    ld.hu $a1, $a0, 0
+; LA32-ALIGNED-NEXT:    ld.hu $a0, $a0, 2
+; LA32-ALIGNED-NEXT:    slli.w $a0, $a0, 16
+; LA32-ALIGNED-NEXT:    or $a0, $a0, $a1
+; LA32-ALIGNED-NEXT:    ret
+;
+; LA32-UNALIGNED-LABEL: f0:
+; LA32-UNALIGNED:       # %bb.0:
+; LA32-UNALIGNED-NEXT:    ld.w $a0, $a0, 0
+; LA32-UNALIGNED-NEXT:    ret
+;
+; LA64-UNALIGNED-LABEL: f0:
+; LA64-UNALIGNED:       # %bb.0:
+; LA64-UNALIGNED-NEXT:    ld.w $a0, $a0, 0
+; LA64-UNALIGNED-NEXT:    ret
+;
+; LA64-ALIGNED-LABEL: f0:
+; LA64-ALIGNED:       # %bb.0:
+; LA64-ALIGNED-NEXT:    ld.hu $a1, $a0, 0
+; LA64-ALIGNED-NEXT:    ld.h $a0, $a0, 2
+; LA64-ALIGNED-NEXT:    slli.d $a0, $a0, 16
+; LA64-ALIGNED-NEXT:    or $a0, $a0, $a1
+; LA64-ALIGNED-NEXT:    ret
+  %tmp = load i32, ptr %p, align 2
+  ret i32 %tmp
+}
+
+define i64 @f1(ptr %p) nounwind {
+; LA32-ALIGNED-LABEL: f1:
+; LA32-ALIGNED:       # %bb.0:
+; LA32-ALIGNED-NEXT:    ld.w $a2, $a0, 0
+; LA32-ALIGNED-NEXT:    ld.w $a1, $a0, 4
+; LA32-ALIGNED-NEXT:    move $a0, $a2
+; LA32-ALIGNED-NEXT:    ret
+;
+; LA32-UNALIGNED-LABEL: f1:
+; LA32-UNALIGNED:       # %bb.0:
+; LA32-UNALIGNED-NEXT:    ld.w $a2, $a0, 0
+; LA32-UNALIGNED-NEXT:    ld.w $a1, $a0, 4
+; LA32-UNALIGNED-NEXT:    move $a0, $a2
+; LA32-UNALIGNED-NEXT:    ret
+;
+; LA64-UNALIGNED-LABEL: f1:
+; LA64-UNALIGNED:       # %bb.0:
+; LA64-UNALIGNED-NEXT:    ld.d $a0, $a0, 0
+; LA64-UNALIGNED-NEXT:    ret
+;
+; LA64-ALIGNED-LABEL: f1:
+; LA64-ALIGNED:       # %bb.0:
+; LA64-ALIGNED-NEXT:    ld.wu $a1, $a0, 0
+; LA64-ALIGNED-NEXT:    ld.wu $a0, $a0, 4
+; LA64-ALIGNED-NEXT:    slli.d $a0, $a0, 32
+; LA64-ALIGNED-NEXT:    or $a0, $a0, $a1
+; LA64-ALIGNED-NEXT:    ret
+  %tmp = load i64, ptr %p, align 4
+  ret i64 %tmp
+}
--- a/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
+++ b/llvm/test/CodeGen/LoongArch/unaligned-memcpy-inline.ll
@ -0,0 +1,97 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+
+;; Test how memcpy is optimized when ual is turned off which is similar to AArch64/arm64-misaligned-memcpy-inline.ll.
+
+; RUN: llc --mtriple=loongarch32 --mattr=-ual < %s | FileCheck %s --check-prefix=LA32
+; RUN: llc --mtriple=loongarch64 --mattr=-ual < %s | FileCheck %s --check-prefix=LA64
+
+;; Small (16 bytes here) unaligned memcpy() should be a function call if
+;; ual is turned off.
+define void @t0(ptr %out, ptr %in) {
+; LA32-LABEL: t0:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    addi.w $sp, $sp, -16
+; LA32-NEXT:    .cfi_def_cfa_offset 16
+; LA32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; LA32-NEXT:    .cfi_offset 1, -4
+; LA32-NEXT:    ori $a2, $zero, 16
+; LA32-NEXT:    bl %plt(memcpy)
+; LA32-NEXT:    ld.w $ra, $sp, 12 # 4-byte Folded Reload
+; LA32-NEXT:    addi.w $sp, $sp, 16
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: t0:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    addi.d $sp, $sp, -16
+; LA64-NEXT:    .cfi_def_cfa_offset 16
+; LA64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; LA64-NEXT:    .cfi_offset 1, -8
+; LA64-NEXT:    ori $a2, $zero, 16
+; LA64-NEXT:    bl %plt(memcpy)
+; LA64-NEXT:    ld.d $ra, $sp, 8 # 8-byte Folded Reload
+; LA64-NEXT:    addi.d $sp, $sp, 16
+; LA64-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 16, i1 false)
+  ret void
+}
+
+;; Small (16 bytes here) aligned memcpy() should be inlined even if
+;; ual is turned off.
+define void @t1(ptr align 8 %out, ptr align 8 %in) {
+; LA32-LABEL: t1:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    ld.w $a2, $a1, 12
+; LA32-NEXT:    st.w $a2, $a0, 12
+; LA32-NEXT:    ld.w $a2, $a1, 8
+; LA32-NEXT:    st.w $a2, $a0, 8
+; LA32-NEXT:    ld.w $a2, $a1, 4
+; LA32-NEXT:    st.w $a2, $a0, 4
+; LA32-NEXT:    ld.w $a1, $a1, 0
+; LA32-NEXT:    st.w $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: t1:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ld.d $a2, $a1, 8
+; LA64-NEXT:    st.d $a2, $a0, 8
+; LA64-NEXT:    ld.d $a1, $a1, 0
+; LA64-NEXT:    st.d $a1, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i64(ptr align 8 %out, ptr align 8 %in, i64 16, i1 false)
+  ret void
+}
+
+;; Tiny (4 bytes here) unaligned memcpy() should be inlined with byte sized
+;; loads and stores if ual is turned off.
+define void @t2(ptr %out, ptr %in) {
+; LA32-LABEL: t2:
+; LA32:       # %bb.0: # %entry
+; LA32-NEXT:    ld.b $a2, $a1, 3
+; LA32-NEXT:    st.b $a2, $a0, 3
+; LA32-NEXT:    ld.b $a2, $a1, 2
+; LA32-NEXT:    st.b $a2, $a0, 2
+; LA32-NEXT:    ld.b $a2, $a1, 1
+; LA32-NEXT:    st.b $a2, $a0, 1
+; LA32-NEXT:    ld.b $a1, $a1, 0
+; LA32-NEXT:    st.b $a1, $a0, 0
+; LA32-NEXT:    ret
+;
+; LA64-LABEL: t2:
+; LA64:       # %bb.0: # %entry
+; LA64-NEXT:    ld.b $a2, $a1, 3
+; LA64-NEXT:    st.b $a2, $a0, 3
+; LA64-NEXT:    ld.b $a2, $a1, 2
+; LA64-NEXT:    st.b $a2, $a0, 2
+; LA64-NEXT:    ld.b $a2, $a1, 1
+; LA64-NEXT:    st.b $a2, $a0, 1
+; LA64-NEXT:    ld.b $a1, $a1, 0
+; LA64-NEXT:    st.b $a1, $a0, 0
+; LA64-NEXT:    ret
+entry:
+  call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %in, i64 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)