Bug 1746631 - Implement integer gemm intrinsic functions. r=rhunt

- Implements 7 intrinsic functions - These intrinsics are only enabled for x86/x86-64 platform and for privileged extensions - These intrinsics should never be accessible to web-pages -- Added corresponding mochitest Differential Revision: https://phabricator.services.mozilla.com/D136430
2024-11-23 21:01:08 +00:00 · 2022-02-04 14:28:29 +00:00 · 2022-02-04 14:28:29 +00:00 · dfde362b9e
commit dfde362b9e
parent 1342f2782a
10 changed files with 1000 additions and 8 deletions
--- a/dom/tests/mochitest/integer-gemm/mochitest.ini
+++ b/dom/tests/mochitest/integer-gemm/mochitest.ini
@ -0,0 +1,3 @@
+[DEFAULT]
+
+[test_unavailable_for_webpage.html]
--- a/dom/tests/mochitest/integer-gemm/test_unavailable_for_webpage.html
+++ b/dom/tests/mochitest/integer-gemm/test_unavailable_for_webpage.html
@ -0,0 +1,29 @@
+<!DOCTYPE HTML>
+<html>
+<!--
+https://bugzilla.mozilla.org/show_bug.cgi?id=1746631
+-->
+<head>
+  <meta charset="utf-8">
+  <title>Test for Mozilla integer gemm (1746631) -- Mozilla integer gemm shouldn't be available for web pages</title>
+  <script src="/tests/SimpleTest/SimpleTest.js"></script>
+  <link rel="stylesheet" type="text/css" href="/tests/SimpleTest/test.css"/>
+</head>
+<body>
+<a target="_blank" href="https://bugzilla.mozilla.org/show_bug.cgi?id=1746631">Feature Test 1746631</a>
+<p id="display"></p>
+<div id="content" style="display: none">
+
+</div>
+<pre id="test">
+<script type="text/javascript">
+  SimpleTest.waitForExplicitFinish();
+
+  const gemm = "mozIntGemm";
+  is(gemm in WebAssembly, false, `"WebAssembly.${gemm}" shouldn't be defined for web pages`);
+
+  SimpleTest.finish();
+</script>
+</pre>
+</body>
+</html>
--- a/dom/tests/moz.build
+++ b/dom/tests/moz.build
@ -161,6 +161,7 @@ MOCHITEST_MANIFESTS += [
    "mochitest/gamepad/mochitest.ini",
    "mochitest/general/mochitest.ini",
    "mochitest/geolocation/mochitest.ini",
+    "mochitest/integer-gemm/mochitest.ini",
    "mochitest/keyhandling/mochitest.ini",
    "mochitest/localstorage/mochitest.ini",
    "mochitest/orientation/mochitest.ini",
--- a/js/moz.configure
+++ b/js/moz.configure
@ -903,12 +903,10 @@ option(
 )


-@depends("--enable-wasm-moz-intgemm")
-def wasm_moz_intgemm(value):
-    if not value:
-        return
-
-    return True
+@depends("--enable-wasm-moz-intgemm", target)
+def wasm_moz_intgemm(value, target):
+    if value and target.cpu in ("x86", "x86_64"):
+        return True


 set_config("ENABLE_WASM_MOZ_INTGEMM", wasm_moz_intgemm)
--- a/js/src/intgemm/IntegerGemmIntrinsic.cpp
+++ b/js/src/intgemm/IntegerGemmIntrinsic.cpp
@ -0,0 +1,405 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+#include "intgemm/IntegerGemmIntrinsic.h"
+#include "mozilla/CheckedInt.h"
+#include <intgemm.h>
+
+#include <utility>
+
+#include "js/HeapAPI.h"
+#include "vm/JSContext.h"
+#include "wasm/WasmInstance.h"
+#include "wasm/WasmLog.h"
+#include "vm/ArrayBufferObject-inl.h"
+
+static constexpr uint32_t ARRAY_ALIGNMENT = 64;
+static constexpr uint32_t ROWS_A_MULTIPLIER = 1;
+static constexpr uint32_t COLUMNS_A_MULTIPLIER = 64;
+static constexpr uint32_t ROWS_B_MULTIPLIER = COLUMNS_A_MULTIPLIER;
+static constexpr uint32_t COLUMNS_B_MULTIPLIER = 8;
+static constexpr uint32_t SELECTED_COLUMNS_B_MULTIPLIER = 8;
+
+void ReportGemmError(JSContext* cx, const unsigned errorNumber) {
+  JS_ReportErrorNumberASCII(cx, js::GetErrorMessage, nullptr, errorNumber);
+}
+
+size_t GetWasmRawBufferLength(const uint8_t* memBase) {
+  const js::WasmArrayRawBuffer* rawBuf =
+      js::WasmArrayRawBuffer::fromDataPtr(memBase);
+  return rawBuf->byteLength();
+}
+
+bool CheckMatrixDimension(JSContext* cx, uint32_t size,
+                          uint32_t sizeMultiplier) {
+  // A valid size is a positive integral multiple of Multiplier
+  if ((size == 0) || (size % sizeMultiplier != 0)) {
+    js::wasm::Log(
+        cx, "Invalid dimension value:%" PRIu32 " (should be a multiple of %u)",
+        size, sizeMultiplier);
+    return false;
+  }
+  return true;
+}
+
+bool CheckMatrixBound(JSContext* cx, uint32_t input, uint64_t inputSize,
+                      size_t wasmBufferSize) {
+  mozilla::CheckedUint64 inputUpperLimit(inputSize);
+  inputUpperLimit += input;
+
+  // Bound check fails if size overflows or it spans outside the wasm memory
+  if (!inputUpperLimit.isValid() ||
+      (inputUpperLimit.value() >= (uint64_t)wasmBufferSize)) {
+    js::wasm::Log(cx, "Memory out of wasm bounds for matrix:%" PRIu32, input);
+    return false;
+  }
+  return true;
+}
+
+bool CheckMatrixBoundAndAlignment(JSContext* cx, uint32_t input,
+                                  uint64_t inputSize, size_t wasmBufferSize) {
+  // Alignment check: It is sufficient to check alignment for the offset rather
+  // than for the actual pointer within wasm memory (as long as following assert
+  // is satisfied)
+  static_assert(js::gc::PageSize >= ARRAY_ALIGNMENT,
+                "PageSize should be bigger than Alignment");
+  if (input % ARRAY_ALIGNMENT != 0) {
+    js::wasm::Log(
+        cx, "Unaligned access for matrix:%" PRIu32 " (should be %u aligned)",
+        input, ARRAY_ALIGNMENT);
+    return false;
+  }
+
+  // Check Bound
+  return CheckMatrixBound(cx, input, inputSize, wasmBufferSize);
+}
+
+int32_t js::intgemm::IntrI8PrepareB(wasm::Instance* instance,
+                                    uint32_t inputMatrixB, float scale,
+                                    float zeroPoint, uint32_t rowsB,
+                                    uint32_t colsB, uint32_t outputMatrixB,
+                                    uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8PrepareB.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsB:%" PRIu32 "  colsB:%" PRIu32, __FUNCTION__, rowsB,
+              colsB);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound and Alignment checks for matricies
+  uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixB, sizeB, wasmBufferSize) ||
+      !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: inputB:%x  rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  outputB:%x  sizeB:%" PRIu64 "  wasmBufferSize:%zu",
+              __FUNCTION__, inputMatrixB, rowsB, colsB, outputMatrixB, sizeB,
+              wasmBufferSize);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm) for PrepareB
+  uint8_t* inputMatrixBPtr = &memBase[inputMatrixB];
+  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+  ::intgemm::Int8::PrepareB((const float*)inputMatrixBPtr,
+                            (int8_t*)outputMatrixBPtr,
+                            (float)scale,  // Quant Mult
+                            rowsB, colsB);
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBFromTransposed(
+    wasm::Instance* instance, uint32_t inputMatrixBTransposed, float scale,
+    float zeroPoint, uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB,
+    uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8PrepareBFromTransposed.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsB:%" PRIu32 "  colsB:%" PRIu32, __FUNCTION__, rowsB,
+              colsB);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBTransposed, sizeB,
+                                    wasmBufferSize) ||
+      !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: inputBT:%x  rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  outputB:%x  sizeB:%" PRIu64 "  wasmBufferSize:%zu",
+              __FUNCTION__, inputMatrixBTransposed, rowsB, colsB, outputMatrixB,
+              sizeB, wasmBufferSize);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm) for PrepareBTransposed
+  uint8_t* inputMatrixBTransposedPtr = &memBase[inputMatrixBTransposed];
+  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+  ::intgemm::Int8::PrepareBTransposed((const float*)inputMatrixBTransposedPtr,
+                                      (int8_t*)outputMatrixBPtr,
+                                      (float)scale,  // Quant Mult
+                                      rowsB, colsB);
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBFromQuantizedTransposed(
+    wasm::Instance* instance, uint32_t inputMatrixBQuantizedTransposed,
+    uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB, uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8PrepareBFromQuantizedTransposed.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsB:%" PRIu32 "  colsB:%" PRIu32, __FUNCTION__, rowsB,
+              colsB);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBQuantizedTransposed, sizeB,
+                                    wasmBufferSize) ||
+      !CheckMatrixBoundAndAlignment(cx, outputMatrixB, sizeB, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: inputBQT:%x  rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  outputB:%x  sizeA:%" PRIu64 "  wasmBufferSize:%zu",
+              __FUNCTION__, inputMatrixBQuantizedTransposed, rowsB, colsB,
+              outputMatrixB, sizeB, wasmBufferSize);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm)
+  uint8_t* inputMatrixBQuantizedTransposedPtr =
+      &memBase[inputMatrixBQuantizedTransposed];
+  uint8_t* outputMatrixBPtr = &memBase[outputMatrixB];
+  ::intgemm::Int8::PrepareBQuantizedTransposed(
+      (const int8_t*)inputMatrixBQuantizedTransposedPtr,
+      (int8_t*)outputMatrixBPtr, rowsB, colsB);
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareA(wasm::Instance* instance,
+                                    uint32_t inputMatrixA, float scale,
+                                    float zeroPoint, uint32_t rowsA,
+                                    uint32_t colsA, uint32_t outputMatrixA,
+                                    uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8PrepareA.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsA, ROWS_A_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsA, COLUMNS_A_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsA:%" PRIu32 "  colsA:%" PRIu32, __FUNCTION__, rowsA,
+              colsA);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeA = (uint64_t)rowsA * (uint64_t)colsA;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixA, sizeA, wasmBufferSize) ||
+      !CheckMatrixBoundAndAlignment(cx, outputMatrixA, sizeA, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: inputA:%x  rowsA:%" PRIu32 "  colsA:%" PRIu32
+              "  outputA:%x  sizeA:%" PRIu64 "  wasmBufferSize:%zu",
+              __FUNCTION__, inputMatrixA, rowsA, colsA, outputMatrixA, sizeA,
+              wasmBufferSize);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm)
+  uint8_t* inputMatrixAPtr = &memBase[inputMatrixA];
+  uint8_t* outputMatrixAPtr = &memBase[outputMatrixA];
+  ::intgemm::Int8Shift::PrepareA((const float*)inputMatrixAPtr,
+                                 (int8_t*)outputMatrixAPtr, scale, rowsA,
+                                 colsA);
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8PrepareBias(
+    wasm::Instance* instance, uint32_t inputMatrixBPrepared, float scaleA,
+    float zeroPointA, float scaleB, float zeroPointB, uint32_t rowsB,
+    uint32_t colsB, uint32_t inputBias, uint32_t output, uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8PrepareBias.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsB:%" PRIu32 "  colsB:%" PRIu32, __FUNCTION__, rowsB,
+              colsB);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+  uint64_t sizeBias = colsB;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+                                    wasmBufferSize) ||
+      !CheckMatrixBound(cx, inputBias, sizeBias, wasmBufferSize) ||
+      !CheckMatrixBound(cx, output, sizeBias, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: preparedB:%x  rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  inputBias:%x  outputBias:%x  sizeB:%" PRIu64
+              "  wasmBufferSize:%zu",
+              __FUNCTION__, inputMatrixBPrepared, rowsB, colsB, inputBias,
+              output, sizeB, wasmBufferSize);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm)
+  uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+  uint8_t* inputBiasPtr = &memBase[inputBias];
+  uint8_t* outputPtr = &memBase[output];
+  float unquantFactor =
+      (-1) * ((127.0f / scaleA) * (127.0f / scaleB)) / (127.0f);
+  ::intgemm::Int8Shift::PrepareBias(
+      (const int8_t*)inputMatrixBPreparedPtr, rowsB, colsB,
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
+          unquantFactor, (const float*)inputBiasPtr, (float*)outputPtr));
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8MultiplyAndAddBias(
+    wasm::Instance* instance, uint32_t inputMatrixAPrepared, float scaleA,
+    float zeroPointA, uint32_t inputMatrixBPrepared, float scaleB,
+    float zeroPointB, uint32_t inputBiasPrepared, float unquantMultiplier,
+    uint32_t rowsA, uint32_t width, uint32_t colsB, uint32_t output,
+    uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8MultiplyAndAddBias.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsA, ROWS_A_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, width, COLUMNS_A_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx, "%s: rowsA:%" PRIu32 "  width:%" PRIu32 "  colsB:%" PRIu32,
+              __FUNCTION__, rowsA, width, colsB);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeA = (uint64_t)rowsA * (uint64_t)width;
+  uint64_t sizeB = (uint64_t)width * (uint64_t)colsB;
+  uint64_t sizeBias = (uint64_t)colsB;
+  uint64_t sizeOutput = (uint64_t)rowsA * (uint64_t)colsB;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixAPrepared, sizeA,
+                                    wasmBufferSize) ||
+      !CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+                                    wasmBufferSize) ||
+      !CheckMatrixBound(cx, inputBiasPrepared, sizeBias, wasmBufferSize) ||
+      !CheckMatrixBound(cx, output, sizeOutput, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: preparedA:%x  preparedB:%x  preparedBias:%x  rowsA:%" PRIu32
+              "  width:%" PRIu32 "  colsB:%" PRIu32
+              "  output:%x  sizeA:%" PRIu64 "  sizeB:%" PRIu64
+              "  sizeBias:%" PRIu64 "  sizeOutput:%" PRIu64,
+              __FUNCTION__, inputMatrixAPrepared, inputMatrixBPrepared,
+              inputBiasPrepared, rowsA, width, colsB, output, sizeA, sizeB,
+              sizeBias, sizeOutput);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm)
+  uint8_t* inputMatrixAPreparedPtr = &memBase[inputMatrixAPrepared];
+  uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+  uint8_t* inputBiasPreparedPtr = &memBase[inputBiasPrepared];
+  uint8_t* outputPtr = &memBase[output];
+  float unquantFactor = unquantMultiplier / (scaleA * scaleB);
+  ::intgemm::Int8Shift::Multiply(
+      (const int8_t*)inputMatrixAPreparedPtr,
+      (const int8_t*)inputMatrixBPreparedPtr, rowsA, width, colsB,
+      ::intgemm::callbacks::UnquantizeAndAddBiasAndWrite(
+          unquantFactor, (const float*)inputBiasPreparedPtr,
+          (float*)outputPtr));
+  return 0;
+}
+
+int32_t js::intgemm::IntrI8SelectColumnsOfB(wasm::Instance* instance,
+                                            uint32_t inputMatrixBPrepared,
+                                            uint32_t rowsB, uint32_t colsB,
+                                            uint32_t colIndexList,
+                                            uint32_t sizeColIndexList,
+                                            uint32_t output, uint8_t* memBase) {
+  MOZ_ASSERT(wasm::SASigIntrI8SelectColumnsOfB.failureMode ==
+             wasm::FailureMode::FailOnNegI32);
+  JSContext* cx = instance->tlsData()->cx;
+
+  // Size checks for matricies
+  if (!CheckMatrixDimension(cx, rowsB, ROWS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, colsB, COLUMNS_B_MULTIPLIER) ||
+      !CheckMatrixDimension(cx, sizeColIndexList,
+                            SELECTED_COLUMNS_B_MULTIPLIER)) {
+    wasm::Log(cx,
+              "%s: rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  sizeColIndexList:%" PRIu32,
+              __FUNCTION__, rowsB, colsB, sizeColIndexList);
+    ReportGemmError(cx, JSMSG_WASM_UNREACHABLE);
+    return -1;
+  }
+
+  // Memory Bound checks for all matricies
+  uint64_t sizeB = (uint64_t)rowsB * (uint64_t)colsB;
+  uint64_t sizeOutput = (uint64_t)rowsB * (uint64_t)sizeColIndexList;
+  size_t wasmBufferSize = GetWasmRawBufferLength(memBase);
+  if (!CheckMatrixBoundAndAlignment(cx, inputMatrixBPrepared, sizeB,
+                                    wasmBufferSize) ||
+      !CheckMatrixBound(cx, colIndexList, sizeColIndexList, wasmBufferSize) ||
+      !CheckMatrixBound(cx, output, sizeOutput, wasmBufferSize)) {
+    wasm::Log(cx,
+              "%s: preparedB:%x  rowsB:%" PRIu32 "  colsB:%" PRIu32
+              "  colList:%x  sizeColList:%" PRIu32 " output:%x  sizeB:%" PRIu64
+              "  sizeOutput:%" PRIu64,
+              __FUNCTION__, inputMatrixBPrepared, rowsB, colsB, colIndexList,
+              sizeColIndexList, output, sizeB, sizeOutput);
+    ReportGemmError(cx, JSMSG_WASM_OUT_OF_BOUNDS);
+    return -1;
+  }
+
+  // Actual call to the 3rd party library (intgemm)
+  uint8_t* inputMatrixBPreparedPtr = &memBase[inputMatrixBPrepared];
+  uint8_t* colIndexListPtr = &memBase[colIndexList];
+  uint8_t* outputPtr = &memBase[output];
+  ::intgemm::Int8::SelectColumnsB(
+      (const int8_t*)inputMatrixBPreparedPtr, (int8_t*)outputPtr, rowsB,
+      (const uint32_t*)colIndexListPtr,
+      (const uint32_t*)colIndexListPtr + sizeColIndexList);
+  return 0;
+}
--- a/js/src/intgemm/IntegerGemmIntrinsic.h
+++ b/js/src/intgemm/IntegerGemmIntrinsic.h
@ -0,0 +1,358 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
+ * vim: set ts=8 sts=2 et sw=2 tw=80:
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at https://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef intgemm_IntegerGemmIntrinsic_h
+#define intgemm_IntegerGemmIntrinsic_h
+
+#include <stdint.h>
+
+namespace js {
+namespace wasm {
+class Instance;
+}
+
+namespace intgemm {
+
+/* Interface for integer matrix multiplication followed by addition of bias.
+ *
+ * C = A * B + Bias
+ *
+ * Input matrix A:
+ *  - A 2-D matrix that typically represents activations as floating point
+ * values
+ *  - no. of rows should be a positive integer
+ *  - no. of columns should be a positive integeral multiple of 64
+ *  - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Input matrix B:
+ *  - A 2-D matrix that typically represents fixed model parameters as
+ * floating point values
+ *  - no. of rows should be:
+ *    -- equal to no. of columns of Input matrix A
+ *    -- a positive integeral multiple of 64
+ *  - no. of columns should be a positive integeral multiple of 8
+ *  - is represented as array (contiguous memory locations) in row-major format
+ *
+ *  Please note that it is also possible to pass Input matrix B in 2 more forms:
+ *   - One that is already a quantized and transposed version of Input matrix B
+ *   - Other that is already a transposed version of Input matrix B
+ *
+ * Input Bias:
+ *  - is an array (contiguous memory locations) that represents bias
+ *  - size of the array should be equal to the no. of columns of Input matrix B
+ *
+ * Output matrix C:
+ *  - is a 2-D matrix that represents the result (= A * B + Bias)
+ *  - no. of rows = no. of rows of Input matrix A
+ *  - no. of columns = no. of columns of Input matrix B (in
+ * untransposed form)
+ *  - is represented as array (contiguous memory locations) in row-major format
+ *
+ * Please note that most of the functions in this interface might have
+ * architecture specific implementations.
+ *
+ * Conventions followed for the interface:
+ *  - Unless explicitly mentioned, Input matrix B refers to an unquantized
+ * (i.e. float values) and non-transposed version
+ *  - no. of rows of Input matrix A = `rowsA`
+ *  - no. of columns of Input matrix A (`colsA`) = no. of rows of Input matrix B
+ * (`rowsB`) = `width`
+ *  - no. of columns of Input matrix B = `colsB`
+ */
+
+/* Prepare B for the Matrix Multiply function from Input matrix B.
+ *
+ * Quantization is performed on the input.
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in]   inputMatrixB        An array representing the Input matrix B in
+ *                                  row-major format.
+ *                                  Size of the array = `rowsB` * `colsB`.
+ *                                  Shape of the matrix: (`rowsB`, `colsB`)
+ * @param[in]   scale               The scaling factor (for quantization)
+ * @param[in]   zeroPoint           The zero point (for quantization)
+ * @param[in]   rowsB               No. of rows of Input matrix B. It should be
+ *                                  a positive integer and a multiple of 64.
+ * @param[in]   colsB               No. of columns of Input matrix B. It should
+ *                                  be a positive integer and a multiple of 8.
+ * @param[out]  outputMatrixB       An array representing the prepared B matrix.
+ *                                  Size of the array = `rowsB` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ *   int8_prepare_b(inputMatrixB: i32, scale: f32, zeroPoint: f32, rowsB: i32,
+ * colsB: i32, outputMatrixB: i32) which implements the function:
+ *   int8_prepare_b(const float* inputMatrixB, float scale, float zeroPoint,
+ * uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)
+ */
+int32_t IntrI8PrepareB(wasm::Instance* instance, uint32_t inputMatrixB,
+                       float scale, float zeroPoint, uint32_t rowsB,
+                       uint32_t colsB, uint32_t outputMatrixB,
+                       uint8_t* memBase);
+
+/* Prepare B for the Matrix Multiply function from transposed version of Input
+ * matrix B.
+ *
+ * Quantization is performed on floating values of input.
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in]   inputMatrixBTransposed An array representing transposed version
+ *                                     of Input matrix B.
+ *                                     It is in column-major format.
+ *                                     Size of the array = `rowsB` * `colsB`.
+ *                                     Shape of the matrix: (`colsB`, `rowsB`)
+ * @param[in]   scale                  The scaling factor (for quantization)
+ * @param[in]   zeroPoint              The zero point (for quantization)
+ * @param[in]   rowsB                  No. of rows of Input matrix B. It should
+ *                                     be a positive integer and a multiple of
+ *                                     64.
+ * @param[in]   colsB                  No. of columns of Input matrix B. It
+ *                                     should be a positive integer and a
+ *                                     multiple of 8.
+ * @param[out]  outputMatrixB          An array representing the prepared B
+ *                                     matrix. Size of array = `rowsB`*`colsB`
+ *
+ * This function implements the intrinsic:
+ *   int8_prepare_b_from_transposed(inputMatrixBTransposed: i32, scale: f32,
+ * zeroPoint: f32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements
+ * the function: int8_prepare_b_from_transposed(const float*
+ * inputMatrixBTransposed, float scale, float zeroPoint, uint32_t rowsB,
+ * uint32_t colsB, int8_t* outputMatrixB)
+ */
+int32_t IntrI8PrepareBFromTransposed(wasm::Instance* instance,
+                                     uint32_t inputMatrixBTransposed,
+                                     float scale, float zeroPoint,
+                                     uint32_t rowsB, uint32_t colsB,
+                                     uint32_t outputMatrixB, uint8_t* memBase);
+
+/* Prepare B for the Matrix Multiply function from a quantized and transposed
+ * version of Input matrix B which is also in a CPU-independent format.
+ *
+ * The final prepared B is in CPU-dependent format and can be used as an input
+ * to matrix multiply function (`int8_multiply_and_add_bias`).
+ *
+ * This function is useful while using the quantized models that are stored in a
+ * CPU-independent format on the disk.
+ *
+ * @param[in]   inputMatrixBQuantizedTransposed  An array representing the
+ *                                               quantized and transposed
+ *                                               version of Input matrix B.
+ *                                               It is in column-major format.
+ *                                               Size of array =
+ *                                                 `rowsB`*`colsB`
+ *                                               Shape of the matrix:
+ *                                                 (`colsB`,`rowsB`)
+ * @param[in]   rowsB                            No. of rows of Input matrix B.
+ *                                               Should be a positive integer
+ *                                               and a multiple of 64.
+ * @param[in]   colsB                            No. of columns of Input matrix
+ *                                               B. Should be a positive
+ *                                               integer and a multiple of 8
+ * @param[out]  outputMatrixB                    An array representing the
+ *                                               prepared B matrix.
+ *                                               Size: `rowsB` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ *   int8_prepare_b_from_quantized_transposed(inputMatrixBQuantizedTransposed:
+ * i32, rowsB: i32, colsB: i32, outputMatrixB: i32) which implements the
+ * function: int8_prepare_b_from_quantized_transposed(const int8_t*
+ * inputMatrixBQuantizedTransposed, uint32_t rowsB, uint32_t colsB, int8_t*
+ * outputMatrixB)
+ */
+int32_t IntrI8PrepareBFromQuantizedTransposed(
+    wasm::Instance* instance, uint32_t inputMatrixBQuantizedTransposed,
+    uint32_t rowsB, uint32_t colsB, uint32_t outputMatrixB, uint8_t* memBase);
+
+/* Prepare A for the Matrix Multiply function from Input matrix A.
+ *
+ * It performs quantization on floating values of input.
+ * The final prepared A might be architecture dependent. e.g. On some
+ * architectures like x86, it might be unsigned (achieved by adding 127 to
+ * quantized values) while on others like Arm, it might be signed. The final
+ * prepared A can be used as an input to matrix multiply function
+ * (`int8_multiply_and_add_bias`).
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in]   inputMatrixA   An array representing the Input matrix A in
+ *                             row-major format.
+ *                             Size of the array = `rowsA` * `colsA`.
+ *                             Shape of the matrix: (`rowsA`, `colsA`)
+ * @param[in]   scale          The scaling factor (for quantization)
+ * @param[in]   zeroPoint      The zero point (for quantization)
+ * @param[in]   rowsA          No. of rows of Input matrix A. It should be a
+ *                             positive integer.
+ * @param[in]   colsA          No. of columns of Input matrix A. It should be a
+ *                             positive integer and a multiple of 64.
+ * @param[out]  outputMatrixA  An array representing the prepared A matrix.
+ *                             Size of the array = `rowsA` * `colsA`.
+ *
+ * This function implements the intrinsic:
+ *   int8_prepare_a(inputMatrixA: i32, scale: f32, zeroPoint: f32, rowsA: i32,
+ * colsA: i32, outputMatrixA: i32) which implements the function:
+ *   int8_prepare_a(const float* inputMatrixA, float scale, float zeroPoint,
+ * uint32_t rowsA, uint32_t colsA, int8_t* outputMatrixA)
+ */
+int32_t IntrI8PrepareA(wasm::Instance* instance, uint32_t inputMatrixA,
+                       float scale, float zeroPoint, uint32_t rowsA,
+                       uint32_t colsA, uint32_t outputMatrixA,
+                       uint8_t* memBase);
+
+/* Prepares bias for the Matrix Multiply function.
+ *
+ * It uses the prepared B (which must be obtained by using any of the
+ * int8_prepare_b* functions) and a bias input to prepare the final bias.
+ *
+ * The final bias can be used as an input to matrix multiply function
+ * (`int8_multiply_and_add_bias`).
+ *
+ * @param[in]   inputMatrixBPrepared An array representing the prepared B
+ *                                   matrix. Size of array = `rowsB`*`colsB`.
+ * @param[in]   scaleA               The scaling factor (for quantization) of A
+ * @param[in]   zeroPointA           The zero point (for quantization) of A
+ * @param[in]   scaleB               The scaling factor (for quantization) of B
+ * @param[in]   zeroPointB           The zero point (for quantization) of B
+ * @param[in]   rowsB                No. of rows of Input matrix B (unquantized
+ *                                   & non-transposed). It should be a positive
+ *                                   integer and a multiple of 64.
+ * @param[in]   colsB                No. of columns of Input matrix B
+ *                                   (unquantized & non-transposed). It should
+ *                                   be a positive integer and a multiple of 8.
+ * @param[in]   inputBias            An array representing the input bias. Size
+ *                                   of array = `colsB`
+ * @param[out]  output               An array representing the final prepared
+ *                                   bias. Size of the array = `colsB`
+ *
+ * This function implements the intrinsic:
+ *   int8_prepare_bias(inputMatrixBPrepared: i32, scaleA: f32, zeroPointA: f32,
+ * scaleB: f32, zeroPointB: f32, rowsB: i32, colsB: i32, inputBias: i32, output:
+ * i32) which implements the function: int8_prepare_bias(const int8_t*
+ * inputMatrixBPrepared, float scaleA, float zeroPointA, float scaleB, float
+ * zeroPointB, uint32_t rowsB, uint32_t colsB, const float* inputBias, float*
+ * output)
+ */
+int32_t IntrI8PrepareBias(wasm::Instance* instance,
+                          uint32_t inputMatrixBPrepared, float scaleA,
+                          float zeroPointA, float scaleB, float zeroPointB,
+                          uint32_t rowsB, uint32_t colsB, uint32_t inputBias,
+                          uint32_t output, uint8_t* memBase);
+
+/* Perform multiplication of 2 matrices followed by adding a bias.
+ *
+ * i.e Output = inputMatrixAPrepared * inputMatrixBPrepared + inputBiasPrepared
+ *
+ * The inputs inputMatrixAPrepared, inputMatrixBPrepared and inputBiasPrepared
+ * of this function must be obtained by using `int8_prepare_A`, one of the
+ * `int8_prepare_b*` and `int8_prepare_bias` functions respectively.
+ *
+ * Please note that this interface might have architecture specific
+ * implementation.
+ *
+ * @param[in]   inputMatrixAPrepared   An array representing the prepared A
+ *                                     matrix. This must be obtained by using
+ *                                     `int8_prepare_A` function. Size of the
+ *                                     array = `rowsA` * `width`.
+ * @param[in]   scaleA                 The scaling factor (quantization) of A
+ * @param[in]   zeroPointA             The zero point (for quantization) of A
+ * @param[in]   inputMatrixBPrepared   An array representing the prepared B
+ *                                     matrix. This must be obtained by using
+ *                                     one of `int8_prepare_b*` functions.
+ *                                     Size of the array = `width` * `colsB`.
+ * @param[in]   scaleB                 The scaling factor (quantization) of B
+ * @param[in]   zeroPointB             The zero point (for quantization) of B
+ * @param[in]   inputBiasPrepared      An array representing the prepared bias.
+ *                                     This must be obtained by using
+ *                                     `int8_prepare_bias` function.
+ *                                     Size of the array = `colsB`
+ * @param[in]   unquantMultiplier      A value that will be multiplied to the
+ *                                     final unquantization factor that is
+ *                                     prepared from `scaleA` and `scaleB`.
+ * @param[in]   rowsA                  No. of rows of Input matrix A. It should
+ *                                     be a positive integer.
+ * @param[in]   width                  No. of columns of Input matrix A (same as
+ *                                     no. of columns of Input matrix B). It
+ *                                     should be a positive integer and a
+ *                                     multiple of 64.
+ * @param[in]   colsB                  No. of columns of Input matrix B. Should
+ *                                     be a multiple of 8.
+ * @param[out]  output                 An array representing the result matrix
+ *                                     in row-major format.
+ *                                     Size of the array = `rowsA` * `colsB`.
+ *
+ * This function implements the intrinsic:
+ *   int8_multiply_and_add_bias(inputMatrixAPrepared: i32, scaleA: f32,
+ * zeroPointA: f32, inputMatrixBPrepared: i32, scaleB: f32, zeroPointB: f32,
+ *                     inputBiasPrepared: i32, unquantMultiplier: f32,
+ *                     rowsA: i32, width: i32, colsB: i32, output: i32)
+ * which implements the function:
+ *   int8_multiply_and_add_bias(const int8_t* inputMatrixAPrepared, float
+ * scaleA, float zeroPointA, const int8_t* inputMatrixBPrepared, float scaleB,
+ * float zeroPointB, const float* inputBiasPrepared, float unquantMultiplier,
+ *                     uint32_t rowsA, uint32_t width, uint32_t colsB, float*
+ * output)
+ */
+int32_t IntrI8MultiplyAndAddBias(wasm::Instance* instance,
+                                 uint32_t inputMatrixAPrepared, float scaleA,
+                                 float zeroPointA,
+                                 uint32_t inputMatrixBPrepared, float scaleB,
+                                 float zeroPointB, uint32_t inputBiasPrepared,
+                                 float unquantMultiplier, uint32_t rowsA,
+                                 uint32_t width, uint32_t colsB,
+                                 uint32_t output, uint8_t* memBase);
+
+/* Select a subset of columns of prepared B.
+ *
+ * Indices of the columns to be selected are specified by an array.
+ *
+ * @param[in]   inputMatrixBPrepared  An array representing the prepared B
+ *                                    matrix. This must be obtained by using
+ *                                    one of the `int8_prepare_b*` functions.
+ *                                    Size of the array = `rowsB` * `colsB`.
+ * @param[in]   rowsB                 No. of rows of Input matrix B. It should
+ *                                    be a positive integer and a multiple
+ *                                    of 64.
+ * @param[in]   colsB                 No. of columns of Input matrix B. It
+ *                                    should be a positive integer and a
+ *                                    multiple of 8.
+ * @param[in]   colIndexList          An array of column indices to be selected
+ *                                    from prepared B. All indices of the array
+ *                                    should be valid
+ *                                    i.e. 0 <= colIndexList[N] < colsB
+ *                                    where N = 0, 1 ....(`sizeColIndexList`-1)
+ * @param[in]   sizeColIndexList      Size of the `colIndexList` array. It
+ *                                    should be a positive integer and a
+ *                                    multiple of 8.
+ * @param[out]  output                An array representing the selected columns
+ *                                    of prepared B.
+ *                                    Size = `rowsB` * `sizeColIndexList`.
+ *
+ * This function implements the intrinsic:
+ *   int8_select_columns_of_b(inputMatrixBPrepared: i32, rowsB: i32, colsB: i32,
+ * colIndexList: i32, sizeColIndexList: i32, output: i32) which implements the
+ * function: int8_select_columns_of_b(const int8_t* inputMatrixBPrepared,
+ * uint32_t rowsB, uint32_t colsB, const uint32_t* colIndexList, const uint32_t
+ * sizeColIndexList, int8_t* output)
+ */
+int32_t IntrI8SelectColumnsOfB(wasm::Instance* instance,
+                               uint32_t inputMatrixBPrepared, uint32_t rowsB,
+                               uint32_t colsB, uint32_t colIndexList,
+                               uint32_t sizeColIndexList, uint32_t output,
+                               uint8_t* memBase);
+
+}  // namespace intgemm
+}  // namespace js
+
+#endif  // intgemm_IntegerGemmIntrinsic_h
--- a/js/src/intgemm/moz.build
+++ b/js/src/intgemm/moz.build
@ -4,17 +4,23 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.

+include("../js-config.mozbuild")
+include("../js-cxxflags.mozbuild")
+
 FINAL_LIBRARY = "js"

 with Files("*"):
    BUG_COMPONENT = ("Core", "JavaScript: WebAssembly")

 LOCAL_INCLUDES += [
+    "!..",
+    "..",
    "/third_party/intgemm/intgemm",
 ]

 SOURCES += [
    "/third_party/intgemm/intgemm/intgemm.cc",
+    "IntegerGemmIntrinsic.cpp",
 ]

 GeneratedFile(
--- a/js/src/wasm/WasmBuiltins.h
+++ b/js/src/wasm/WasmBuiltins.h
@ -19,6 +19,7 @@
 #ifndef wasm_builtins_h
 #define wasm_builtins_h

+#include "intgemm/IntegerGemmIntrinsic.h"
 #include "jit/IonTypes.h"
 #include "wasm/WasmIntrinsicGenerated.h"

--- a/js/src/wasm/WasmIntrinsic.yaml
+++ b/js/src/wasm/WasmIntrinsic.yaml
@ -15,3 +15,187 @@
    - I32
    - I32
    - I32
+
+#if defined(ENABLE_WASM_MOZ_INTGEMM)
+
+# Intrinsics for integer matrix multiplication followed by addition of bias.
+# Please refer to @TOPSRCDIR/js/src/intgemm/IntegerGemmIntrinsic.h for more details on these intrinsics.
+
+
+# Prepare B for the Matrix Multiply intrinsic from Input matrix B.
+#
+#  Quantization is performed on the input.
+#  The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+#  intrinsic (`int8_multiply_and_add_bias`).
+#
+# int8_prepare_b(const float* inputMatrixB, float scale, float zeroPoint, uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)
+# int8_prepare_b(inputMatrixB: i32, scale: f32, zeroPoint: f32, rowsB: i32, colsB: i32, outputMatrixB: i32)
+- op: I8PrepareB
+  symbolic_address:
+    name: IntrI8PrepareB
+    type: Args_Int32_GeneralInt32Float32Float32Int32Int32Int32General
+  entry: intgemm::IntrI8PrepareB
+  export: int8_prepare_b
+  params:
+    - I32
+    - F32
+    - F32
+    - I32
+    - I32
+    - I32
+
+
+# Prepare B for the Matrix Multiply intrinsic from transposed version of Input matrix B.
+#
+#  Quantization is performed on floating values of input.
+#  The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+#  intrinsic (`int8_multiply_and_add_bias`).
+#
+# int8_prepare_b_from_transposed(const float* inputMatrixBTransposed, float scale, float zeroPoint, uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)
+# int8_prepare_b_from_transposed(inputMatrixBTransposed: i32, scale: f32, zeroPoint: f32, rowsB: i32, colsB: i32, outputMatrixB: i32)
+- op: I8PrepareBFromTransposed
+  symbolic_address:
+    name: IntrI8PrepareBFromTransposed
+    type: Args_Int32_GeneralInt32Float32Float32Int32Int32Int32General
+  entry: intgemm::IntrI8PrepareBFromTransposed
+  export: int8_prepare_b_from_transposed
+  params:
+    - I32
+    - F32
+    - F32
+    - I32
+    - I32
+    - I32
+
+
+# Prepare B for the Matrix Multiply intrinsic from a quantized and transposed version of Input
+# matrix B which is also in a CPU-independent format.
+#
+#  The final prepared B is in CPU-dependent format and can be used as an input to matrix multiply
+#  intrinsic (`int8_multiply_and_add_bias`).
+#
+# int8_prepare_b_from_quantized_transposed(const int8_t* inputMatrixBQuantizedTransposed, uint32_t rowsB, uint32_t colsB, int8_t* outputMatrixB)
+# int8_prepare_b_from_quantized_transposed(inputMatrixBQuantizedTransposed: i32, rowsB: i32, colsB: i32, outputMatrixB: i32)
+- op: I8PrepareBFromQuantizedTransposed
+  symbolic_address:
+    name: IntrI8PrepareBFromQuantizedTransposed
+    type: Args_Int32_GeneralInt32Int32Int32Int32General
+  entry: intgemm::IntrI8PrepareBFromQuantizedTransposed
+  export: int8_prepare_b_from_quantized_transposed
+  params:
+    - I32
+    - I32
+    - I32
+    - I32
+
+
+# Prepare A for the Matrix Multiply intrinsic from Input matrix A.
+#
+#  It performs quantization on floating values of input.
+#  The final prepared A might be architecture dependent. e.g. On some architectures like x86, it
+#  might be unsigned (achieved by adding 127 to quantized values) while on others like Arm, it might
+#  be signed.
+#  The final prepared A can be used as an input to matrix multiply intrinsic
+#  (`int8_multiply_and_add_bias`).
+#
+# int8_prepare_a(const float* inputMatrixA, float scale, float zeroPoint, uint32_t rowsA, uint32_t colsA, int8_t* outputMatrixA)
+# int8_prepare_a(inputMatrixA: i32, scale: f32, zeroPoint: f32, rowsA: i32, colsA: i32, outputMatrixA: i32)
+- op: I8PrepareA
+  symbolic_address:
+    name: IntrI8PrepareA
+    type: Args_Int32_GeneralInt32Float32Float32Int32Int32Int32General
+  entry: intgemm::IntrI8PrepareA
+  export: int8_prepare_a
+  params:
+    - I32
+    - F32
+    - F32
+    - I32
+    - I32
+    - I32
+
+
+# Prepares bias for the Matrix Multiply intrinsic.
+#
+#  It uses the prepared B (which must be obtained by using any of the `int8_prepare_b*` intrinsics) and
+#  a bias input to prepare the final bias.
+#
+#  The final bias can be used as an input to matrix multiply intrinsic (`int8_multiply_and_add_bias`).
+#
+# int8_prepare_bias(const int8_t* inputMatrixBPrepared, float scaleA, float zeroPointA, float scaleB, float zeroPointB, uint32_t rowsB, uint32_t colsB, const float* inputBias, float* output)
+# int8_prepare_bias(inputMatrixBPrepared: i32, scaleA: f32, zeroPointA: f32, scaleB: f32, zeroPointB: f32, rowsB: i32, colsB: i32, inputBias: i32, output: i32)
+- op: I8PrepareBias
+  symbolic_address:
+    name: IntrI8PrepareBias
+    type: Args_Int32_GeneralInt32Float32Float32Float32Float32Int32Int32Int32Int32General
+  entry: intgemm::IntrI8PrepareBias
+  export: int8_prepare_bias
+  params:
+    - I32
+    - F32
+    - F32
+    - F32
+    - F32
+    - I32
+    - I32
+    - I32
+    - I32
+
+
+# Perform multiplication of 2 matrices followed by adding a bias.
+#
+#  i.e Output = inputMatrixAPrepared * inputMatrixBPrepared + inputBiasPrepared
+#
+#  The inputs of this intrinsic must be obtained by using `int8_prepare_A`,
+#  one of the `int8_prepare_b*` and `int8_prepare_bias` intrinsics respectively.
+#
+# int8_multiply_and_add_bias(const int8_t* inputMatrixAPrepared, float scaleA, float zeroPointA,
+#                      const int8_t* inputMatrixBPrepared, float scaleB, float zeroPointB,
+#                      const float* inputBiasPrepared, float unquantMultiplier,
+#                      uint32_t rowsA, uint32_t width, uint32_t colsB, float* output)
+# int8_multiply_and_add_bias(inputMatrixAPrepared: i32, scaleA: f32, zeroPointA: f32,
+#                      inputMatrixBPrepared: i32, scaleB: f32, zeroPointB: f32,
+#                      inputBiasPrepared: i32, unquantMultiplier: f32,
+#                      rowsA: i32, width: i32, colsB: i32, output: i32)
+- op: I8MultiplyAndAddBias
+  symbolic_address:
+    name: IntrI8MultiplyAndAddBias
+    type: Args_Int32_GeneralInt32Float32Float32Int32Float32Float32Int32Float32Int32Int32Int32Int32General
+  entry: intgemm::IntrI8MultiplyAndAddBias
+  export: int8_multiply_and_add_bias
+  params:
+    - I32
+    - F32
+    - F32
+    - I32
+    - F32
+    - F32
+    - I32
+    - F32
+    - I32
+    - I32
+    - I32
+    - I32
+
+
+# Select a subset of columns of prepared B.
+#
+#  Indices of the columns to be selected are specified by an array.
+#
+# int8_select_columns_of_b(const int8_t* inputMatrixBPrepared, uint32_t rowsB, uint32_t colsB, const uint32_t* colIndexList, const uint32_t sizeColIndexList, int8_t* output)
+# int8_select_columns_of_b(inputMatrixBPrepared: i32, rowsB: i32, colsB: i32, colIndexList: i32, sizeColIndexList: i32, output: i32)
+- op: I8SelectColumnsOfB
+  symbolic_address:
+    name: IntrI8SelectColumnsOfB
+    type: Args_Int32_GeneralInt32Int32Int32Int32Int32Int32General
+  entry: intgemm::IntrI8SelectColumnsOfB
+  export: int8_select_columns_of_b
+  params:
+    - I32
+    - I32
+    - I32
+    - I32
+    - I32
+    - I32
+
+#endif // ENABLE_WASM_MOZ_INTGEMM
--- a/js/src/wasm/WasmJS.cpp
+++ b/js/src/wasm/WasmJS.cpp
@ -5321,8 +5321,15 @@ static bool WebAssembly_mozIntGemm(JSContext* cx, unsigned argc, Value* vp) {
  CallArgs args = CallArgsFromVp(argc, vp);

  RootedWasmModuleObject module(cx);
-  if (!wasm::CompileIntrinsicModule(cx, mozilla::Span<IntrinsicOp>(),
-                                    Shareable::True, &module)) {
+  wasm::IntrinsicOp ops[] = {
+      wasm::IntrinsicOp::I8PrepareB,
+      wasm::IntrinsicOp::I8PrepareBFromTransposed,
+      wasm::IntrinsicOp::I8PrepareBFromQuantizedTransposed,
+      wasm::IntrinsicOp::I8PrepareA,
+      wasm::IntrinsicOp::I8PrepareBias,
+      wasm::IntrinsicOp::I8MultiplyAndAddBias,
+      wasm::IntrinsicOp::I8SelectColumnsOfB};
+  if (!wasm::CompileIntrinsicModule(cx, ops, Shareable::False, &module)) {
    ReportOutOfMemory(cx);
    return false;
  }