Bug 926838 - [Part 1] Add new files, and update license file. r=gerv,ehsan

2025-02-08 16:03:21 +00:00 · 2013-11-13 11:07:24 +08:00 · 2013-11-13 11:07:24 +08:00 · 6c2a2d1f4e
commit 6c2a2d1f4e
parent 98d8b74fc0
60 changed files with 21176 additions and 0 deletions
--- a/media/openmax_dl/LICENSE
+++ b/media/openmax_dl/LICENSE
@ -0,0 +1,39 @@
+Use of this source code is governed by a BSD-style license that can be
+found in the LICENSE file in the root of the source tree. All
+contributing project authors may be found in the AUTHORS file in the
+root of the source tree.
+
+The files were originally licensed by ARM Limited.
+
+The following files:
+
+    * dl/api/omxtypes.h
+    * dl/sp/api/omxSP.h
+
+are licensed by Khronos:
+
+Copyright (c) 2005-2008,2015 The Khronos Group Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and/or associated documentation files (the
+"Materials"), to deal in the Materials without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Materials, and to
+permit persons to whom the Materials are furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be included
+in all copies or substantial portions of the Materials.
+
+MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+   https://www.khronos.org/registry/
+
+THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
--- a/media/openmax_dl/OWNERS
+++ b/media/openmax_dl/OWNERS
@ -0,0 +1,3 @@
+ajm@google.com
+kma@google.com
+rtoy@google.com
--- a/media/openmax_dl/README.chromium
+++ b/media/openmax_dl/README.chromium
@ -0,0 +1,19 @@
+Name: OpenMAX DL
+Short Name: OpenMax DL
+URL: https://silver.arm.com/download/Software/Graphics/OX000-BU-00010-r1p0-00bet0/OX000-BU-00010-r1p0-00bet0.tgz
+Version: 1.0.2
+License: BSD
+License File: LICENSE
+Security Critical: yes
+
+Description:
+Implementation of OpenMAX DL spec from ARM.  This is used to support
+WebAudio for Chromium on Android.
+
+Local Modifications:
+Only the FFT routines from the OpenMAX DL package are included.  The
+code was modified to work with gcc and a new implementation for a
+floating-point FFT was added.
+
+The original ARM license is unclear, but Google has obtained
+permission to relicense this code under a BSD license.
--- a/media/openmax_dl/dl/api/armCOMM_s.h
+++ b/media/openmax_dl/dl/api/armCOMM_s.h
@ -0,0 +1,409 @@
+@// -*- Mode: asm; -*-
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+	
+@// 
+@// File Name:  armCOMM_s.h
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   13871
+@// Last Modified Date:       Fri, 09 May 2008
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// ARM optimized OpenMAX common header file
+@//
+
+	.set	_SBytes, 0	@ Number of scratch bytes on stack
+	.set	_Workspace, 0	@ Stack offset of scratch workspace
+
+	.set	_RRegList, 0	@ R saved register list (last register number)
+	.set	_DRegList, 0	@ D saved register list (last register number)
+
+        @// Work out a list of R saved registers, and how much stack space is needed.
+	@// gas doesn't support setting a variable to a string, so we set _RRegList to 
+	@// the register number.
+	.macro	_M_GETRREGLIST	rreg
+	.ifeqs "\rreg", ""
+	@ Nothing needs to be saved
+	.exitm
+	.endif
+	@ If rreg is lr or r4, save lr and r4
+	.ifeqs "\rreg", "lr"
+	.set	_RRegList, 4
+	.exitm
+	.endif
+
+	.ifeqs "\rreg", "r4"
+	.set	_RRegList, 4
+	.exitm
+	.endif
+
+	@ If rreg = r5 or r6, save up to register r6
+	.ifeqs "\rreg", "r5"
+	.set	_RRegList, 6
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r6"
+	.set	_RRegList, 6
+	.exitm
+	.endif
+
+	@ If rreg = r7 or r8, save up to register r8
+	.ifeqs "\rreg", "r7"
+	.set	_RRegList, 8
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r8"
+	.set	_RRegList, 8
+	.exitm
+	.endif
+
+	@ If rreg = r9 or r10, save up to register r10
+	.ifeqs "\rreg", "r9"
+	.set	_RRegList, 10
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r10"
+	.set	_RRegList, 10
+	.exitm
+	.endif
+
+	@ If rreg = r11 or r12, save up to register r12
+	.ifeqs "\rreg", "r11"
+	.set	_RRegList, 12
+	.exitm
+	.endif
+	.ifeqs "\rreg", "r12"
+	.set	_RRegList, 12
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved r register limit: \rreg"
+	.endm
+
+	@ Work out list of D saved registers, like for R registers.
+	.macro	_M_GETDREGLIST dreg
+	.ifeqs "\dreg", ""
+	.set	_DRegList, 0
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d8"
+	.set	_DRegList, 8
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d9"
+	.set	_DRegList, 9
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d10"
+	.set	_DRegList, 10
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d11"
+	.set	_DRegList, 11
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d12"
+	.set	_DRegList, 12
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d13"
+	.set	_DRegList, 13
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d14"
+	.set	_DRegList, 14
+	.exitm
+	.endif
+
+	.ifeqs "\dreg", "d15"
+	.set	_DRegList, 15
+	.exitm
+	.endif
+
+	.warning "Unrecognized saved d register limit: \rreg"
+	.endm
+
+@//////////////////////////////////////////////////////////
+@// Function header and footer macros
+@//////////////////////////////////////////////////////////      
+	
+        @ Function Header Macro    
+        @ Generates the function prologue
+        @ Note that functions should all be "stack-moves-once"
+        @ The FNSTART and FNEND macros should be the only places
+        @ where the stack moves.
+        @    
+        @  name  = function name
+        @  rreg  = ""   don't stack any registers
+        @          "lr" stack "lr" only
+        @          "rN" stack registers "r4-rN,lr"
+        @  dreg  = ""   don't stack any D registers
+        @          "dN" stack registers "d8-dN"
+        @
+        @ Note: ARM Archicture procedure call standard AAPCS
+        @ states that r4-r11, sp, d8-d15 must be preserved by
+        @ a compliant function.
+	.macro	M_START name, rreg, dreg
+	.set	_Workspace, 0
+
+	@ Define the function and make it external.
+	.global	\name
+	.func	\name
+	.section	.text.\name,"ax",%progbits
+	.align	2
+\name :		
+.fnstart
+	@ Save specified R registers
+	_M_GETRREGLIST	\rreg
+	_M_PUSH_RREG
+
+	@ Save specified D registers
+        _M_GETDREGLIST  \dreg
+	_M_PUSH_DREG
+
+	@ Ensure size claimed on stack is 8-byte aligned
+	.if (_SBytes & 7) != 0
+	.set	_SBytes, _SBytes + (8 - (_SBytes & 7))
+	.endif
+	.if _SBytes != 0
+		sub	sp, sp, #_SBytes
+	.endif	
+	.endm
+
+        @ Function Footer Macro        
+        @ Generates the function epilogue
+	.macro M_END
+	@ Restore the stack pointer to its original value on function entry
+	.if _SBytes != 0
+		add	sp, sp, #_SBytes
+	.endif
+	@ Restore any saved R or D registers.
+	_M_RET
+	.fnend	
+	.endfunc
+        @ Reset the global stack tracking variables back to their
+	@ initial values.
+	.set _SBytes, 0
+	.endm
+
+	@// Based on the value of _DRegList, push the specified set of registers 
+	@// to the stack.  Is there a better way?
+	.macro _M_PUSH_DREG
+	.if _DRegList == 8
+		vpush	{d8}
+	.exitm
+	.endif
+	
+	.if _DRegList == 9
+		vpush	{d8-d9}
+	.exitm
+	.endif
+	
+	.if _DRegList == 10
+		vpush	{d8-d10}
+	.exitm
+	.endif
+	
+	.if _DRegList == 11
+		vpush	{d8-d11}
+	.exitm
+	.endif
+	
+	.if _DRegList == 12
+		vpush	{d8-d12}
+	.exitm
+	.endif
+	
+	.if _DRegList == 13
+		vpush	{d8-d13}
+	.exitm
+	.endif
+	
+	.if _DRegList == 14
+		vpush	{d8-d14}
+	.exitm
+	.endif
+	
+	.if _DRegList == 15
+		vpush	{d8-d15}
+	.exitm
+	.endif
+	.endm
+
+	@// Based on the value of _RRegList, push the specified set of registers 
+	@// to the stack.  Is there a better way?
+	.macro _M_PUSH_RREG
+	.if _RRegList == 4
+		stmfd	sp!, {r4, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 6
+		stmfd	sp!, {r4-r6, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 8
+		stmfd	sp!, {r4-r8, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 10
+		stmfd	sp!, {r4-r10, lr}
+	.exitm
+	.endif
+	
+	.if _RRegList == 12
+		stmfd	sp!, {r4-r12, lr}
+	.exitm
+	.endif
+	.endm
+
+	@// The opposite of _M_PUSH_DREG
+	.macro  _M_POP_DREG
+	.if _DRegList == 8
+		vpop	{d8}
+	.exitm
+	.endif
+	
+	.if _DRegList == 9
+		vpop	{d8-d9}
+	.exitm
+	.endif
+	
+	.if _DRegList == 10
+		vpop	{d8-d10}
+	.exitm
+	.endif
+	
+	.if _DRegList == 11
+		vpop	{d8-d11}
+	.exitm
+	.endif
+	
+	.if _DRegList == 12
+		vpop	{d8-d12}
+	.exitm
+	.endif
+	
+	.if _DRegList == 13
+		vpop	{d8-d13}
+	.exitm
+	.endif
+	
+	.if _DRegList == 14
+		vpop	{d8-d14}
+	.exitm
+	.endif
+	
+	.if _DRegList == 15
+		vpop	{d8-d15}
+	.exitm
+	.endif
+	.endm
+
+	@// The opposite of _M_PUSH_RREG
+	.macro _M_POP_RREG cc
+	.if _RRegList == 0
+		bx\cc lr
+	.exitm
+	.endif
+	.if _RRegList == 4
+		ldm\cc\()fd	sp!, {r4, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 6
+		ldm\cc\()fd	sp!, {r4-r6, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 8
+		ldm\cc\()fd	sp!, {r4-r8, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 10
+		ldm\cc\()fd	sp!, {r4-r10, pc}
+	.exitm
+	.endif
+	
+	.if _RRegList == 12
+		ldm\cc\()fd	sp!, {r4-r12, pc}
+	.exitm
+	.endif
+	.endm
+	
+        @ Produce function return instructions
+	.macro	_M_RET cc
+	_M_POP_DREG \cc
+	_M_POP_RREG \cc
+	.endm	
+	
+        @// Allocate 4-byte aligned area of name
+        @// |name| and size |size| bytes.
+	.macro	M_ALLOC4 name, size
+	.if	(_SBytes & 3) != 0
+	.set	_SBytes, _SBytes + (4 - (_SBytes & 3))
+	.endif
+	.set	\name\()_F, _SBytes
+	.set	_SBytes, _SBytes + \size
+	
+	.endm
+
+        @ Load word from stack
+	.macro M_LDR r, a0, a1, a2, a3
+	_M_DATA "ldr", 4, \r, \a0, \a1, \a2, \a3
+	.endm
+
+        @ Store word to stack
+	.macro M_STR r, a0, a1, a2, a3
+	_M_DATA "str", 4, \r, \a0, \a1, \a2, \a3
+	.endm
+
+        @ Macro to perform a data access operation
+        @ Such as LDR or STR
+        @ The addressing mode is modified such that
+        @ 1. If no address is given then the name is taken
+        @    as a stack offset
+        @ 2. If the addressing mode is not available for the
+        @    state being assembled for (eg Thumb) then a suitable
+        @    addressing mode is substituted.
+        @
+        @ On Entry:
+        @ $i = Instruction to perform (eg "LDRB")
+        @ $a = Required byte alignment
+        @ $r = Register(s) to transfer (eg "r1")
+        @ $a0,$a1,$a2. Addressing mode and condition. One of:
+        @     label {,cc}
+        @     [base]                    {,,,cc}
+        @     [base, offset]{!}         {,,cc}
+        @     [base, offset, shift]{!}  {,cc}
+        @     [base], offset            {,,cc}
+        @     [base], offset, shift     {,cc}
+	@
+	@ WARNING: Most of the above are not supported, except the first case.
+	.macro _M_DATA i, a, r, a0, a1, a2, a3
+	.set	_Offset, _Workspace + \a0\()_F
+	\i\a1	\r, [sp, #_Offset]	
+	.endm
--- a/media/openmax_dl/dl/api/armOMX.h
+++ b/media/openmax_dl/dl/api/armOMX.h
@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/* 
+ * 
+ * File Name:  armOMX_ReleaseVersion.h
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   15322
+ * Last Modified Date:       Wed, 15 Oct 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * This file allows a version of the OMX DL libraries to be built where some or
+ * all of the function names can be given a user specified suffix. 
+ *
+ * You might want to use it where:
+ *
+ * - you want to rename a function "out of the way" so that you could replace
+ *   a function with a different version (the original version would still be
+ *   in the library just with a different name - so you could debug the new
+ *   version by comparing it to the output of the old)
+ *
+ * - you want to rename all the functions to versions with a suffix so that 
+ *   you can include two versions of the library and choose between functions
+ *   at runtime.
+ *
+ *     e.g. omxIPBM_Copy_U8_C1R could be renamed omxIPBM_Copy_U8_C1R_CortexA8
+ * 
+ */
+
+  
+#ifndef _armOMX_H_
+#define _armOMX_H_
+
+#define ARMOMX_ENABLE_RENAMING 0
+#if ARMOMX_ENABLE_RENAMING
+
+/* We need to define these two macros in order to expand and concatenate the names */
+#define OMXCAT2BAR(A, B) omx ## A ## B
+#define OMXCATBAR(A, B) OMXCAT2BAR(A, B)
+
+/* Define the suffix to add to all functions - the default is no suffix */
+#define BARE_SUFFIX 
+
+
+
+/* Define what happens to the bare suffix-less functions, down to the sub-domain accuracy */
+#define OMXACAAC_SUFFIX    BARE_SUFFIX   
+#define OMXACMP3_SUFFIX    BARE_SUFFIX
+#define OMXICJP_SUFFIX     BARE_SUFFIX
+#define OMXIPBM_SUFFIX     BARE_SUFFIX
+#define OMXIPCS_SUFFIX     BARE_SUFFIX
+#define OMXIPPP_SUFFIX     BARE_SUFFIX
+#define OMXSP_SUFFIX       BARE_SUFFIX
+#define OMXVCCOMM_SUFFIX   BARE_SUFFIX
+#define OMXVCM4P10_SUFFIX  BARE_SUFFIX
+#define OMXVCM4P2_SUFFIX   BARE_SUFFIX
+
+
+
+
+/* Define what the each bare, un-suffixed OpenMAX API function names is to be renamed */
+#define omxACAAC_DecodeChanPairElt                        OMXCATBAR(ACAAC_DecodeChanPairElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeDatStrElt                          OMXCATBAR(ACAAC_DecodeDatStrElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeFillElt                            OMXCATBAR(ACAAC_DecodeFillElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeIsStereo_S32                       OMXCATBAR(ACAAC_DecodeIsStereo_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsPNS_S32_I                        OMXCATBAR(ACAAC_DecodeMsPNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeMsStereo_S32_I                     OMXCATBAR(ACAAC_DecodeMsStereo_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodePrgCfgElt                          OMXCATBAR(ACAAC_DecodePrgCfgElt, OMXACAAC_SUFFIX)
+#define omxACAAC_DecodeTNS_S32_I                          OMXCATBAR(ACAAC_DecodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_DeinterleaveSpectrum_S32                 OMXCATBAR(ACAAC_DeinterleaveSpectrum_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_EncodeTNS_S32_I                          OMXCATBAR(ACAAC_EncodeTNS_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermPredict_S32                      OMXCATBAR(ACAAC_LongTermPredict_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_LongTermReconstruct_S32_I                OMXCATBAR(ACAAC_LongTermReconstruct_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTFwd_S32                              OMXCATBAR(ACAAC_MDCTFwd_S32, OMXACAAC_SUFFIX)
+#define omxACAAC_MDCTInv_S32_S16                          OMXCATBAR(ACAAC_MDCTInv_S32_S16, OMXACAAC_SUFFIX)
+#define omxACAAC_NoiselessDecode                          OMXCATBAR(ACAAC_NoiselessDecode, OMXACAAC_SUFFIX)
+#define omxACAAC_QuantInv_S32_I                           OMXCATBAR(ACAAC_QuantInv_S32_I, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADIFHeader                         OMXCATBAR(ACAAC_UnpackADIFHeader, OMXACAAC_SUFFIX)
+#define omxACAAC_UnpackADTSFrameHeader                    OMXCATBAR(ACAAC_UnpackADTSFrameHeader, OMXACAAC_SUFFIX)
+
+
+#define omxACMP3_HuffmanDecode_S32                        OMXCATBAR(ACMP3_HuffmanDecode_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfb_S32                     OMXCATBAR(ACMP3_HuffmanDecodeSfb_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_HuffmanDecodeSfbMbp_S32                  OMXCATBAR(ACMP3_HuffmanDecodeSfbMbp_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_MDCTInv_S32                              OMXCATBAR(ACMP3_MDCTInv_S32, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantize_S32_I                         OMXCATBAR(ACMP3_ReQuantize_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_ReQuantizeSfb_S32_I                      OMXCATBAR(ACMP3_ReQuantizeSfb_S32_I, OMXACMP3_SUFFIX)
+#define omxACMP3_SynthPQMF_S32_S16                        OMXCATBAR(ACMP3_SynthPQMF_S32_S16, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackFrameHeader                        OMXCATBAR(ACMP3_UnpackFrameHeader, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackScaleFactors_S8                    OMXCATBAR(ACMP3_UnpackScaleFactors_S8, OMXACMP3_SUFFIX)
+#define omxACMP3_UnpackSideInfo                           OMXCATBAR(ACMP3_UnpackSideInfo, OMXACMP3_SUFFIX)
+
+#define omxICJP_CopyExpand_U8_C3                          OMXCATBAR(ICJP_CopyExpand_U8_C3, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16                                OMXCATBAR(ICJP_DCTFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTFwd_S16_I                              OMXCATBAR(ICJP_DCTFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16                                OMXCATBAR(ICJP_DCTInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTInv_S16_I                              OMXCATBAR(ICJP_DCTInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantFwd_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16                           OMXCATBAR(ICJP_DCTQuantFwd_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwd_S16_I                         OMXCATBAR(ICJP_DCTQuantFwd_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantFwdTableInit                      OMXCATBAR(ICJP_DCTQuantFwdTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_Multiple_S16                  OMXCATBAR(ICJP_DCTQuantInv_Multiple_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16                           OMXCATBAR(ICJP_DCTQuantInv_S16, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInv_S16_I                         OMXCATBAR(ICJP_DCTQuantInv_S16_I, OMXICJP_SUFFIX)
+#define omxICJP_DCTQuantInvTableInit                      OMXCATBAR(ICJP_DCTQuantInvTableInit, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffman8x8_Direct_S16_C1            OMXCATBAR(ICJP_DecodeHuffman8x8_Direct_S16_C1, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_DecodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_DecodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_DecodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffman8x8_Direct_S16_U1_C1         OMXCATBAR(ICJP_EncodeHuffman8x8_Direct_S16_U1_C1, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecGetBufSize_U8            OMXCATBAR(ICJP_EncodeHuffmanSpecGetBufSize_U8, OMXICJP_SUFFIX)
+#define omxICJP_EncodeHuffmanSpecInit_U8                  OMXCATBAR(ICJP_EncodeHuffmanSpecInit_U8, OMXICJP_SUFFIX)
+
+#define omxIPBM_AddC_U8_C1R_Sfs                           OMXCATBAR(IPBM_AddC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C1R                               OMXCATBAR(IPBM_Copy_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_Copy_U8_C3R                               OMXCATBAR(IPBM_Copy_U8_C3R, OMXIPBM_SUFFIX)
+#define omxIPBM_Mirror_U8_C1R                             OMXCATBAR(IPBM_Mirror_U8_C1R, OMXIPBM_SUFFIX)
+#define omxIPBM_MulC_U8_C1R_Sfs                           OMXCATBAR(IPBM_MulC_U8_C1R_Sfs, OMXIPBM_SUFFIX)
+
+#define omxIPCS_ColorTwistQ14_U8_C3R                      OMXCATBAR(IPCS_ColorTwistQ14_U8_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr420LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr422LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R      OMXCATBAR(IPCS_BGR565ToYCbCr444LS_MCU_U16_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr420LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr422LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R       OMXCATBAR(IPCS_BGR888ToYCbCr444LS_MCU_U8_S16_C3P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr420RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr420RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr420ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr420ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr420ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszCscRotBGR_U8_P3C3R             OMXCATBAR(IPCS_YCbCr422RszCscRotBGR_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R          OMXCATBAR(IPCS_CbYCrY422RszCscRotBGR_U8_U16_C2R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422RszRot_U8_P3R                     OMXCATBAR(IPCS_YCbCr422RszRot_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR565_U8_U16_C2C3R            OMXCATBAR(IPCS_YCbYCr422ToBGR565_U8_U16_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr422ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbYCr422ToBGR888_U8_C2C3R                OMXCATBAR(IPCS_YCbYCr422ToBGR888_U8_C2C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R       OMXCATBAR(IPCS_YCbCr422ToBGR888LS_MCU_S16_U8_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R        OMXCATBAR(IPCS_CbYCrY422ToYCbCr420Rotate_U8_C2P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr422ToYCbCr420Rotate_U8_P3R           OMXCATBAR(IPCS_YCbCr422ToYCbCr420Rotate_U8_P3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_C3R               OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565_U8_U16_P3C3R             OMXCATBAR(IPCS_YCbCr444ToBGR565_U8_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R      OMXCATBAR(IPCS_YCbCr444ToBGR565LS_MCU_S16_U16_P3C3R, OMXIPCS_SUFFIX)
+#define omxIPCS_YCbCr444ToBGR888_U8_C3R                   OMXCATBAR(IPCS_YCbCr444ToBGR888_U8_C3R, OMXIPCS_SUFFIX)
+
+#define omxIPPP_Deblock_HorEdge_U8_I                      OMXCATBAR(IPPP_Deblock_HorEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_Deblock_VerEdge_U8_I                      OMXCATBAR(IPPP_Deblock_VerEdge_U8_I, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterFIR_U8_C1R                          OMXCATBAR(IPPP_FilterFIR_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_FilterMedian_U8_C1R                       OMXCATBAR(IPPP_FilterMedian_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_GetCentralMoment_S64                      OMXCATBAR(IPPP_GetCentralMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_GetSpatialMoment_S64                      OMXCATBAR(IPPP_GetSpatialMoment_S64, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentGetStateSize                        OMXCATBAR(IPPP_MomentGetStateSize, OMXIPPP_SUFFIX)
+#define omxIPPP_MomentInit                                OMXCATBAR(IPPP_MomentInit, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C1R                            OMXCATBAR(IPPP_Moments_U8_C1R, OMXIPPP_SUFFIX)
+#define omxIPPP_Moments_U8_C3R                            OMXCATBAR(IPPP_Moments_U8_C3R, OMXIPPP_SUFFIX)
+
+#define omxSP_BlockExp_S16                                OMXCATBAR(SP_BlockExp_S16, OMXSP_SUFFIX)
+#define omxSP_BlockExp_S32                                OMXCATBAR(SP_BlockExp_S32, OMXSP_SUFFIX)
+#define omxSP_Copy_S16                                    OMXCATBAR(SP_Copy_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16                                 OMXCATBAR(SP_DotProd_S16, OMXSP_SUFFIX)
+#define omxSP_DotProd_S16_Sfs                             OMXCATBAR(SP_DotProd_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTFwd_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S16S32_Sfs                    OMXCATBAR(SP_FFTFwd_RToCCS_S16S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTFwd_RToCCS_S32_Sfs                       OMXCATBAR(SP_FFTFwd_RToCCS_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC16                        OMXCATBAR(SP_FFTGetBufSize_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_C_SC32                        OMXCATBAR(SP_FFTGetBufSize_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S16S32                      OMXCATBAR(SP_FFTGetBufSize_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTGetBufSize_R_S32                         OMXCATBAR(SP_FFTGetBufSize_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC16                              OMXCATBAR(SP_FFTInit_C_SC16, OMXSP_SUFFIX)
+#define omxSP_FFTInit_C_SC32                              OMXCATBAR(SP_FFTInit_C_SC32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S16S32                            OMXCATBAR(SP_FFTInit_R_S16S32, OMXSP_SUFFIX)
+#define omxSP_FFTInit_R_S32                               OMXCATBAR(SP_FFTInit_R_S32, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32_Sfs                       OMXCATBAR(SP_FFTInv_CCSToR_S32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CCSToR_S32S16_Sfs                    OMXCATBAR(SP_FFTInv_CCSToR_S32S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC16_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FFTInv_CToC_SC32_Sfs                        OMXCATBAR(SP_FFTInv_CToC_SC32_Sfs, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32                            OMXCATBAR(SP_FilterMedian_S32, OMXSP_SUFFIX)
+#define omxSP_FilterMedian_S32_I                          OMXCATBAR(SP_FilterMedian_S32_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16                              OMXCATBAR(SP_FIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_I                            OMXCATBAR(SP_FIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_ISfs                         OMXCATBAR(SP_FIR_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIR_Direct_S16_Sfs                          OMXCATBAR(SP_FIR_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16                           OMXCATBAR(SP_FIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_I                         OMXCATBAR(SP_FIROne_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_ISfs                      OMXCATBAR(SP_FIROne_Direct_S16_ISfs, OMXSP_SUFFIX)
+#define omxSP_FIROne_Direct_S16_Sfs                       OMXCATBAR(SP_FIROne_Direct_S16_Sfs, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16                        OMXCATBAR(SP_IIR_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_BiQuadDirect_S16_I                      OMXCATBAR(SP_IIR_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16                              OMXCATBAR(SP_IIR_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIR_Direct_S16_I                            OMXCATBAR(SP_IIR_Direct_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16                     OMXCATBAR(SP_IIROne_BiQuadDirect_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_BiQuadDirect_S16_I                   OMXCATBAR(SP_IIROne_BiQuadDirect_S16_I, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16                           OMXCATBAR(SP_IIROne_Direct_S16, OMXSP_SUFFIX)
+#define omxSP_IIROne_Direct_S16_I                         OMXCATBAR(SP_IIROne_Direct_S16_I, OMXSP_SUFFIX)
+
+#define omxVCCOMM_Average_16x                             OMXCATBAR(VCCOMM_Average_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Average_8x                              OMXCATBAR(VCCOMM_Average_8x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock                OMXCATBAR(VCCOMM_ComputeTextureErrorBlock, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ComputeTextureErrorBlock_SAD            OMXCATBAR(VCCOMM_ComputeTextureErrorBlock_SAD, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy16x16                               OMXCATBAR(VCCOMM_Copy16x16, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_Copy8x8                                 OMXCATBAR(VCCOMM_Copy8x8, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_ExpandFrame_I                           OMXCATBAR(VCCOMM_ExpandFrame_I, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_LimitMVToRect                           OMXCATBAR(VCCOMM_LimitMVToRect, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_16x                                 OMXCATBAR(VCCOMM_SAD_16x, OMXVCCOMM_SUFFIX)
+#define omxVCCOMM_SAD_8x                                  OMXCATBAR(VCCOMM_SAD_8x, OMXVCCOMM_SUFFIX)
+
+#define omxVCM4P10_Average_4x                             OMXCATBAR(VCM4P10_Average_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Half                        OMXCATBAR(VCM4P10_BlockMatch_Half, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Integer                     OMXCATBAR(VCM4P10_BlockMatch_Integer, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_BlockMatch_Quarter                     OMXCATBAR(VCM4P10_BlockMatch_Quarter, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockChroma_I                        OMXCATBAR(VCM4P10_DeblockChroma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DeblockLuma_I                          OMXCATBAR(VCM4P10_DeblockLuma_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeChromaDcCoeffsToPairCAVLC        OMXCATBAR(VCM4P10_DecodeChromaDcCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DecodeCoeffsToPairCAVLC                OMXCATBAR(VCM4P10_DecodeCoeffsToPairCAVLC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_DequantTransformResidualFromPairAndAdd OMXCATBAR(VCM4P10_DequantTransformResidualFromPairAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_HorEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingChroma_VerEdge_I       OMXCATBAR(VCM4P10_FilterDeblockingChroma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_HorEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_HorEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_FilterDeblockingLuma_VerEdge_I         OMXCATBAR(VCM4P10_FilterDeblockingLuma_VerEdge_I, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_GetVLCInfo                             OMXCATBAR(VCM4P10_GetVLCInfo, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateChroma                      OMXCATBAR(VCM4P10_InterpolateChroma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfHor_Luma                OMXCATBAR(VCM4P10_InterpolateHalfHor_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateHalfVer_Luma                OMXCATBAR(VCM4P10_InterpolateHalfVer_Luma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InterpolateLuma                        OMXCATBAR(VCM4P10_InterpolateLuma, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_ChromaDC           OMXCATBAR(VCM4P10_InvTransformDequant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformDequant_LumaDC             OMXCATBAR(VCM4P10_InvTransformDequant_LumaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_InvTransformResidualAndAdd             OMXCATBAR(VCM4P10_InvTransformResidualAndAdd, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEGetBufSize                           OMXCATBAR(VCM4P10_MEGetBufSize, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MEInit                                 OMXCATBAR(VCM4P10_MEInit, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_MotionEstimationMB                     OMXCATBAR(VCM4P10_MotionEstimationMB, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_16x16                     OMXCATBAR(VCM4P10_PredictIntra_16x16, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntra_4x4                       OMXCATBAR(VCM4P10_PredictIntra_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_PredictIntraChroma_8x8                  OMXCATBAR(VCM4P10_PredictIntraChroma_8x8, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SAD_4x                                 OMXCATBAR(VCM4P10_SAD_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_16x                            OMXCATBAR(VCM4P10_SADQuar_16x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_4x                             OMXCATBAR(VCM4P10_SADQuar_4x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SADQuar_8x                             OMXCATBAR(VCM4P10_SADQuar_8x, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SATD_4x4                               OMXCATBAR(VCM4P10_SATD_4x4, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_SubAndTransformQDQResidual             OMXCATBAR(VCM4P10_SubAndTransformQDQResidual, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantChromaDCFromPair       OMXCATBAR(VCM4P10_TransformDequantChromaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformDequantLumaDCFromPair         OMXCATBAR(VCM4P10_TransformDequantLumaDCFromPair, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_ChromaDC                OMXCATBAR(VCM4P10_TransformQuant_ChromaDC, OMXVCM4P10_SUFFIX)
+#define omxVCM4P10_TransformQuant_LumaDC                  OMXCATBAR(VCM4P10_TransformQuant_LumaDC, OMXVCM4P10_SUFFIX)
+
+#define omxVCM4P2_BlockMatch_Half_16x16                   OMXCATBAR(VCM4P2_BlockMatch_Half_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Half_8x8                     OMXCATBAR(VCM4P2_BlockMatch_Half_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_16x16                OMXCATBAR(VCM4P2_BlockMatch_Integer_16x16, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_BlockMatch_Integer_8x8                  OMXCATBAR(VCM4P2_BlockMatch_Integer_8x8, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DCT8x8blk                               OMXCATBAR(VCM4P2_DCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Inter                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeBlockCoef_Intra                   OMXCATBAR(VCM4P2_DecodeBlockCoef_Intra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodePadMV_PVOP                        OMXCATBAR(VCM4P2_DecodePadMV_PVOP, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_DecodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_DecodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_DecodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeMV                                OMXCATBAR(VCM4P2_EncodeMV, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_Inter                   OMXCATBAR(VCM4P2_EncodeVLCZigzag_Inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraACVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraACVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_EncodeVLCZigzag_IntraDCVLC              OMXCATBAR(VCM4P2_EncodeVLCZigzag_IntraDCVLC, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_FindMVpred                              OMXCATBAR(VCM4P2_FindMVpred, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_IDCT8x8blk                              OMXCATBAR(VCM4P2_IDCT8x8blk, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MCReconBlock                            OMXCATBAR(VCM4P2_MCReconBlock, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEGetBufSize                            OMXCATBAR(VCM4P2_MEGetBufSize, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MEInit                                  OMXCATBAR(VCM4P2_MEInit, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_MotionEstimationMB                      OMXCATBAR(VCM4P2_MotionEstimationMB, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_PredictReconCoefIntra                   OMXCATBAR(VCM4P2_PredictReconCoefIntra, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInter_I                            OMXCATBAR(VCM4P2_QuantInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantIntra_I                            OMXCATBAR(VCM4P2_QuantIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvInter_I                         OMXCATBAR(VCM4P2_QuantInvInter_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_QuantInvIntra_I                         OMXCATBAR(VCM4P2_QuantInvIntra_I, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_inter                 OMXCATBAR(VCM4P2_TransRecBlockCoef_inter, OMXVCM4P2_SUFFIX)
+#define omxVCM4P2_TransRecBlockCoef_intra                 OMXCATBAR(VCM4P2_TransRecBlockCoef_intra, OMXVCM4P2_SUFFIX)
+
+#endif /* endif ARMOMX_ENABLE_RENAMING */
+#endif /* _armOMX_h_ */
--- a/media/openmax_dl/dl/api/omxtypes.h
+++ b/media/openmax_dl/dl/api/omxtypes.h
@ -0,0 +1,254 @@
+/**
+ * File: omxtypes.h
+ * Brief: Defines basic Data types used in OpenMAX v1.0.2 header files.
+ *
+ * Copyright (c) 2005-2008,2015 The Khronos Group Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and/or associated documentation files (the
+ * "Materials"), to deal in the Materials without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Materials, and to
+ * permit persons to whom the Materials are furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Materials.
+ *
+ * MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+ * KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+ * SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+ *    https://www.khronos.org/registry/
+ *
+ * THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
+ *
+ */
+  
+#ifndef _OMXTYPES_H_
+#define _OMXTYPES_H_
+
+#include <limits.h> 
+
+#define OMX_IN
+#define OMX_OUT
+#define OMX_INOUT
+
+
+typedef enum {
+    
+    /* Mandatory return codes - use cases are explicitly described for each function */
+    OMX_Sts_NoErr                    =  0,    /* No error, the function completed successfully */
+    OMX_Sts_Err                      = -2,    /* Unknown/unspecified error */    
+    OMX_Sts_InvalidBitstreamValErr   = -182,  /* Invalid value detected during bitstream processing */    
+    OMX_Sts_MemAllocErr              = -9,    /* Not enough memory allocated for the operation */
+    OMX_StsACAAC_GainCtrErr    	     = -159,  /* AAC: Unsupported gain control data detected */
+    OMX_StsACAAC_PrgNumErr           = -167,  /* AAC: Invalid number of elements for one program   */
+    OMX_StsACAAC_CoefValErr          = -163,  /* AAC: Invalid quantized coefficient value          */     
+    OMX_StsACAAC_MaxSfbErr           = -162,  /* AAC: Invalid maxSfb value in relation to numSwb */    
+	OMX_StsACAAC_PlsDataErr		     = -160,  /* AAC: pulse escape sequence data error */
+
+    /* Optional return codes - use cases are explicitly described for each function*/
+    OMX_Sts_BadArgErr                = -5,    /* Bad Arguments */
+
+    OMX_StsACAAC_TnsNumFiltErr       = -157,  /* AAC: Invalid number of TNS filters  */
+    OMX_StsACAAC_TnsLenErr           = -156,  /* AAC: Invalid TNS region length  */   
+    OMX_StsACAAC_TnsOrderErr         = -155,  /* AAC: Invalid order of TNS filter  */                  
+    OMX_StsACAAC_TnsCoefResErr       = -154,  /* AAC: Invalid bit-resolution for TNS filter coefficients  */
+    OMX_StsACAAC_TnsCoefErr          = -153,  /* AAC: Invalid TNS filter coefficients  */                  
+    OMX_StsACAAC_TnsDirectErr        = -152,  /* AAC: Invalid TNS filter direction  */  
+
+    OMX_StsICJP_JPEGMarkerErr        = -183,  /* JPEG marker encountered within an entropy-coded block; */
+                                              /* Huffman decoding operation terminated early.           */
+    OMX_StsICJP_JPEGMarker           = -181,  /* JPEG marker encountered; Huffman decoding */
+                                              /* operation terminated early.                         */
+    OMX_StsIPPP_ContextMatchErr      = -17,   /* Context parameter doesn't match to the operation */
+
+    OMX_StsSP_EvenMedianMaskSizeErr  = -180,  /* Even size of the Median Filter mask was replaced by the odd one */
+
+    OMX_Sts_MaximumEnumeration       = INT_MAX  /*Placeholder, forces enum of size OMX_INT*/
+    
+ } OMXResult;          /** Return value or error value returned from a function. Identical to OMX_INT */
+
+ 
+/* OMX_U8 */
+#if UCHAR_MAX == 0xff
+typedef unsigned char OMX_U8;
+#elif USHRT_MAX == 0xff 
+typedef unsigned short int OMX_U8; 
+#else
+#error OMX_U8 undefined
+#endif 
+
+ 
+/* OMX_S8 */
+#if SCHAR_MAX == 0x7f 
+typedef signed char OMX_S8;
+#elif SHRT_MAX == 0x7f 
+typedef signed short int OMX_S8; 
+#else
+#error OMX_S8 undefined
+#endif
+ 
+ 
+/* OMX_U16 */
+#if USHRT_MAX == 0xffff
+typedef unsigned short int OMX_U16;
+#elif UINT_MAX == 0xffff
+typedef unsigned int OMX_U16; 
+#else
+#error OMX_U16 undefined
+#endif
+
+
+/* OMX_S16 */
+#if SHRT_MAX == 0x7fff 
+typedef signed short int OMX_S16;
+#elif INT_MAX == 0x7fff 
+typedef signed int OMX_S16; 
+#else
+#error OMX_S16 undefined
+#endif
+
+
+/* OMX_U32 */
+#if UINT_MAX == 0xffffffff
+typedef unsigned int OMX_U32;
+#elif LONG_MAX == 0xffffffff
+typedef unsigned long int OMX_U32; 
+#else
+#error OMX_U32 undefined
+#endif
+
+
+/* OMX_S32 */
+#if INT_MAX == 0x7fffffff
+typedef signed int OMX_S32;
+#elif LONG_MAX == 0x7fffffff
+typedef long signed int OMX_S32; 
+#else
+#error OMX_S32 undefined
+#endif
+
+
+/* OMX_U64 & OMX_S64 */
+#if defined( _WIN32 ) || defined ( _WIN64 )
+    typedef __int64 OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned __int64 OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000i64)
+    #define OMX_MIN_U64			(0x0000000000000000i64)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFi64)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFi64)
+#else
+    typedef long long OMX_S64; /** Signed 64-bit integer */
+    typedef unsigned long long OMX_U64; /** Unsigned 64-bit integer */
+    #define OMX_MIN_S64			(0x8000000000000000LL)
+    #define OMX_MIN_U64			(0x0000000000000000LL)
+    #define OMX_MAX_S64			(0x7FFFFFFFFFFFFFFFLL)
+    #define OMX_MAX_U64			(0xFFFFFFFFFFFFFFFFLL)
+#endif
+
+
+/* OMX_SC8 */
+typedef struct
+{
+  OMX_S8 Re; /** Real part */
+  OMX_S8 Im; /** Imaginary part */	
+	
+} OMX_SC8; /** Signed 8-bit complex number */
+
+
+/* OMX_SC16 */
+typedef struct
+{
+  OMX_S16 Re; /** Real part */
+  OMX_S16 Im; /** Imaginary part */	
+	
+} OMX_SC16; /** Signed 16-bit complex number */
+
+
+/* OMX_SC32 */
+typedef struct
+{
+  OMX_S32 Re; /** Real part */
+  OMX_S32 Im; /** Imaginary part */	
+	
+} OMX_SC32; /** Signed 32-bit complex number */
+
+
+/* OMX_SC64 */
+typedef struct
+{
+  OMX_S64 Re; /** Real part */
+  OMX_S64 Im; /** Imaginary part */	
+	
+} OMX_SC64; /** Signed 64-bit complex number */
+
+
+/* OMX_F32 */
+typedef float OMX_F32; /** Single precision floating point,IEEE 754 */
+
+
+/* OMX_F64 */
+typedef double OMX_F64; /** Double precision floating point,IEEE 754 */
+
+
+/* OMX_INT */
+typedef int OMX_INT; /** signed integer corresponding to machine word length, has maximum signed value INT_MAX*/
+
+
+#define OMX_MIN_S8  	   	(-128)
+#define OMX_MIN_U8  		0
+#define OMX_MIN_S16		 	(-32768)
+#define OMX_MIN_U16			0
+#define OMX_MIN_S32			(-2147483647-1)
+#define OMX_MIN_U32			0
+
+#define OMX_MAX_S8			(127)
+#define OMX_MAX_U8			(255)
+#define OMX_MAX_S16			(32767)
+#define OMX_MAX_U16			(0xFFFF)
+#define OMX_MAX_S32			(2147483647)
+#define OMX_MAX_U32			(0xFFFFFFFF)
+
+typedef void OMXVoid;
+
+#ifndef NULL
+#define NULL ((void*)0)
+#endif
+
+/** Defines the geometric position and size of a rectangle, 
+  * where x,y defines the coordinates of the top left corner
+  * of the rectangle, with dimensions width in the x-direction 
+  * and height in the y-direction */
+typedef struct {
+	OMX_INT x;      /** x-coordinate of top left corner of rectangle */
+	OMX_INT y;      /** y-coordinate of top left corner of rectangle */
+	OMX_INT width;  /** Width in the x-direction. */
+	OMX_INT height; /** Height in the y-direction. */
+}OMXRect;
+
+
+/** Defines the geometric position of a point, */
+typedef struct 
+{
+ OMX_INT x; /** x-coordinate */
+ OMX_INT y;	/** y-coordinate */
+	
+} OMXPoint;
+
+
+/** Defines the dimensions of a rectangle, or region of interest in an image */
+typedef struct 
+{
+ OMX_INT width;  /** Width of the rectangle, in the x-direction */
+ OMX_INT height; /** Height of the rectangle, in the y-direction */
+	
+} OMXSize;
+
+#endif /* _OMXTYPES_H_ */
--- a/media/openmax_dl/dl/api/omxtypes_s.h
+++ b/media/openmax_dl/dl/api/omxtypes_s.h
@ -0,0 +1,76 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxtypes_s.h
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   9622
+@// Last Modified Date:       Wed, 06 Feb 2008
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@//
+
+@// Mandatory return codes - use cases are explicitly described for each function 
+	.equ	OMX_Sts_NoErr, 0    @// No error the function completed successfully 
+	.equ	OMX_Sts_Err, -2    @// Unknown/unspecified error     
+	.equ	OMX_Sts_InvalidBitstreamValErr, -182  @// Invalid value detected during bitstream processing     
+	.equ	OMX_Sts_MemAllocErr, -9    @// Not enough memory allocated for the operation 
+	.equ	OMX_StsACAAC_GainCtrErr, -159  @// AAC: Unsupported gain control data detected 
+	.equ	OMX_StsACAAC_PrgNumErr, -167  @// AAC: Invalid number of elements for one program   
+	.equ	OMX_StsACAAC_CoefValErr, -163  @// AAC: Invalid quantized coefficient value               
+	.equ	OMX_StsACAAC_MaxSfbErr, -162  @// AAC: Invalid maxSfb value in relation to numSwb     
+	.equ	OMX_StsACAAC_PlsDataErr, -160  @// AAC: pulse escape sequence data error 
+
+@// Optional return codes - use cases are explicitly described for each function
+	.equ	OMX_Sts_BadArgErr, -5    @// Bad Arguments 
+
+	.equ	OMX_StsACAAC_TnsNumFiltErr, -157  @// AAC: Invalid number of TNS filters  
+	.equ	OMX_StsACAAC_TnsLenErr, -156  @// AAC: Invalid TNS region length     
+	.equ	OMX_StsACAAC_TnsOrderErr, -155  @// AAC: Invalid order of TNS filter                    
+	.equ	OMX_StsACAAC_TnsCoefResErr, -154  @// AAC: Invalid bit-resolution for TNS filter coefficients  
+	.equ	OMX_StsACAAC_TnsCoefErr, -153  @// AAC: Invalid TNS filter coefficients                    
+	.equ	OMX_StsACAAC_TnsDirectErr, -152  @// AAC: Invalid TNS filter direction    
+	.equ	OMX_StsICJP_JPEGMarkerErr, -183  @// JPEG marker encountered within an entropy-coded block; 
+                                            @// Huffman decoding operation terminated early.           
+	.equ	OMX_StsICJP_JPEGMarker, -181  @// JPEG marker encountered; Huffman decoding 
+                                            @// operation terminated early.                         
+	.equ	OMX_StsIPPP_ContextMatchErr, -17   @// Context parameter doesn't match to the operation 
+
+	.equ	OMX_StsSP_EvenMedianMaskSizeErr, -180  @// Even size of the Median Filter mask was replaced by the odd one 
+
+	.equ	OMX_Sts_MaximumEnumeration, 0x7FFFFFFF
+
+
+
+	.equ	OMX_MIN_S8, (-128)
+	.equ	OMX_MIN_U8, 0
+	.equ	OMX_MIN_S16, (-32768)
+	.equ	OMX_MIN_U16, 0
+
+
+	.equ	OMX_MIN_S32, (-2147483647-1)
+	.equ	OMX_MIN_U32, 0
+
+	.equ	OMX_MAX_S8, (127)
+	.equ	OMX_MAX_U8, (255)
+	.equ	OMX_MAX_S16, (32767)
+	.equ	OMX_MAX_U16, (0xFFFF)
+	.equ	OMX_MAX_S32, (2147483647)
+	.equ	OMX_MAX_U32, (0xFFFFFFFF)
+
+	.equ	OMX_VC_UPPER, 0x1                 @// Used by the PredictIntra functions   
+	.equ	OMX_VC_LEFT, 0x2                 @// Used by the PredictIntra functions 
+	.equ	OMX_VC_UPPER_RIGHT, 0x40          @// Used by the PredictIntra functions   
+
+	.equ	NULL, 0
--- a/media/openmax_dl/dl/sp/api/armSP.h
+++ b/media/openmax_dl/dl/sp/api/armSP.h
@ -0,0 +1,92 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  armSP.h
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7014
+ * Last Modified Date:       Wed, 01 Aug 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *   
+ * File: armSP.h
+ * Brief: Declares API's/Basic Data types used across the OpenMAX Signal Processing domain
+ *
+ */
+#ifndef _armSP_H_
+#define _armSP_H_
+
+#include "dl/api/omxtypes.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** FFT Specific declarations */
+extern  OMX_S32 armSP_FFT_S32TwiddleTable[1026];
+extern OMX_F32 armSP_FFT_F32TwiddleTable[];
+
+typedef struct  ARMsFFTSpec_SC32_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC32    *pTwiddle;
+    OMX_SC32    *pBuf;
+}ARMsFFTSpec_SC32;
+
+
+typedef struct  ARMsFFTSpec_SC16_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC16    *pTwiddle;
+    OMX_SC16    *pBuf;
+}ARMsFFTSpec_SC16;
+
+typedef struct  ARMsFFTSpec_R_SC32_Tag 
+{
+    OMX_U32     N;
+    OMX_U16     *pBitRev;    
+    OMX_SC32    *pTwiddle;
+    OMX_S32     *pBuf;
+}ARMsFFTSpec_R_SC32;
+
+typedef struct ARMsFFTSpec_R_FC32_Tag
+{
+    OMX_U32 N;
+    OMX_U16* pBitRev;
+    OMX_FC32* pTwiddle;
+    OMX_F32* pBuf;
+} ARMsFFTSpec_R_FC32;
+
+typedef struct ARMsFFTSpec_FC32_Tag
+{
+    OMX_U32 N;
+    OMX_U16* pBitRev;
+    OMX_FC32* pTwiddle;
+    OMX_FC32* pBuf;
+} ARMsFFTSpec_FC32;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+/*End of File*/
+
+
+
--- a/media/openmax_dl/dl/sp/api/omxSP.h
+++ b/media/openmax_dl/dl/sp/api/omxSP.h
--- a/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe_s.S
@ -0,0 +1,294 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of
+@//  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s to support float
+@//  instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+      @// Guarding implementation by the processor name
+
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to complete the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.F32
+#define dShift  D1.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+#define dX0r    D0.F32
+#define dX0i    D1.F32
+#define dX1r    D2.F32
+#define dX1i    D3.F32
+#define dW0r    D4.F32
+#define dW0i    D5.F32
+#define dW1r    D6.F32
+#define dW1i    D7.F32
+#define dT0     D8.F32
+#define dT1     D9.F32
+#define dT2     D10.F32
+#define dT3     D11.F32
+#define qT0     D12.F32
+#define qT1     D14.F32
+#define qT2     D16.F32
+#define qT3     D18.F32
+#define dY0r    D4.F32
+#define dY0i    D5.F32
+#define dY1r    D6.F32
+#define dY1i    D7.F32
+
+#define dY2     D4.F32
+#define dY3     D5.F32
+#define dW0     D6.F32
+#define dW1     D7.F32
+#define dW0Tmp  D10.F32
+#define dW1Neg  D11.F32
+
+#define half    D13.F32
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        VMOV    half, 0.5
+
+
+        MOV     size,N,ASR #1                 @// preserve the contents of N
+        MOV     step,N,LSL #2                 @// step = N/2 * 8 bytes
+
+
+        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @// Note: W^(k) is stored as negated value and also need to
+        @// conjugate the values from the table
+
+        @// Z(0) : no need of twiddle multiply
+        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+
+        VLD1    dX0,[pSrc],step
+        ADD     pOut1,pOut,step               @// pOut1 = pOut+ N/2*8 bytes
+
+        VLD1    dX1,[pSrc]!
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,size,LSL #1
+
+        MOV     step1,size,LSL #2             @// step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                @// (N/4-1)*8 bytes
+
+        VADD    dY0,dX0,dX1                   @// [b+d | a+c]
+        VSUB    dY1,dX0,dX1                   @// [b-d | a-c]
+        VMUL    dY0, dY0, half[0]
+        VMUL    dY1, dY1, half[0]
+
+        @// dY0= [a-c | a+c] ;dY1= [b-d | b+d]
+        VZIP    dY0,dY1
+
+        VSUB   dX0,dY0,dY1
+        SUBS   size,size,#2
+        VADD   dX1,dY0,dY1
+
+        SUB     pSrc,pSrc,step
+
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#8       @// W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep   @// W^1
+
+
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+
+
+        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table and also
+        @// need to conjugate the values from the table.
+        @//
+        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        @// since both of them require F(1),F(2) and F(N/2-2),F(N/2-1)
+
+
+        SUB     step,step,#24
+evenOddButterflyLoop\name :
+
+
+        VLD1    dW0r,[argTwiddle1],step1
+        VLD1    dW1r,[argTwiddle1]!
+
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+
+        SUB     step1,step1,#8                @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    size,size,#4
+
+
+        VSUB    dT2,dX0r,dX1r                 @// a-c
+        VADD    dT3,dX0i,dX1i                 @// b+d
+        VADD    dT0,dX0r,dX1r                 @// a+c
+        VSUB    dT1,dX0i,dX1i                 @// b-d
+        SUB     step1,step1,#8
+
+        VMUL    dT2, dT2, half[0]
+        VMUL    dT3, dT3, half[0]
+
+        VMUL    dT0, dT0, half[0]
+        VMUL    dT1, dT1, half[0]
+
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+
+
+        VMUL   dX1r,dW1r,dT2
+        VMUL   dX1i,dW1r,dT3
+        VMUL   dX0r,dW0r,dT2
+        VMUL   dX0i,dW0r,dT3
+
+        VMLS   dX1r,dW1i,dT3
+        VMLA   dX1i,dW1i,dT2
+
+        VMLA   dX0r,dW0i,dT3
+        VMLS   dX0i,dW0i,dT2
+
+
+        VADD    dY1r,dT0,dX1i                 @// F(N/2 -1)
+        VSUB    dY1i,dX1r,dT1
+
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+
+
+        VADD    dY0r,dT0,dX0i                 @// F(1)
+        VSUB    dY0i,dT1,dX0r
+
+
+        VST2    {dY0r,dY0i},[pOut1],step
+        VST2    {dY1r,dY1i},[pOut1]!
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                 @// (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop\name
+
+
+        @// set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     pOut1,pOut1,#8
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as
+        @// -ve)
+        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
+        @// (a+bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement\name :
+        VLD1    dX0r,[pSrc]
+
+        VST1    dX0r[0],[pOut1]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[pOut1]
+
+
+
+decrementScale\name :
+
+        .endm
+
+        M_START armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe,r4
+
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.S
@ -0,0 +1,321 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7485
+@// Last Modified Date:       Fri, 21 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the "preTwiddleRadix2" stage prior to the call to the complexFFT  
+@// It does a Z(k) = Feven(k) + jW^(-k) FOdd(k); k=0,1,2,...N/2-1 computation
+@// It implements both "scaled"(by 1/2) and "unsclaed" versions of the above formula
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+                
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+      @// Guarding implementation by the processor name
+    
+    
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7     
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+#define count           r8                   @// Total num of radix stages required to comple the FFT
+#define x0r             r4    
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8            
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.S32
+#define dShift  D1.S32
+#define dX1     D1.S32
+#define dY0     D2.S32
+#define dY1     D3.S32
+#define dX0r    D0.S32            
+#define dX0i    D1.S32
+#define dX1r    D2.S32
+#define dX1i    D3.S32
+#define dW0r    D4.S32
+#define dW0i    D5.S32
+#define dW1r    D6.S32
+#define dW1i    D7.S32
+#define dT0     D8.S32
+#define dT1     D9.S32
+#define dT2     D10.S32
+#define dT3     D11.S32
+#define qT0     Q6.S64
+#define qT1     Q7.S64
+#define qT2     Q8.S64
+#define qT3     Q9.S64
+#define dY0r    D4.S32
+#define dY0i    D5.S32
+#define dY1r    D6.S32
+#define dY1i    D7.S32
+
+#define dY2     D4.S32
+#define dY3     D5.S32
+#define dW0     D6.S32
+#define dW1     D7.S32
+#define dW0Tmp  D10.S32
+#define dW1Neg  D11.S32
+
+
+@ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        
+        
+        MOV     size,N,ASR #1                    @// preserve the contents of N
+        MOV     step,N,LSL #2                    @// step = N/2 * 8 bytes
+        
+        
+        @// Z(k) = 1/2 {[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]}
+        @// Note: W^(k) is stored as negated value and also need to conjugate the values from the table
+        
+        @// Z(0) : no need of twiddle multiply
+        @// Z(0) = 1/2 { [F(0) +  F'(N/2)] +j [F(0) -  F'(N/2)] }
+        
+        VLD1    dX0,[pSrc],step
+        ADD     pOut1,pOut,step                  @// pOut1 = pOut+ N/2*8 bytes 
+                
+        VLD1    dX1,[pSrc]!
+        SUB     twStep,step,size,LSL #1          @// twStep = 3N/8 * 8 bytes pointing to W^1
+        
+        MOV     step1,size,LSL #2                @// step1 = N/4 * 8 = N/2*4 bytes
+        SUB     step1,step1,#8                   @// (N/4-1)*8 bytes
+        
+        VHADD    dY0,dX0,dX1                     @// [b+d | a+c]
+        VHSUB    dY1,dX0,dX1                     @// [b-d | a-c] 
+        VZIP    dY0,dY1                          @// dY0= [a-c | a+c] ;dY1= [b-d | b+d] 
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VHADD   dX1,dY0,dY1
+        .else
+            VSUB   dX0,dY0,dY1
+            SUBS    size,size,#2
+            VADD   dX1,dY0,dY1
+        .endif
+                    
+        SUB     pSrc,pSrc,step
+        
+        VST1    dX0[0],[pOut1]!
+        ADD     pTwiddleTmp,pTwiddle,#8                @// W^2
+        VST1    dX1[1],[pOut1]!
+        ADD     argTwiddle1,pTwiddle,twStep            @// W^1 
+        
+        
+        BLT     decrementScale\name
+        BEQ     lastElement\name
+        
+                        
+        @// Z(k) = 1/2[F(k) +  F'(N/2-k)] +j*W^(-k) [F(k) -  F'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table and also need to conjugate the values from the table
+        @// Process 4 elements at a time. E.g: Z(1),Z(2) and Z(N/2-2),Z(N/2-1) since both of them
+        @// require F(1),F(2) and F(N/2-2),F(N/2-1)
+        
+        
+        SUB     step,step,#24
+evenOddButterflyLoop\name :     
+        
+        
+        VLD1    dW0r,[argTwiddle1],step1
+        VLD1    dW1r,[argTwiddle1]!
+        
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle1,argTwiddle1,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+        
+        SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+        
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    size,size,#4
+        
+                        
+        VHSUB    dT2,dX0r,dX1r                            @// a-c
+        VHADD    dT3,dX0i,dX1i                            @// b+d
+        SUB     step1,step1,#8
+        VHADD    dT0,dX0r,dX1r                           @// a+c
+        VHSUB    dT1,dX0i,dX1i                            @// b-d
+        
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+        
+                                
+        VMULL   qT0,dW1r,dT2
+        VMLSL   qT0,dW1i,dT3
+        VMULL   qT1,dW1r,dT3
+        VMLAL   qT1,dW1i,dT2
+                    
+        VMULL   qT2,dW0r,dT2
+        VMLAL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLSL   qT3,dW0i,dT2
+        
+        
+        VRSHRN  dX1r,qT0,#31
+        VRSHRN  dX1i,qT1,#31
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
+            VHSUB    dY1i,dX1r,dT1
+        .else
+            VADD    dY1r,dT0,dX1i                           @// F(N/2 -1)
+            VSUB    dY1i,dX1r,dT1
+
+        .endif
+        
+        
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+        
+                            
+        VRSHRN  dX0r,qT2,#31
+        VRSHRN  dX0i,qT3,#31
+        
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    dY0r,dT0,dX0i                           @// F(1)
+            VHSUB    dY0i,dT1,dX0r
+        .else
+            VADD    dY0r,dT0,dX0i                           @// F(1)
+            VSUB    dY0i,dT1,dX0r
+        .endif
+        
+        
+        VST2    {dY0r,dY0i},[pOut1],step
+        VST2    {dY1r,dY1i},[pOut1]!
+        SUB     pOut1,pOut1,step
+        SUB     step,step,#32                            @// (N/2-4)*8 bytes
+        
+        
+        BGT     evenOddButterflyLoop\name
+        
+        
+        SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
+        SUB     pOut1,pOut1,#8
+        
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] - j w^-k [Z(k) - Z'(k)] (since W^k is stored as -ve)
+        @// 1/2[(a+jb) + (a-jb)] - j w^-k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j (c-jd) [0+j2b]
+        @// (a+bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+        
+lastElement\name :      
+        VLD1    dX0r,[pSrc]
+        
+        .ifeqs  "\scaled", "TRUE"
+            VSHR    dX0r,dX0r,#1
+        .endif
+        
+        VST1    dX0r[0],[pOut1]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[pOut1]
+        
+        
+
+decrementScale\name :          
+        
+        .ifeqs  "\scaled", "TRUE"
+            SUB scale,scale,#1
+        .endif
+        
+        .endm
+        
+        M_START armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe,r4
+                    
+            FFTSTAGE "FALSE","TRUE",Inv
+        M_END
+        
+        M_START armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe,r4
+                    
+            FFTSTAGE "TRUE","TRUE",InvSfs
+        M_END
+
+        
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_fs_unsafe_s.S
@ -0,0 +1,134 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep       r3
+#define outPointStep    r3
+#define grpSize         r4
+#define setCount        r4
+#define step            r8
+#define dstStep         r8
+
+@// Neon Registers
+
+#define dX0     D0.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        MOV        pointStep,grpSize,LSL #3
+        RSB        step,pointStep,#8
+
+
+        @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name :
+
+        VLD1    dX0,[pSrc],pointStep
+        VLD1    dX1,[pSrc],step                   @// step = -pointStep + 8
+        SUBS    setCount,setCount,#1
+
+        VADD    dY0,dX0,dX1
+        VSUB    dY1,dX0,dX1
+
+        VST1    dY0,[pDst],outPointStep
+        @// dstStep =  step = -pointStep + 8
+        VST1    dY1,[pDst],dstStep
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_ls_unsafe_s.S
@ -0,0 +1,153 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep    r3
+#define grpCount        r4
+#define dstStep         r5
+#define pTmp            r4
+
+@// Neon Registers
+
+#define dWr     d0.f32
+#define dWi     d1.f32
+#define dXr0    d2.f32
+#define dXi0    d3.f32
+#define dXr1    d4.f32
+#define dXi1    d5.f32
+#define dYr0    d6.f32
+#define dYi0    d7.f32
+#define dYr1    d8.f32
+#define dYi1    d9.f32
+#define qT0     d10.f32
+#define qT1     d12.f32
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+
+        MOV     outPointStep,subFFTSize,LSL #3
+        @// Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        RSB      dstStep,outPointStep,#16
+
+
+        @// Loop on 2 grps at a time for the last stage
+
+radix2lsGrpLoop\name :
+        @ dWr = [pTwiddle[0].Re, pTwiddle[1].Re]
+        @ dWi = [pTwiddle[0].Im, pTwiddle[1].Im]
+        VLD2    {dWr,dWi},[pTwiddle :64]!
+
+        @ dXr0 = [pSrc[0].Re, pSrc[2].Re]
+        @ dXi0 = [pSrc[0].Im, pSrc[2].Im]
+        @ dXr1 = [pSrc[1].Re, pSrc[3].Re]
+        @ dXi1 = [pSrc[1].Im, pSrc[3].Im]
+        VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   qT0,dWr,dXr1
+            VMLA   qT0,dWi,dXi1                       @// real part
+            VMUL   qT1,dWr,dXi1
+            VMLS   qT1,dWi,dXr1                       @// imag part
+
+        .else
+
+            VMUL   qT0,dWr,dXr1
+            VMLS   qT0,dWi,dXi1                       @// real part
+            VMUL   qT1,dWr,dXi1
+            VMLA   qT1,dWi,dXr1                       @// imag part
+
+        .endif
+
+        VSUB    dYr0,dXr0,qT0
+        VSUB    dYi0,dXi0,qT1
+        VADD    dYr1,dXr0,qT0
+        VADD    dYi1,dXi0,qT1
+
+        VST2    {dYr0,dYi0},[pDst],outPointStep
+        VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+
+        BGT     radix2lsGrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4,""
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix2_unsafe_s.S
@ -0,0 +1,191 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@// Description:
+@// Compute a Radix 2 DIT in-order out-of-place FFT stage for an N point
+@// complex signal.  This handles the general stage, not the first or last
+@// stage.
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep    r3
+#define pointStep       r4
+#define grpCount        r5
+#define setCount        r8
+@//const           RN  9
+#define step            r10
+#define dstStep         r11
+#define pTable          r9
+#define pTmp            r9
+
+@// Neon Registers
+
+#define dW      D0.F32
+#define dX0     D2.F32
+#define dX1     D3.F32
+#define dX2     D4.F32
+#define dX3     D5.F32
+#define dY0     D6.F32
+#define dY1     D7.F32
+#define dY2     D8.F32
+#define dY3     D9.F32
+#define qT0     D10.F32
+#define qT1     D11.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount
+        @// and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #2
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes =
+        @//    4*size bytes
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#1
+
+
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+
+        @// Loop on the groups
+
+radix2GrpLoop\name :
+        MOV      setCount,pointStep,LSR #3
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+
+
+        @// Loop on the sets
+
+
+radix2SetLoop\name :
+
+
+        @// point0: dX0-real part dX1-img part
+        VLD2    {dX0,dX1},[pSrc],pointStep
+        @// point1: dX2-real part dX3-img part
+        VLD2    {dX2,dX3},[pSrc],step
+
+        SUBS    setCount,setCount,#2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   qT0,dX2,dW[0]
+            VMLA   qT0,dX3,dW[1]                       @// real part
+            VMUL   qT1,dX3,dW[0]
+            VMLS   qT1,dX2,dW[1]                       @// imag part
+
+        .else
+
+            VMUL   qT0,dX2,dW[0]
+            VMLS   qT0,dX3,dW[1]                       @// real part
+            VMUL   qT1,dX3,dW[0]
+            VMLA   qT1,dX2,dW[1]                       @// imag part
+
+        .endif
+
+        VSUB    dY0,dX0,qT0
+        VSUB    dY1,dX1,qT1
+        VADD    dY2,dX0,qT0
+        VADD    dY3,dX1,qT1
+
+        VST2    {dY0,dY1},[pDst],outPointStep
+        @// dstStep = -outPointStep + 16
+        VST2    {dY2,dY3},[pDst],dstStep
+
+        BGT     radix2SetLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix2GrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #1
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        @// pTwiddle -= 4*size bytes
+        SUB     pTwiddle,pTwiddle,outPointStep
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_fs_unsafe_s.S
@ -0,0 +1,251 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define pPingPongBuf    r5
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r3
+@// Reuse grpSize as setCount
+#define setCount        r3
+#define pointStep       r4
+#define outPointStep    r4
+#define setStep         r8
+#define step1           r9
+#define step3           r10
+
+@// Neon Registers
+
+#define dXr0    D0.F32
+#define dXi0    D1.F32
+#define dXr1    D2.F32
+#define dXi1    D3.F32
+#define dXr2    D4.F32
+#define dXi2    D5.F32
+#define dXr3    D6.F32
+#define dXi3    D7.F32
+#define dYr0    D8.F32
+#define dYi0    D9.F32
+#define dYr1    D10.F32
+#define dYi1    D11.F32
+#define dYr2    D12.F32
+#define dYi2    D13.F32
+#define dYr3    D14.F32
+#define dYi3    D15.F32
+#define qX0     Q0.F32
+#define qX1     Q1.F32
+#define qX2     Q2.F32
+#define qX3     Q3.F32
+#define qY0     Q4.F32
+#define qY1     Q5.F32
+#define qY2     Q6.F32
+#define qY3     Q7.F32
+#define dZr0    D16.F32
+#define dZi0    D17.F32
+#define dZr1    D18.F32
+#define dZi1    D19.F32
+#define dZr2    D20.F32
+#define dZi2    D21.F32
+#define dZr3    D22.F32
+#define dZi3    D23.F32
+#define qZ0     Q8.F32
+#define qZ1     Q9.F32
+#define qZ2     Q10.F32
+#define qZ3     Q11.F32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,subFFTNum,LSL #1
+
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        @// subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#4
+
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        MOV     subFFTNum,grpSize
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #4
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        @// setStep = 3*pointStep
+        ADD     setStep,setStep,pointStep
+        @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#16
+
+        @//  data[3] & update pSrc for the next set
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep
+        @// step1 = 2*pointStep
+        MOV     step1,pointStep,LSL #1
+
+        VADD    qY0,qX0,qX2
+
+        @// step3 = -pointStep
+        RSB     step3,pointStep,#0
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 2 sets at a time
+
+radix4fsGrpZeroSetLoop\name :
+
+
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        @// finish first stage of 4 point FFT
+
+
+        VSUB    qY2,qX0,qX2
+
+        VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+        VADD    qY1,qX1,qX3
+        VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+        VSUB    qY3,qX1,qX3
+
+
+        @// finish second stage of 4 point FFT
+
+        .ifeqs "\inverse", "TRUE"
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1
+
+            @//  data[3] & update pSrc for the next set, but not if it's the
+            @//  last iteration so that we don't read past the end of the 
+            @//  input array.
+            BEQ     radix4SkipLastUpdateInv\name
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+radix4SkipLastUpdateInv\name:
+            VSUB    dZr3,dYr2,dYi3
+
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi3,dYi2,dYr3
+
+            VSUB    qZ1,qY0,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VADD    dZr2,dYr2,dYi3
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            VSUB    dZi2,dYi2,dYr3
+
+            VADD    qY0,qX0,qX2                     @// u0 for next iteration
+            VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+        .else
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1
+
+            @//  data[3] & update pSrc for the next set, but not if it's the
+            @//  last iteration so that we don't read past the end of the 
+            @//  input array.
+            BEQ     radix4SkipLastUpdateFwd\name
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+radix4SkipLastUpdateFwd\name:
+            VADD    dZr2,dYr2,dYi3
+
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi2,dYi2,dYr3
+
+            VSUB    qZ1,qY0,qY1
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+            VSUB    dZr3,dYr2,dYi3
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            VADD    dZi3,dYi2,dYr3
+
+            VADD    qY0,qX0,qX2                     @// u0 for next iteration
+            VST2    {dZr3,dZi3},[pDst :128],setStep
+
+        .endif
+
+        BGT     radix4fsGrpZeroSetLoop\name
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_ls_unsafe_s.S
@ -0,0 +1,339 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep    r3
+#define grpCount        r4
+#define dstStep         r5
+#define grpTwStep       r8
+#define stepTwiddle     r9
+#define twStep          r10
+#define pTmp            r4
+#define step16          r11
+#define step24          r12
+
+
+@// Neon Registers
+
+#define dButterfly1Real02       D0.F32
+#define dButterfly1Imag02       D1.F32
+#define dButterfly1Real13       D2.F32
+#define dButterfly1Imag13       D3.F32
+#define dButterfly2Real02       D4.F32
+#define dButterfly2Imag02       D5.F32
+#define dButterfly2Real13       D6.F32
+#define dButterfly2Imag13       D7.F32
+#define dXr0                    D0.F32
+#define dXi0                    D1.F32
+#define dXr1                    D2.F32
+#define dXi1                    D3.F32
+#define dXr2                    D4.F32
+#define dXi2                    D5.F32
+#define dXr3                    D6.F32
+#define dXi3                    D7.F32
+
+#define dYr0                    D16.F32
+#define dYi0                    D17.F32
+#define dYr1                    D18.F32
+#define dYi1                    D19.F32
+#define dYr2                    D20.F32
+#define dYi2                    D21.F32
+#define dYr3                    D22.F32
+#define dYi3                    D23.F32
+
+#define dW1r                    D8.F32
+#define dW1i                    D9.F32
+#define dW2r                    D10.F32
+#define dW2i                    D11.F32
+#define dW3r                    D12.F32
+#define dW3i                    D13.F32
+#define qT0                     d14.f32
+#define qT1                     d16.F32
+#define qT2                     d18.F32
+#define qT3                     d20.f32
+#define qT4                     d22.f32
+#define qT5                     d24.f32
+
+#define dZr0                    D14.F32
+#define dZi0                    D15.F32
+#define dZr1                    D26.F32
+#define dZi1                    D27.F32
+#define dZr2                    D28.F32
+#define dZi2                    D29.F32
+#define dZr3                    D30.F32
+#define dZi3                    D31.F32
+
+#define qX0                     Q0.F32
+#define qY0                     Q8.F32
+#define qY1                     Q9.F32
+#define qY2                     Q10.F32
+#define qY3                     Q11.F32
+#define qZ0                     Q7.F32
+#define qZ1                     Q13.F32
+#define qZ2                     Q14.F32
+#define qZ3                     Q15.F32
+
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes
+        MOV     outPointStep,subFFTSize,LSL #3
+
+        @// Update grpCount and grpSize rightaway
+
+        VLD2    {dW1r,dW1i},[pTwiddle :128]             @// [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+
+        VLD1    dW2r,[pTwiddle :64]                     @// [wi|wr]
+        MOV     subFFTNum,#1                            @//after the last stage
+
+        VLD1    dW3r,[pTwiddle :64],step16              @// [wi|wr]
+        MOV     stepTwiddle,#0
+
+        VLD1    dW2i,[pTwiddle :64]!                    @// [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8                @// grpTwStep = -8 to start with
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
+        MOV     dstStep,outPointStep,LSL #1
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
+        ADD     dstStep,dstStep,outPointStep            @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                     @// dstStep = - 3*outPointStep+16
+        MOV     step24,#24
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
+
+
+        @// Process two groups at a time
+
+radix4lsGrpLoop\name :
+
+        VZIP    dW2r,dW2i
+        ADD     stepTwiddle,stepTwiddle,#16
+        VZIP    dW3r,dW3i
+        ADD     grpTwStep,stepTwiddle,#4
+        VUZP     dButterfly1Real13, dButterfly2Real13   @// B.r D.r
+        SUB     twStep,stepTwiddle,#16                  @// -16+stepTwiddle
+        VUZP     dButterfly1Imag13, dButterfly2Imag13   @// B.i D.i
+        MOV     grpTwStep,grpTwStep,LSL #1
+        VUZP     dButterfly1Real02, dButterfly2Real02   @// A.r C.r
+        RSB     grpTwStep,grpTwStep,#0                  @// -8-2*stepTwiddle
+
+
+        VUZP     dButterfly1Imag02, dButterfly2Imag02   @// A.i C.i
+
+
+        @// grpCount is multiplied by 4
+        SUBS    grpCount,grpCount,#8
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr1,dW1r,dXr1
+            VMLA   dZr1,dW1i,dXi1                       @// real part
+            VMUL   dZi1,dW1r,dXi1
+            VMLS   dZi1,dW1i,dXr1                       @// imag part
+
+        .else
+
+            VMUL   dZr1,dW1r,dXr1
+            VMLS   dZr1,dW1i,dXi1                       @// real part
+            VMUL   dZi1,dW1r,dXi1
+            VMLA   dZi1,dW1i,dXr1                       @// imag part
+
+        .endif
+
+        VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr2,dW2r,dXr2
+            VMLA   dZr2,dW2i,dXi2                       @// real part
+            VMUL   dZi2,dW2r,dXi2
+            VLD1   dW2r,[pTwiddle :64],step16           @// [wi|wr]
+            VMLS   dZi2,dW2i,dXr2                       @// imag part
+
+        .else
+
+            VMUL   dZr2,dW2r,dXr2
+            VMLS   dZr2,dW2i,dXi2                       @// real part
+            VMUL   dZi2,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16          @// [wi|wr]
+            VMLA   dZi2,dW2i,dXr2                       @// imag part
+
+        .endif
+
+
+        VLD1    dW2i,[pTwiddle :64],twStep              @// [wi|wr]
+
+        @// move qX0 so as to load for the next iteration
+        VMOV     qZ0,qX0
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr3,dW3r,dXr3
+            VMLA   dZr3,dW3i,dXi3                       @// real part
+            VMUL   dZi3,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLS   dZi3,dW3i,dXr3                       @// imag part
+
+        .else
+
+            VMUL   dZr3,dW3r,dXr3
+            VMLS   dZr3,dW3i,dXi3                       @// real part
+            VMUL   dZi3,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLA   dZi3,dW3i,dXr3                       @// imag part
+
+        .endif
+
+        VLD1    dW3i,[pTwiddle :64],grpTwStep           @// [wi|wr]
+
+        @// Don't do the load on the last iteration so we don't read past the end
+        @// of pSrc.
+        addeq   pSrc, pSrc, #64
+        beq     radix4lsSkipRead\name
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]!
+
+        @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]!
+radix4lsSkipRead\name:
+
+        @// finish first stage of 4 point FFT
+
+        VADD    qY0,qZ0,qZ2
+        VSUB    qY2,qZ0,qZ2
+        VADD    qY1,qZ1,qZ3
+        VSUB    qY3,qZ1,qZ3
+
+
+        @// finish second stage of 4 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VSUB    qZ0,qY2,qY1
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            @// dstStep = -outPointStep + 16
+            VST2    {dZr1,dZi1},[pDst :128],dstStep
+
+
+        .else
+
+            VSUB    qZ0,qY2,qY1
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            @// dstStep = -outPointStep + 16
+            VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+        .endif
+
+        BGT     radix4lsGrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        @// Extra increment done in final iteration of the loop
+        SUB     pSrc,pSrc,#64
+        @// pDst -= 4*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #2
+        SUB     pSrc,pTmp,outPointStep
+        SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
+        @// Extra increment done in final iteration of the loop
+        SUB     pTwiddle,pTwiddle,#16
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix4_unsafe_s.S
@ -0,0 +1,331 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount        r3
+#define pointStep       r4
+#define outPointStep    r5
+#define stepTwiddle     r12
+#define setCount        r14
+#define srcStep         r8
+#define setStep         r9
+#define dstStep         r10
+#define twStep          r11
+#define t1              r3
+
+@// Neon Registers
+
+#define dW1     D0.F32
+#define dW2     D1.F32
+#define dW3     D2.F32
+
+#define dXr0    D4.F32
+#define dXi0    D5.F32
+#define dXr1    D6.F32
+#define dXi1    D7.F32
+#define dXr2    D8.F32
+#define dXi2    D9.F32
+#define dXr3    D10.F32
+#define dXi3    D11.F32
+#define dYr0    D12.F32
+#define dYi0    D13.F32
+#define dYr1    D14.F32
+#define dYi1    D15.F32
+#define dYr2    D16.F32
+#define dYi2    D17.F32
+#define dYr3    D18.F32
+#define dYi3    D19.F32
+#define qT0     d16.f32
+#define qT1     d18.f32
+#define qT2     d12.f32
+#define qT3     d14.f32
+#define dZr0    D20.F32
+#define dZi0    D21.F32
+#define dZr1    D22.F32
+#define dZi1    D23.F32
+#define dZr2    D24.F32
+#define dZi2    D25.F32
+#define dZr3    D26.F32
+#define dZi3    D27.F32
+
+#define qY0     Q6.F32
+#define qY1     Q7.F32
+#define qY2     Q8.F32
+#define qY3     Q9.F32
+#define qX0     Q2.F32
+#define qZ0     Q10.F32
+#define qZ1     Q11.F32
+#define qZ2     Q12.F32
+#define qZ3     Q13.F32
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse
+        @// pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+        VLD1     dW1,[pTwiddle]                    @//[wi | wr]
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes
+        @//   = 2*size bytes
+
+        MOV     stepTwiddle,#0
+        VLD1     dW2,[pTwiddle]                    @//[wi | wr]
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#2             @// 2*grpSize
+
+        VLD1     dW3,[pTwiddle]                    @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1           @// srcStep = 2*pointStep
+        ADD     setStep,srcStep,pointStep          @// setStep = 3*pointStep
+
+        RSB     setStep,setStep,#0                 @// setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                @// srcStep = 2*pointStep-16
+
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep       @// dstStep = 3*outPointStep
+        @// dstStep = - 3*outPointStep+16
+        RSB     dstStep,dstStep,#16
+
+
+
+radix4GrpLoop\name :
+
+        VLD2    {dXr0,dXi0},[pSrc],pointStep       @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc],pointStep       @//  data[1]
+        @// set pTwiddle to the first point
+        ADD      pTwiddle,pTwiddle,stepTwiddle
+        VLD2    {dXr2,dXi2},[pSrc],pointStep       @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+
+        @//  data[3] & update pSrc for the next set
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+        SUB      twStep,stepTwiddle,twStep         @// twStep = -3*stepTwiddle
+
+        MOV      setCount,pointStep,LSR #3
+        @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,#16
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+
+
+        @// Loop on the sets
+
+radix4SetLoop\name :
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLA   dZr1,dXi1,dW1[1]                @// real part
+            VMLS   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLA   dZr2,dXi2,dW2[1]                @// real part
+            VMLS   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLA   dZr3,dXi3,dW3[1]                @// real part
+            VMLS   dZi3,dXr3,dW3[1]                @// imag part
+        .else
+            VMUL   dZr1,dXr1,dW1[0]
+            VMUL   dZi1,dXi1,dW1[0]
+            VMUL   dZr2,dXr2,dW2[0]
+            VMUL   dZi2,dXi2,dW2[0]
+            VMUL   dZr3,dXr3,dW3[0]
+            VMUL   dZi3,dXi3,dW3[0]
+
+            VMLS   dZr1,dXi1,dW1[1]                @// real part
+            VMLA   dZi1,dXr1,dW1[1]                @// imag part
+
+            @//  data[1] for next iteration
+            VLD2    {dXr1,dXi1},[pSrc],pointStep
+
+            VMLS   dZr2,dXi2,dW2[1]                @// real part
+            VMLA   dZi2,dXr2,dW2[1]                @// imag part
+
+            @//  data[2] for next iteration
+            VLD2    {dXr2,dXi2},[pSrc],pointStep
+
+            VMLS   dZr3,dXi3,dW3[1]                @// real part
+            VMLA   dZi3,dXr3,dW3[1]                @// imag part
+        .endif
+
+        @//  data[3] & update pSrc to data[0]
+        @// But don't read on the very last iteration because that reads past 
+	@// the end of pSrc. The last iteration is grpCount = 4, setCount = 2.
+        cmp     grpCount, #4
+        cmpeq   setCount, #2                      @// Test setCount if grpCount = 4
+        @// These are executed only if both grpCount = 4 and setCount = 2       
+        addeq   pSrc, pSrc, setStep
+        beq     radix4SkipRead\name
+        VLD2    {dXr3,dXi3},[pSrc],setStep
+radix4SkipRead\name:
+        SUBS    setCount,setCount,#2
+
+        @// finish first stage of 4 point FFT
+        VADD    qY0,qX0,qZ2
+        VSUB    qY2,qX0,qZ2
+
+        @//  data[0] for next iteration
+        VLD2    {dXr0,dXi0},[pSrc :128]!
+        VADD    qY1,qZ1,qZ3
+        VSUB    qY3,qZ1,qZ3
+
+        @// finish second stage of 4 point FFT
+
+        VSUB    qZ0,qY2,qY1
+
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VST2    {dZr1,dZi1},[pDst :128],dstStep
+
+
+        .else
+
+            VSUB    dZr1,dYr0,dYi3
+            VST2    {dZr0,dZi0},[pDst :128],outPointStep
+            VADD    dZi1,dYi0,dYr3
+
+            VADD    qZ2,qY2,qY1
+            VST2    {dZr1,dZi1},[pDst :128],outPointStep
+
+            VADD    dZr3,dYr0,dYi3
+            VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            VSUB    dZi3,dYi0,dYr3
+
+            VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+        .endif
+
+        @// increment to data[1] of the next set
+        ADD     pSrc,pSrc,pointStep
+        BGT     radix4SetLoop\name
+
+
+        VLD1     dW1,[pTwiddle :64],stepTwiddle    @//[wi | wr]
+        @// subtract 4 since grpCount multiplied by 4
+        SUBS    grpCount,grpCount,#4
+        VLD1     dW2,[pTwiddle :64],stepTwiddle    @//[wi | wr]
+        @// increment pSrc for the next grp
+        ADD     pSrc,pSrc,srcStep
+        VLD1     dW3,[pTwiddle :64],twStep         @//[wi | wr]
+        BGT     radix4GrpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        @// pDst -= 2*size; pSrc -= 8*size bytes
+        SUB     pDst,pSrc,outPointStep,LSL #2
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.S
@ -0,0 +1,426 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_FC32_Radix8_fs_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r2
+#define pTwiddle        r1
+#define subFFTNum       r6
+#define subFFTSize      r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize         r3
+@// Reuse grpSize as setCount
+#define setCount        r3
+#define pointStep       r4
+#define outPointStep    r4
+#define setStep         r8
+#define step1           r9
+#define step2           r10
+#define t0              r11
+
+
+@// Neon Registers
+
+#define dXr0    D0.F32
+#define dXi0    D1.F32
+#define dXr1    D2.F32
+#define dXi1    D3.F32
+#define dXr2    D4.F32
+#define dXi2    D5.F32
+#define dXr3    D6.F32
+#define dXi3    D7.F32
+#define dXr4    D8.F32
+#define dXi4    D9.F32
+#define dXr5    D10.F32
+#define dXi5    D11.F32
+#define dXr6    D12.F32
+#define dXi6    D13.F32
+#define dXr7    D14.F32
+#define dXi7    D15.F32
+#define qX0     Q0.F32
+#define qX1     Q1.F32
+#define qX2     Q2.F32
+#define qX3     Q3.F32
+#define qX4     Q4.F32
+#define qX5     Q5.F32
+#define qX6     Q6.F32
+#define qX7     Q7.F32
+
+#define dUr0    D16.F32
+#define dUi0    D17.F32
+#define dUr2    D18.F32
+#define dUi2    D19.F32
+#define dUr4    D20.F32
+#define dUi4    D21.F32
+#define dUr6    D22.F32
+#define dUi6    D23.F32
+#define dUr1    D24.F32
+#define dUi1    D25.F32
+#define dUr3    D26.F32
+#define dUi3    D27.F32
+#define dUr5    D28.F32
+#define dUi5    D29.F32
+@// reuse dXr7 and dXi7
+#define dUr7    D30.F32
+#define dUi7    D31.F32
+#define qU0     Q8.F32
+#define qU1     Q12.F32
+#define qU2     Q9.F32
+#define qU3     Q13.F32
+#define qU4     Q10.F32
+#define qU5     Q14.F32
+#define qU6     Q11.F32
+#define qU7     Q15.F32
+
+
+#define dVr0    D24.F32
+#define dVi0    D25.F32
+#define dVr2    D26.F32
+#define dVi2    D27.F32
+#define dVr4    D28.F32
+#define dVi4    D29.F32
+#define dVr6    D30.F32
+#define dVi6    D31.F32
+#define dVr1    D16.F32
+#define dVi1    D17.F32
+#define dVr3    D18.F32
+#define dVi3    D19.F32
+#define dVr5    D20.F32
+#define dVi5    D21.F32
+#define dVr7    D22.F32
+#define dVi7    D23.F32
+#define qV0     Q12.F32
+#define qV1     Q8.F32
+#define qV2     Q13.F32
+#define qV3     Q9.F32
+#define qV4     Q14.F32
+#define qV5     Q10.F32
+#define qV6     Q15.F32
+#define qV7     Q11.F32
+
+#define dYr0    D16.F32
+#define dYi0    D17.F32
+#define dYr2    D18.F32
+#define dYi2    D19.F32
+#define dYr4    D20.F32
+#define dYi4    D21.F32
+#define dYr6    D22.F32
+#define dYi6    D23.F32
+#define dYr1    D24.F32
+#define dYi1    D25.F32
+#define dYr3    D26.F32
+#define dYi3    D27.F32
+#define dYr5    D28.F32
+#define dYi5    D29.F32
+#define dYr7    D30.F32
+#define dYi7    D31.F32
+#define qY0     Q8.F32
+#define qY1     Q12.F32
+#define qY2     Q9.F32
+#define qY3     Q13.F32
+#define qY4     Q10.F32
+#define qY5     Q14.F32
+#define qY6     Q11.F32
+#define qY7     Q15.F32
+
+#define dT0     D14.F32
+#define dT1     D15.F32
+
+@// Define constants
+        @ sqrt(1/2)
+ONEBYSQRT2:     .float  0.7071067811865476e0
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        @// subFFTSize = 1 for the first stage
+        MOV     subFFTSize,#8
+        LDR     t0,=ONEBYSQRT2
+
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,grpSize,LSL #3
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1             @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep     @//  data[0]
+        MOV     step1,grpSize,LSL #4
+
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
+        SUB     step2,step2,pointStep                 @// step2 = 7*pointStep
+        @// setStep = - 7*pointStep+16
+        RSB     setStep,step2,#16
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep     @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep     @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep     @//  data[6]
+        @//  data[7] & update pSrc for the next set
+        @//  setStep = -7*pointStep + 16
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets
+
+radix8fsGrpZeroSetLoop\name :
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2
+
+
+        @// finish first stage of 8 point FFT
+
+        VADD    qU0,qX0,qX4
+        VADD    qU2,qX1,qX5
+        VADD    qU4,qX2,qX6
+        VADD    qU6,qX3,qX7
+
+        @// finish second stage of 8 point FFT
+
+        VADD    qV0,qU0,qU4
+        VSUB    qV2,qU0,qU4
+        VADD    qV4,qU2,qU6
+        VSUB    qV6,qU2,qU6
+
+        @// finish third stage of 8 point FFT
+
+        VADD    qY0,qV0,qV4
+        VSUB    qY4,qV0,qV4
+        VST2    {dYr0,dYi0},[pDst :128],step1         @// store y0
+
+        .ifeqs  "\inverse", "TRUE"
+
+            VSUB    dYr2,dVr2,dVi6
+            VADD    dYi2,dVi2,dVr6
+
+            VADD    dYr6,dVr2,dVi6
+            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y2
+            VSUB    dYi6,dVi2,dVr6
+
+            VSUB    qU1,qX0,qX4
+            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
+
+            VSUB    qU3,qX1,qX5
+            VSUB    qU5,qX2,qX6
+            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y6
+
+        .ELSE
+
+            VADD    dYr6,dVr2,dVi6
+            VSUB    dYi6,dVi2,dVr6
+
+            VSUB    dYr2,dVr2,dVi6
+            VST2    {dYr6,dYi6},[pDst :128],step1     @// store y2
+            VADD    dYi2,dVi2,dVr6
+
+
+            VSUB    qU1,qX0,qX4
+            VST2    {dYr4,dYi4},[pDst :128],step1     @// store y4
+            VSUB    qU3,qX1,qX5
+            VSUB    qU5,qX2,qX6
+            VST2    {dYr2,dYi2},[pDst :128],step1     @// store y6
+
+
+        .ENDIF
+
+        @// finish first stage of 8 point FFT
+
+        VSUB    qU7,qX3,qX7
+        VLD1    dT0[0], [t0]
+
+        @// finish second stage of 8 point FFT
+
+        VSUB    dVr1,dUr1,dUi5
+        @//  data[0] for next iteration
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep
+        VADD    dVi1,dUi1,dUr5
+        VADD    dVr3,dUr1,dUi5
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep     @//  data[1]
+        VSUB    dVi3,dUi1,dUr5
+
+        VSUB    dVr5,dUr3,dUi7
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep     @//  data[2]
+        VADD    dVi5,dUi3,dUr7
+        VADD    dVr7,dUr3,dUi7
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep     @//  data[3]
+        VSUB    dVi7,dUi3,dUr7
+
+        @// finish third stage of 8 point FFT
+
+        .ifeqs  "\inverse", "TRUE"
+
+            @// calculate a*v5
+            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
+
+            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
+            VMUL    dVi5,dVi5,dT0[0]
+
+            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
+            VSUB    dVr5,dT1,dVi5                     @// a * V5
+            VADD    dVi5,dT1,dVi5
+
+            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
+
+            @// calculate  b*v7
+            VMUL    dT1,dVr7,dT0[0]
+            VMUL    dVi7,dVi7,dT0[0]
+
+            VADD    qY1,qV1,qV5
+            VSUB    qY5,qV1,qV5
+
+
+            VADD    dVr7,dT1,dVi7                     @// b * V7
+            VSUB    dVi7,dVi7,dT1
+            SUB     pDst, pDst, step2                 @// set pDst to y1
+
+            @// On the last iteration,  this will read past the end of pSrc, 
+            @// so skip this read.
+            BEQ     radix8SkipLastUpdateInv\name
+            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
+radix8SkipLastUpdateInv\name:
+
+            VSUB    dYr3,dVr3,dVr7
+            VSUB    dYi3,dVi3,dVi7
+            VST2    {dYr1,dYi1},[pDst :128],step1     @// store y1
+            VADD    dYr7,dVr3,dVr7
+            VADD    dYi7,dVi3,dVi7
+
+
+            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y3
+            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y5
+            VST2    {dYr7,dYi7},[pDst :128]           @// store y7
+            ADD pDst, pDst, #16
+
+        .ELSE
+
+            @// calculate  b*v7
+            VMUL    dT1,dVr7,dT0[0]
+            VLD2    {dXr4,dXi4},[pSrc :128],pointStep @//  data[4]
+            VMUL    dVi7,dVi7,dT0[0]
+
+            VLD2    {dXr5,dXi5},[pSrc :128],pointStep @//  data[5]
+            VADD    dVr7,dT1,dVi7                     @// b * V7
+            VSUB    dVi7,dVi7,dT1
+
+            VLD2    {dXr6,dXi6},[pSrc :128],pointStep @//  data[6]
+
+            @// calculate a*v5
+            VMUL    dT1,dVr5,dT0[0]                   @// use dVi0 for dT1
+            VMUL    dVi5,dVi5,dT0[0]
+
+            VADD    dYr7,dVr3,dVr7
+            VADD    dYi7,dVi3,dVi7
+            SUB     pDst, pDst, step2                 @// set pDst to y1
+
+            VSUB    dVr5,dT1,dVi5                     @// a * V5
+            VADD    dVi5,dT1,dVi5
+
+            @// On the last iteration,  this will read past the end of pSrc, 
+            @// so skip this read.
+            BEQ     radix8SkipLastUpdateFwd\name
+            VLD2    {dXr7,dXi7},[pSrc :128],setStep   @//  data[7]
+radix8SkipLastUpdateFwd\name:
+
+            VSUB    qY5,qV1,qV5
+
+            VSUB    dYr3,dVr3,dVr7
+            VST2    {dYr7,dYi7},[pDst :128],step1     @// store y1
+            VSUB    dYi3,dVi3,dVi7
+            VADD    qY1,qV1,qV5
+
+
+            VST2    {dYr5,dYi5},[pDst :128],step1     @// store y3
+            VST2    {dYr3,dYi3},[pDst :128],step1     @// store y5
+            VST2    {dYr1,dYi1},[pDst :128]!          @// store y7
+
+        .ENDIF
+
+
+        @// update pDst for the next set
+        SUB     pDst, pDst, step2
+        BGT     radix8fsGrpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                   @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+
+        .endm
+
+
+        @// Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.S
@ -0,0 +1,170 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6693
+@// Last Modified Date:       Tue, 10 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep                       r3
+#define outPointStep                    r3
+#define grpSize                         r4
+#define setCount                        r4
+#define step                            r8
+#define dstStep                         r8
+
+@// Neon Registers
+
+#define dX0                             D0.S16
+#define dX1                             D1.S16
+#define dY0                             D2.S16
+#define dY1                             D3.S16
+#define dX0S32                          D0.S32
+#define dX1S32                          D1.S32
+#define dY0S32                          D2.S32
+#define dY1S32                          D3.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+
+
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1
+        MOV        subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+
+        MOV        pointStep,grpSize,LSL #2
+        RSB        step,pointStep,#4
+
+
+        @// Loop on the sets for grp zero: 1 set at a time
+
+grpZeroSetLoop\name:
+
+        VLD1    {dX0S32[0]},[pSrc],pointStep
+        VLD1    {dX1S32[0]},[pSrc],step                   @// step = -pointStep + 4
+        SUBS    setCount,setCount,#1              @// decrement the loop counter
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHADD    dY0,dX0,dX1
+            VHSUB    dY1,dX0,dX1
+
+        .ELSE
+
+            VADD    dY0,dX0,dX1
+            VSUB    dY1,dX0,dX1
+
+
+        .ENDIF
+
+        VST1    {dY0S32[0]},[pDst],outPointStep
+        VST1    {dY1S32[0]},[pDst],dstStep                  @// dstStep =  step = -pointStep + 4
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.S
@ -0,0 +1,210 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6741
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pTmp                            r4
+#define step                            r8
+
+@// Neon Registers
+
+#define dWr                             D0.S16
+#define dWi                             D1.S16
+#define dXr0                            D2.S16
+#define dXi0                            D3.S16
+#define dXr1                            D4.S16
+#define dXi1                            D5.S16
+#define dYr0                            D6.S16
+#define dYi0                            D7.S16
+#define dYr1                            D8.S16
+#define dYi1                            D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+
+        MOV     outPointStep,subFFTSize,LSL #2
+        @// Update grpCount and grpSize rightaway
+
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        SUB      step,outPointStep,#4                   @// step = -4+outPointStep
+        RSB      dstStep,step,#0                        @// dstStep = -4-outPointStep+8 = -step
+        @//RSB      dstStep,outPointStep,#16
+
+
+        @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name:
+        VLD2    {dWr[0],dWi[0]},[pTwiddle]!             @// grp 0
+        VLD2    {dWr[1],dWi[1]},[pTwiddle]!             @// grp 1
+
+        @//VLD2    {dWr,dWi},[pTwiddle],#16
+
+        VLD4    {dXr0[0],dXi0[0],dXr1[0],dXi1[0]},[pSrc]!   @// grp 0
+        VLD4    {dXr0[1],dXi0[1],dXr1[1],dXi1[1]},[pSrc]!   @// grp 1
+
+
+        @//VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc],#32
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dWr
+            VMLAL   qT0,dXi1,dWi                       @// real part
+            VMULL   qT1,dXi1,dWr
+            VMLSL   qT1,dXr1,dWi                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dWr
+            VMLSL   qT0,dXi1,dWi                       @// real part
+            VMULL   qT1,dXi1,dWr
+            VMLAL   qT1,dXr1,dWi                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dXr1,qT0,#15
+        VRSHRN  dXi1,qT1,#15
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHSUB    dYr0,dXr0,dXr1
+            VHSUB    dYi0,dXi0,dXi1
+            VHADD    dYr1,dXr0,dXr1
+            VHADD    dYi1,dXi0,dXi1
+
+        .ELSE
+
+            VSUB    dYr0,dXr0,dXr1
+            VSUB    dYi0,dXi0,dXi1
+            VADD    dYr1,dXr0,dXr1
+            VADD    dYi1,dXi0,dXi1
+
+
+        .ENDIF
+
+        VST2    {dYr0[0],dYi0[0]},[pDst]!
+        VST2    {dYr0[1],dYi0[1]},[pDst],step               @// step = -4+outPointStep
+
+        VST2    {dYr1[0],dYi1[0]},[pDst]!
+        VST2    {dYr1[1],dYi1[1]},[pDst],dstStep            @// dstStep = -4-outPointStep+8 = -step
+
+        @//VST2    {dYr0,dYi0},[pDst],outPointStep
+        @//VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.S
@ -0,0 +1,216 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_ps_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6740
+@// Last Modified Date:       Wed, 18 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define twStep                          r8
+#define pTmp                            r4
+
+@// Neon Registers
+
+#define dW1S32                          D0.S32
+#define dW2S32                          D1.S32
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q5.S32
+#define qT1                             Q6.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,subFFTNum
+        MOV     twStep,subFFTNum,LSL #1
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+
+
+        RSB      dstStep,outPointStep,#8
+
+
+        @// Note: pointStep is 8 in this case: so need of extra reg
+        @// Loop on the groups: 2 groups at a time
+
+grpLoop\name:
+
+        VLD1     dW1S32[],[pTwiddle],twStep                @//[wi | wr]
+        VLD1     dW2S32[],[pTwiddle],twStep
+
+        @// Process the sets for each grp:  2 sets at a time (no set looping required)
+
+        VLD1    dX0,[pSrc]!            @// point0: of set0,set1 of grp0
+        VLD1    dX1,[pSrc]!            @// point1: of set0,set1 of grp0
+        VLD1    dX2,[pSrc]!            @// point0: of set0,set1 of grp1
+        VLD1    dX3,[pSrc]!            @// point1: of set0,set1 of grp1
+
+        SUBS    grpCount,grpCount,#4              @// decrement the loop counter
+        VUZP    dW1,dW2
+        VUZP    dX1,dX3
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX1,dW1
+            VMLAL   qT0,dX3,dW2                       @// real part
+            VMULL   qT1,dX3,dW1
+            VMLSL   qT1,dX1,dW2                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dX1,dW1
+            VMLSL   qT0,dX3,dW2                       @// real part
+            VMULL   qT1,dX3,dW1
+            VMLAL   qT1,dX1,dW2                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dX1,qT0,#15
+        VRSHRN  dX3,qT1,#15
+
+        VZIP    dX1,dX3
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            VHSUB    dY0,dX0,dX1
+            VHADD    dY1,dX0,dX1
+            VHSUB    dY2,dX2,dX3
+            VHADD    dY3,dX2,dX3
+
+        .ELSE
+
+            VSUB    dY0,dX0,dX1
+            VADD    dY1,dX0,dX1
+            VSUB    dY2,dX2,dX3
+            VADD    dY3,dX2,dX3
+
+
+
+        .ENDIF
+
+        VST1    dY0,[pDst],outPointStep             @// point0: of set0,set1 of grp0
+        VST1    dY1,[pDst],dstStep                  @// dstStep = -outPointStep + 8
+        VST1    dY2,[pDst],outPointStep             @// point0: of set0,set1 of grp1
+        VST1    dY3,[pDst],dstStep                  @// point1: of set0,set1 of grp1
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix2_unsafe_s.S
@ -0,0 +1,219 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5892
+@// Last Modified Date:       Thu, 07 Jun 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 2 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+    @// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define pointStep                       r4
+#define grpCount                        r5
+#define setCount                        r8
+#define step                            r10
+#define dstStep                         r11
+#define pTmp                            r9
+
+@// Neon Registers
+
+#define dW                              D0.S16
+#define dX0                             D2.S16
+#define dX1                             D3.S16
+#define dX2                             D4.S16
+#define dX3                             D5.S16
+#define dY0                             D6.S16
+#define dY1                             D7.S16
+#define dY2                             D8.S16
+#define dY3                             D9.S16
+#define qT0                             Q3.S32
+#define qT1                             Q4.S32
+
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+
+
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = 2*size bytes
+        SMULBB  outPointStep,grpCount,pointStep
+        LSL     pointStep,pointStep,#1
+
+
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+
+        @// Loop on the groups
+
+grpLoop\name:
+
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+        MOV      setCount,pointStep,LSR #2
+
+
+        @// Loop on the sets: 4 at a time
+
+
+setLoop\name:
+
+
+        VLD2    {dX0,dX1},[pSrc],pointStep            @// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 @// point1: dX2-real part dX3-img part
+
+        SUBS    setCount,setCount,#4
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX2,dW[0]
+            VMLAL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLSL   qT1,dX2,dW[1]                       @// imag part
+
+        .ELSE
+
+            VMULL   qT0,dX2,dW[0]
+            VMLSL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLAL   qT1,dX2,dW[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dX2,qT0,#15
+        VRSHRN  dX3,qT1,#15
+
+        .ifeqs "\scaled", "TRUE"
+            VHSUB    dY0,dX0,dX2
+            VHSUB    dY1,dX1,dX3
+            VHADD    dY2,dX0,dX2
+            VHADD    dY3,dX1,dX3
+
+        .ELSE
+            VSUB    dY0,dX0,dX2
+            VSUB    dY1,dX1,dX3
+            VADD    dY2,dX0,dX2
+            VADD    dY3,dX1,dX3
+
+        .ENDIF
+
+        VST2    {dY0,dY1},[pDst],outPointStep
+        VST2    {dY2,dY3},[pDst],dstStep              @// dstStep = -outPointStep + 16
+
+        BGT     setLoop\name
+
+        SUBS    grpCount,grpCount,#2
+        ADD     pSrc,pSrc,pointStep
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 2*size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 2*size bytes
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.S
@ -0,0 +1,314 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define pPingPongBuf                    r5
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step3                           r10
+
+@// Neon Registers
+
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dYr0                            D8.S16
+#define dYi0                            D9.S16
+#define dYr1                            D10.S16
+#define dYi1                            D11.S16
+#define dYr2                            D12.S16
+#define dYi2                            D13.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define dZr0                            D16.S16
+#define dZi0                            D17.S16
+#define dZr1                            D18.S16
+#define dZi1                            D19.S16
+#define dZr2                            D20.S16
+#define dZi2                            D21.S16
+#define dZr3                            D22.S16
+#define dZi3                            D23.S16
+#define qY0                             Q4.S16
+#define qY2                             Q6.S16
+#define qX0                             Q0.S16
+#define qX2                             Q2.S16
+
+#define qY1                             Q5.S16
+#define qY3                             Q7.S16
+#define qX1                             Q1.S16
+#define qX3                             Q3.S16
+#define qZ0                             Q8.S16
+#define qZ1                             Q9.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+
+        @// Define stack arguments
+
+        MOV     pointStep,subFFTNum
+        @// Update pSubFFTSize and pSubFFTNum regs
+
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #3
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        MOV     step1,setStep
+        ADD     setStep,setStep,pointStep             @// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                   @// setStep = - 3*pointStep+16
+
+
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3]
+        MOV     subFFTSize,#4                         @// subFFTSize = 1 for the first stage
+
+
+        .ifeqs  "\scaled", "TRUE"
+            VHADD    qY0,qX0,qX2             @// u0
+        .ELSE
+            VADD   qY0,qX0,qX2               @// u0
+        .ENDIF
+        RSB     step3,pointStep,#0
+
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets: 4 sets at a time
+
+grpZeroSetLoop\name:
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
+            VHSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VHADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VHSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VHADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+
+                VHSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VHADD    dZr2,dYr2,dYi3                  @// y1
+                VHSUB    dZi2,dYi2,dYr3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHSUB    dZr3,dYr2,dYi3                  @// y3
+                VHADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
+            VSUB    qY2,qX0,qX2             @// u1
+            SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VADD    qY1,qX1,qX3             @// u2
+            VLD2    {dXr2,dXi2},[pSrc :128],step3
+            VSUB    qY3,qX1,qX3             @// u3
+
+
+
+            @// finish second stage of 4 point FFT
+
+            VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+            VADD    qZ0,qY0,qY1             @// y0
+
+            VLD2    {dXr3,dXi3},[pSrc :128],setStep
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dZr3,dYr2,dYi3                  @// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+
+                VSUB    qZ1,qY0,qY1                     @// y2
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+
+
+            .ELSE
+
+                VADD    dZr2,dYr2,dYi3                  @// y1
+                VSUB    dZi2,dYi2,dYr3
+
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    qZ1,qY0,qY1                     @// y2
+
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VSUB    dZr3,dYr2,dYi3                  @// y3
+                VADD    dZi3,dYi2,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    qY0,qX0,qX2                     @// u0 (next loop)
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+
+            .ENDIF
+
+
+        .ENDIF
+
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= grpSize
+        MOV     pDst,pPingPongBuf
+
+
+        .endm
+
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.S
@ -0,0 +1,410 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7765
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep                    r3
+#define grpCount                        r4
+#define dstStep                         r5
+#define pw1                             r8
+#define pw2                             r9
+#define pw3                             r10
+#define pTmp                            r4
+
+
+@// Neon Registers
+
+#define dButterfly1Real02               D0.S16
+#define dButterfly1Imag02               D1.S16
+#define dButterfly1Real13               D2.S16
+#define dButterfly1Imag13               D3.S16
+#define dButterfly2Real02               D4.S16
+#define dButterfly2Imag02               D5.S16
+#define dButterfly2Real13               D6.S16
+#define dButterfly2Imag13               D7.S16
+#define dXr0                            D0.S16
+#define dXi0                            D1.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+
+#define dW1rS32                         D8.S32
+#define dW1iS32                         D9.S32
+#define dW2rS32                         D10.S32
+#define dW2iS32                         D11.S32
+#define dW3rS32                         D12.S32
+#define dW3iS32                         D13.S32
+
+#define dW1r                            D8.S16
+#define dW1i                            D9.S16
+#define dW2r                            D10.S16
+#define dW2i                            D11.S16
+#define dW3r                            D12.S16
+#define dW3i                            D13.S16
+
+#define dTmp0                           D12.S16
+#define dTmp1                           D13.S16
+#define dTmp1S32                        D13.S32
+#define dTmp2S32                        D14.S32
+#define dTmp3S32                        D15.S32
+
+#define dYr0                            D18.S16
+#define dYi0                            D19.S16
+#define dYr1                            D16.S16
+#define dYi1                            D17.S16
+#define dYr2                            D20.S16
+#define dYi2                            D21.S16
+#define dYr3                            D14.S16
+#define dYi3                            D15.S16
+#define qY0                             Q9.S16
+#define qY1                             Q8.S16
+#define qY2                             Q10.S16
+#define qY3                             Q7.S16
+
+#define qX0                             Q0.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+
+#define qT0                             Q9.S32
+#define qT1                             Q10.S32
+#define qT2                             Q7.S32
+#define qT3                             Q8.S32
+
+#define dZr0                            D22.S16
+#define dZi0                            D23.S16
+#define dZr1                            D24.S16
+#define dZi1                            D25.S16
+#define dZr2                            D26.S16
+#define dZi2                            D27.S16
+#define dZr3                            D28.S16
+#define dZi3                            D29.S16
+
+#define qZ0                             Q11.S16
+#define qZ1                             Q12.S16
+#define qZ2                             Q13.S16
+#define qZ3                             Q14.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        MOV     pw2,pTwiddle
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+        MOV     pw3,pTwiddle
+        MOV     pw1,pTwiddle
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes
+        MOV     outPointStep,subFFTSize,LSL #2
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#2
+
+
+        @// Update grpCount and grpSize rightaway
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        MOV     dstStep,outPointStep,LSL #1
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+        @// Process 4 groups at a time
+
+grpLoop\name:
+
+
+        @// Rearrange the third twiddle
+        VUZP    dW3r,dW3i
+        SUBS    grpCount,grpCount,#16                    @// grpCount is multiplied by 4
+
+
+        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
+        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dW1r
+            VMLAL   qT0,dXi1,dW1i                       @// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLSL   qT1,dXr1,dW1i                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dW1r
+            VMLSL   qT0,dXi1,dW1i                       @// real part
+            VMULL   qT1,dXi1,dW1r
+            VMLAL   qT1,dXr1,dW1i                       @// imag part
+
+        .ENDIF
+
+        @// Load the first twiddle for 4 groups : w^1
+        @// w^1 twiddle (i+0,i+1,i+2,i+3)       for group 0,1,2,3
+
+        VLD2 {dW1r,dW1i}, [pw1 :128]!
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dXr2,dW2r
+            VMLAL   qT2,dXi2,dW2i                       @// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLSL   qT3,dXr2,dW2i                       @// imag part
+
+        .ELSE
+            VMULL   qT2,dXr2,dW2r
+            VMLSL   qT2,dXi2,dW2i                       @// real part
+            VMULL   qT3,dXi2,dW2r
+            VMLAL   qT3,dXr2,dW2i                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+
+
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr3,dW3r
+            VMLAL   qT0,dXi3,dW3i                       @// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLSL   qT1,dXr3,dW3i                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr3,dW3r
+            VMLSL   qT0,dXi3,dW3i                       @// real part
+            VMULL   qT1,dXi3,dW3r
+            VMLAL   qT1,dXr3,dW3i                       @// imag part
+
+        .ENDIF
+
+        @// Load the second twiddle for 4 groups : w^2
+        @// w^2 twiddle (2i+0,2i+2,2i+4,2i+6)   for group 0,1,2,3
+        VLD4 {dW2r,dW2i,dTmp0,dTmp1},[pw2 :256]!
+
+
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+
+        @// Load the third twiddle for 4 groups : w^3
+        @// w^3 twiddle (3i+0,3i+3,3i+6,3i+9)   for group 0,1,2,3
+
+        VLD3 {dW3rS32,dTmp1S32,dTmp2S32},[pw3 :64]!
+
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+
+        VLD3 {dW3iS32,dTmp2S32,dTmp3S32},[pw3 :64]!
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+            VHADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+            VHSUB    qY3,qZ1,qZ3
+
+            @// finish second stage of 4 point FFT
+
+            VHSUB    qZ0,qY2,qY1
+            VHADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+            .ifeqs "\inverse", "TRUE"
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VHSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VHADD    dZi1,dYi0,dYr3
+
+                VHADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+            VADD    qY1,qZ1,qZ3
+            VLD4    {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+            VSUB    qY3,qZ1,qZ3
+
+            @// finish second stage of 4 point FFT
+
+            VSUB    qZ0,qY2,qY1
+            VADD    qZ2,qY2,qY1
+            VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+
+
+            .ifeqs "\inverse", "TRUE"
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ELSE
+
+                VSUB    dZr1,dYr0,dYi3                          @// y1 = u0+ju3
+                VADD    dZi1,dYi0,dYr3
+
+                VADD    dZr3,dYr0,dYi3                          @// y3 = u0-ju3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -3*outPointStep + 16
+
+            .ENDIF
+
+
+
+
+        .ENDIF
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     pTmp,pDst
+        SUB     pSrc,pSrc,#64                       @// Extra increment currently done in the loop
+        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= size; pSrc -= 4*size bytes
+        SUB     pSrc,pTmp,outPointStep
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix4_unsafe_s.S
@ -0,0 +1,400 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix4_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7761
+@// Last Modified Date:       Wed, 26 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+@// Guarding implementation by the processor name
+
+
+
+    @// Guarding implementation by the processor name
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r5
+#define stepTwiddle                     r12
+#define setCount                        r14
+#define srcStep                         r8
+#define setStep                         r9
+#define dstStep                         r10
+#define twStep                          r11
+#define t1                              r3
+
+@// Neon Registers
+
+#define dW1                             D0.S16
+#define dW2                             D1.S16
+#define dW3                             D2.S16
+
+#define dXr0                            D4.S16
+#define dXi0                            D5.S16
+#define dXr1                            D6.S16
+#define dXi1                            D7.S16
+#define dXr2                            D8.S16
+#define dXi2                            D9.S16
+#define dXr3                            D10.S16
+#define dXi3                            D11.S16
+#define dYr0                            D12.S16
+#define dYi0                            D13.S16
+#define dYr1                            D14.S16
+#define dYi1                            D15.S16
+#define dYr2                            D16.S16
+#define dYi2                            D17.S16
+#define dYr3                            D18.S16
+#define dYi3                            D19.S16
+#define qT0                             Q8.S32
+#define qT1                             Q9.S32
+#define qT2                             Q6.S32
+#define qT3                             Q7.S32
+
+#define dZr0                            D20.S16
+#define dZi0                            D21.S16
+#define dZr1                            D22.S16
+#define dZi1                            D23.S16
+#define dZr2                            D24.S16
+#define dZi2                            D25.S16
+#define dZr3                            D26.S16
+#define dZi3                            D27.S16
+#define qY0                             Q6.S16
+#define qY1                             Q7.S16
+#define qY2                             Q8.S16
+#define qY3                             Q9.S16
+#define qX0                             Q2.S16
+#define qZ0                             Q10.S16
+#define qZ1                             Q11.S16
+#define qZ2                             Q12.S16
+#define qZ3                             Q13.S16
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2
+        MOV     subFFTSize,grpCount
+
+
+        @// pOut0+1 increments pOut0 by 4 bytes
+        @// pOut0+outPointStep == increment of 4*outPointStep bytes = size bytes
+
+        MOV     stepTwiddle,#0
+        SMULBB  outPointStep,grpCount,subFFTNum
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize bytes
+
+        LSL     pointStep,subFFTNum,#2                      @// 2*grpSize
+
+        VLD1     dW1,[pTwiddle :64]                             @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
+        VLD1     dW2,[pTwiddle :64]                             @//[wi | wr]
+        ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
+        SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
+        VLD1     dW3,[pTwiddle :64]
+        @//RSB     setStep,setStep,#16                      @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#0                          @// setStep = - 3*pointStep
+
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+
+
+
+grpLoop\name:
+
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        ADD      pTwiddle,pTwiddle,stepTwiddle               @// set pTwiddle to the first point
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & reset pSrc
+
+        SUB      twStep,stepTwiddle,twStep                   @// twStep = -3*stepTwiddle
+
+
+        MOV      setCount,pointStep,LSR #2
+        ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
+
+        @// Loop on the sets : 4 at a time
+
+setLoop\name:
+
+        SUBS    setCount,setCount,#4                    @// decrement the loop counter
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dW1[0]
+            VMLAL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLSL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr1,dW1[0]
+            VMLSL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLAL   qT1,dXr1,dW1[1]                       @// imag part
+
+        .ENDIF
+
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dXr2,dW2[0]
+            VMLAL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLSL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT2,dXr2,dW2[0]
+            VMLSL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLAL   qT3,dXr2,dW2[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr1,qT0,#15
+        VRSHRN  dZi1,qT1,#15
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr3,dW3[0]
+            VMLAL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLSL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ELSE
+            VMULL   qT0,dXr3,dW3[0]
+            VMLSL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLAL   qT1,dXr3,dW3[1]                       @// imag part
+
+        .ENDIF
+
+        VRSHRN  dZr2,qT2,#15
+        VRSHRN  dZi2,qT3,#15
+
+
+        VRSHRN  dZr3,qT0,#15
+        VRSHRN  dZi3,qT1,#15
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+
+
+        .ifeqs "\scaled", "TRUE"
+
+            @// finish first stage of 4 point FFT
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+
+            VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0]
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+
+
+            @// finish second stage of 4 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    qZ0,qY2,qY1
+
+                VHADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi0,dYr3
+
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
+                VHSUB    qZ0,qY2,qY1
+
+                VHSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi3,dYi0,dYr3
+
+                VHADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VHADD    dZr2,dYr0,dYi3
+                VHSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+        .ELSE
+
+            @// finish first stage of 4 point FFT
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+
+            VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0]
+            VADD    qY1,qZ1,qZ3
+            VSUB    qY3,qZ1,qZ3
+
+
+            @// finish second stage of 4 point FFT
+
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    qZ0,qY2,qY1
+
+                VADD    dZr2,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi2,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+
+            .ELSE
+
+                VSUB    qZ0,qY2,qY1
+
+                VSUB    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi3,dYi0,dYr3
+
+                VADD    qZ1,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+
+                VADD    dZr2,dYr0,dYi3
+                VSUB    dZi2,dYi0,dYr3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VST2    {dZr2,dZi2},[pDst :128],dstStep
+
+
+            .ENDIF
+
+
+
+        .ENDIF
+
+        ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set
+        BGT     setLoop\name
+
+        VLD1     dW1,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        SUBS    grpCount,grpCount,#4                        @// subtract 4 since grpCount multiplied by 4
+        VLD1     dW2,[pTwiddle :64],stepTwiddle                 @//[wi | wr]
+        ADD     pSrc,pSrc,srcStep                           @// increment pSrc for the next grp
+        VLD1     dW3,[pTwiddle :64],twStep                      @//[wi | wr]
+
+
+
+        BGT     grpLoop\name
+
+
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #2           @// pDst -= size; pSrc -= 4*size bytes
+        SUB     pSrc,t1,outPointStep
+
+
+        .endm
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.S
@ -0,0 +1,619 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  armSP_FFT_CToC_SC16_Radix8_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7766
+@// Last Modified Date:       Thu, 27 Sep 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+
+@// Guarding implementation by the processor name
+
+
+@//Input Registers
+
+#define pSrc                            r0
+#define pDst                            r2
+#define pTwiddle                        r1
+#define subFFTNum                       r6
+#define subFFTSize                      r7
+@// dest buffer for the next stage (not pSrc for first stage)
+#define pPingPongBuf                    r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize                         r3
+@// Reuse grpSize as setCount
+#define setCount                        r3
+#define pointStep                       r4
+#define outPointStep                    r4
+#define setStep                         r8
+#define step1                           r9
+#define step2                           r10
+#define t0                              r11
+
+
+@// Neon Registers
+
+#define dXr0                            D14.S16
+#define dXi0                            D15.S16
+#define dXr1                            D2.S16
+#define dXi1                            D3.S16
+#define dXr2                            D4.S16
+#define dXi2                            D5.S16
+#define dXr3                            D6.S16
+#define dXi3                            D7.S16
+#define dXr4                            D8.S16
+#define dXi4                            D9.S16
+#define dXr5                            D10.S16
+#define dXi5                            D11.S16
+#define dXr6                            D12.S16
+#define dXi6                            D13.S16
+#define dXr7                            D0.S16
+#define dXi7                            D1.S16
+#define qX0                             Q7.S16
+#define qX1                             Q1.S16
+#define qX2                             Q2.S16
+#define qX3                             Q3.S16
+#define qX4                             Q4.S16
+#define qX5                             Q5.S16
+#define qX6                             Q6.S16
+#define qX7                             Q0.S16
+
+#define dUr0                            D16.S16
+#define dUi0                            D17.S16
+#define dUr2                            D18.S16
+#define dUi2                            D19.S16
+#define dUr4                            D20.S16
+#define dUi4                            D21.S16
+#define dUr6                            D22.S16
+#define dUi6                            D23.S16
+#define dUr1                            D24.S16
+#define dUi1                            D25.S16
+#define dUr3                            D26.S16
+#define dUi3                            D27.S16
+#define dUr5                            D28.S16
+#define dUi5                            D29.S16
+@// reuse dXr7 and dXi7
+#define dUr7                            D30.S16
+#define dUi7                            D31.S16
+#define qU0                             Q8.S16
+#define qU1                             Q12.S16
+#define qU2                             Q9.S16
+#define qU3                             Q13.S16
+#define qU4                             Q10.S16
+#define qU5                             Q14.S16
+#define qU6                             Q11.S16
+#define qU7                             Q15.S16
+
+
+
+#define dVr0                            D24.S16
+#define dVi0                            D25.S16
+#define dVr2                            D26.S16
+#define dVi2                            D27.S16
+#define dVr4                            D28.S16
+#define dVi4                            D29.S16
+#define dVr6                            D30.S16
+#define dVi6                            D31.S16
+#define dVr1                            D16.S16
+#define dVi1                            D17.S16
+#define dVr3                            D18.S16
+#define dVi3                            D19.S16
+#define dVr5                            D20.S16
+#define dVi5                            D21.S16
+@// reuse dUi7
+#define dVr7                            D22.S16
+@// reuse dUr7
+#define dVi7                            D23.S16
+#define qV0                             Q12.S16
+#define qV1                             Q8.S16
+#define qV2                             Q13.S16
+#define qV3                             Q9.S16
+#define qV4                             Q14.S16
+#define qV5                             Q10.S16
+#define qV6                             Q15.S16
+#define qV7                             Q11.S16
+
+
+
+#define dYr0                            D16.S16
+#define dYi0                            D17.S16
+#define dYr2                            D18.S16
+#define dYi2                            D19.S16
+#define dYr4                            D20.S16
+#define dYi4                            D21.S16
+#define dYr6                            D22.S16
+#define dYi6                            D23.S16
+#define dYr1                            D24.S16
+#define dYi1                            D25.S16
+#define dYr3                            D26.S16
+#define dYi3                            D27.S16
+#define dYr5                            D28.S16
+#define dYi5                            D29.S16
+@// reuse dYr4 and dYi4
+#define dYr7                            D30.S16
+#define dYi7                            D31.S16
+#define qY0                             Q8.S16
+#define qY1                             Q12.S16
+#define qY2                             Q9.S16
+#define qY3                             Q13.S16
+#define qY4                             Q10.S16
+#define qY5                             Q14.S16
+#define qY6                             Q11.S16
+#define qY7                             Q15.S16
+
+
+#define dT0                             D0.S16
+#define dT1                             D1.S16
+
+
+@// Define constants
+        .set   ONEBYSQRT2, 0x00005A82        @// Q15 format
+
+
+        .MACRO FFTSTAGE scaled, inverse , name
+
+        @// Define stack arguments
+
+        @// Update pSubFFTSize and pSubFFTNum regs
+        MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
+        LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q15 format
+
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3
+        MOV     subFFTNum,grpSize
+
+
+        @// pT0+1 increments pT0 by 4 bytes
+        @// pT0+pointStep = increment of 4*pointStep bytes = grpSize/2 bytes
+        @// Note: outPointStep = pointStep for firststage
+
+        MOV     pointStep,grpSize,LSL #2
+
+
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     step1,grpSize,LSL #3
+
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
+        RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
+
+
+
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
+                                                      @//  setStep = -7*pointStep + 16
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 4 sets at a time
+
+grpZeroSetLoop\name:
+
+        @// Decrement setcount
+        SUBS    setCount,setCount,#4                    @// decrement the set loop counter
+
+
+        .ifeqs "\scaled", "TRUE"
+            @// finish first stage of 8 point FFT
+
+            VHADD    qU0,qX0,qX4
+            VHADD    qU2,qX1,qX5
+            VHADD    qU4,qX2,qX6
+            VHADD    qU6,qX3,qX7
+
+            @// finish second stage of 8 point FFT
+
+            VHADD    qV0,qU0,qU4
+            VHSUB    qV2,qU0,qU4
+            VHADD    qV4,qU2,qU6
+            VHSUB    qV6,qU2,qU6
+
+            @// finish third stage of 8 point FFT
+
+            VHADD    qY0,qV0,qV4
+            VHSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    dYr2,dVr2,dVi6
+                VHADD    dYi2,dVi2,dVr6
+
+                VHADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VHADD    dYr6,dVr2,dVi6
+                VHSUB    dYi6,dVi2,dVr6
+
+                VHSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VHADD    dYi2,dVi2,dVr6
+
+
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
+            VHSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
+            VHSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VHADD    dVi1,dUi1,dUr5
+            VHADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VHSUB    dVi3,dUi1,dUr5
+
+            VHSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VHADD    dVi5,dUi3,dUr7
+            VHADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VHSUB    dVi7,dUi3,dUr7
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VHADD    qY1,qV1,qV5
+                VHSUB    qY5,qV1,qV5
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
+                VHSUB    dYr3,dVr3,dVr7
+                VHSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+                VHSUB    qY5,qV1,qV5
+
+                VHSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VHSUB    dYi3,dVi3,dVi7
+                VHADD    qY1,qV1,qV5
+
+
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+
+        .ELSE
+            @// finish first stage of 8 point FFT
+
+            VADD    qU0,qX0,qX4
+            VADD    qU2,qX1,qX5
+            VADD    qU4,qX2,qX6
+            VADD    qU6,qX3,qX7
+
+            @// finish second stage of 8 point FFT
+
+            VADD    qV0,qU0,qU4
+            VSUB    qV2,qU0,qU4
+            VADD    qV4,qU2,qU6
+            VSUB    qV6,qU2,qU6
+
+            @// finish third stage of 8 point FFT
+
+            VADD    qY0,qV0,qV4
+            VSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    dYr2,dVr2,dVi6
+                VADD    dYi2,dVi2,dVr6
+
+                VADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+
+            .ELSE
+
+                VADD    dYr6,dVr2,dVi6
+                VSUB    dYi6,dVi2,dVr6
+
+                VSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VADD    dYi2,dVi2,dVr6
+
+
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+
+            .ENDIF
+
+            @// finish first stage of 8 point FFT
+
+            VSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0
+
+            @// finish second stage of 8 point FFT
+
+            VSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VADD    dVi1,dUi1,dUr5
+            VADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VSUB    dVi3,dUi1,dUr5
+
+            VSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VADD    dVi5,dUi3,dUr7
+            VADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VSUB    dVi7,dUi3,dUr7
+
+            @// finish third stage of 8 point FFT
+
+            .ifeqs  "\inverse", "TRUE"
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VADD    qY1,qV1,qV5
+                VSUB    qY5,qV1,qV5
+
+
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+
+                VSUB    dYr3,dVr3,dVr7
+                VSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+
+
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr7,dYi7},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+#endif
+            .ELSE
+
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+
+                @// calculate a*v5
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]
+
+                VSUB    qY5,qV1,qV5
+
+                VSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VSUB    dYi3,dVi3,dVi7
+                VADD    qY1,qV1,qV5
+
+
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+#if 0
+                VST2    {dYr1,dYi1},[pDst :128],#16                      @// store y7
+#else
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+#endif
+
+            .ENDIF
+
+
+        .ENDIF
+
+        SUB     pDst, pDst, step2                               @// update pDst for the next set
+        BGT     grpZeroSetLoop\name
+
+
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize
+        MOV     pDst,pPingPongBuf
+
+
+
+        .endm
+
+
+        @// Allocate stack memory required by the function
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+
+
+        M_START armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+
+        M_START armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.S
@ -0,0 +1,163 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5995
+@// Last Modified Date:       Fri, 08 Jun 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the first stage of a Radix 2 DIT in-order out-of-place FFT 
+@// stage for a N point complex signal.
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+            
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define pPingPongBuf	r5
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define pointStep	r3
+#define outPointStep	r3
+#define grpSize		r4
+#define setCount	r4
+#define step		r8
+#define dstStep		r8
+
+@// Neon Registers
+
+#define dX0	D0.S32
+#define dX1	D1.S32
+#define dY0	D2.S32
+#define dY1	D3.S32
+
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        
+        @// update subFFTSize and subFFTNum into RN6 and RN7 for the next stage
+        
+        
+        MOV        subFFTSize,#2
+        LSR        grpSize,subFFTNum,#1  
+        MOV        subFFTNum,grpSize 
+        
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        @// Note: setCount = grpSize/2 (reuse the updated grpSize for setCount)
+        
+        MOV        pointStep,grpSize,LSL #3
+        RSB        step,pointStep,#8 
+        
+        
+        @// Loop on the sets for grp zero
+
+grpZeroSetLoop\name :	
+        
+        VLD1    dX0,[pSrc],pointStep
+        VLD1    dX1,[pSrc],step                   @// step = -pointStep + 8
+        SUBS    setCount,setCount,#1              @// decrement the loop counter
+        
+        .ifeqs "\scaled", "TRUE"
+        
+            VHADD    dY0,dX0,dX1
+            VHSUB    dY1,dX0,dX1
+        
+        .ELSE
+        
+            VADD    dY0,dX0,dX1
+            VSUB    dY1,dX0,dX1
+        
+         
+        .ENDIF
+        
+        VST1    dY0,[pDst],outPointStep
+        VST1    dY1,[pDst],dstStep                  @// dstStep =  step = -pointStep + 8
+               
+        BGT     grpZeroSetLoop\name
+        
+        
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize 
+        MOV     pDst,pPingPongBuf
+                
+        .endm
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.S
@ -0,0 +1,184 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7493
+@// Last Modified Date:       Mon, 24 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute the last stage of a Radix 2 DIT in-order out-of-place FFT
+@// stage for a N point complex signal.
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+            
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+
+#define outPointStep	r3
+#define grpCount	r4
+#define dstStep		r5
+#define pTmp		r4
+
+@// Neon Registers
+
+#define dWr	D0.S32
+#define dWi	d1.s32
+#define dXr0	d2.s32
+#define dXi0	d3.s32
+#define dXr1	d4.s32
+#define dXi1	d5.s32
+#define dYr0	d6.s32
+#define dYi0	d7.s32
+#define dYr1	d8.s32
+#define dYi1	d9.s32
+#define qT0	q5.s64
+#define qT1	q6.s64
+	
+        .macro FFTSTAGE scaled, inverse, name
+        
+        
+        MOV     outPointStep,subFFTSize,LSL #3
+        @// Update grpCount and grpSize rightaway 
+        
+        MOV     subFFTNum,#1                            @//after the last stage
+        LSL     grpCount,subFFTSize,#1
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+                               
+        RSB      dstStep,outPointStep,#16
+        
+        
+        @// Loop on 2 grps at a time for the last stage
+
+grpLoop\name :	
+        VLD2    {dWr,dWi},[pTwiddle :64]!
+        
+        VLD4    {dXr0,dXi0,dXr1,dXi1},[pSrc :128]!
+        SUBS    grpCount,grpCount,#4                   @// grpCount is multiplied by 2 
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dWr,dXr1
+            VMLAL   qT0,dWi,dXi1                       @// real part
+            VMULL   qT1,dWr,dXi1
+            VMLSL   qT1,dWi,dXr1                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dWr,dXr1
+            VMLSL   qT0,dWi,dXi1                       @// real part
+            VMULL   qT1,dWr,dXi1
+            VMLAL   qT1,dWi,dXr1                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dXr1,qT0,#31
+        VRSHRN  dXi1,qT1,#31
+        
+                
+        .ifeqs "\scaled", "TRUE"
+        
+            VHSUB    dYr0,dXr0,dXr1
+            VHSUB    dYi0,dXi0,dXi1
+            VHADD    dYr1,dXr0,dXr1
+            VHADD    dYi1,dXi0,dXi1
+            
+        .else
+        
+            VSUB    dYr0,dXr0,dXr1
+            VSUB    dYi0,dXi0,dXi1
+            VADD    dYr1,dXr0,dXr1
+            VADD    dYi1,dXi0,dXi1
+            
+         
+        .endif
+        
+        VST2    {dYr0,dYi0},[pDst],outPointStep
+        VST2    {dYr1,dYi1},[pDst],dstStep                  @// dstStep =  step = -outPointStep + 16
+               
+        bgt     grpLoop\name
+        
+        
+        @// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+                
+        .endm
+        
+        
+                
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe,r4,""
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+	
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix2_unsafe_s.S
@ -0,0 +1,216 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@//
+@// File Name:  armSP_FFT_CToC_SC32_Radix2_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   5638
+@// Last Modified Date:       Wed, 06 Jun 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a Radix 2 DIT in-order out-of-place FFT stage for a N point complex signal.
+@// This handle the general stage, not the first or last stage.
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+
+           
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep	r3
+#define pointStep	r4
+#define grpCount	r5
+#define setCount	r8
+@//const           RN  9
+#define step		r10
+#define dstStep		r11
+#define pTable		r9
+#define pTmp		r9    
+
+@// Neon Registers
+
+#define dW	D0.S32
+#define dX0	D2.S32
+#define dX1	D3.S32
+#define dX2	D4.S32
+#define dX3	D5.S32
+#define dY0	D6.S32
+#define dY1	D7.S32
+#define dY2	D8.S32
+#define dY3	D9.S32
+#define qT0	Q3.S64
+#define qT1	Q4.S64
+
+    
+    
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        LSR     subFFTNum,subFFTNum,#1                      @//grpSize
+        LSL     grpCount,subFFTSize,#1
+        
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 4*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #2
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes = 4*size bytes
+        SMULBB  outPointStep,grpCount,pointStep  
+        LSL     pointStep,pointStep,#1    
+                               
+        
+        RSB      step,pointStep,#16
+        RSB      dstStep,outPointStep,#16
+        
+        @// Loop on the groups
+
+grpLoop\name :	        
+        MOV      setCount,pointStep,LSR #3
+        VLD1     dW,[pTwiddle],pointStep                @//[wi | wr]
+        
+        
+        @// Loop on the sets
+        
+        
+setLoop\name :	       
+        
+        
+        VLD2    {dX0,dX1},[pSrc],pointStep            @// point0: dX0-real part dX1-img part
+        VLD2    {dX2,dX3},[pSrc],step                 @// point1: dX2-real part dX3-img part
+        
+        SUBS    setCount,setCount,#2               
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dX2,dW[0]
+            VMLAL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLSL   qT1,dX2,dW[1]                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dX2,dW[0]
+            VMLSL   qT0,dX3,dW[1]                       @// real part
+            VMULL   qT1,dX3,dW[0]
+            VMLAL   qT1,dX2,dW[1]                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dX2,qT0,#31
+        VRSHRN  dX3,qT1,#31
+        
+        .ifeqs "\scaled", "TRUE"
+            VHSUB    dY0,dX0,dX2
+            VHSUB    dY1,dX1,dX3
+            VHADD    dY2,dX0,dX2
+            VHADD    dY3,dX1,dX3
+                
+        .else
+            VSUB    dY0,dX0,dX2
+            VSUB    dY1,dX1,dX3
+            VADD    dY2,dX0,dX2
+            VADD    dY3,dX1,dX3
+        
+        .endif
+        
+        VST2    {dY0,dY1},[pDst],outPointStep
+        VST2    {dY2,dY3},[pDst],dstStep              @// dstStep = -outPointStep + 16
+        
+        BGT     setLoop\name
+        
+        SUBS    grpCount,grpCount,#2               
+        ADD     pSrc,pSrc,pointStep
+        BGT     grpLoop\name    
+        
+        
+        @// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #1       @// pDst -= 4*size; pSrc -= 8*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        
+        @// Reset pTwiddle for the next stage
+        SUB     pTwiddle,pTwiddle,outPointStep      @// pTwiddle -= 4*size bytes
+        
+        
+        .endm
+        
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",INV
+        M_END
+ 
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.S
@ -0,0 +1,320 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix4_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7767
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a first stage Radix 4 FFT stage for a N point complex signal
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define pPingPongBuf	r5
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize		r3
+@// Reuse grpSize as setCount
+#define setCount	r3
+#define pointStep	r4
+#define outPointStep	r4
+#define setStep		r8
+#define step1		r9
+#define step3		r10
+
+@// Neon Registers
+
+#define dXr0	D0.S32
+#define dXi0	D1.S32
+#define dXr1	D2.S32
+#define dXi1	D3.S32
+#define dXr2	D4.S32
+#define dXi2	D5.S32
+#define dXr3	D6.S32
+#define dXi3	D7.S32
+#define dYr0	D8.S32
+#define dYi0	D9.S32
+#define dYr1	D10.S32
+#define dYi1	D11.S32
+#define dYr2	D12.S32
+#define dYi2	D13.S32
+#define dYr3	D14.S32
+#define dYi3	D15.S32
+#define qX0	Q0.S32
+#define qX1	Q1.S32
+#define qX2	Q2.S32
+#define qX3	Q3.S32
+#define qY0	Q4.S32
+#define qY1	Q5.S32
+#define qY2	Q6.S32
+#define qY3	Q7.S32
+#define dZr0	D16.S32
+#define dZi0	D17.S32
+#define dZr1	D18.S32
+#define dZi1	D19.S32
+#define dZr2	D20.S32
+#define dZi2	D21.S32
+#define dZr3	D22.S32
+#define dZi3	D23.S32
+#define qZ0	Q8.S32
+#define qZ1	Q9.S32
+#define qZ2	Q10.S32
+#define qZ3	Q11.S32
+
+    
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        
+        MOV     pointStep,subFFTNum,LSL #1
+        
+        
+        @// Update pSubFFTSize and pSubFFTNum regs
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     subFFTSize,#4                                 @// subFFTSize = 1 for the first stage
+        
+        @// Note: setCount = subFFTNum/4 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#2
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]  
+        MOV     subFFTNum,grpSize
+        
+                                       
+        @// Calculate the step of input data for the next set
+        @//MOV     setStep,pointStep,LSL #1
+        MOV     setStep,grpSize,LSL #4
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        ADD     setStep,setStep,pointStep                   @// setStep = 3*pointStep
+        RSB     setStep,setStep,#16                         @// setStep = - 3*pointStep+16
+        
+        VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+        MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        
+        .ifeqs "\scaled", "TRUE"
+            VHADD    qY0,qX0,qX2
+        .else
+            VADD    qY0,qX0,qX2
+        .endif
+            
+        RSB     step3,pointStep,#0                          @// step3 = -pointStep                          
+        
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets : 2 sets at a time
+
+grpZeroSetLoop\name :	
+        
+        
+        
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2                    @// decrement the set loop counter           
+        
+        .ifeqs "\scaled", "TRUE" 
+        
+            @// finish first stage of 4 point FFT 
+                        
+            VHSUB    qY2,qX0,qX2
+            
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VHADD    qY1,qX1,qX3
+            VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+            VHSUB    qY3,qX1,qX3
+            
+                       
+            @// finish second stage of 4 point FFT 
+                                                
+            .ifeqs "\inverse", "TRUE"
+                   
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VHADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set    
+                VHSUB    dZr3,dYr2,dYi3
+                
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi3,dYi2,dYr3
+                
+                VHSUB    qZ1,qY0,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                
+                VHADD    dZr2,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+                
+                VHADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+                
+                
+            .else
+                
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VHADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+                VHADD    dZr2,dYr2,dYi3
+            
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi2,dYi2,dYr3
+            
+                VHSUB    qZ1,qY0,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            
+                VHSUB    dZr3,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VHADD    dZi3,dYi2,dYr3
+            
+                VHADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+            
+            .endif
+            
+        
+        
+        .else
+        
+            @// finish first stage of 4 point FFT 
+            
+            
+            VSUB    qY2,qX0,qX2
+            
+            VLD2    {dXr0,dXi0},[pSrc :128],step1          @//  data[0]
+            VADD    qY1,qX1,qX3
+            VLD2    {dXr2,dXi2},[pSrc :128],step3          @//  data[2]
+            VSUB    qY3,qX1,qX3
+            
+                       
+            @// finish second stage of 4 point FFT 
+                                                
+            .ifeqs "\inverse", "TRUE" 
+                   
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set    
+                VSUB    dZr3,dYr2,dYi3
+                
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi3,dYi2,dYr3
+                
+                VSUB    qZ1,qY0,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+                
+                VADD    dZr2,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+                
+                VADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr2,dZi2},[pDst :128],setStep
+                
+                
+            .else
+                
+                VLD2    {dXr1,dXi1},[pSrc :128],step1          @//  data[1]
+                VADD    qZ0,qY0,qY1
+            
+                VLD2    {dXr3,dXi3},[pSrc :128],setStep            @//  data[3] & update pSrc for the next set
+                VADD    dZr2,dYr2,dYi3
+            
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi2,dYi2,dYr3
+            
+                VSUB    qZ1,qY0,qY1
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+            
+                VSUB    dZr3,dYr2,dYi3
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+                VADD    dZi3,dYi2,dYr3
+            
+                VADD    qY0,qX0,qX2                     @// u0 for next iteration
+                VST2    {dZr3,dZi3},[pDst :128],setStep
+            
+            .endif
+            
+        .endif
+        
+        BGT     grpZeroSetLoop\name
+        
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                     @// pDst -= 2*grpSize  
+        MOV     pDst,pPingPongBuf
+        
+        
+        .endm
+
+                
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+                
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+                
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+    
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.S
@ -0,0 +1,404 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix4_ls_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7767
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+    
+@// Guarding implementation by the processor name
+    
+    
+@// Import symbols required from other files
+@// (For example tables)
+    @//IMPORT  armAAC_constTable    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define outPointStep	r3
+#define grpCount	r4
+#define dstStep		r5
+#define grpTwStep	r8
+#define stepTwiddle	r9
+#define twStep		r10
+#define pTmp		r4
+#define step16		r11
+#define step24		r12
+
+
+@// Neon Registers
+
+#define dButterfly1Real02	D0.S32
+#define dButterfly1Imag02	D1.S32
+#define dButterfly1Real13	D2.S32
+#define dButterfly1Imag13	D3.S32
+#define dButterfly2Real02	D4.S32
+#define dButterfly2Imag02	D5.S32
+#define dButterfly2Real13	D6.S32
+#define dButterfly2Imag13	D7.S32
+#define dXr0			D0.S32
+#define dXi0			D1.S32
+#define dXr1			D2.S32
+#define dXi1			D3.S32
+#define dXr2			D4.S32
+#define dXi2			D5.S32
+#define dXr3			D6.S32
+#define dXi3			D7.S32
+
+#define dYr0			D16.S32
+#define dYi0			D17.S32
+#define dYr1			D18.S32
+#define dYi1			D19.S32
+#define dYr2			D20.S32
+#define dYi2			D21.S32
+#define dYr3			D22.S32
+#define dYi3			D23.S32
+
+#define dW1r			D8.S32
+#define dW1i			D9.S32
+#define dW2r			D10.S32
+#define dW2i			D11.S32
+#define dW3r			D12.S32
+#define dW3i			D13.S32
+#define qT0			Q7.S64
+#define qT1			Q8.S64
+#define qT2			Q9.S64
+#define qT3			Q10.S64
+#define qT4			Q11.S64
+#define qT5			Q12.S64
+
+#define dZr0			D14.S32
+#define dZi0			D15.S32
+#define dZr1			D26.S32
+#define dZi1			D27.S32
+#define dZr2			D28.S32
+#define dZi2			D29.S32
+#define dZr3			D30.S32
+#define dZi3			D31.S32
+
+#define qX0			Q0.S32
+#define qY0			Q8.S32
+#define qY1			Q9.S32   
+#define qY2			Q10.S32
+#define qY3			Q11.S32
+#define qZ0			Q7.S32
+#define qZ1			Q13.S32   
+#define qZ2			Q14.S32
+#define qZ3			Q15.S32
+
+
+        
+        .MACRO FFTSTAGE scaled, inverse , name
+        
+        @// Define stack arguments
+        
+        
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes 
+        MOV     outPointStep,subFFTSize,LSL #3
+        
+        @// Update grpCount and grpSize rightaway 
+        
+        VLD2    {dW1r,dW1i},[pTwiddle :128]                          @// [wi|wr]
+        MOV     step16,#16
+        LSL     grpCount,subFFTSize,#2
+        
+        VLD1    dW2r,[pTwiddle :64]                             @// [wi|wr]
+        MOV     subFFTNum,#1                            @//after the last stage
+        
+        VLD1    dW3r,[pTwiddle :64],step16                     @// [wi|wr]
+        MOV     stepTwiddle,#0
+        
+        VLD1    dW2i,[pTwiddle :64]!                            @// [wi|wr]
+        SUB     grpTwStep,stepTwiddle,#8                    @// grpTwStep = -8 to start with       
+        
+        @// update subFFTSize for the next stage
+        MOV     subFFTSize,grpCount
+        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
+        MOV     dstStep,outPointStep,LSL #1
+        
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                         @// dstStep = - 3*outPointStep+16
+        MOV     step24,#24 
+
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+
+        @// Process two groups at a time
+        
+grpLoop\name :	
+        
+        VZIP    dW2r,dW2i
+        ADD     stepTwiddle,stepTwiddle,#16                 @// increment for the next iteration
+        VZIP    dW3r,dW3i
+        ADD     grpTwStep,stepTwiddle,#4
+        VUZP     dButterfly1Real13, dButterfly2Real13        @// B.r D.r
+        SUB     twStep,stepTwiddle,#16                      @// -16+stepTwiddle
+        VUZP     dButterfly1Imag13, dButterfly2Imag13        @// B.i D.i
+        MOV     grpTwStep,grpTwStep,LSL #1
+        VUZP     dButterfly1Real02, dButterfly2Real02        @// A.r C.r
+        RSB     grpTwStep,grpTwStep,#0                      @// -8-2*stepTwiddle
+        
+        
+        VUZP     dButterfly1Imag02, dButterfly2Imag02        @// A.i C.i
+        
+        
+        SUBS    grpCount,grpCount,#8                    @// grpCount is multiplied by 4
+                
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dW1r,dXr1
+            VMLAL   qT0,dW1i,dXi1                       @// real part
+            VMULL   qT1,dW1r,dXi1
+            VMLSL   qT1,dW1i,dXr1                       @// imag part
+                
+        .else
+        
+            VMULL   qT0,dW1r,dXr1
+            VMLSL   qT0,dW1i,dXi1                       @// real part
+            VMULL   qT1,dW1r,dXi1
+            VMLAL   qT1,dW1i,dXr1                       @// imag part
+                    
+        .endif
+        
+        VLD2    {dW1r,dW1i},[pTwiddle :128],stepTwiddle      @// [wi|wr]
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dW2r,dXr2
+            VMLAL   qT2,dW2i,dXi2                       @// real part
+            VMULL   qT3,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
+            VMLSL   qT3,dW2i,dXr2                       @// imag part
+                
+        .else
+        
+            VMULL   qT2,dW2r,dXr2
+            VMLSL   qT2,dW2i,dXi2                       @// real part
+            VMULL   qT3,dW2r,dXi2
+            VLD1    dW2r,[pTwiddle :64],step16                  @// [wi|wr]
+            VMLAL   qT3,dW2i,dXr2                       @// imag part
+                    
+        .endif
+        
+        
+        VRSHRN  dZr1,qT0,#31
+        VLD1    dW2i,[pTwiddle :64],twStep                  @// [wi|wr] 
+        VRSHRN  dZi1,qT1,#31
+        
+        VMOV     qZ0,qX0                                @// move qX0 so as to load for the next iteration
+        VLD4     {dButterfly1Real02,dButterfly1Imag02,dButterfly1Real13,dButterfly1Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+                
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT4,dW3r,dXr3
+            VMLAL   qT4,dW3i,dXi3                       @// real part
+            VMULL   qT5,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLSL   qT5,dW3i,dXr3                       @// imag part
+                
+        .else
+        
+            VMULL   qT4,dW3r,dXr3
+            VMLSL   qT4,dW3i,dXi3                       @// real part
+            VMULL   qT5,dW3r,dXi3
+            VLD1    dW3r,[pTwiddle :64],step24
+            VMLAL   qT5,dW3i,dXr3                       @// imag part
+                    
+        .endif
+        
+        VRSHRN  dZr2,qT2,#31
+        VLD1    dW3i,[pTwiddle :64],grpTwStep                           @// [wi|wr]
+        VRSHRN  dZi2,qT3,#31
+        
+        VRSHRN  dZr3,qT4,#31
+        VRSHRN  dZi3,qT5,#31
+        VLD4     {dButterfly2Real02,dButterfly2Imag02,dButterfly2Real13,dButterfly2Imag13},[pSrc :256]! @// AC.r AC.i BD.r BD.i
+        
+                
+        .ifeqs "\scaled", "TRUE"
+        
+            @// finish first stage of 4 point FFT 
+            
+            VHADD    qY0,qZ0,qZ2
+            VHSUB    qY2,qZ0,qZ2
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+            
+                        
+            @// finish second stage of 4 point FFT 
+            
+            .ifeqs  "\inverse", "TRUE"
+
+                VHSUB    qZ0,qY2,qY1
+            
+                VHADD    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                                
+                VHADD    qZ2,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+            
+                VHSUB    dZr1,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHADD    dZi1,dYi0,dYr3
+                
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
+            
+                                
+            .else
+                
+                VHSUB    qZ0,qY2,qY1
+            
+                VHSUB    dZr1,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi1,dYi0,dYr3
+            
+                VHADD    qZ2,qY2,qY1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            
+                VHADD    dZr3,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
+
+            
+            .endif
+            
+        
+        
+        .else
+        
+            @// finish first stage of 4 point FFT 
+            
+            VADD    qY0,qZ0,qZ2
+            VSUB    qY2,qZ0,qZ2
+            VADD    qY1,qZ1,qZ3
+            VSUB    qY3,qZ1,qZ3
+            
+                        
+            @// finish second stage of 4 point FFT 
+            
+            .ifeqs  "\inverse", "TRUE"
+
+                VSUB    qZ0,qY2,qY1
+            
+                VADD    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                                
+                VADD    qZ2,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+            
+                VSUB    dZr1,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VADD    dZi1,dYi0,dYr3
+                
+                VST2    {dZr1,dZi1},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
+            
+                                
+            .else
+                
+                VSUB    qZ0,qY2,qY1
+            
+                VSUB    dZr1,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi1,dYi0,dYr3
+            
+                VADD    qZ2,qY2,qY1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            
+                VADD    dZr3,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                
+                VST2    {dZr3,dZi3},[pDst :128],dstStep              @// dstStep = -outPointStep + 16
+
+            
+            .endif
+            
+        .endif
+        
+        BGT     grpLoop\name
+           
+                
+        @// Reset and Swap pSrc and pDst for the next stage     
+        MOV     pTmp,pDst
+        SUB     pSrc,pSrc,#64                       @// Extra increment done in final iteration of the loop
+        SUB     pDst,pSrc,outPointStep,LSL #2       @// pDst -= 4*size; pSrc -= 8*size bytes           
+        SUB     pSrc,pTmp,outPointStep
+        SUB     pTwiddle,pTwiddle,subFFTSize,LSL #1
+        SUB     pTwiddle,pTwiddle,#16               @// Extra increment done in final iteration of the loop
+        
+        .endm
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","FALSE",fwd
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "FALSE","TRUE",inv
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","FALSE",fwdsfs
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe,r4
+        FFTSTAGE "TRUE","TRUE",invsfs
+        M_END
+
+        
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix4_unsafe_s.S
@ -0,0 +1,395 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix4_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7767
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a Radix 4 FFT stage for a N point complex signal
+@// 
+
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+    
+@// Import symbols required from other files
+@// (For example tables)
+    
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpCount	r3
+#define pointStep	r4
+#define outPointStep	r5
+#define stepTwiddle	r12
+#define setCount	r14
+#define srcStep		r8
+#define setStep		r9
+#define dstStep		r10
+#define twStep		r11
+#define t1		r3
+
+@// Neon Registers
+
+#define dW1	D0.S32
+#define dW2	D1.S32
+#define dW3	D2.S32   
+
+#define dXr0	D4.S32
+#define dXi0	D5.S32
+#define dXr1	D6.S32
+#define dXi1	D7.S32
+#define dXr2	D8.S32
+#define dXi2	D9.S32
+#define dXr3	D10.S32
+#define dXi3	D11.S32
+#define dYr0	D12.S32
+#define dYi0	D13.S32
+#define dYr1	D14.S32
+#define dYi1	D15.S32
+#define dYr2	D16.S32
+#define dYi2	D17.S32
+#define dYr3	D18.S32
+#define dYi3	D19.S32
+#define qT0	Q8.S64   
+#define qT1	Q9.S64
+#define qT2	Q6.S64
+#define qT3	Q7.S64
+
+#define dZr0	D20.S32
+#define dZi0	D21.S32
+#define dZr1	D22.S32
+#define dZi1	D23.S32
+#define dZr2	D24.S32
+#define dZi2	D25.S32
+#define dZr3	D26.S32
+#define dZi3	D27.S32
+
+#define qY0	Q6.S32
+#define qY1	Q7.S32
+#define qY2	Q8.S32
+#define qY3	Q9.S32   
+#define qX0	Q2.S32
+#define qZ0	Q10.S32
+#define qZ1	Q11.S32
+#define qZ2	Q12.S32
+#define qZ3	Q13.S32
+
+        
+        .MACRO FFTSTAGE scaled, inverse , name
+        
+        @// Define stack arguments
+        
+        
+        @// Update grpCount and grpSize rightaway inorder to reuse pGrpCount and pGrpSize regs
+        
+        LSL     grpCount,subFFTSize,#2
+        LSR     subFFTNum,subFFTNum,#2  
+        MOV     subFFTSize,grpCount
+        
+        VLD1     dW1,[pTwiddle]                             @//[wi | wr]
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = 2*grpSize bytes
+        MOV     pointStep,subFFTNum,LSL #1
+        
+        
+        @// pOut0+1 increments pOut0 by 8 bytes
+        @// pOut0+outPointStep == increment of 8*outPointStep bytes = 2*size bytes
+        
+        MOV     stepTwiddle,#0
+        VLD1     dW2,[pTwiddle]                             @//[wi | wr]
+        SMULBB  outPointStep,grpCount,pointStep  
+        LSL     pointStep,pointStep,#2                      @// 2*grpSize    
+        
+        VLD1     dW3,[pTwiddle]                             @//[wi | wr]
+        MOV     srcStep,pointStep,LSL #1                    @// srcStep = 2*pointStep
+        ADD     setStep,srcStep,pointStep                   @// setStep = 3*pointStep
+        @//RSB     setStep,setStep,#16                         @// setStep = - 3*pointStep+16
+        RSB     setStep,setStep,#0                         @// setStep = - 3*pointStep
+        SUB     srcStep,srcStep,#16                         @// srcStep = 2*pointStep-16
+        
+        MOV     dstStep,outPointStep,LSL #1
+        ADD     dstStep,dstStep,outPointStep                @// dstStep = 3*outPointStep
+        RSB     dstStep,dstStep,#16                          @// dstStep = - 3*outPointStep+16
+        
+
+        
+grpLoop\name :	
+        
+        VLD2    {dXr0,dXi0},[pSrc],pointStep                @//  data[0]
+        ADD      stepTwiddle,stepTwiddle,pointStep
+        VLD2    {dXr1,dXi1},[pSrc],pointStep                @//  data[1]
+        ADD      pTwiddle,pTwiddle,stepTwiddle              @// set pTwiddle to the first point
+        VLD2    {dXr2,dXi2},[pSrc],pointStep                @//  data[2]
+        MOV      twStep,stepTwiddle,LSL #2
+        
+        VLD2    {dXr3,dXi3},[pSrc],setStep                  @//  data[3] & update pSrc for the next set
+        SUB      twStep,stepTwiddle,twStep                  @// twStep = -3*stepTwiddle
+        
+        MOV      setCount,pointStep,LSR #3
+        ADD     pSrc,pSrc,#16                         @// set pSrc to data[0] of the next set
+        ADD     pSrc,pSrc,pointStep                   @// increment to data[1] of the next set
+       
+        
+        @// Loop on the sets
+
+setLoop\name :	
+        
+        
+        
+        SUBS    setCount,setCount,#2                    @// decrement the loop counter
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr1,dW1[0]
+            VMLAL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLSL   qT1,dXr1,dW1[1]                       @// imag part
+            
+        .else
+            VMULL   qT0,dXr1,dW1[0]
+            VMLSL   qT0,dXi1,dW1[1]                       @// real part
+            VMULL   qT1,dXi1,dW1[0]
+            VMLAL   qT1,dXr1,dW1[1]                       @// imag part
+        
+        .endif
+        
+        VLD2    {dXr1,dXi1},[pSrc],pointStep              @//  data[1] for next iteration
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT2,dXr2,dW2[0]
+            VMLAL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLSL   qT3,dXr2,dW2[1]                       @// imag part
+            
+        .else
+            VMULL   qT2,dXr2,dW2[0]
+            VMLSL   qT2,dXi2,dW2[1]                       @// real part
+            VMULL   qT3,dXi2,dW2[0]
+            VMLAL   qT3,dXr2,dW2[1]                       @// imag part
+        
+        .endif
+        
+        VRSHRN  dZr1,qT0,#31
+        VRSHRN  dZi1,qT1,#31
+        VLD2    {dXr2,dXi2},[pSrc],pointStep              @//  data[2] for next iteration
+        
+        
+        .ifeqs  "\inverse", "TRUE"
+            VMULL   qT0,dXr3,dW3[0]
+            VMLAL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLSL   qT1,dXr3,dW3[1]                       @// imag part
+            
+        .else
+            VMULL   qT0,dXr3,dW3[0]
+            VMLSL   qT0,dXi3,dW3[1]                       @// real part
+            VMULL   qT1,dXi3,dW3[0]
+            VMLAL   qT1,dXr3,dW3[1]                       @// imag part
+        
+        .endif
+        
+        VRSHRN  dZr2,qT2,#31
+        VRSHRN  dZi2,qT3,#31
+        
+        
+        VRSHRN  dZr3,qT0,#31
+        VRSHRN  dZi3,qT1,#31
+        VLD2    {dXr3,dXi3},[pSrc],setStep            @//  data[3] & update pSrc to data[0]
+        
+        .ifeqs "\scaled", "TRUE"
+        
+            @// finish first stage of 4 point FFT 
+            VHADD    qY0,qX0,qZ2
+            VHSUB    qY2,qX0,qZ2
+                        
+            VLD2    {dXr0,dXi0},[pSrc]!          @//  data[0] for next iteration
+            VHADD    qY1,qZ1,qZ3
+            VHSUB    qY3,qZ1,qZ3
+            
+            @// finish second stage of 4 point FFT 
+            
+            VHSUB    qZ0,qY2,qY1
+            
+            
+            .ifeqs  "\inverse", "TRUE"
+                
+                VHADD    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+                
+                VHADD    qZ2,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+            
+                VHSUB    dZr1,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHADD    dZi1,dYi0,dYr3
+            
+                VST2    {dZr1,dZi1},[pDst :128],dstStep
+                
+                
+            .else
+                
+                VHSUB    dZr1,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VHADD    dZi1,dYi0,dYr3
+            
+                VHADD    qZ2,qY2,qY1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            
+                VHADD    dZr3,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VHSUB    dZi3,dYi0,dYr3
+            
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+            
+            .endif
+        
+        
+        .else
+        
+            @// finish first stage of 4 point FFT 
+            VADD    qY0,qX0,qZ2
+            VSUB    qY2,qX0,qZ2
+                        
+            VLD2    {dXr0,dXi0},[pSrc :128]!          @//  data[0] for next iteration
+            VADD    qY1,qZ1,qZ3
+            VSUB    qY3,qZ1,qZ3
+            
+            @// finish second stage of 4 point FFT 
+            
+            VSUB    qZ0,qY2,qY1
+            
+            
+            .ifeqs  "\inverse", "TRUE"
+                
+                VADD    dZr3,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+                
+                VADD    qZ2,qY2,qY1
+                VST2    {dZr3,dZi3},[pDst :128],outPointStep
+            
+                VSUB    dZr1,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VADD    dZi1,dYi0,dYr3
+            
+                VST2    {dZr1,dZi1},[pDst :128],dstStep
+                
+                
+            .else
+                
+                VSUB    dZr1,dYr0,dYi3
+                VST2    {dZr0,dZi0},[pDst :128],outPointStep
+                VADD    dZi1,dYi0,dYr3
+            
+                VADD    qZ2,qY2,qY1
+                VST2    {dZr1,dZi1},[pDst :128],outPointStep
+            
+                VADD    dZr3,dYr0,dYi3
+                VST2    {dZr2,dZi2},[pDst :128],outPointStep
+                VSUB    dZi3,dYi0,dYr3
+            
+                VST2    {dZr3,dZi3},[pDst :128],dstStep
+
+            
+            .endif
+            
+        .endif
+        
+        ADD     pSrc,pSrc,pointStep                         @// increment to data[1] of the next set              
+        BGT     setLoop\name
+        
+        
+        VLD1     dW1,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
+        SUBS    grpCount,grpCount,#4                    @// subtract 4 since grpCount multiplied by 4               
+        VLD1     dW2,[pTwiddle :64],stepTwiddle                  @//[wi | wr]
+        ADD     pSrc,pSrc,srcStep                       @// increment pSrc for the next grp
+        VLD1     dW3,[pTwiddle :64],twStep                       @//[wi | wr]
+        BGT     grpLoop\name
+
+                
+        @// Reset and Swap pSrc and pDst for the next stage
+        MOV     t1,pDst
+        SUB     pDst,pSrc,outPointStep,LSL #2                  @// pDst -= 2*size; pSrc -= 8*size bytes           
+        SUB     pSrc,t1,outPointStep    
+        
+        
+        .endm
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+        
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.S
@ -0,0 +1,595 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  armSP_FFT_CToC_SC32_Radix8_fs_unsafe_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7770
+@// Last Modified Date:       Thu, 27 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute a first stage Radix 8 FFT stage for a N point complex signal
+@// 
+
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+    
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    
+@// Guarding implementation by the processor name
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r2
+#define pTwiddle	r1
+#define subFFTNum	r6
+#define subFFTSize	r7
+@// dest buffer for the next stage (not pSrc for first stage) 	
+#define pPingPongBuf	r5
+
+
+@//Output Registers
+
+
+@//Local Scratch Registers
+
+#define grpSize		r3
+@// Reuse grpSize as setCount	
+#define setCount	r3
+#define pointStep	r4
+#define outPointStep	r4
+#define setStep		r8
+#define step1		r9
+#define step2		r10
+#define t0		r11
+  
+
+@// Neon Registers
+
+#define dXr0	D0.S32
+#define dXi0	D1.S32
+#define dXr1	D2.S32
+#define dXi1	D3.S32
+#define dXr2	D4.S32
+#define dXi2	D5.S32
+#define dXr3	D6.S32
+#define dXi3	D7.S32
+#define dXr4	D8.S32
+#define dXi4	D9.S32
+#define dXr5	D10.S32
+#define dXi5	D11.S32
+#define dXr6	D12.S32
+#define dXi6	D13.S32
+#define dXr7	D14.S32
+#define dXi7	D15.S32
+#define qX0	Q0.S32
+#define qX1	Q1.S32
+#define qX2	Q2.S32
+#define qX3	Q3.S32   
+#define qX4	Q4.S32
+#define qX5	Q5.S32
+#define qX6	Q6.S32
+#define qX7	Q7.S32
+
+#define dUr0	D16.S32
+#define dUi0	D17.S32
+#define dUr2	D18.S32
+#define dUi2	D19.S32
+#define dUr4	D20.S32
+#define dUi4	D21.S32
+#define dUr6	D22.S32
+#define dUi6	D23.S32
+#define dUr1	D24.S32
+#define dUi1	D25.S32
+#define dUr3	D26.S32
+#define dUi3	D27.S32
+#define dUr5	D28.S32
+#define dUi5	D29.S32
+@// reuse dXr7 and dXi7	
+#define dUr7	D30.S32
+#define dUi7	D31.S32
+#define qU0	Q8.S32
+#define qU1	Q12.S32
+#define qU2	Q9.S32
+#define qU3	Q13.S32   
+#define qU4	Q10.S32
+#define qU5	Q14.S32
+#define qU6	Q11.S32
+#define qU7	Q15.S32
+
+
+
+#define dVr0	D24.S32
+#define dVi0	D25.S32
+#define dVr2	D26.S32
+#define dVi2	D27.S32
+#define dVr4	D28.S32
+#define dVi4	D29.S32
+#define dVr6	D30.S32
+#define dVi6	D31.S32
+#define dVr1	D16.S32
+#define dVi1	D17.S32
+#define dVr3	D18.S32
+#define dVi3	D19.S32
+#define dVr5	D20.S32
+#define dVi5	D21.S32
+#define dVr7	D22.S32              
+#define dVi7	D23.S32              
+#define qV0	Q12.S32
+#define qV1	Q8.S32
+#define qV2	Q13.S32
+#define qV3	Q9.S32   
+#define qV4	Q14.S32
+#define qV5	Q10.S32
+#define qV6	Q15.S32
+#define qV7	Q11.S32
+
+
+
+#define dYr0	D16.S32
+#define dYi0	D17.S32
+#define dYr2	D18.S32
+#define dYi2	D19.S32
+#define dYr4	D20.S32
+#define dYi4	D21.S32
+#define dYr6	D22.S32
+#define dYi6	D23.S32
+#define dYr1	D24.S32
+#define dYi1	D25.S32
+#define dYr3	D26.S32
+#define dYi3	D27.S32
+#define dYr5	D28.S32
+#define dYi5	D29.S32
+#define dYr7	D30.S32                 
+#define dYi7	D31.S32
+#define qY0	Q8.S32
+#define qY1	Q12.S32
+#define qY2	Q9.S32
+#define qY3	Q13.S32   
+#define qY4	Q10.S32
+#define qY5	Q14.S32
+#define qY6	Q11.S32
+#define qY7	Q15.S32
+
+
+#define dT0	D14.S32             
+#define dT1	D15.S32
+
+@// Define constants
+	.set ONEBYSQRT2, 0x5A82799A        @// Q31 format
+    
+
+        .MACRO FFTSTAGE scaled, inverse, name
+        
+        @// Define stack arguments
+        
+        @// Update pSubFFTSize and pSubFFTNum regs
+        MOV     subFFTSize,#8                               @// subFFTSize = 1 for the first stage
+        LDR     t0,=ONEBYSQRT2                              @// t0=(1/sqrt(2)) as Q31 value 
+        
+        @// Note: setCount = subFFTNum/8 (reuse the grpSize reg for setCount)
+        LSR     grpSize,subFFTNum,#3  
+        MOV     subFFTNum,grpSize
+        
+                
+        @// pT0+1 increments pT0 by 8 bytes
+        @// pT0+pointStep = increment of 8*pointStep bytes = grpSize bytes
+        @// Note: outPointStep = pointStep for firststage
+        
+        MOV     pointStep,grpSize,LSL #3
+        
+                                       
+        @// Calculate the step of input data for the next set
+        @//MOV     step1,pointStep,LSL #1                      @// step1 = 2*pointStep
+        VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0]
+        MOV     step1,grpSize,LSL #4
+        
+        MOV     step2,pointStep,LSL #3
+        VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+        SUB     step2,step2,pointStep                          @// step2 = 7*pointStep
+        RSB     setStep,step2,#16                              @// setStep = - 7*pointStep+16
+        
+        VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+        VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3] 
+        VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+        VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+        VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+        VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7] & update pSrc for the next set
+                                                      @//  setStep = -7*pointStep + 16  
+        @// grp = 0 a special case since all the twiddle factors are 1
+        @// Loop on the sets
+
+grpZeroSetLoop\name :	
+                                                      
+        @// Decrement setcount
+        SUBS    setCount,setCount,#2                    @// decrement the set loop counter           
+                                                                         
+        
+        .ifeqs	"\scaled", "TRUE"
+            @// finish first stage of 8 point FFT 
+            
+            VHADD    qU0,qX0,qX4
+            VHADD    qU2,qX1,qX5
+            VHADD    qU4,qX2,qX6
+            VHADD    qU6,qX3,qX7
+            
+            @// finish second stage of 8 point FFT 
+            
+            VHADD    qV0,qU0,qU4
+            VHSUB    qV2,qU0,qU4
+            VHADD    qV4,qU2,qU6
+            VHSUB    qV6,qU2,qU6
+            
+            @// finish third stage of 8 point FFT 
+            
+            VHADD    qY0,qV0,qV4
+            VHSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+            
+            .ifeqs	"\inverse", "TRUE"
+                
+                VHSUB    dYr2,dVr2,dVi6
+                VHADD    dYi2,dVi2,dVr6
+                
+                VHADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VHSUB    dYi6,dVi2,dVr6
+            
+                VHSUB    qU1,qX0,qX4                    
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+            
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+            
+            .ELSE
+            
+                VHADD    dYr6,dVr2,dVi6
+                VHSUB    dYi6,dVi2,dVr6
+                
+                VHSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VHADD    dYi2,dVi2,dVr6
+                
+                                
+                VHSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VHSUB    qU3,qX1,qX5
+                VHSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+            
+            .ENDIF
+            
+            @// finish first stage of 8 point FFT 
+            
+            VHSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0                                   
+            
+            @// finish second stage of 8 point FFT 
+            
+            VHSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VHADD    dVi1,dUi1,dUr5
+            VHADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VHSUB    dVi3,dUi1,dUr5
+                        
+            VHSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VHADD    dVi5,dUi3,dUr7
+            VHADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VHSUB    dVi7,dUi3,dUr7
+            
+            @// finish third stage of 8 point FFT 
+            
+            .ifeqs	"\inverse", "TRUE"
+            
+                @// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+                            
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+                
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VHADD    qY1,qV1,qV5
+                VHSUB    qY5,qV1,qV5
+                
+                            
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+                
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]            
+                
+                
+                VHSUB    dYr3,dVr3,dVr7
+                VHSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+
+                
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+
+            .ELSE
+            
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+                
+                @// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VHADD    dYr7,dVr3,dVr7
+                VHADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+            
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]            
+                
+                VHSUB    qY5,qV1,qV5
+                
+                VHSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VHSUB    dYi3,dVi3,dVi7
+                VHADD    qY1,qV1,qV5
+                
+                
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+            
+            .ENDIF
+            
+            
+           
+        .ELSE
+            @// finish first stage of 8 point FFT 
+            
+            VADD    qU0,qX0,qX4
+            VADD    qU2,qX1,qX5
+            VADD    qU4,qX2,qX6
+            VADD    qU6,qX3,qX7
+            
+            @// finish second stage of 8 point FFT 
+            
+            VADD    qV0,qU0,qU4
+            VSUB    qV2,qU0,qU4
+            VADD    qV4,qU2,qU6
+            VSUB    qV6,qU2,qU6
+            
+            @// finish third stage of 8 point FFT 
+            
+            VADD    qY0,qV0,qV4
+            VSUB    qY4,qV0,qV4
+            VST2    {dYr0,dYi0},[pDst :128],step1                    @// store y0
+            
+            .ifeqs	"\inverse", "TRUE"
+                
+                VSUB    dYr2,dVr2,dVi6
+                VADD    dYi2,dVi2,dVr6
+                
+                VADD    dYr6,dVr2,dVi6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y2
+                VSUB    dYi6,dVi2,dVr6
+            
+                VSUB    qU1,qX0,qX4                    
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+            
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y6
+            
+            .ELSE
+            
+                VADD    dYr6,dVr2,dVi6
+                VSUB    dYi6,dVi2,dVr6
+                
+                VSUB    dYr2,dVr2,dVi6
+                VST2    {dYr6,dYi6},[pDst :128],step1                    @// store y2
+                VADD    dYi2,dVi2,dVr6
+                
+                                
+                VSUB    qU1,qX0,qX4
+                VST2    {dYr4,dYi4},[pDst :128],step1                    @// store y4
+                VSUB    qU3,qX1,qX5
+                VSUB    qU5,qX2,qX6
+                VST2    {dYr2,dYi2},[pDst :128],step1                    @// store y6
+
+            
+            .ENDIF
+            
+            @// finish first stage of 8 point FFT 
+            
+            VSUB    qU7,qX3,qX7
+            VMOV    dT0[0],t0                                   
+            
+            @// finish second stage of 8 point FFT 
+            
+            VSUB    dVr1,dUr1,dUi5
+            VLD2    {dXr0,dXi0},[pSrc :128],pointStep          @//  data[0] for next iteration
+            VADD    dVi1,dUi1,dUr5
+            VADD    dVr3,dUr1,dUi5
+            VLD2    {dXr1,dXi1},[pSrc :128],pointStep          @//  data[1]
+            VSUB    dVi3,dUi1,dUr5
+                        
+            VSUB    dVr5,dUr3,dUi7
+            VLD2    {dXr2,dXi2},[pSrc :128],pointStep          @//  data[2]
+            VADD    dVi5,dUi3,dUr7
+            VADD    dVr7,dUr3,dUi7
+            VLD2    {dXr3,dXi3},[pSrc :128],pointStep          @//  data[3]
+            VSUB    dVi7,dUi3,dUr7
+            
+            @// finish third stage of 8 point FFT 
+            
+            .ifeqs	"\inverse", "TRUE"
+            
+                @// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi5,dVi5,dT0[0]
+                            
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+                
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VADD    qY1,qV1,qV5
+                VSUB    qY5,qV1,qV5
+                
+                            
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+                
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]            
+                
+                
+                VSUB    dYr3,dVr3,dVr7
+                VSUB    dYi3,dVi3,dVi7
+                VST2    {dYr1,dYi1},[pDst :128],step1                    @// store y1
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+
+                
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y3
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y5
+                VST2    {dYr7,dYi7},[pDst :128]!                      @// store y7
+
+            .ELSE
+            
+                @// calculate  b*v7
+                VQRDMULH    dT1,dVr7,dT0[0]
+                VLD2    {dXr4,dXi4},[pSrc :128],pointStep          @//  data[4]
+                VQRDMULH    dVi7,dVi7,dT0[0]
+                
+                VLD2    {dXr5,dXi5},[pSrc :128],pointStep          @//  data[5]
+                VADD    dVr7,dT1,dVi7                               @// b * V7
+                VSUB    dVi7,dVi7,dT1
+                
+                VLD2    {dXr6,dXi6},[pSrc :128],pointStep          @//  data[6]
+                
+                @// calculate a*v5 
+                VQRDMULH    dT1,dVr5,dT0[0]                         @// use dVi0 for dT1
+                VQRDMULH    dVi5,dVi5,dT0[0]
+
+                VADD    dYr7,dVr3,dVr7
+                VADD    dYi7,dVi3,dVi7
+                SUB     pDst, pDst, step2                           @// set pDst to y1
+            
+                VSUB    dVr5,dT1,dVi5                               @// a * V5
+                VADD    dVi5,dT1,dVi5
+                VLD2    {dXr7,dXi7},[pSrc :128],setStep            @//  data[7]            
+                
+                VSUB    qY5,qV1,qV5
+                
+                VSUB    dYr3,dVr3,dVr7
+                VST2    {dYr7,dYi7},[pDst :128],step1                    @// store y1
+                VSUB    dYi3,dVi3,dVi7
+                VADD    qY1,qV1,qV5
+                
+                
+                VST2    {dYr5,dYi5},[pDst :128],step1                    @// store y3
+                VST2    {dYr3,dYi3},[pDst :128],step1                    @// store y5
+                VST2    {dYr1,dYi1},[pDst :128]!                      @// store y7
+            
+            .ENDIF
+            
+            
+        .ENDIF
+        
+        SUB     pDst, pDst, step2                               @// update pDst for the next set
+        BGT     grpZeroSetLoop\name
+        
+        
+        @// reset pSrc to pDst for the next stage
+        SUB     pSrc,pDst,pointStep                             @// pDst -= 2*grpSize  
+        MOV     pDst,pPingPongBuf 
+        
+        
+        
+        .endm
+        
+
+        @// Allocate stack memory required by the function
+        
+        
+        M_START armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","FALSE",FWD
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "FALSE","TRUE",INV
+        M_END
+ 
+        
+        M_START armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","FALSE",FWDSFS
+        M_END
+
+        
+        M_START armSP_FFTInv_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe,r4
+            FFTSTAGE "TRUE","TRUE",INVSFS
+        M_END
+
+    
+	.end
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_F32TwiddleTable.c
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_F32TwiddleTable.c
--- a/media/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c
+++ b/media/openmax_dl/dl/sp/src/armSP_FFT_S32TwiddleTable.c
@ -0,0 +1,556 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  armSP_FFT_S32TwiddleTable.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   6781
+ * Last Modified Date:       Wed, 25 Jul 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ *
+ * Description:
+ * Twiddle table for Forward FFT in Q31 format.
+ * It contains complex pairs [-cos (W * i), -sin (W * i)] where W = -2*PI/N
+ * and 0<= i<= N/8.  N is the max size of the FFT. Here N = 2^12.
+ * Values for N/8 < i < N are generated in the FFTInit function using the 
+ * symmetries of cos and sine.
+ * 
+ * NOTE: The values are stored negated. This is to represent '1' which cannot be otherwise 
+ * represented as Q31 in 32 bits. 
+**/
+
+#include "dl/api/omxtypes.h"
+
+
+const OMX_S32 armSP_FFT_S32TwiddleTable[1026] ={
+
+0x80000000,		0x0,
+0x800009df,		0x3243f5,
+0x8000277a,		0x6487e3,
+0x800058d4,		0x96cbc1,
+0x80009dea,		0xc90f88,
+0x8000f6bd,		0xfb5330,
+0x8001634e,		0x12d96b1,
+0x8001e39b,		0x15fda03,
+0x800277a6,		0x1921d20,
+0x80031f6d,		0x1c45ffe,
+0x8003daf1,		0x1f6a297,
+0x8004aa32,		0x228e4e2,
+0x80058d2f,		0x25b26d7,
+0x800683e8,		0x28d6870,
+0x80078e5e,		0x2bfa9a4,
+0x8008ac90,		0x2f1ea6c,
+0x8009de7e,		0x3242abf,
+0x800b2427,		0x3566a96,
+0x800c7d8c,		0x388a9ea,
+0x800deaad,		0x3bae8b2,
+0x800f6b88,		0x3ed26e6,
+0x8011001f,		0x41f6480,
+0x8012a86f,		0x451a177,
+0x8014647b,		0x483ddc3,
+0x80163440,		0x4b6195d,
+0x801817bf,		0x4e8543e,
+0x801a0ef8,		0x51a8e5c,
+0x801c19ea,		0x54cc7b1,
+0x801e3895,		0x57f0035,
+0x80206af8,		0x5b137df,
+0x8022b114,		0x5e36ea9,
+0x80250ae7,		0x615a48b,
+0x80277872,		0x647d97c,
+0x8029f9b4,		0x67a0d76,
+0x802c8ead,		0x6ac406f,
+0x802f375d,		0x6de7262,
+0x8031f3c2,		0x710a345,
+0x8034c3dd,		0x742d311,
+0x8037a7ac,		0x77501be,
+0x803a9f31,		0x7a72f45,
+0x803daa6a,		0x7d95b9e,
+0x8040c956,		0x80b86c2,
+0x8043fbf6,		0x83db0a7,
+0x80474248,		0x86fd947,
+0x804a9c4d,		0x8a2009a,
+0x804e0a04,		0x8d42699,
+0x80518b6b,		0x9064b3a,
+0x80552084,		0x9386e78,
+0x8058c94c,		0x96a9049,
+0x805c85c4,		0x99cb0a7,
+0x806055eb,		0x9cecf89,
+0x806439c0,		0xa00ece8,
+0x80683143,		0xa3308bd,
+0x806c3c74,		0xa6522fe,
+0x80705b50,		0xa973ba5,
+0x80748dd9,		0xac952aa,
+0x8078d40d,		0xafb6805,
+0x807d2dec,		0xb2d7baf,
+0x80819b74,		0xb5f8d9f,
+0x80861ca6,		0xb919dcf,
+0x808ab180,		0xbc3ac35,
+0x808f5a02,		0xbf5b8cb,
+0x8094162c,		0xc27c389,
+0x8098e5fb,		0xc59cc68,
+0x809dc971,		0xc8bd35e,
+0x80a2c08b,		0xcbdd865,
+0x80a7cb49,		0xcefdb76,
+0x80ace9ab,		0xd21dc87,
+0x80b21baf,		0xd53db92,
+0x80b76156,		0xd85d88f,
+0x80bcba9d,		0xdb7d376,
+0x80c22784,		0xde9cc40,
+0x80c7a80a,		0xe1bc2e4,
+0x80cd3c2f,		0xe4db75b,
+0x80d2e3f2,		0xe7fa99e,
+0x80d89f51,		0xeb199a4,
+0x80de6e4c,		0xee38766,
+0x80e450e2,		0xf1572dc,
+0x80ea4712,		0xf475bff,
+0x80f050db,		0xf7942c7,
+0x80f66e3c,		0xfab272b,
+0x80fc9f35,		0xfdd0926,
+0x8102e3c4,		0x100ee8ad,
+0x81093be8,		0x1040c5bb,
+0x810fa7a0,		0x1072a048,
+0x811626ec,		0x10a4784b,
+0x811cb9ca,		0x10d64dbd,
+0x8123603a,		0x11082096,
+0x812a1a3a,		0x1139f0cf,
+0x8130e7c9,		0x116bbe60,
+0x8137c8e6,		0x119d8941,
+0x813ebd90,		0x11cf516a,
+0x8145c5c7,		0x120116d5,
+0x814ce188,		0x1232d979,
+0x815410d4,		0x1264994e,
+0x815b53a8,		0x1296564d,
+0x8162aa04,		0x12c8106f,
+0x816a13e6,		0x12f9c7aa,
+0x8171914e,		0x132b7bf9,
+0x8179223a,		0x135d2d53,
+0x8180c6a9,		0x138edbb1,
+0x81887e9a,		0x13c0870a,
+0x81904a0c,		0x13f22f58,
+0x819828fd,		0x1423d492,
+0x81a01b6d,		0x145576b1,
+0x81a82159,		0x148715ae,
+0x81b03ac2,		0x14b8b17f,
+0x81b867a5,		0x14ea4a1f,
+0x81c0a801,		0x151bdf86,
+0x81c8fbd6,		0x154d71aa,
+0x81d16321,		0x157f0086,
+0x81d9dde1,		0x15b08c12,
+0x81e26c16,		0x15e21445,
+0x81eb0dbe,		0x16139918,
+0x81f3c2d7,		0x16451a83,
+0x81fc8b60,		0x1676987f,
+0x82056758,		0x16a81305,
+0x820e56be,		0x16d98a0c,
+0x82175990,		0x170afd8d,
+0x82206fcc,		0x173c6d80,
+0x82299971,		0x176dd9de,
+0x8232d67f,		0x179f429f,
+0x823c26f3,		0x17d0a7bc,
+0x82458acc,		0x1802092c,
+0x824f0208,		0x183366e9,
+0x82588ca7,		0x1864c0ea,
+0x82622aa6,		0x18961728,
+0x826bdc04,		0x18c7699b,
+0x8275a0c0,		0x18f8b83c,
+0x827f78d8,		0x192a0304,
+0x8289644b,		0x195b49ea,
+0x82936317,		0x198c8ce7,
+0x829d753a,		0x19bdcbf3,
+0x82a79ab3,		0x19ef0707,
+0x82b1d381,		0x1a203e1b,
+0x82bc1fa2,		0x1a517128,
+0x82c67f14,		0x1a82a026,
+0x82d0f1d5,		0x1ab3cb0d,
+0x82db77e5,		0x1ae4f1d6,
+0x82e61141,		0x1b161479,
+0x82f0bde8,		0x1b4732ef,
+0x82fb7dd8,		0x1b784d30,
+0x83065110,		0x1ba96335,
+0x8311378d,		0x1bda74f6,
+0x831c314e,		0x1c0b826a,
+0x83273e52,		0x1c3c8b8c,
+0x83325e97,		0x1c6d9053,
+0x833d921b,		0x1c9e90b8,
+0x8348d8dc,		0x1ccf8cb3,
+0x835432d8,		0x1d00843d,
+0x835fa00f,		0x1d31774d,
+0x836b207d,		0x1d6265dd,
+0x8376b422,		0x1d934fe5,
+0x83825afb,		0x1dc4355e,
+0x838e1507,		0x1df5163f,
+0x8399e244,		0x1e25f282,
+0x83a5c2b0,		0x1e56ca1e,
+0x83b1b649,		0x1e879d0d,
+0x83bdbd0e,		0x1eb86b46,
+0x83c9d6fc,		0x1ee934c3,
+0x83d60412,		0x1f19f97b,
+0x83e2444d,		0x1f4ab968,
+0x83ee97ad,		0x1f7b7481,
+0x83fafe2e,		0x1fac2abf,
+0x840777d0,		0x1fdcdc1b,
+0x84140490,		0x200d888d,
+0x8420a46c,		0x203e300d,
+0x842d5762,		0x206ed295,
+0x843a1d70,		0x209f701c,
+0x8446f695,		0x20d0089c,
+0x8453e2cf,		0x21009c0c,
+0x8460e21a,		0x21312a65,
+0x846df477,		0x2161b3a0,
+0x847b19e1,		0x219237b5,
+0x84885258,		0x21c2b69c,
+0x84959dd9,		0x21f3304f,
+0x84a2fc62,		0x2223a4c5,
+0x84b06df2,		0x225413f8,
+0x84bdf286,		0x22847de0,
+0x84cb8a1b,		0x22b4e274,
+0x84d934b1,		0x22e541af,
+0x84e6f244,		0x23159b88,
+0x84f4c2d4,		0x2345eff8,
+0x8502a65c,		0x23763ef7,
+0x85109cdd,		0x23a6887f,
+0x851ea652,		0x23d6cc87,
+0x852cc2bb,		0x24070b08,
+0x853af214,		0x243743fa,
+0x8549345c,		0x24677758,
+0x85578991,		0x2497a517,
+0x8565f1b0,		0x24c7cd33,
+0x85746cb8,		0x24f7efa2,
+0x8582faa5,		0x25280c5e,
+0x85919b76,		0x2558235f,
+0x85a04f28,		0x2588349d,
+0x85af15b9,		0x25b84012,
+0x85bdef28,		0x25e845b6,
+0x85ccdb70,		0x26184581,
+0x85dbda91,		0x26483f6c,
+0x85eaec88,		0x26783370,
+0x85fa1153,		0x26a82186,
+0x860948ef,		0x26d809a5,
+0x86189359,		0x2707ebc7,
+0x8627f091,		0x2737c7e3,
+0x86376092,		0x27679df4,
+0x8646e35c,		0x27976df1,
+0x865678eb,		0x27c737d3,
+0x8666213c,		0x27f6fb92,
+0x8675dc4f,		0x2826b928,
+0x8685aa20,		0x2856708d,
+0x86958aac,		0x288621b9,
+0x86a57df2,		0x28b5cca5,
+0x86b583ee,		0x28e5714b,
+0x86c59c9f,		0x29150fa1,
+0x86d5c802,		0x2944a7a2,
+0x86e60614,		0x29743946,
+0x86f656d3,		0x29a3c485,
+0x8706ba3d,		0x29d34958,
+0x8717304e,		0x2a02c7b8,
+0x8727b905,		0x2a323f9e,
+0x8738545e,		0x2a61b101,
+0x87490258,		0x2a911bdc,
+0x8759c2ef,		0x2ac08026,
+0x876a9621,		0x2aefddd8,
+0x877b7bec,		0x2b1f34eb,
+0x878c744d,		0x2b4e8558,
+0x879d7f41,		0x2b7dcf17,
+0x87ae9cc5,		0x2bad1221,
+0x87bfccd7,		0x2bdc4e6f,
+0x87d10f75,		0x2c0b83fa,
+0x87e2649b,		0x2c3ab2b9,
+0x87f3cc48,		0x2c69daa6,
+0x88054677,		0x2c98fbba,
+0x8816d327,		0x2cc815ee,
+0x88287256,		0x2cf72939,
+0x883a23ff,		0x2d263596,
+0x884be821,		0x2d553afc,
+0x885dbeb8,		0x2d843964,
+0x886fa7c2,		0x2db330c7,
+0x8881a33d,		0x2de2211e,
+0x8893b125,		0x2e110a62,
+0x88a5d177,		0x2e3fec8b,
+0x88b80432,		0x2e6ec792,
+0x88ca4951,		0x2e9d9b70,
+0x88dca0d3,		0x2ecc681e,
+0x88ef0ab4,		0x2efb2d95,
+0x890186f2,		0x2f29ebcc,
+0x89141589,		0x2f58a2be,
+0x8926b677,		0x2f875262,
+0x893969b9,		0x2fb5fab2,
+0x894c2f4c,		0x2fe49ba7,
+0x895f072e,		0x30133539,
+0x8971f15a,		0x3041c761,
+0x8984edcf,		0x30705217,
+0x8997fc8a,		0x309ed556,
+0x89ab1d87,		0x30cd5115,
+0x89be50c3,		0x30fbc54d,
+0x89d1963c,		0x312a31f8,
+0x89e4edef,		0x3158970e,
+0x89f857d8,		0x3186f487,
+0x8a0bd3f5,		0x31b54a5e,
+0x8a1f6243,		0x31e39889,
+0x8a3302be,		0x3211df04,
+0x8a46b564,		0x32401dc6,
+0x8a5a7a31,		0x326e54c7,
+0x8a6e5123,		0x329c8402,
+0x8a823a36,		0x32caab6f,
+0x8a963567,		0x32f8cb07,
+0x8aaa42b4,		0x3326e2c3,
+0x8abe6219,		0x3354f29b,
+0x8ad29394,		0x3382fa88,
+0x8ae6d720,		0x33b0fa84,
+0x8afb2cbb,		0x33def287,
+0x8b0f9462,		0x340ce28b,
+0x8b240e11,		0x343aca87,
+0x8b3899c6,		0x3468aa76,
+0x8b4d377c,		0x34968250,
+0x8b61e733,		0x34c4520d,
+0x8b76a8e4,		0x34f219a8,
+0x8b8b7c8f,		0x351fd918,
+0x8ba0622f,		0x354d9057,
+0x8bb559c1,		0x357b3f5d,
+0x8bca6343,		0x35a8e625,
+0x8bdf7eb0,		0x35d684a6,
+0x8bf4ac05,		0x36041ad9,
+0x8c09eb40,		0x3631a8b8,
+0x8c1f3c5d,		0x365f2e3b,
+0x8c349f58,		0x368cab5c,
+0x8c4a142f,		0x36ba2014,
+0x8c5f9ade,		0x36e78c5b,
+0x8c753362,		0x3714f02a,
+0x8c8addb7,		0x37424b7b,
+0x8ca099da,		0x376f9e46,
+0x8cb667c8,		0x379ce885,
+0x8ccc477d,		0x37ca2a30,
+0x8ce238f6,		0x37f76341,
+0x8cf83c30,		0x382493b0,
+0x8d0e5127,		0x3851bb77,
+0x8d2477d8,		0x387eda8e,
+0x8d3ab03f,		0x38abf0ef,
+0x8d50fa59,		0x38d8fe93,
+0x8d675623,		0x39060373,
+0x8d7dc399,		0x3932ff87,
+0x8d9442b8,		0x395ff2c9,
+0x8daad37b,		0x398cdd32,
+0x8dc175e0,		0x39b9bebc,
+0x8dd829e4,		0x39e6975e,
+0x8deeef82,		0x3a136712,
+0x8e05c6b7,		0x3a402dd2,
+0x8e1caf80,		0x3a6ceb96,
+0x8e33a9da,		0x3a99a057,
+0x8e4ab5bf,		0x3ac64c0f,
+0x8e61d32e,		0x3af2eeb7,
+0x8e790222,		0x3b1f8848,
+0x8e904298,		0x3b4c18ba,
+0x8ea7948c,		0x3b78a007,
+0x8ebef7fb,		0x3ba51e29,
+0x8ed66ce1,		0x3bd19318,
+0x8eedf33b,		0x3bfdfecd,
+0x8f058b04,		0x3c2a6142,
+0x8f1d343a,		0x3c56ba70,
+0x8f34eed8,		0x3c830a50,
+0x8f4cbadb,		0x3caf50da,
+0x8f649840,		0x3cdb8e09,
+0x8f7c8701,		0x3d07c1d6,
+0x8f94871d,		0x3d33ec39,
+0x8fac988f,		0x3d600d2c,
+0x8fc4bb53,		0x3d8c24a8,
+0x8fdcef66,		0x3db832a6,
+0x8ff534c4,		0x3de4371f,
+0x900d8b69,		0x3e10320d,
+0x9025f352,		0x3e3c2369,
+0x903e6c7b,		0x3e680b2c,
+0x9056f6df,		0x3e93e950,
+0x906f927c,		0x3ebfbdcd,
+0x90883f4d,		0x3eeb889c,
+0x90a0fd4e,		0x3f1749b8,
+0x90b9cc7d,		0x3f430119,
+0x90d2acd4,		0x3f6eaeb8,
+0x90eb9e50,		0x3f9a5290,
+0x9104a0ee,		0x3fc5ec98,
+0x911db4a9,		0x3ff17cca,
+0x9136d97d,		0x401d0321,
+0x91500f67,		0x40487f94,
+0x91695663,		0x4073f21d,
+0x9182ae6d,		0x409f5ab6,
+0x919c1781,		0x40cab958,
+0x91b5919a,		0x40f60dfb,
+0x91cf1cb6,		0x4121589b,
+0x91e8b8d0,		0x414c992f,
+0x920265e4,		0x4177cfb1,
+0x921c23ef,		0x41a2fc1a,
+0x9235f2ec,		0x41ce1e65,
+0x924fd2d7,		0x41f93689,
+0x9269c3ac,		0x42244481,
+0x9283c568,		0x424f4845,
+0x929dd806,		0x427a41d0,
+0x92b7fb82,		0x42a5311b,
+0x92d22fd9,		0x42d0161e,
+0x92ec7505,		0x42faf0d4,
+0x9306cb04,		0x4325c135,
+0x932131d1,		0x4350873c,
+0x933ba968,		0x437b42e1,
+0x935631c5,		0x43a5f41e,
+0x9370cae4,		0x43d09aed,
+0x938b74c1,		0x43fb3746,
+0x93a62f57,		0x4425c923,
+0x93c0faa3,		0x4450507e,
+0x93dbd6a0,		0x447acd50,
+0x93f6c34a,		0x44a53f93,
+0x9411c09e,		0x44cfa740,
+0x942cce96,		0x44fa0450,
+0x9447ed2f,		0x452456bd,
+0x94631c65,		0x454e9e80,
+0x947e5c33,		0x4578db93,
+0x9499ac95,		0x45a30df0,
+0x94b50d87,		0x45cd358f,
+0x94d07f05,		0x45f7526b,
+0x94ec010b,		0x4621647d,
+0x95079394,		0x464b6bbe,
+0x9523369c,		0x46756828,
+0x953eea1e,		0x469f59b4,
+0x955aae17,		0x46c9405c,
+0x95768283,		0x46f31c1a,
+0x9592675c,		0x471cece7,
+0x95ae5c9f,		0x4746b2bc,
+0x95ca6247,		0x47706d93,
+0x95e67850,		0x479a1d67,
+0x96029eb6,		0x47c3c22f,
+0x961ed574,		0x47ed5be6,
+0x963b1c86,		0x4816ea86,
+0x965773e7,		0x48406e08,
+0x9673db94,		0x4869e665,
+0x96905388,		0x48935397,
+0x96acdbbe,		0x48bcb599,
+0x96c97432,		0x48e60c62,
+0x96e61ce0,		0x490f57ee,
+0x9702d5c3,		0x49389836,
+0x971f9ed7,		0x4961cd33,
+0x973c7817,		0x498af6df,
+0x9759617f,		0x49b41533,
+0x97765b0a,		0x49dd282a,
+0x979364b5,		0x4a062fbd,
+0x97b07e7a,		0x4a2f2be6,
+0x97cda855,		0x4a581c9e,
+0x97eae242,		0x4a8101de,
+0x98082c3b,		0x4aa9dba2,
+0x9825863d,		0x4ad2a9e2,
+0x9842f043,		0x4afb6c98,
+0x98606a49,		0x4b2423be,
+0x987df449,		0x4b4ccf4d,
+0x989b8e40,		0x4b756f40,
+0x98b93828,		0x4b9e0390,
+0x98d6f1fe,		0x4bc68c36,
+0x98f4bbbc,		0x4bef092d,
+0x9912955f,		0x4c177a6e,
+0x99307ee0,		0x4c3fdff4,
+0x994e783d,		0x4c6839b7,
+0x996c816f,		0x4c9087b1,
+0x998a9a74,		0x4cb8c9dd,
+0x99a8c345,		0x4ce10034,
+0x99c6fbde,		0x4d092ab0,
+0x99e5443b,		0x4d31494b,
+0x9a039c57,		0x4d595bfe,
+0x9a22042d,		0x4d8162c4,
+0x9a407bb9,		0x4da95d96,
+0x9a5f02f5,		0x4dd14c6e,
+0x9a7d99de,		0x4df92f46,
+0x9a9c406e,		0x4e210617,
+0x9abaf6a1,		0x4e48d0dd,
+0x9ad9bc71,		0x4e708f8f,
+0x9af891db,		0x4e984229,
+0x9b1776da,		0x4ebfe8a5,
+0x9b366b68,		0x4ee782fb,
+0x9b556f81,		0x4f0f1126,
+0x9b748320,		0x4f369320,
+0x9b93a641,		0x4f5e08e3,
+0x9bb2d8de,		0x4f857269,
+0x9bd21af3,		0x4faccfab,
+0x9bf16c7a,		0x4fd420a4,
+0x9c10cd70,		0x4ffb654d,
+0x9c303dcf,		0x50229da1,
+0x9c4fbd93,		0x5049c999,
+0x9c6f4cb6,		0x5070e92f,
+0x9c8eeb34,		0x5097fc5e,
+0x9cae9907,		0x50bf031f,
+0x9cce562c,		0x50e5fd6d,
+0x9cee229c,		0x510ceb40,
+0x9d0dfe54,		0x5133cc94,
+0x9d2de94d,		0x515aa162,
+0x9d4de385,		0x518169a5,
+0x9d6decf4,		0x51a82555,
+0x9d8e0597,		0x51ced46e,
+0x9dae2d68,		0x51f576ea,
+0x9dce6463,		0x521c0cc2,
+0x9deeaa82,		0x524295f0,
+0x9e0effc1,		0x5269126e,
+0x9e2f641b,		0x528f8238,
+0x9e4fd78a,		0x52b5e546,
+0x9e705a09,		0x52dc3b92,
+0x9e90eb94,		0x53028518,
+0x9eb18c26,		0x5328c1d0,
+0x9ed23bb9,		0x534ef1b5,
+0x9ef2fa49,		0x537514c2,
+0x9f13c7d0,		0x539b2af0,
+0x9f34a449,		0x53c13439,
+0x9f558fb0,		0x53e73097,
+0x9f7689ff,		0x540d2005,
+0x9f979331,		0x5433027d,
+0x9fb8ab41,		0x5458d7f9,
+0x9fd9d22a,		0x547ea073,
+0x9ffb07e7,		0x54a45be6,
+0xa01c4c73,		0x54ca0a4b,
+0xa03d9fc8,		0x54efab9c,
+0xa05f01e1,		0x55153fd4,
+0xa08072ba,		0x553ac6ee,
+0xa0a1f24d,		0x556040e2,
+0xa0c38095,		0x5585adad,
+0xa0e51d8c,		0x55ab0d46,
+0xa106c92f,		0x55d05faa,
+0xa1288376,		0x55f5a4d2,
+0xa14a4c5e,		0x561adcb9,
+0xa16c23e1,		0x56400758,
+0xa18e09fa,		0x566524aa,
+0xa1affea3,		0x568a34a9,
+0xa1d201d7,		0x56af3750,
+0xa1f41392,		0x56d42c99,
+0xa21633cd,		0x56f9147e,
+0xa2386284,		0x571deefa,
+0xa25a9fb1,		0x5742bc06,
+0xa27ceb4f,		0x57677b9d,
+0xa29f4559,		0x578c2dba,
+0xa2c1adc9,		0x57b0d256,
+0xa2e4249b,		0x57d5696d,
+0xa306a9c8,		0x57f9f2f8,
+0xa3293d4b,		0x581e6ef1,
+0xa34bdf20,		0x5842dd54,
+0xa36e8f41,		0x58673e1b,
+0xa3914da8,		0x588b9140,
+0xa3b41a50,		0x58afd6bd,
+0xa3d6f534,		0x58d40e8c,
+0xa3f9de4e,		0x58f838a9,
+0xa41cd599,		0x591c550e,
+0xa43fdb10,		0x594063b5,
+0xa462eeac,		0x59646498,
+0xa486106a,		0x598857b2,
+0xa4a94043,		0x59ac3cfd,
+0xa4cc7e32,		0x59d01475,
+0xa4efca31,		0x59f3de12,
+0xa513243b,		0x5a1799d1,
+0xa5368c4b,		0x5a3b47ab,
+0xa55a025b,		0x5a5ee79a,
+0xa57d8666,		0x5a82799a
+};
+
+/*End of File*/
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_FC32_Sfs_s.S
@ -0,0 +1,192 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTFwd_CToC_SC32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+    @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Sfs_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+
+@// Neon registers
+
+#define dX0     D0.F32
+
+
+    @// Allocate stack memory required by the function
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_CToC_FC32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        @// order = 1
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        BGT     orderGreaterthan2
+        @// order = 2
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan2:                                                                     @// order =3
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL    armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan3:
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+FFTEnd:
+
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+        .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC16_Sfs_s.S
@ -0,0 +1,356 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  omxSP_FFTFwd_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6729
+@// Last Modified Date:       Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+    .extern  armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+    .extern  armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec                r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+#define argTwiddle              r1
+#define argDst          r2
+#define argScale                r4
+#define pTwiddle                r4
+#define tmpOrder                r4
+#define pOut            r5
+#define subFFTSize              r7
+#define subFFTNum               r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne            r2
+#define round           r3
+
+@// Neon registers
+
+#define dX0     D0.S16
+#define dShift  D1.S16
+#define dX0S32  D0.S32
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_CToC_SC16_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        LDRLT   x0r,[pSrc]
+        STRLT   x0r,[pDst]
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        SUBS     diff,scale,order
+        M_STR   diff,diffOnStack
+        MOVGT   scale,order
+        @// Now scale <= order
+
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        SUBS    scale,scale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe      @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        MOV     argScale,scale
+        BGT     orderGreaterthan2
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan2:                                                                     @// order =3
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        BLLT    armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+orderGreaterthan3:
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order
+        MOVGT   scale,order
+        BGE     specialScaleCase                   @// scale = 0 or scale = order
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+
+specialScaleCase:                                           @//  scale = 0 or scale = order  and order > 3
+
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+        BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+scaleEqualsOrder:
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop:
+        BEQ        lastStageScaledRadix4
+        BL        armSP_FFTFwd_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop
+
+lastStageScaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+
+generalScaleCase:                                               @// 0 < scale < order and order > 3
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     @// Is count even or odd ?
+
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP     diff,#1
+        M_STR   diff, diffOnStack
+        BEQ     scaleps                         @// scaling including a radix2_ps stage
+
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+
+scaledRadix2Loop:
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        B       outScale
+
+scaleps:
+        SUB     argScale,scale,#1                   @// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+
+scaledRadix2psLoop:
+        BEQ     scaledRadix2psStage
+        BLGT    armSP_FFTFwd_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGE     scaledRadix2psLoop
+
+scaledRadix2psStage:
+        BL      armSP_FFTFwd_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        B       generalLastStageUnscaledRadix2
+
+
+outScale:
+        M_LDR   diff, diffOnStack
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC16_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        End
+
+unscaledRadix2Loop:
+        CMP        subFFTNum,#4
+         BEQ        generalLastTwoStagesUnscaledRadix2
+         BL        armSP_FFTFwd_CToC_SC16_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B        End
+
+
+FFTEnd:                                               @// Does only the scaling
+
+        M_LDR   diff, diffOnStack
+        CMP     diff,#0
+        BLE     End
+
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff
+
+scaleFFTData:                                           @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0S32[0]},[pSrc]!
+
+        BGT     scaleFFTData
+
+
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+    .END
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_CToC_SC32_Sfs_s.S
@ -0,0 +1,335 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+@// 
+@// File Name:  omxSP_FFTFwd_CToC_SC32_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6684
+@// Last Modified Date:       Mon, 09 Jul 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe   
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    @// Guarding implementation by the processor name
+    
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe     
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe      
+     
+    
+@//Input Registers
+
+#define pSrc		r0
+#define pDst		r1
+#define pFFTSpec	r2
+#define scale		r3
+
+
+@// Output registers
+#define result		r0
+
+@//Local Scratch Registers
+
+#define argTwiddle	r1
+#define argDst		r2
+#define argScale	r4
+#define tmpOrder	r4
+#define pTwiddle	r4
+#define pOut		r5
+#define subFFTSize	r7     
+#define subFFTNum	r6
+#define N		r6
+#define order		r14
+#define diff		r9
+@// Total num of radix stages required to comple the FFT	
+#define count		r8
+#define x0r		r4    
+#define x0i		r5
+#define diffMinusOne	r2
+#define round		r3
+
+@// Neon registers
+
+#define dX0	D0.S32
+#define dShift	D1.S32
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_CToC_SC32_Sfs,r11,d15
+        
+@ Structure offsets for the FFTSpec		
+	.set	ARMsFFTSpec_N, 0
+	.set	ARMsFFTSpec_pBitRev, 4
+	.set	ARMsFFTSpec_pTwiddle, 8
+	.set	ARMsFFTSpec_pBuf, 12
+        
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+                
+        CLZ     order,N                             @// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+                
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0:	
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        
+        SUBS     diff,scale,order
+        M_STR   diff,diffOnStack
+        MOVGT   scale,order
+        @// Now scale <= order
+        
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        SUBS    scale,scale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe      @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:	
+        CMP     order,#2
+        MOV     argScale,scale
+        BGT     orderGreaterthan2
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2          
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe  
+        B       FFTEnd
+        
+orderGreaterthan2:	                                                               @// order =3        
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe // "fs" means first stage
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe // "ls" means last stage
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe    
+        B       FFTEnd
+
+        
+
+orderGreaterthan3:	       
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   @// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase:	                                    @//  scale = 0 or scale = order  and order >= 2     
+        
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+         
+
+unscaledRadix4Loop:	
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+         
+lastStageUnscaledRadix4:	
+        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+         
+
+scaleEqualsOrder:	         
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+        
+
+scaledRadix4Loop:	
+        BEQ        lastStageScaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop         
+         
+lastStageScaledRadix4:	
+        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+        
+generalScaleCase:	                                        @// 0 < scale < order and order >= 2
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     @// Is count even or odd ?
+        
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        M_STR   diff, diffOnStack    
+        
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop:	        
+        BLGT    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        
+        
+        M_LDR   diff, diffOnStack  
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:	
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop 
+         
+generalLastStageUnscaledRadix4:	
+        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        End             
+             
+
+unscaledRadix2Loop:	
+        CMP        subFFTNum,#2
+         BEQ        generalLastStageUnscaledRadix2
+         BL        armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastStageUnscaledRadix2:	
+        BL      armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 
+        B        End             
+
+       
+FFTEnd:	                                              @// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     End
+        
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff     
+        
+scaleFFTData:	                                        @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+                
+                       
+End:	                        
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        
+	.end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_F32_Sfs_s.S
@ -0,0 +1,406 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute FFT for a real signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+    @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define subFFTSizeTmp   r6
+#define step            r3
+#define step1           r4
+#define twStep          r8
+#define zero            r9
+#define pTwiddleTmp     r5
+#define t0              r10
+
+@// Neon registers
+
+#define dX0       d0.f32
+#define dzero     d1.f32
+#define dZero     d2.f32
+#define dShift    d3.f32
+#define dX0r      d2.f32
+#define dX0i      d3.f32
+#define dX1r      d4.f32
+#define dX1i      d5.f32
+#define dT0       d6.f32
+#define dT1       d7.f32
+#define dT2       d8.f32
+#define dT3       d9.f32
+#define qT0       d10.f32
+#define qT1       d12.f32
+#define dW0r      d14.f32
+#define dW0i      d15.f32
+#define dW1r      d16.f32
+#define dW1i      d17.f32
+#define dY0r      d14.f32
+#define dY0i      d15.f32
+#define dY1r      d16.f32
+#define dY1i      d17.f32
+#define dY0rS64   d14.s64
+#define dY0iS64   d15.s64
+#define qT2       d18.f32
+#define qT3       d20.f32
+@// lastThreeelements
+#define dX1       d3.f32
+#define dW0       d4.f32
+#define dW1       d5.f32
+#define dY0       d10.f32
+#define dY1       d11.f32
+#define dY2       d12.f32
+#define dY3       d13.f32
+
+#define half      d0.f32
+
+HALF:   .float  0.5
+
+    @// Allocate stack memory required by the function
+
+    @// Write function header
+        M_START     omxSP_FFTFwd_RToCCS_F32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        MOV     zero,#0
+        VMOV    dzero[0],zero
+        VMOV    dZero[0],zero
+        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
+
+        B       End
+
+
+
+sizeGreaterThanOne:
+        @// Do a N/2 point complex FFT including the scaling
+
+        MOV     N,N,ASR #1                          @// N/2 point complex FFT
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pOut]
+        MOV     pSrc,pOut
+        MOV     argDst,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVNE   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        @// order = 1
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        BGT     orderGreaterthan2
+        @// order =2
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan2:@// order =3
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTFwd_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+        B       FFTEnd
+
+
+
+orderGreaterthan3:
+specialScaleCase:
+
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVEQ   argDst,pDst
+        MOVNE   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVNE   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTFwd_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+FFTEnd:
+finalComplexToRealFixup:
+
+
+        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j [0+j2b]
+        @// (a+b, 0)
+
+        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j [0+j2b]
+        @// (a-b, 0)
+
+        @// F(0) and F(N/2)
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        MOV     zero,#0
+        VMOV    dX0r[1],zero
+        MOV     step,subFFTSize,LSL #3            @// step = N/2 * 8 bytes
+        VMOV    dX0i[1],zero
+        @// twStep = 3N/8 * 8 bytes pointing to W^1
+        SUB     twStep,step,subFFTSize,LSL #1
+
+        VADD    dY0r,dX0r,dX0i                    @// F(0) = ((Z0.r+Z0.i) , 0)
+        MOV     step1,subFFTSize,LSL #2           @// step1 = N/2 * 4 bytes
+        VSUB    dY0i,dX0r,dX0i                    @// F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+
+        VST1    dY0r,[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8         @// W^2
+        VST1    dY0i,[argDst]!
+        ADD     argTwiddle,argTwiddle,twStep      @// W^1
+
+        VDUP    dzero,zero
+        SUB     argDst,argDst,step
+
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                    @// (N/4-1)*8 bytes
+
+        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table
+        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1)
+        @// since both of them require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+
+
+        LDR     t0, =HALF
+        VLD1    half[0], [t0]
+
+evenOddButterflyLoop:
+
+
+        VLD1    dW0r,[argTwiddle],step1
+        VLD1    dW1r,[argTwiddle]!
+
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+
+
+
+        SUB     step1,step1,#8                    @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+
+
+
+        VSUB    dT2,dX0r,dX1r                     @// a-c
+        SUB     step1,step1,#8
+        VADD    dT0,dX0r,dX1r                     @// a+c
+        VSUB    dT1,dX0i,dX1i                     @// b-d
+        VADD    dT3,dX0i,dX1i                     @// b+d
+        VMUL   dT0,dT0,half[0]
+        VMUL   dT1,dT1,half[0]
+        VZIP    dW1r,dW1i
+        VZIP    dW0r,dW0i
+
+
+        VMUL   qT0,dW1r,dT2
+        VMUL   qT1,dW1r,dT3
+        VMUL   qT2,dW0r,dT2
+        VMUL   qT3,dW0r,dT3
+
+        VMLA   qT0,dW1i,dT3
+        VMLS   qT1,dW1i,dT2
+
+        VMLS   qT2,dW0i,dT3
+        VMLA   qT3,dW0i,dT2
+
+
+        VMUL  dX1r,qT0,half[0]
+        VMUL  dX1i,qT1,half[0]
+
+        VSUB    dY1r,dT0,dX1i                     @// F(N/2 -1)
+        VADD    dY1i,dT1,dX1r
+        VNEG    dY1i,dY1i
+
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+
+
+        VMUL  dX0r,qT2,half[0]
+        VMUL  dX0i,qT3,half[0]
+
+        VSUB    dY0r,dT0,dX0i                     @// F(1)
+        VADD    dY0i,dT1,dX0r
+
+
+        VST2    {dY0r,dY0i},[argDst],step
+        VST2    {dY1r,dY1i},[argDst]!
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                     @// (N/2-4)*8 bytes
+
+
+        BGT     evenOddButterflyLoop
+
+        @// set both the ptrs to the last element
+        SUB     pSrc,pSrc,#8
+        SUB     argDst,argDst,#8
+
+
+
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+        @// (a-bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+
+lastElement:
+        VLD1    dX0r,[pSrc]
+
+        VST1    dX0r[0],[argDst]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[argDst]!
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+        .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.S
@ -0,0 +1,158 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxSP_FFTFwd_RToCCS_S16S32_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7403
+@// Last Modified Date:       Mon, 17 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute FFT for a real signal
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  omxSP_FFTFwd_RToCCS_S32_Sfs
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    @// Guarding implementation by the processor name
+    
+@// Import symbols required from other files
+@// (For example tables)
+             
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+        
+@// Output registers
+#define result          r0
+        
+#define pTmpDst         r4
+#define pTmpSrc         r5
+#define N               r6
+#define order           r7
+#define pOut            r8
+
+@// Neon registers
+
+#define dX0             D0.S16
+#define qY0             Q1.S32
+#define dY0S32          D2.S32
+#define qX0             Q1.S32
+#define dY1S32          D3.S32
+#define dX0S32          D0.S32
+
+
+
+
+    @// Allocate stack memory required by the function
+        
+    @// Write function header
+        M_START     omxSP_FFTFwd_RToCCS_S16S32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+        
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        @//LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        
+        @//  N=1 Treat seperately  
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        VMOVL   qY0,dX0
+        VST1    dY0S32[0],[pDst]
+        
+        MOV     pSrc,pDst
+        B       realS32FFT
+        
+sizeGreaterThanOne:        
+        MOV     N,N,ASR #1
+        
+        CLZ     order,N                             @// N = 2^order 
+        RSB     order,order,#31
+        
+        TST     order,#1
+        MOVEQ   pTmpDst,pOut
+        MOVNE   pTmpDst,pDst
+        MOV     pTmpSrc,pTmpDst
+        
+        CMP     N,#1
+        BGT     copyS16ToS32
+        VLD1    dX0S32[0],[pSrc]
+        VMOVL   qX0,dX0
+        VST1    dY0S32,[pTmpDst]
+        B       setpSrc
+
+        
+copyS16ToS32:               
+        
+        VLD1    dX0,[pSrc]!
+        SUBS    N,N,#2
+        VMOVL   qX0,dX0
+        VST1    {dY0S32,dY1S32},[pTmpDst]!
+        BGT     copyS16ToS32
+ 
+setpSrc:                
+        MOV     pSrc,pTmpSrc
+        
+        
+              
+realS32FFT:             
+        BL      omxSP_FFTFwd_RToCCS_S32_Sfs        
+        
+                
+                       
+End:                            
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        .end
+                
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTFwd_RToCCS_S32_Sfs_s.S
@ -0,0 +1,549 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxSP_FFTFwd_RToCCS_S32_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7810
+@// Last Modified Date:       Thu, 04 Oct 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute FFT for a real signal
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe   
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+    @// Guarding implementation by the processor name
+    
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe     
+        .extern  armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe      
+     
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7     
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT        
+#define count           r8
+#define x0r             r4    
+#define x0i             r5
+#define diffMinusOne    r2
+#define subFFTSizeTmp   r6
+#define step            r3
+#define step1           r4
+#define twStep          r8
+#define zero            r9
+#define pTwiddleTmp     r5
+#define t0              r10
+
+@// Neon registers
+
+#define dX0       d0.s32
+#define dzero     d1.s32
+#define dZero     d2.s32
+#define dShift    d3.s32
+#define dX0r      d2.s32            
+#define dX0i      d3.s32
+#define dX1r      d4.s32
+#define dX1i      d5.s32
+#define dT0       d6.s32
+#define dT1       d7.s32
+#define dT2       d8.s32
+#define dT3       d9.s32
+#define qT0       q5.s64
+#define qT1       q6.s64
+#define dW0r      d14.s32
+#define dW0i      d15.s32
+#define dW1r      d16.s32
+#define dW1i      d17.s32
+#define dY0r      d14.s32
+#define dY0i      d15.s32
+#define dY1r      d16.s32
+#define dY1i      d17.s32
+#define dY0rS64   d14.s64
+#define dY0iS64   d15.s64
+#define qT2       q9.s64
+#define qT3       q10.s64
+@// lastThreeelements
+#define dX1       d3.s32
+#define dW0       d4.s32
+#define dW1       d5.s32
+#define dY0       d10.s32
+#define dY1       d11.s32
+#define dY2       d12.s32
+#define dY3       d13.s32
+
+    @// Allocate stack memory required by the function
+
+        M_ALLOC4        diffOnStack, 4
+        
+    @// Write function header
+        M_START     omxSP_FFTFwd_RToCCS_S32_Sfs,r11,d15
+        
+@ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        @//  N=1 Treat seperately  
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        RSB     scale,scale,#0                        @// to use VRSHL for right shift by a variable
+        MOV     zero,#0
+        VMOV    dShift[0],scale
+        VMOV    dzero[0],zero
+        VRSHL   dX0,dShift
+        VMOV    dZero[0],zero
+        VST3    {dX0[0],dzero[0],dZero[0]},[pDst]
+                
+        B       End
+        
+                
+        
+sizeGreaterThanOne:
+        @// Do a N/2 point complex FFT including the scaling
+              
+        MOV     N,N,ASR #1                          @// N/2 point complex FFT
+                               
+        CLZ     order,N                             @// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+                
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pOut]
+        MOV     pSrc,pOut
+        MOV     argDst,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVEQ   argDst,pDst        
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        
+        SUBS     diff,scale,order
+        M_STR   diff,diffOnStack
+        MOVGT   scale,order
+        @// Now scale <= order
+        
+        CMP     order,#1
+        BGT     orderGreaterthan1
+        SUBS    scale,scale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe      @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:
+        CMP     order,#2
+        MOV     argScale,scale
+        BGT     orderGreaterthan2
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe      @// order =2          
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe  
+        B       FFTEnd
+        
+orderGreaterthan2:@// order =3        
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe      
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_fs_OutOfPlace_unsafe  
+        SUBS    argScale,argScale,#1
+        BLGE    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe  
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe      
+        BLLT    armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe    
+        B       FFTEnd
+
+        
+
+orderGreaterthan3:
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   @// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase:@//  scale = 0 or scale = order  and order >= 2     
+        
+        TST     order, #2                           @// Set input args to fft stages
+        MOVEQ   argDst,pDst        
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+         
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+         
+lastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+         
+
+scaleEqualsOrder:
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTFwd_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTFwd_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTFwd_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+        
+
+scaledRadix4Loop:
+        BEQ        lastStageScaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop         
+         
+lastStageScaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+        
+generalScaleCase:@// 0 < scale < order and order >= 2
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count, scale,diff,lsr #1         @// count = scale + (order - scale)/2
+        MOVNE   count, order
+        TST     count, #0x01                     @// Is count even or odd ?
+        
+        MOVEQ   argDst,pDst                     @// Set input args to fft stages
+        MOVNE   argDst,pOut
+        MOVNE   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        M_STR   diff, diffOnStack    
+        
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTFwd_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop:
+        BLGT    armSP_FFTFwd_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        
+        
+        M_LDR   diff, diffOnStack  
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTFwd_CToC_SC32_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop 
+         
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTFwd_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        finalComplexToRealFixup             
+             
+
+unscaledRadix2Loop:
+        CMP        subFFTNum,#2
+         BEQ        generalLastStageUnscaledRadix2
+         BL        armSP_FFTFwd_CToC_SC32_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTFwd_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 
+        B        finalComplexToRealFixup             
+
+       
+FFTEnd:@// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     finalComplexToRealFixup
+        
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff
+        
+        @// save subFFTSize and use tmpsubfftsize in the folowwing loop
+        MOV    subFFTSizeTmp,subFFTSize                 @// subFFTSizeTmp same reg as subFFTNum      
+        
+scaleFFTData:@// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSizeTmp,subFFTSizeTmp,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+        SUB     pSrc,pSrc,subFFTSize,LSL #3             @// reset pSrc for final fixup
+        
+        @//  change the logic so that output after scaling is in pOut and not in pDst
+        @//  finally store from pOut to pDst
+        @//  change branch "End" to branch "finalComplexToRealFixup" in the above
+        @//  chk the code below for multiplication by j factor
+
+finalComplexToRealFixup:
+        
+               
+        @// F(0) = 1/2[Z(0) + Z'(0)] - j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] - j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] - j [0+j2b]
+        @// (a+b, 0)
+        
+        @// F(N/2) = 1/2[Z(0) + Z'(0)] + j [Z(0) - Z'(0)]
+        @// 1/2[(a+jb) + (a-jb)] + j  [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j [0+j2b]
+        @// (a-b, 0)
+       
+        @// F(0) and F(N/2)
+        VLD2    {dX0r[0],dX0i[0]},[pSrc]!
+        MOV     zero,#0
+        VMOV    dX0r[1],zero
+        MOV     step,subFFTSize,LSL #3                  @// step = N/2 * 8 bytes
+        VMOV    dX0i[1],zero
+        SUB     twStep,step,subFFTSize,LSL #1           @// twStep = 3N/8 * 8 bytes pointing to W^1
+        
+        VADD    dY0r,dX0r,dX0i                          @// F(0) = ((Z0.r+Z0.i) , 0)
+        MOV     step1,subFFTSize,LSL #2                 @// step1 = N/2 * 4 bytes
+        VSUB    dY0i,dX0r,dX0i                            @// F(N/2) = ((Z0.r-Z0.i) , 0)
+        SUBS    subFFTSize,subFFTSize,#2
+        
+        VST1    dY0r,[argDst],step
+        ADD     pTwiddleTmp,argTwiddle,#8                @// W^2
+        VST1    dY0i,[argDst]!
+        ADD     argTwiddle,argTwiddle,twStep             @// W^1 
+        
+        VDUP    dzero,zero
+        SUB     argDst,argDst,step
+        
+        BLT     End
+        BEQ     lastElement
+        SUB     step,step,#24
+        SUB     step1,step1,#8                         @// (N/4-1)*8 bytes
+        
+        @// F(k) = 1/2[Z(k) +  Z'(N/2-k)] -j*W^(k) [Z(k) -  Z'(N/2-k)]
+        @// Note: W^k is stored as negative values in the table
+        @// Process 4 elements at a time. E.g: F(1),F(2) and F(N/2-2),F(N/2-1) since both of them
+        @// require Z(1),Z(2) and Z(N/2-2),Z(N/2-1)
+        
+        
+evenOddButterflyLoop:
+        
+        
+        VLD1    dW0r,[argTwiddle],step1
+        VLD1    dW1r,[argTwiddle]!
+        
+        VLD2    {dX0r,dX0i},[pSrc],step
+        SUB     argTwiddle,argTwiddle,step1
+        VLD2    {dX1r,dX1i},[pSrc]!
+        
+        
+        
+        SUB     step1,step1,#8                          @// (N/4-2)*8 bytes
+        VLD1    dW0i,[pTwiddleTmp],step1
+        VLD1    dW1i,[pTwiddleTmp]!
+        SUB     pSrc,pSrc,step
+        
+        SUB     pTwiddleTmp,pTwiddleTmp,step1
+        VREV64  dX1r,dX1r
+        VREV64  dX1i,dX1i
+        SUBS    subFFTSize,subFFTSize,#4
+        
+                
+        
+        VSUB    dT2,dX0r,dX1r                            @// a-c
+        SUB     step1,step1,#8
+        VADD    dT3,dX0i,dX1i                            @// b+d
+        VADD    dT0,dX0r,dX1r                           @// a+c
+        VSUB    dT1,dX0i,dX1i                            @// b-d
+        VHADD   dT0,dT0,dzero
+        VHADD   dT1,dT1,dzero
+        
+        VZIP    dW1r,dW1i
+        vzip    dW0r,dW0i
+        
+                                
+        VMULL   qT0,dW1r,dT2
+        VMLAL   qT0,dW1i,dT3
+        VMULL   qT1,dW1r,dT3
+        VMLSL   qT1,dW1i,dT2
+                    
+        VMULL   qT2,dW0r,dT2
+        VMLSL   qT2,dW0i,dT3
+        VMULL   qT3,dW0r,dT3
+        VMLAL   qT3,dW0i,dT2
+        
+                
+        VRSHRN  dX1r,qT0,#32
+        VRSHRN  dX1i,qT1,#32
+        
+        VSUB    dY1r,dT0,dX1i                           @// F(N/2 -1)
+        VADD    dY1i,dT1,dX1r
+        VNEG    dY1i,dY1i
+        
+        VREV64  dY1r,dY1r
+        VREV64  dY1i,dY1i
+        
+                            
+        VRSHRN  dX0r,qT2,#32
+        VRSHRN  dX0i,qT3,#32
+        
+        
+        VSUB    dY0r,dT0,dX0i                           @// F(1)
+        VADD    dY0i,dT1,dX0r
+        
+        
+        VST2    {dY0r,dY0i},[argDst],step
+        VST2    {dY1r,dY1i},[argDst]!
+        SUB     argDst,argDst,step
+        SUB     step,step,#32                            @// (N/2-4)*8 bytes
+        
+        
+        BGT     evenOddButterflyLoop
+        
+        SUB     pSrc,pSrc,#8                @// set both the ptrs to the last element
+        SUB     argDst,argDst,#8
+        
+        
+                                       
+        @// Last element can be expanded as follows
+        @// 1/2[Z(k) + Z'(k)] + j w^k [Z(k) - Z'(k)]
+        @// 1/2[(a+jb) + (a-jb)] + j w^k [(a+jb) - (a-jb)]
+        @// 1/2[2a+j0] + j (c+jd) [0+j2b]
+        @// (a-bc, -bd)
+        @// Since (c,d) = (0,1) for the last element, result is just (a,-b)
+        
+lastElement:
+        VLD1    dX0r,[pSrc]
+        
+        VST1    dX0r[0],[argDst]!
+        VNEG    dX0r,dX0r
+        VST1    dX0r[1],[argDst]!
+        
+        
+        
+        
+                
+                       
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+
+        .end
+        
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_FC32.c
@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_C_FC32
+ *
+ * Description:
+ * These functions compute the size of the specification structure
+ * required for the length 2^order complex FFT and IFFT functions. The function
+ * <FFTGetBufSize_C_FC32> is used in conjunction with the 32-bit functions
+ * <FFTFwd_CToC_FC32_Sfs> and <FFTInv_CToC_FC32_Sfs>.
+ *
+ * Input Arguments:
+ *
+ *   order - base-2 logarithm of the desired block length; valid in the range
+ *            [1,12] ([1,15] if BIG_FFT_TABLE is defined.)
+ *
+ * Output Arguments:
+ *
+ *   pSize - pointer to the number of bytes required for the specification
+ *            structure
+ *
+ * Return Value:
+ *
+ *    OMX_Sts_NoErr - no error
+ *
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_C_FC32(OMX_INT order, OMX_INT *pSize) {
+  if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+  /*
+   * The required size is the same as for C_SC32, because the
+   * elements are the same size and because ARMsFFTSpec_SC32 is
+   * the same size as ARMsFFTSpec_FC32.
+   */
+  return omxSP_FFTGetBufSize_C_SC32(order, pSize);
+}
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC16.c
@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTGetBufSize_C_SC16.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   9468
+ * Last Modified Date:       Thu, 03 Jan 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * Compute the size of the specification structure required
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_C_SC16   (2.2.4.1.6)
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order complex FFT and IFFT functions. The function 
+ * <FFTGetBufSize_C_SC16> is used in conjunction with the 16-bit functions 
+ * <FFTFwd_CToC_SC16_Sfs> and <FFTInv_CToC_SC16_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    
+ *
+ */
+
+
+
+OMXResult omxSP_FFTGetBufSize_C_SC16(
+     OMX_INT order,
+     OMX_INT *pSize)
+{
+    
+    OMX_INT     N,twiddleSize;
+    
+    /* Check for order zero */
+    if (order == 0)
+    {
+        *pSize = sizeof(ARMsFFTSpec_SC16);   
+        return OMX_Sts_NoErr;
+    }
+
+    
+    N = 1 << order;
+    
+    /*The max size of the twiddle table needed is 3N/4 for a radix-4 stage*/
+    twiddleSize = 3*N/4;
+
+    /* 2 pointers to store bitreversed array and twiddle factor array */
+    *pSize = sizeof(ARMsFFTSpec_SC16)
+        /* Twiddle factors  */
+           + sizeof(OMX_SC16) * twiddleSize
+        /* Ping Pong buffer   */   
+           + sizeof(OMX_SC16) * N
+           + 62 ;  /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */
+        
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_C_SC32.c
@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTGetBufSize_C_SC32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   9468
+ * Last Modified Date:       Thu, 03 Jan 2008
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * Compute the size of the specification structure required
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function:  omxSP_FFTGetBufSize_C_SC32   (2.2.4.1.6)
+ *
+ * Description:
+ * These functions compute the size of the specification structure 
+ * required for the length 2^order complex FFT and IFFT functions. The function 
+ * <FFTGetBufSize_C_SC32> is used in conjunction with the 32-bit functions 
+ * <FFTFwd_CToC_SC32_Sfs> and <FFTInv_CToC_SC32_Sfs>. 
+ *
+ * Input Arguments:
+ *   
+ *   order - base-2 logarithm of the desired block length; valid in the range 
+ *            [0,12] 
+ *
+ * Output Arguments:
+ *   
+ *   pSize - pointer to the number of bytes required for the specification 
+ *            structure 
+ *
+ * Return Value:
+ *    
+ *    OMX_Sts_NoErr - no error 
+ *    
+ *
+ */
+
+
+OMXResult omxSP_FFTGetBufSize_C_SC32(
+     OMX_INT order,
+     OMX_INT *pSize)
+{
+    
+    OMX_INT     N,twiddleSize;
+
+    /* Check for order zero */
+    if (order == 0)
+    {
+        *pSize = sizeof(ARMsFFTSpec_SC32);   
+        return OMX_Sts_NoErr;
+    }
+
+    
+    N = 1 << order;
+    
+    /*The max size of the twiddle table needed is 3N/4 for a radix-4 stage*/
+    twiddleSize = 3*N/4;
+    
+    *pSize = sizeof(ARMsFFTSpec_SC32)
+        /* N Twiddle factors  */
+           + sizeof(OMX_SC32) * twiddleSize
+        /* Ping Pong buffer   */   
+           + sizeof(OMX_SC32) * N
+           + 62 ;  /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */ 
+            
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_F32.c
@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_F32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 32-bit functions
+ * <FFTFwd_RToCCS_F32_Sfs> and <FFTInv_CCSToR_F32_Sfs>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the length; valid in the range
+ *                    [1,12]. ([1,15] if BIG_FFT_TABLE is defined.)
+ * [out] pSize	   pointer to the number of bytes required for the
+ *			   specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_F32(OMX_INT order, OMX_INT *pSize) {
+  if (!pSize || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+
+  /*
+   * The required size is the same as for R_S32, because the
+   * elements are the same size and because ARMsFFTSpec_R_SC32 is
+   * the same size as ARMsFFTSpec_R_FC32.
+   */
+  return omxSP_FFTGetBufSize_R_S32(order, pSize);
+}
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S16S32.c
@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTGetBufSize_R_S16S32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7777
+ * Last Modified Date:       Thu, 27 Sep 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * Computes the size of the specification structure required.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_S16S32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 16-bit functions
+ * <FFTFwd_RToCCS_S16_S32_Sfs> and <FFTInv_CCSToR_S32_S16_Sfs>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the length; valid in the range
+ *			   [0,12].
+ * [out] pSize	   pointer to the number of bytes required for the
+ *			   specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_S16S32(
+     OMX_INT order,     
+     OMX_INT *pSize
+ )
+{
+    OMX_INT     NBy2,N,twiddleSize;
+    
+    
+    /* Check for order zero */
+    if (order == 0)
+    {
+        *pSize = sizeof(ARMsFFTSpec_R_SC32)  
+                 + sizeof(OMX_S32) * (2); /* Extra size 'N' is used in FFTInv_CCSToR_S32S16_Sfs as a temporary buf */   
+        
+        return OMX_Sts_NoErr;
+    }
+    
+    NBy2 = 1 << (order - 1);
+    N = NBy2<<1;
+    twiddleSize = 5*N/8;            /* 3/4(N/2) + N/4 */
+    
+    /* 2 pointers to store bitreversed array and twiddle factor array */
+    *pSize = sizeof(ARMsFFTSpec_R_SC32)
+        /* Twiddle factors  */
+           + sizeof(OMX_SC32) * twiddleSize
+        /* Ping Pong buffer for doing the N/2 point complex FFT  */      
+           + sizeof(OMX_S32) * (N<<1)  /* Extra size 'N' is used in FFTInv_CCSToR_S32S16_Sfs as a temporary buf */
+           + 62 ;  /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */ 
+           
+           
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTGetBufSize_R_S32.c
@ -0,0 +1,91 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTGetBufSize_R_S32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7777
+ * Last Modified Date:       Thu, 27 Sep 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * Computes the size of the specification structure required.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTGetBufSize_R_S32
+ *
+ * Description:
+ * Computes the size of the specification structure required for the length
+ * 2^order real FFT and IFFT functions.
+ *
+ * Remarks:
+ * This function is used in conjunction with the 32-bit functions
+ * <FFTFwd_RToCCS_S32_Sfs> and <FFTInv_CCSToR_S32_Sfs>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the length; valid in the range
+ *			   [0,12].
+ * [out] pSize	   pointer to the number of bytes required for the
+ *			   specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTGetBufSize_R_S32(
+     OMX_INT order,     
+     OMX_INT *pSize
+ )
+{
+    OMX_INT     NBy2,N,twiddleSize;
+    
+    
+    /* Check for order zero */
+    if (order == 0)
+    {
+        *pSize = sizeof(ARMsFFTSpec_R_SC32)
+                + sizeof(OMX_S32) * (2); /* Extra size 'N' is used in FFTInv_CCSToR_S32S16_Sfs as a temporary buf */   
+
+        return OMX_Sts_NoErr;
+    }
+    
+    NBy2 = 1 << (order - 1);
+    N = NBy2<<1;
+    twiddleSize = 5*N/8;            /* 3/4(N/2) + N/4 */
+    
+    /* 2 pointers to store bitreversed array and twiddle factor array */
+    *pSize = sizeof(ARMsFFTSpec_R_SC32)
+        /* Twiddle factors  */
+           + sizeof(OMX_SC32) * twiddleSize
+        /* Ping Pong buffer for doing the N/2 point complex FFT  */      
+           + sizeof(OMX_S32) * (N<<1)  /* Extra size 'N' is used in FFTInv_CCSToR_S32_Sfs as a temporary buf */
+           + 62 ;  /* Extra bytes to get 32 byte alignment of ptwiddle and pBuf */ 
+           
+           
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_FC32.c
@ -0,0 +1,162 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This is a modification of omxSP_FFTInit_C_SC32.c to support
+ *  complex float instead of SC32.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function: omxSP_FFTInit_C_FC32
+ *
+ * Description:
+ * Initializes the specification structures required for the
+ * complex FFT and IFFT functions.
+ *
+ * Remarks:
+ * Desired block length is specified as an input. The function is used to
+ * initialize the specification structures for functions <FFTFwd_CToC_FC32_Sfs>
+ * and <FFTInv_CToC_FC32_Sfs>. Memory for the specification structure *pFFTSpec
+ * must be allocated prior to calling this function. The space required for
+ * *pFFTSpec, in bytes, can be determined using <FFTGetBufSize_C_FC32>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *                     valid in the range [1,12]. ([1,15] if
+ *                     BIG_FFT_TABLE is defined.)
+ * [out] pFFTSpec    pointer to initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_C_FC32(OMXFFTSpec_C_FC32* pFFTSpec, OMX_INT order) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pBuf;
+  OMX_U16* pBitRev;
+  OMX_U32 pTmp;
+  OMX_INT Nby2;
+  OMX_INT N;
+  OMX_INT M;
+  OMX_INT diff;
+  OMX_INT step;
+  ARMsFFTSpec_FC32* pFFTStruct = 0;
+  OMX_F32 x;
+  OMX_F32 y;
+  OMX_F32 xNeg;
+
+  pFFTStruct = (ARMsFFTSpec_FC32 *) pFFTSpec;
+
+  /* Validate args */
+  if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+
+  /* Do the initializations */
+  Nby2 = 1 << (order - 1);
+  N = Nby2 << 1;
+  M = N >> 3;
+
+  /* optimized implementations don't use bitreversal */
+  pBitRev = NULL;
+
+  pTwiddle = (OMX_FC32 *) (sizeof(ARMsFFTSpec_FC32) + (OMX_S8*) pFFTSpec);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32) pTwiddle) & 31;
+  if (pTmp)
+    pTwiddle = (OMX_FC32*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+
+  pBuf = (OMX_FC32*) (sizeof(OMX_FC32) * (3 * N / 4) + (OMX_S8*) pTwiddle);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32)pBuf) & 31;
+  if (pTmp)
+    pBuf = (OMX_FC32*) ((OMX_S8*)pBuf + (32 - pTmp));
+
+  /*
+   * Filling Twiddle factors :
+   *
+   * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size
+   * (MaxSize/8 + 1) Rest of the values i.e., upto MaxSize are
+   * calculated using the symmetries of sin and cos The max size of
+   * the twiddle table needed is 3N/4 for a radix-4 stage
+   *
+   * W = (-2 * PI) / N
+   * N = 1 << order
+   * W = -PI >> (order - 1)
+   */
+
+  diff = TWIDDLE_TABLE_ORDER - order;
+  /* step into the twiddle table for the current order */
+  step = 1 << diff;
+
+  x = armSP_FFT_F32TwiddleTable[0];
+  y = armSP_FFT_F32TwiddleTable[1];
+  xNeg = 1;
+
+  if (order >= 3) {
+    /* i = 0 case */
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+    pTwiddle[2 * M].Re = -y;
+    pTwiddle[2 * M].Im = xNeg;
+    pTwiddle[4 * M].Re = xNeg;
+    pTwiddle[4 * M].Im = y;
+
+    for (i = 1; i <= M; i++) {
+      j = i * step;
+
+      x = armSP_FFT_F32TwiddleTable[2 * j];
+      y = armSP_FFT_F32TwiddleTable[2 * j + 1];
+
+      pTwiddle[i].Re = x;
+      pTwiddle[i].Im = y;
+      pTwiddle[2 * M - i].Re = -y;
+      pTwiddle[2 * M - i].Im = -x;
+      pTwiddle[2 * M + i].Re = y;
+      pTwiddle[2 * M + i].Im = -x;
+      pTwiddle[4 * M - i].Re = -x;
+      pTwiddle[4 * M - i].Im = y;
+      pTwiddle[4 * M + i].Re = -x;
+      pTwiddle[4 * M + i].Im = -y;
+      pTwiddle[6 * M - i].Re = y;
+      pTwiddle[6 * M - i].Im = x;
+    }
+  } else if (order == 2) {
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+    pTwiddle[1].Re = -y;
+    pTwiddle[1].Im = xNeg;
+    pTwiddle[2].Re = xNeg;
+    pTwiddle[2].Im = y;
+  } else if (order == 1) {
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+  }
+
+  /* Update the structure */
+  pFFTStruct->N = N;
+  pFFTStruct->pTwiddle = pTwiddle;
+  pFFTStruct->pBitRev = pBitRev;
+  pFFTStruct->pBuf = pBuf;
+
+  return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC16.c
@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ *
+ * File Name:  omxSP_FFTInit_C_SC16.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   15322
+ * Last Modified Date:       Wed, 15 Oct 2008
+ *
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ *
+ *
+ * Description:
+ * Initializes the specification structures required
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function: omxSP_FFTInit_C_SC16
+ *
+ * Description:
+ * These functions initialize the specification structures required for the
+ * complex FFT and IFFT functions.
+ *
+ * Remarks:
+ * Desired block length is specified as an input. The function is used to
+ * initialize the specification structures for functions <FFTFwd_CToC_SC16_Sfs>
+ * and <FFTInv_CToC_SC16_Sfs>. Memory for the specification structure *pFFTSpec
+ * must be allocated prior to calling this function. The space required for
+ * *pFFTSpec, in bytes, can be determined using <FFTGetBufSize_C_SC16>.
+ *
+ * Parameters:
+ * [in]  order          base-2 logarithm of the desired block length;
+ *                              valid in the range [0,12].
+ * [out] pFFTSpec               pointer to initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_C_SC16(
+     OMXFFTSpec_C_SC16* pFFTSpec,
+     OMX_INT order
+ )
+ {
+    OMX_INT     i,j;
+    OMX_SC16    *pTwiddle, *pBuf;
+    OMX_U16     *pBitRev;
+    OMX_INT     Nby2,N,M,diff,step;
+    OMX_U32             pTmp;
+    ARMsFFTSpec_SC16 *pFFTStruct = 0;
+    OMX_S16     x,y,xNeg;
+    OMX_S32     xS32,yS32;
+
+
+    pFFTStruct = (ARMsFFTSpec_SC16 *) pFFTSpec;
+
+    /* if order zero no init is needed */
+    if (order == 0)
+    {
+        pFFTStruct->N = 1;
+        return OMX_Sts_NoErr;
+    }
+
+    /* Do the initializations */
+    Nby2 = 1 << (order - 1);
+    N = Nby2 << 1;
+    M = N>>3;
+
+    pBitRev = NULL ;
+
+    pTwiddle = (OMX_SC16 *)
+        (sizeof(ARMsFFTSpec_SC16) + (OMX_S8*) pFFTSpec);
+
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pTwiddle)&31;              /* (OMX_U32)pTwiddle % 32 */
+    if(pTmp != 0)
+        pTwiddle = (OMX_SC16*) ((OMX_S8*)pTwiddle + (32-pTmp));
+
+    pBuf = (OMX_SC16 *)
+        (sizeof(OMX_SC16) * (3*N/4) + (OMX_S8*) pTwiddle);
+
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+    if(pTmp != 0)
+        pBuf = (OMX_SC16*) ((OMX_S8*)pBuf + (32-pTmp));
+
+
+
+    /*
+     * Filling Twiddle factors :
+     * The original twiddle table "armSP_FFT_S16TwiddleTable" is of size (MaxSize/8 + 1)
+     * Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
+     * The max size of the twiddle table needed is 3N/4 for a radix-4 stage
+     *
+     * W = (-2 * PI) / N
+     * N = 1 << order
+     * W = -PI >> (order - 1)
+     */
+
+
+
+    diff = 12 - order;
+    step = 1<<diff;             /* step into the twiddle table for the current order */
+
+    xS32 = armSP_FFT_S32TwiddleTable[0];
+    yS32 = armSP_FFT_S32TwiddleTable[1];
+    x = (xS32+0x8000)>>16;
+    y = (yS32+0x8000)>>16;
+
+    xNeg = 0x7FFF;
+
+    if(order >=3)
+    {
+            /* i = 0 case */
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[2*M].Re = -y;
+            pTwiddle[2*M].Im = xNeg;
+            pTwiddle[4*M].Re = xNeg;
+            pTwiddle[4*M].Im = y;
+
+
+        for (i=1; i<=M; i++)
+          {
+            j = i*step;
+
+            xS32 = armSP_FFT_S32TwiddleTable[2*j];
+            yS32 = armSP_FFT_S32TwiddleTable[2*j+1];
+            x = (xS32+0x8000)>>16;
+            y = (yS32+0x8000)>>16;
+
+            pTwiddle[i].Re = x;
+            pTwiddle[i].Im = y;
+            pTwiddle[2*M-i].Re = -y;
+            pTwiddle[2*M-i].Im = -x;
+            pTwiddle[2*M+i].Re = y;
+            pTwiddle[2*M+i].Im = -x;
+            pTwiddle[4*M-i].Re = -x;
+            pTwiddle[4*M-i].Im = y;
+            pTwiddle[4*M+i].Re = -x;
+            pTwiddle[4*M+i].Im = -y;
+            pTwiddle[6*M-i].Re = y;
+            pTwiddle[6*M-i].Im = x;
+        }
+
+
+    }
+    else
+    {
+        if (order == 2)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[1].Re = -y;
+            pTwiddle[1].Im = xNeg;
+            pTwiddle[2].Re = xNeg;
+            pTwiddle[2].Im = y;
+
+        }
+        if (order == 1)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+
+        }
+
+
+    }
+
+
+    /* Update the structure */
+    pFFTStruct->N = N;
+    pFFTStruct->pTwiddle = pTwiddle;
+    pFFTStruct->pBitRev = pBitRev;
+    pFFTStruct->pBuf = pBuf;
+
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_C_SC32.c
@ -0,0 +1,196 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTInit_C_SC32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7769
+ * Last Modified Date:       Thu, 27 Sep 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description:
+ * Initializes the specification structures required
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+/**
+ * Function: omxSP_FFTInit_C_SC32
+ *
+ * Description:
+ * Initializes the specification structures required for the
+ * complex FFT and IFFT functions.
+ *
+ * Remarks:
+ * Desired block length is specified as an input. The function is used to
+ * initialize the specification structures for functions <FFTFwd_CToC_SC32_Sfs>
+ * and <FFTInv_CToC_SC32_Sfs>. Memory for the specification structure *pFFTSpec
+ * must be allocated prior to calling this function. The space required for
+ * *pFFTSpec, in bytes, can be determined using <FFTGetBufSize_C_SC32>.
+ *
+ * Parameters:
+ * [in]  order       	base-2 logarithm of the desired block length;
+ *				valid in the range [0,12].
+ * [out] pFFTSpec		pointer to initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_C_SC32(
+     OMXFFTSpec_C_SC32* pFFTSpec,
+     OMX_INT order
+ )
+{
+    OMX_INT     i,j;
+    OMX_SC32    *pTwiddle, *pBuf;
+    OMX_U16     *pBitRev;
+    OMX_U32      pTmp;
+    OMX_INT     Nby2,N,M,diff, step; 
+    ARMsFFTSpec_SC32 *pFFTStruct = 0;
+    OMX_S32     x,y,xNeg;
+    
+    pFFTStruct = (ARMsFFTSpec_SC32 *) pFFTSpec;
+
+    /* if order zero no init is needed */
+    if (order == 0)
+    {
+        pFFTStruct->N = 1;
+        return OMX_Sts_NoErr;
+    }
+
+    /* Do the initializations */
+    Nby2 = 1 << (order - 1);
+    N = Nby2 << 1;
+    M = N>>3;                
+    
+    
+    pBitRev = NULL ;                /* optimized implementations don't use bitreversal */
+    
+    pTwiddle = (OMX_SC32 *) 
+        (sizeof(ARMsFFTSpec_SC32) + (OMX_S8*) pFFTSpec);
+        
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pTwiddle)&31;              /* (OMX_U32)pTwiddle % 32 */
+    if(pTmp != 0)
+        pTwiddle = (OMX_SC32*) ((OMX_S8*)pTwiddle + (32-pTmp));            
+        
+    pBuf = (OMX_SC32*)        
+        (sizeof(OMX_SC32) * (3*N/4) + (OMX_S8*) pTwiddle);
+    
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+    if(pTmp != 0)
+        pBuf = (OMX_SC32*) ((OMX_S8*)pBuf + (32-pTmp));                
+                    
+        
+    
+    
+    /* 
+     * Filling Twiddle factors : 
+     * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size (MaxSize/8 + 1)
+     * Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
+     * The max size of the twiddle table needed is 3N/4 for a radix-4 stage
+     *
+     * W = (-2 * PI) / N 
+     * N = 1 << order
+     * W = -PI >> (order - 1)
+     */
+    
+    
+    diff = 12 - order;
+    step = 1<<diff;             /* step into the twiddle table for the current order */
+    
+    x = armSP_FFT_S32TwiddleTable[0];
+    y = armSP_FFT_S32TwiddleTable[1];
+    xNeg = 0x7FFFFFFF;
+    
+    if(order >=3)    
+    {
+            /* i = 0 case */
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[2*M].Re = -y;
+            pTwiddle[2*M].Im = xNeg;
+            pTwiddle[4*M].Re = xNeg;
+            pTwiddle[4*M].Im = y;
+            
+    
+        for (i=1; i<=M; i++)
+          {
+            j = i*step;
+            
+            x = armSP_FFT_S32TwiddleTable[2*j];
+            y = armSP_FFT_S32TwiddleTable[2*j+1];
+            
+            pTwiddle[i].Re = x;
+            pTwiddle[i].Im = y;
+            pTwiddle[2*M-i].Re = -y;
+            pTwiddle[2*M-i].Im = -x;
+            pTwiddle[2*M+i].Re = y;
+            pTwiddle[2*M+i].Im = -x;
+            pTwiddle[4*M-i].Re = -x;
+            pTwiddle[4*M-i].Im = y;
+            pTwiddle[4*M+i].Re = -x;
+            pTwiddle[4*M+i].Im = -y;
+            pTwiddle[6*M-i].Re = y;
+            pTwiddle[6*M-i].Im = x;
+        }
+        
+     
+    }
+    else
+    {
+        if (order == 2)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[1].Re = -y;
+            pTwiddle[1].Im = xNeg;
+            pTwiddle[2].Re = xNeg;
+            pTwiddle[2].Im = y;
+        
+        }
+        if (order == 1)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+        
+        }        
+        
+    
+    }
+    
+    
+        
+    /* Update the structure */
+    pFFTStruct->N = N;
+    pFFTStruct->pTwiddle = pTwiddle;
+    pFFTStruct->pBitRev = pBitRev;
+    pFFTStruct->pBuf = pBuf;
+
+    return OMX_Sts_NoErr;
+}
+
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_F32.c
@ -0,0 +1,210 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This is a modification of omxSP_FFTInit_R_S32.c to support float
+ *  instead of S32.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+/**
+ * Function: omxSP_FFTInit_R_F32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_F32_Sfs> and
+ * <ippsFFTInv_CCSToR_F32_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_F32>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *                         valid in the range [1,12].  ([1,15] if
+ *                         BIG_FFT_TABLE is defined.)
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+OMXResult omxSP_FFTInit_R_F32(OMXFFTSpec_R_F32* pFFTSpec, OMX_INT order) {
+  OMX_INT i;
+  OMX_INT j;
+  OMX_FC32* pTwiddle;
+  OMX_FC32* pTwiddle1;
+  OMX_FC32* pTwiddle2;
+  OMX_FC32* pTwiddle3;
+  OMX_FC32* pTwiddle4;
+  OMX_F32* pBuf;
+  OMX_U16* pBitRev;
+  OMX_U32 pTmp;
+  OMX_INT Nby2;
+  OMX_INT N;
+  OMX_INT M;
+  OMX_INT diff;
+  OMX_INT step;
+  OMX_F32 x;
+  OMX_F32 y;
+  OMX_F32 xNeg;
+  ARMsFFTSpec_R_FC32* pFFTStruct = 0;
+
+  pFFTStruct = (ARMsFFTSpec_R_FC32 *) pFFTSpec;
+
+  /* Validate args */
+  if (!pFFTSpec || (order < 1) || (order > TWIDDLE_TABLE_ORDER))
+    return OMX_Sts_BadArgErr;
+
+  /* Do the initializations */
+  Nby2 = 1 << (order - 1);
+  N = Nby2 << 1;
+
+  /* optimized implementations don't use bitreversal */
+  pBitRev = NULL;
+
+  pTwiddle = (OMX_FC32 *) (sizeof(ARMsFFTSpec_R_SC32) + (OMX_S8*) pFFTSpec);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32)pTwiddle) & 31;
+  if (pTmp)
+    pTwiddle = (OMX_FC32*) ((OMX_S8*)pTwiddle + (32 - pTmp));
+
+  pBuf = (OMX_F32*) (sizeof(OMX_FC32)*(5*N/8) + (OMX_S8*) pTwiddle);
+
+  /* Align to 32 byte boundary */
+  pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+  if (pTmp)
+    pBuf = (OMX_F32*) ((OMX_S8*)pBuf + (32 - pTmp));
+
+  /*
+   * Filling Twiddle factors :
+   *
+   * exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2)
+   *
+   * N/2 point complex FFT is used to compute N point real FFT The
+   * original twiddle table "armSP_FFT_F32TwiddleTable" is of size
+   * (MaxSize/8 + 1) Rest of the values i.e., upto MaxSize are
+   * calculated using the symmetries of sin and cos The max size of
+   * the twiddle table needed is 3/4(N/2) for a radix-4 stage
+   *
+   * W = (-2 * PI) / N
+   * N = 1 << order
+   * W = -PI >> (order - 1)
+   */
+
+  M = Nby2 >> 3;
+  diff = TWIDDLE_TABLE_ORDER - (order - 1);
+  /* step into the twiddle table for the current order */
+  step = 1 << diff;
+
+  x = armSP_FFT_F32TwiddleTable[0];
+  y = armSP_FFT_F32TwiddleTable[1];
+  xNeg = 1;
+
+  if ((order - 1) >= 3) {
+    /* i = 0 case */
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+    pTwiddle[2*M].Re = -y;
+    pTwiddle[2*M].Im = xNeg;
+    pTwiddle[4*M].Re = xNeg;
+    pTwiddle[4*M].Im = y;
+
+    for (i = 1; i <= M; i++) {
+      j = i*step;
+
+      x = armSP_FFT_F32TwiddleTable[2*j];
+      y = armSP_FFT_F32TwiddleTable[2*j+1];
+
+      pTwiddle[i].Re = x;
+      pTwiddle[i].Im = y;
+      pTwiddle[2*M-i].Re = -y;
+      pTwiddle[2*M-i].Im = -x;
+      pTwiddle[2*M+i].Re = y;
+      pTwiddle[2*M+i].Im = -x;
+      pTwiddle[4*M-i].Re = -x;
+      pTwiddle[4*M-i].Im = y;
+      pTwiddle[4*M+i].Re = -x;
+      pTwiddle[4*M+i].Im = -y;
+      pTwiddle[6*M-i].Re = y;
+      pTwiddle[6*M-i].Im = x;
+    }
+  } else if ((order - 1) == 2) {
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+    pTwiddle[1].Re = -y;
+    pTwiddle[1].Im = xNeg;
+    pTwiddle[2].Re = xNeg;
+    pTwiddle[2].Im = y;
+  } else if ((order-1) == 1) {
+    pTwiddle[0].Re = x;
+    pTwiddle[0].Im = y;
+  }
+
+  /*
+   * Now fill the last N/4 values : exp^(-j*2*PI*k/N) ;
+   * k=1,3,5,...,N/2-1 These are used for the final twiddle fix-up for
+   * converting complex to real FFT
+   */
+
+  M = N >> 3;
+  diff = TWIDDLE_TABLE_ORDER - order;
+  step = 1 << diff;
+
+  pTwiddle1 = pTwiddle + 3*N/8;
+  pTwiddle4 = pTwiddle1 + (N/4 - 1);
+  pTwiddle3 = pTwiddle1 + N/8;
+  pTwiddle2 = pTwiddle1 + (N/8 - 1);
+
+  x = armSP_FFT_F32TwiddleTable[0];
+  y = armSP_FFT_F32TwiddleTable[1];
+  xNeg = 1;
+
+  if (order >=3) {
+    for (i = 1; i <= M; i += 2) {
+      j = i*step;
+
+      x = armSP_FFT_F32TwiddleTable[2*j];
+      y = armSP_FFT_F32TwiddleTable[2*j+1];
+
+      pTwiddle1[0].Re = x;
+      pTwiddle1[0].Im = y;
+      pTwiddle1 += 1;
+      pTwiddle2[0].Re = -y;
+      pTwiddle2[0].Im = -x;
+      pTwiddle2 -= 1;
+      pTwiddle3[0].Re = y;
+      pTwiddle3[0].Im = -x;
+      pTwiddle3 += 1;
+      pTwiddle4[0].Re = -x;
+      pTwiddle4[0].Im = y;
+      pTwiddle4 -= 1;
+    }
+  } else {
+    if (order == 2) {
+      pTwiddle1[0].Re = -y;
+      pTwiddle1[0].Im = xNeg;
+    }
+  }
+
+
+  /* Update the structure */
+  pFFTStruct->N = N;
+  pFFTStruct->pTwiddle = pTwiddle;
+  pFFTStruct->pBitRev = pBitRev;
+  pFFTStruct->pBuf = pBuf;
+
+  return OMX_Sts_NoErr;
+}
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S16S32.c
@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTInit_R_S16S32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7777
+ * Last Modified Date:       Thu, 27 Sep 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description: 
+ * Initialize the real forward-FFT specification information struct.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+
+/**
+ * Function: omxSP_FFTInit_R_S16_S32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_S16_S32_Sfs> and
+ * <ippsFFTInv_CCSToR_S32_S16_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_S16_S32>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *			   valid in the range [0,12].
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+
+OMXResult omxSP_FFTInit_R_S16S32(
+     OMXFFTSpec_R_S16S32* pFFTSpec,
+     OMX_INT order
+)
+{
+    OMX_INT     i,j;
+    OMX_SC32    *pTwiddle,*pTwiddle1,*pTwiddle2,*pTwiddle3,*pTwiddle4;
+    OMX_S32     *pBuf;
+    OMX_U16     *pBitRev;
+    OMX_U32     pTmp;
+    OMX_INT     Nby2,N,M,diff, step; 
+    OMX_S32     x,y,xNeg;
+    ARMsFFTSpec_R_SC32 *pFFTStruct = 0;
+
+   
+    pFFTStruct = (ARMsFFTSpec_R_SC32 *) pFFTSpec;
+
+    /* if order zero no init is needed */
+    if (order == 0)
+    {
+        pFFTStruct->N = 1;
+        pFFTStruct->pTwiddle = NULL;
+        pFFTStruct->pBuf = (OMX_S32 *)
+               (sizeof(ARMsFFTSpec_R_SC32) + (OMX_S8*) pFFTSpec);
+        
+        return OMX_Sts_NoErr;
+    }
+
+    /* Do the initializations */
+    Nby2 = 1 << (order - 1);
+    N = Nby2 << 1;
+                    
+    
+    
+    pBitRev = NULL ;                /* optimized implementations don't use bitreversal */
+    
+    pTwiddle = (OMX_SC32 *) 
+        (sizeof(ARMsFFTSpec_R_SC32) + (OMX_S8*) pFFTSpec);
+    
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pTwiddle)&31;              /* (OMX_U32)pTwiddle % 32 */
+    if(pTmp != 0)
+        pTwiddle = (OMX_SC32*) ((OMX_S8*)pTwiddle + (32-pTmp));                    
+        
+        
+    pBuf = (OMX_S32*)        
+        (sizeof(OMX_SC32) * (5*N/8) + (OMX_S8*) pTwiddle);
+        
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+    if(pTmp != 0)
+        pBuf = (OMX_S32*) ((OMX_S8*)pBuf + (32-pTmp));                        
+                    
+        
+    
+    
+    /* 
+     * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2)
+     * N/2 point complex FFT is used to compute N point real FFT
+     * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size (MaxSize/8 + 1)
+     * Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
+     * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage
+     *
+     * W = (-2 * PI) / N 
+     * N = 1 << order
+     * W = -PI >> (order - 1)
+     */
+    
+    M = Nby2>>3;
+    diff = 12 - (order-1);
+    step = 1<<diff;             /* step into the twiddle table for the current order */
+    
+    x = armSP_FFT_S32TwiddleTable[0];
+    y = armSP_FFT_S32TwiddleTable[1];
+    xNeg = 0x7FFFFFFF;
+    
+    if((order-1) >=3)    
+    {
+            /* i = 0 case */
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[2*M].Re = -y;
+            pTwiddle[2*M].Im = xNeg;
+            pTwiddle[4*M].Re = xNeg;
+            pTwiddle[4*M].Im = y;
+            
+    
+        for (i=1; i<=M; i++)
+          {
+            j = i*step;
+            
+            x = armSP_FFT_S32TwiddleTable[2*j];
+            y = armSP_FFT_S32TwiddleTable[2*j+1];
+            
+            pTwiddle[i].Re = x;
+            pTwiddle[i].Im = y;
+            pTwiddle[2*M-i].Re = -y;
+            pTwiddle[2*M-i].Im = -x;
+            pTwiddle[2*M+i].Re = y;
+            pTwiddle[2*M+i].Im = -x;
+            pTwiddle[4*M-i].Re = -x;
+            pTwiddle[4*M-i].Im = y;
+            pTwiddle[4*M+i].Re = -x;
+            pTwiddle[4*M+i].Im = -y;
+            pTwiddle[6*M-i].Re = y;
+            pTwiddle[6*M-i].Im = x;
+        }
+        
+     
+    }
+    else
+    {
+        if ((order-1) == 2)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[1].Re = -y;
+            pTwiddle[1].Im = xNeg;
+            pTwiddle[2].Re = xNeg;
+            pTwiddle[2].Im = y;
+        
+        }
+        if ((order-1) == 1)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+        
+        }        
+        
+    
+    }
+    
+    
+    /*
+     * Now fill the last N/4 values : exp^(-j*2*PI*k/N) ;  k=1,3,5,...,N/2-1 
+     * These are used for the final twiddle fix-up for converting complex to real FFT
+     */
+     
+    M = N>>3;
+    diff = 12 - order;
+    step = 1<<diff;
+    
+    pTwiddle1 = pTwiddle + 3*N/8;
+    pTwiddle4 = pTwiddle1 + (N/4-1);
+    pTwiddle3 = pTwiddle1 + N/8;
+    pTwiddle2 = pTwiddle1 + (N/8-1);
+    
+    x = armSP_FFT_S32TwiddleTable[0];
+    y = armSP_FFT_S32TwiddleTable[1];
+    xNeg = 0x7FFFFFFF;
+    
+    if((order) >=3)    
+    {
+                        
+    
+        for (i=1; i<=M; i+=2 )
+          {
+            j = i*step;
+            
+            x = armSP_FFT_S32TwiddleTable[2*j];
+            y = armSP_FFT_S32TwiddleTable[2*j+1];
+            
+            pTwiddle1[0].Re = x;
+            pTwiddle1[0].Im = y;
+            pTwiddle1 += 1;
+            pTwiddle2[0].Re = -y;
+            pTwiddle2[0].Im = -x;
+            pTwiddle2 -= 1;
+            pTwiddle3[0].Re = y;
+            pTwiddle3[0].Im = -x;
+            pTwiddle3 += 1;
+            pTwiddle4[0].Re = -x;
+            pTwiddle4[0].Im = y;
+            pTwiddle4 -= 1;
+            
+        }
+        
+     
+    }
+    else
+    {
+        if (order == 2)
+        {
+            
+            pTwiddle1[0].Re = -y;
+            pTwiddle1[0].Im = xNeg;
+            
+        }
+                
+    
+    }
+     
+   
+    /* Update the structure */
+    pFFTStruct->N = N;
+    pFFTStruct->pTwiddle = pTwiddle;
+    pFFTStruct->pBitRev = pBitRev;
+    pFFTStruct->pBuf = pBuf;
+
+    return OMX_Sts_NoErr;
+}
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInit_R_S32.c
@ -0,0 +1,261 @@
+/*
+ *  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ *
+ *  This file was originally licensed as follows. It has been
+ *  relicensed with permission from the copyright holders.
+ */
+
+/**
+ * 
+ * File Name:  omxSP_FFTInit_R_S32.c
+ * OpenMAX DL: v1.0.2
+ * Last Modified Revision:   7777
+ * Last Modified Date:       Thu, 27 Sep 2007
+ * 
+ * (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+ * 
+ * 
+ * Description: 
+ * Initialize the real forward-FFT specification information struct.
+ */
+
+#include "dl/api/armOMX.h"
+#include "dl/api/omxtypes.h"
+#include "dl/sp/api/armSP.h"
+#include "dl/sp/api/omxSP.h"
+
+
+
+/**
+ * Function: omxSP_FFTInit_R_S32
+ *
+ * Description:
+ * Initialize the real forward-FFT specification information struct.
+ *
+ * Remarks:
+ * This function is used to initialize the specification structures
+ * for functions <ippsFFTFwd_RToCCS_S32_Sfs> and
+ * <ippsFFTInv_CCSToR_S32_Sfs>. Memory for *pFFTSpec must be
+ * allocated prior to calling this function. The number of bytes
+ * required for *pFFTSpec can be determined using
+ * <FFTGetBufSize_R_S32>.
+ *
+ * Parameters:
+ * [in]  order       base-2 logarithm of the desired block length;
+ *			   valid in the range [0,12].
+ * [out] pFFTFwdSpec pointer to the initialized specification structure.
+ *
+ * Return Value:
+ * Standard omxError result. See enumeration for possible result codes.
+ *
+ */
+OMXResult omxSP_FFTInit_R_S32(
+     OMXFFTSpec_R_S32* pFFTSpec,
+     OMX_INT order
+)
+{
+    OMX_INT     i,j;
+    OMX_SC32    *pTwiddle,*pTwiddle1,*pTwiddle2,*pTwiddle3,*pTwiddle4;
+    OMX_S32     *pBuf;
+    OMX_U16     *pBitRev;
+    OMX_U32     pTmp;
+    OMX_INT     Nby2,N,M,diff, step; 
+    OMX_S32     x,y,xNeg;
+    ARMsFFTSpec_R_SC32 *pFFTStruct = 0;
+
+   
+    pFFTStruct = (ARMsFFTSpec_R_SC32 *) pFFTSpec;
+
+    /* if order zero no init is needed */
+    if (order == 0)
+    {
+        pFFTStruct->N = 1;
+        pFFTStruct->pTwiddle = NULL;
+        pFFTStruct->pBuf = (OMX_S32 *)
+               (sizeof(ARMsFFTSpec_R_SC32) + (OMX_S8*) pFFTSpec);
+        
+        return OMX_Sts_NoErr;
+    }
+
+    /* Do the initializations */
+    Nby2 = 1 << (order - 1);
+    N = Nby2 << 1;
+                    
+    
+    
+    pBitRev = NULL ;                /* optimized implementations don't use bitreversal */
+    
+    pTwiddle = (OMX_SC32 *) 
+        (sizeof(ARMsFFTSpec_R_SC32) + (OMX_S8*) pFFTSpec);
+        
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pTwiddle)&31;              /* (OMX_U32)pTwiddle % 32 */
+    if(pTmp != 0)
+        pTwiddle = (OMX_SC32*) ((OMX_S8*)pTwiddle + (32-pTmp));                
+        
+    pBuf = (OMX_S32*)        
+        (sizeof(OMX_SC32) * (5*N/8) + (OMX_S8*) pTwiddle);
+    
+    /* Align to 32 byte boundary */
+    pTmp = ((OMX_U32)pBuf)&31;                 /* (OMX_U32)pBuf % 32 */
+    if(pTmp != 0)
+        pBuf = (OMX_S32*) ((OMX_S8*)pBuf + (32-pTmp));                    
+                    
+        
+    
+    
+    /* 
+     * Filling Twiddle factors : exp^(-j*2*PI*k/ (N/2) ) ; k=0,1,2,...,3/4(N/2)
+     * N/2 point complex FFT is used to compute N point real FFT
+     * The original twiddle table "armSP_FFT_S32TwiddleTable" is of size (MaxSize/8 + 1)
+     * Rest of the values i.e., upto MaxSize are calculated using the symmetries of sin and cos
+     * The max size of the twiddle table needed is 3/4(N/2) for a radix-4 stage
+     *
+     * W = (-2 * PI) / N 
+     * N = 1 << order
+     * W = -PI >> (order - 1)
+     */
+    
+    M = Nby2>>3;
+    diff = 12 - (order-1);
+    step = 1<<diff;             /* step into the twiddle table for the current order */
+    
+    x = armSP_FFT_S32TwiddleTable[0];
+    y = armSP_FFT_S32TwiddleTable[1];
+    xNeg = 0x7FFFFFFF;
+    
+    if((order-1) >=3)    
+    {
+            /* i = 0 case */
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[2*M].Re = -y;
+            pTwiddle[2*M].Im = xNeg;
+            pTwiddle[4*M].Re = xNeg;
+            pTwiddle[4*M].Im = y;
+            
+    
+        for (i=1; i<=M; i++)
+          {
+            j = i*step;
+            
+            x = armSP_FFT_S32TwiddleTable[2*j];
+            y = armSP_FFT_S32TwiddleTable[2*j+1];
+            
+            pTwiddle[i].Re = x;
+            pTwiddle[i].Im = y;
+            pTwiddle[2*M-i].Re = -y;
+            pTwiddle[2*M-i].Im = -x;
+            pTwiddle[2*M+i].Re = y;
+            pTwiddle[2*M+i].Im = -x;
+            pTwiddle[4*M-i].Re = -x;
+            pTwiddle[4*M-i].Im = y;
+            pTwiddle[4*M+i].Re = -x;
+            pTwiddle[4*M+i].Im = -y;
+            pTwiddle[6*M-i].Re = y;
+            pTwiddle[6*M-i].Im = x;
+        }
+        
+     
+    }
+    else
+    {
+        if ((order-1) == 2)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+            pTwiddle[1].Re = -y;
+            pTwiddle[1].Im = xNeg;
+            pTwiddle[2].Re = xNeg;
+            pTwiddle[2].Im = y;
+        
+        }
+        if ((order-1) == 1)
+        {
+            pTwiddle[0].Re = x;
+            pTwiddle[0].Im = y;
+        
+        }        
+        
+    
+    }
+    
+    
+    /*
+     * Now fill the last N/4 values : exp^(-j*2*PI*k/N) ;  k=1,3,5,...,N/2-1 
+     * These are used for the final twiddle fix-up for converting complex to real FFT
+     */
+     
+    M = N>>3;
+    diff = 12 - order;
+    step = 1<<diff;
+    
+    pTwiddle1 = pTwiddle + 3*N/8;
+    pTwiddle4 = pTwiddle1 + (N/4-1);
+    pTwiddle3 = pTwiddle1 + N/8;
+    pTwiddle2 = pTwiddle1 + (N/8-1);
+    
+    x = armSP_FFT_S32TwiddleTable[0];
+    y = armSP_FFT_S32TwiddleTable[1];
+    xNeg = 0x7FFFFFFF;
+    
+    if((order) >=3)    
+    {
+                        
+    
+        for (i=1; i<=M; i+=2 )
+          {
+            j = i*step;
+            
+            x = armSP_FFT_S32TwiddleTable[2*j];
+            y = armSP_FFT_S32TwiddleTable[2*j+1];
+            
+            pTwiddle1[0].Re = x;
+            pTwiddle1[0].Im = y;
+            pTwiddle1 += 1;
+            pTwiddle2[0].Re = -y;
+            pTwiddle2[0].Im = -x;
+            pTwiddle2 -= 1;
+            pTwiddle3[0].Re = y;
+            pTwiddle3[0].Im = -x;
+            pTwiddle3 += 1;
+            pTwiddle4[0].Re = -x;
+            pTwiddle4[0].Im = y;
+            pTwiddle4 -= 1;
+            
+        }
+        
+     
+    }
+    else
+    {
+        if (order == 2)
+        {
+            
+            pTwiddle1[0].Re = -y;
+            pTwiddle1[0].Im = xNeg;
+            
+        }
+                
+    
+    }
+     
+   
+    /* Update the structure */
+    pFFTStruct->N = N;
+    pFFTStruct->pTwiddle = pTwiddle;
+    pFFTStruct->pBitRev = pBitRev;
+    pFFTStruct->pBuf = pBuf;
+
+    return OMX_Sts_NoErr;
+}
+/*****************************************************************************
+ *                              END OF FILE
+ *****************************************************************************/
+
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_F32_Sfs_s.S
@ -0,0 +1,283 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
+
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+      @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.F32
+#define dShift  D1.F32
+#define dX1     D1.F32
+#define dY0     D2.F32
+#define dY1     D3.F32
+#define dX0r    D0.F32
+#define dX0i    D1.F32
+#define dX1r    D2.F32
+#define dX1i    D3.F32
+#define dW0r    D4.F32
+#define dW0i    D5.F32
+#define dW1r    D6.F32
+#define dW1i    D7.F32
+#define dT0     D8.F32
+#define dT1     D9.F32
+#define dT2     D10.F32
+#define dT3     D11.F32
+#define qT0     d12.F32
+#define qT1     d14.F32
+#define qT2     d16.F32
+#define qT3     d18.F32
+#define dY0r    D4.F32
+#define dY0i    D5.F32
+#define dY1r    D6.F32
+#define dY1i    D7.F32
+#define dzero   D20.F32
+
+#define dY2     D4.F32
+#define dY3     D5.F32
+#define dW0     D6.F32
+#define dW1     D7.F32
+#define dW0Tmp  D10.F32
+#define dW1Neg  D11.F32
+
+#define sN      S0.S32
+#define fN      S1.F32
+@// one must be the same as dScale[0]!
+#define dScale  D2.F32
+#define one     S4.F32
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        complexFFTSize, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CCSToR_F32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        @//  N=1 Treat seperately
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        VST1    dX0[0],[pDst]
+
+        B       End
+
+sizeGreaterThanOne:
+
+        @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+
+
+        BL    armSP_FFTInv_CCSToR_F32_preTwiddleRadix2_unsafe
+
+
+complexIFFT:
+
+        ASR     N,N,#1                             @// N/2 point complex IFFT
+        M_STR   N, complexFFTSize                  @ Save N for scaling later
+        ADD     pSrc,pOut,N,LSL #3                 @// set pSrc as pOut1
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+orderGreaterthan3:
+specialScaleCase:
+
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+FFTEnd:                                               @// Does only the scaling
+        @ Scale inverse FFT result by 1/N
+
+        M_LDR   N, complexFFTSize
+        VMOV    sN,N
+        VCVT    fN, sN                  @ fn = fftSize, as a float
+        VMOV    one, 1.0
+        VDIV    one, one, fN            @ one = dScale[0] = 1 / fftSize
+
+
+        @// N = subFFTSize  ; dataptr = pDst
+scaleFFTData:
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VMUL    dX0, dX0, dScale[0]
+        VST1    {dX0},[pSrc]!
+
+        BGT     scaleFFTData
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+
+
+        .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32S16_Sfs_s.S
@ -0,0 +1,146 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxSP_FFTInv_CCSToR_S32S16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7098
+@// Last Modified Date:       Thu, 16 Aug 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  omxSP_FFTInv_CCSToR_S32_Sfs
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+      @// Guarding implementation by the processor name
+    
+    
+@// Import symbols required from other files
+@// (For example tables)
+     
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+
+#define N               r6
+#define pOut            r5
+#define pTmpDst         r4
+
+
+@// Neon registers
+
+#define dX0     D0.S32
+#define dX01    D1.S32  
+#define qX0     Q0.S32
+#define dY0     D2.S16
+#define dY0S32  D2.S32
+
+
+
+    @// Allocate stack memory required by the function
+        
+    @// Write function header
+        M_START     omxSP_FFTInv_CCSToR_S32S16_Sfs,r11,d15
+        
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        @//LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        
+        MOV     pTmpDst,pDst
+        ADD     pDst,pOut,N, LSL #2
+        
+        
+        BL      omxSP_FFTInv_CCSToR_S32_Sfs
+        
+        ADD     pDst,pOut,N, LSL #2
+        
+        CMP     N,#2
+        BGT     copyLoop
+        BEQ     copyS32ToS16
+        VLD1    dX0[0],[pDst]
+        VQMOVN  dY0,qX0
+        VST1    dY0[0],[pTmpDst]
+        
+        B       End
+        
+copyS32ToS16:   
+        
+        VLD1    dX0,[pDst]
+        VQMOVN  dY0,qX0
+        VST1    dY0S32[0],[pTmpDst]
+        B       End
+
+copyLoop:               
+              
+        VLD1    {dX0,dX01},[pDst]!
+        SUBS    N,N,#4
+        VQMOVN  dY0,qX0
+        VST1    dY0,[pTmpDst]!
+        
+        BGT     copyLoop
+                
+                       
+End:                            
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        
+    .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CCSToR_S32_Sfs_s.S
@ -0,0 +1,390 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxSP_FFTInv_CCSToR_S32_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   7469
+@// Last Modified Date:       Thu, 20 Sep 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@// 
+
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe        
+        .extern  armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe        
+        
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+      @// Guarding implementation by the processor name
+    
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe     
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe
+        
+    
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+#define scale           r3
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7     
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4    
+#define x0i             r5
+#define diffMinusOne    r2
+#define round           r3
+
+#define pOut1           r2
+#define size            r7
+#define step            r8            
+#define step1           r9
+#define twStep          r10
+#define pTwiddleTmp     r11
+#define argTwiddle1     r12
+#define zero            r14
+
+@// Neon registers
+
+#define dX0     D0.S32
+#define dShift  D1.S32
+#define dX1     D1.S32
+#define dY0     D2.S32
+#define dY1     D3.S32
+#define dX0r    D0.S32            
+#define dX0i    D1.S32
+#define dX1r    D2.S32
+#define dX1i    D3.S32
+#define dW0r    D4.S32
+#define dW0i    D5.S32
+#define dW1r    D6.S32
+#define dW1i    D7.S32
+#define dT0     D8.S32
+#define dT1     D9.S32
+#define dT2     D10.S32
+#define dT3     D11.S32
+#define qT0     Q6.S64
+#define qT1     Q7.S64
+#define qT2     Q8.S64
+#define qT3     Q9.S64
+#define dY0r    D4.S32
+#define dY0i    D5.S32
+#define dY1r    D6.S32
+#define dY1i    D7.S32
+#define dzero   D20.S32
+
+#define dY2     D4.S32
+#define dY3     D5.S32
+#define dW0     D6.S32
+#define dW1     D7.S32
+#define dW0Tmp  D10.S32
+#define dW1Neg  D11.S32
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CCSToR_S32_Sfs,r11,d15
+        
+@ Structure offsets for the FFTSpec             
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+        
+        @//  N=1 Treat seperately  
+        CMP     N,#1
+        BGT     sizeGreaterThanOne
+        VLD1    dX0[0],[pSrc]
+        RSB     scale,scale,#0                        @// to use VRSHL for right shift by a variable
+        VMOV    dShift[0],scale
+        VRSHL   dX0,dShift
+        VST1    dX0[0],[pDst]
+                
+        B       End
+        
+sizeGreaterThanOne:     
+        
+        @// Call the preTwiddle Radix2 stage before doing the compledIFFT
+        
+        @// The following conditional BL combination would work since 
+        @// evenOddButterflyLoop in the first call would set Z flag to zero
+        
+        CMP     scale,#0
+        BLEQ    armSP_FFTInv_CCSToR_S32_preTwiddleRadix2_unsafe
+        BLGT    armSP_FFTInv_CCSToR_S32_Sfs_preTwiddleRadix2_unsafe
+                
+        
+        
+complexIFFT:    
+        
+        ASR     N,N,#1                             @// N/2 point complex IFFT 
+        ADD     pSrc,pOut,N,LSL #3                 @// set pSrc as pOut1 
+                        
+        CLZ     order,N                             @// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+        
+        ADD     scale,scale,order                   @// FFTInverse has a final scaling factor by N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+                
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0:      
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        @// Store the scale factor and scale at the end
+        SUB     diff,scale,order
+        M_STR   diff, diffOnStack
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:      
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe        
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+        
+
+orderGreaterthan3:             
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   @// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase:                                           @//  scale = 0 or scale = order  and order >= 2     
+        
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+         
+
+unscaledRadix4Loop:     
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+         
+lastStageUnscaledRadix4:        
+        BL      armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+         
+
+scaleEqualsOrder:                
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+        
+
+scaledRadix4Loop:       
+        BEQ        lastStageScaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop         
+         
+lastStageScaledRadix4:  
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+        
+generalScaleCase:                                               @// 0 < scale < order and order >= 2
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     @// Is count even or odd ?
+        
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        M_STR   diff, diffOnStack    
+        
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop:               
+        BLGT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        
+        
+        M_LDR   diff, diffOnStack  
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:      
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop 
+         
+generalLastStageUnscaledRadix4: 
+        BL      armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        End             
+             
+
+unscaledRadix2Loop:     
+        CMP        subFFTNum,#2
+         BEQ        generalLastStageUnscaledRadix2
+         BL        armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastStageUnscaledRadix2: 
+        BL      armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 
+        B        End             
+
+       
+FFTEnd:                                               @// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     End
+        
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff     
+        
+scaleFFTData:                                           @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+                       
+End:                            
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        
+    
+     
+        .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_FC32_Sfs_s.S
@ -0,0 +1,214 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This is a modification of armSP_FFT_CToC_SC32_Radix2_fs_unsafe_s.s
+@//  to support float instead of SC32.
+@//
+
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+      @// Guarding implementation by the processor name
+
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+
+
+@//Input Registers
+
+#define pSrc            r0
+#define pDst            r1
+#define pFFTSpec        r2
+
+
+@// Output registers
+#define result          r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst          r2
+#define argScale        r4
+#define tmpOrder        r4
+#define pTwiddle        r4
+#define pOut            r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N               r6
+#define order           r14
+#define diff            r9
+@// Total num of radix stages required to comple the FFT
+#define count           r8
+#define x0r             r4
+#define x0i             r5
+#define diffMinusOne    r2
+
+@// Neon registers
+
+#define dX0     D0.F32
+#define qX0     Q0.F32
+#define sN      S0.S32
+#define fN      S1.F32
+@// one must be the same as dScale[0]!
+#define dScale  D4.F32
+#define one     S8.F32
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        fftSize, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CToC_FC32_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        M_STR   N, fftSize
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+        BGE     orderGreaterthan1
+        @// order = 1
+        BLLT    armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        B       FFTEnd
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_FC32_Radix2_fs_OutOfPlace_unsafe
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_FC32_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_FC32_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+orderGreaterthan3:
+
+        @// Set input args to fft stages
+        TST     order, #2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        @// Pass the first stage destination in RN5
+        MOVEQ   pOut,pDst
+        MOV     argTwiddle,pTwiddle
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine even though
+        @// the first BL would corrupt the flags. This is because the end of
+        @// the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag
+        @// to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_FC32_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_FC32_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_FC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_FC32_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+FFTEnd:                                               @// Does only the scaling
+
+        M_LDR   N, fftSize
+
+        VMOV    sN,N
+        VCVT    fN, sN                  @ fn = fftSize, as a float
+        VMOV    one, 1.0
+        VDIV    one, one, fN            @ one = dScale[0] = 1 / fftSize
+
+        @ Scale data, doing 2 complex values at a time (because N is
+        @ always even).
+
+        @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+scaleFFTData:
+        VLD1    {qX0},[pSrc :128]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#2
+        VMUL    qX0, qX0, dScale[0]
+        VST1    {qX0},[pSrc :128]!
+
+        BGT     scaleFFTData
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+        .end
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC16_Sfs_s.S
@ -0,0 +1,342 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+
+@//
+@//
+@// File Name:  omxSP_FFTInv_CToC_SC16_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6729
+@// Last Modified Date:       Tue, 17 Jul 2007
+@//
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@//
+@//
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@//
+@//
+
+
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+
+@// Import symbols required from other files
+@// (For example tables)
+
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+
+@// Set debugging level
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+
+
+
+@// Guarding implementation by the processor name
+
+
+    .extern  armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+    .extern  armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+
+@//Input Registers
+
+#define pSrc    r0
+#define pDst    r1
+#define pFFTSpec        r2
+#define scale   r3
+
+
+@// Output registers
+#define result  r0
+
+@//Local Scratch Registers
+
+#define argTwiddle      r1
+#define argDst  r2
+#define argScale        r4
+#define pTwiddle        r4
+#define tmpOrder        r4
+#define pOut    r5
+#define subFFTSize      r7
+#define subFFTNum       r6
+#define N       r6
+#define order   r14
+#define diff    r9
+@// Total num of radix stages required to comple the FFT
+#define count   r8
+#define x0r     r4
+#define x0i     r5
+#define diffMinusOne    r2
+#define round   r3
+
+@// Neon registers
+
+#define dX0  D0.S16
+#define dShift  D1.S16
+#define dX0S32  D0.S32
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CToC_SC16_Sfs,r11,d15
+
+@ Structure offsets for the FFTSpec
+        .set    ARMsFFTSpec_N, 0
+        .set    ARMsFFTSpec_pBitRev, 4
+        .set    ARMsFFTSpec_pTwiddle, 8
+        .set    ARMsFFTSpec_pBuf, 12
+
+        @// Define stack arguments
+
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+
+        CLZ     order,N                             @// N = 2^order
+        RSB     order,order,#31
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+
+        ADD     scale,scale,order                   @// FFTInverse has a final scaling factor by N
+
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        LDRLT   x0r,[pSrc]
+        STRLT   x0r,[pDst]
+        MOVLT   pSrc,pDst
+        BLT     FFTEnd
+
+orderGreaterthan0:
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        @// Store the scale factor and scale at the end
+        SUB     diff,scale,order
+        M_STR   diff, diffOnStack
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        B       FFTEnd
+
+
+orderGreaterthan1:
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+
+
+
+
+orderGreaterthan3:
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order
+        MOVGT   scale,order
+        BGE     specialScaleCase                   @// scale = 0 or scale = order
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+
+specialScaleCase:                                           @//  scale = 0 or scale = order  and order > 3
+
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder
+
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+unscaledRadix4Loop:
+        BEQ        lastStageUnscaledRadix4
+        BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+
+lastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+scaleEqualsOrder:
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC16_Sfs_Radix4_fs_OutOfPlace_unsafe
+        BLNE    armSP_FFTInv_CToC_SC16_Sfs_Radix8_fs_OutOfPlace_unsafe
+
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+
+scaledRadix4Loop:
+        BEQ        lastStageScaledRadix4
+        BL        armSP_FFTInv_CToC_SC16_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop
+
+lastStageScaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix4_ls_OutOfPlace_unsafe
+        B        FFTEnd
+
+
+
+generalScaleCase:                                        @// 0 < scale < order and order > 3
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     @// Is count even or odd ?
+
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+
+        CMP     diff,#1
+        M_STR   diff, diffOnStack
+        BEQ     scaleps                         @// scaling including a radix2_ps stage
+
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+
+scaledRadix2Loop:
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        B       outScale
+
+scaleps:
+        SUB     argScale,scale,#1                   @// order>3 and diff=1 => scale >= 3
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+
+scaledRadix2psLoop:
+        BEQ     scaledRadix2psStage
+        BLGT    armSP_FFTInv_CToC_SC16_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGE     scaledRadix2psLoop
+
+scaledRadix2psStage:
+        BL      armSP_FFTInv_CToC_SC16_Sfs_Radix2_ps_OutOfPlace_unsafe
+        B       generalLastStageUnscaledRadix2
+
+
+outScale:
+        M_LDR   diff, diffOnStack
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC16_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop
+
+generalLastStageUnscaledRadix4:
+        BL      armSP_FFTInv_CToC_SC16_Radix4_ls_OutOfPlace_unsafe
+        B        End
+
+unscaledRadix2Loop:
+        CMP        subFFTNum,#4
+         BEQ        generalLastTwoStagesUnscaledRadix2
+         BL        armSP_FFTInv_CToC_SC16_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop
+
+generalLastTwoStagesUnscaledRadix2:
+        BL      armSP_FFTInv_CToC_SC16_Radix2_ps_OutOfPlace_unsafe
+generalLastStageUnscaledRadix2:
+        BL      armSP_FFTInv_CToC_SC16_Radix2_ls_OutOfPlace_unsafe
+        B        End
+
+
+FFTEnd:                                              @// Does only the scaling
+
+        M_LDR   diff, diffOnStack
+        CMP     diff,#0
+        BLE     End
+
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff
+
+scaleFFTData:                                        @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0S32[0]},[pSrc]                        @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0S32[0]},[pSrc]!
+
+        BGT     scaleFFTData
+
+
+End:
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr
+
+        @// Write function tail
+        M_END
+
+
+
+
+
+
+    .END
--- a/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S
+++ b/media/openmax_dl/dl/sp/src/omxSP_FFTInv_CToC_SC32_Sfs_s.S
@ -0,0 +1,314 @@
+@//
+@//  Copyright (c) 2013 The WebRTC project authors. All Rights Reserved.
+@//
+@//  Use of this source code is governed by a BSD-style license
+@//  that can be found in the LICENSE file in the root of the source
+@//  tree. An additional intellectual property rights grant can be found
+@//  in the file PATENTS.  All contributing project authors may
+@//  be found in the AUTHORS file in the root of the source tree.
+@//
+@//  This file was originally licensed as follows. It has been
+@//  relicensed with permission from the copyright holders.
+@//
+
+@// 
+@// File Name:  omxSP_FFTInv_CToC_SC32_Sfs_s.s
+@// OpenMAX DL: v1.0.2
+@// Last Modified Revision:   6675
+@// Last Modified Date:       Fri, 06 Jul 2007
+@// 
+@// (c) Copyright 2007-2008 ARM Limited. All Rights Reserved.
+@// 
+@// 
+@//
+@// Description:
+@// Compute an inverse FFT for a complex signal
+@// 
+
+        
+@// Include standard headers
+
+#include "dl/api/armCOMM_s.h"
+#include "dl/api/omxtypes_s.h"
+        
+@// Import symbols required from other files
+@// (For example tables)
+        
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_fs_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe   
+        
+@// Set debugging level        
+@//DEBUG_ON    SETL {TRUE}
+
+
+
+@// Guarding implementation by the processor name
+    
+    
+    
+      @// Guarding implementation by the processor name
+    
+@// Import symbols required from other files
+@// (For example tables)
+        .extern  armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe     
+        .extern  armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
+        .extern  armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe        
+     
+    
+@//Input Registers
+
+#define pSrc	r0
+#define pDst	r1
+#define pFFTSpec	r2
+#define scale	r3
+
+
+@// Output registers
+#define result	r0
+
+@//Local Scratch Registers
+
+#define argTwiddle	r1
+#define argDst	r2
+#define argScale	r4
+#define tmpOrder	r4
+#define pTwiddle	r4
+#define pOut	r5
+#define subFFTSize	r7     
+#define subFFTNum	r6
+#define N	r6
+#define order	r14
+#define diff	r9
+@// Total num of radix stages required to comple the FFT
+#define count	r8
+#define x0r	r4    
+#define x0i	r5
+#define diffMinusOne	r2
+#define round	r3
+
+@// Neon registers
+
+#define dX0	D0.S32
+#define dShift	D1.S32
+
+
+
+    @// Allocate stack memory required by the function
+        M_ALLOC4        diffOnStack, 4
+
+    @// Write function header
+        M_START     omxSP_FFTInv_CToC_SC32_Sfs,r11,d15
+        
+@ Structure offsets for the FFTSpec		
+	.set	ARMsFFTSpec_N, 0
+	.set	ARMsFFTSpec_pBitRev, 4
+	.set	ARMsFFTSpec_pTwiddle, 8
+	.set	ARMsFFTSpec_pBuf, 12
+	        
+        @// Define stack arguments
+        
+        @// Read the size from structure and take log
+        LDR     N, [pFFTSpec, #ARMsFFTSpec_N]
+        
+        @// Read other structure parameters
+        LDR     pTwiddle, [pFFTSpec, #ARMsFFTSpec_pTwiddle]
+        LDR     pOut, [pFFTSpec, #ARMsFFTSpec_pBuf]
+                
+        CLZ     order,N                             @// N = 2^order 
+        RSB     order,order,#31     
+        MOV     subFFTSize,#1
+        @//MOV     subFFTNum,N
+        
+        ADD     scale,scale,order                   @// FFTInverse has a final scaling factor by N
+        
+        CMP     order,#3
+        BGT     orderGreaterthan3                   @// order > 3
+                
+        CMP     order,#1
+        BGE     orderGreaterthan0                   @// order > 0
+        M_STR   scale, diffOnStack,LT               @// order = 0
+        VLD1    dX0,[pSrc]
+        VST1    dX0,[pDst]
+        MOV     pSrc,pDst
+        BLT     FFTEnd
+        
+orderGreaterthan0:	
+        @// set the buffers appropriately for various orders
+        CMP     order,#2
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle
+        @// Store the scale factor and scale at the end
+        SUB     diff,scale,order
+        M_STR   diff, diffOnStack
+        BGE     orderGreaterthan1
+        BLLT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe  @// order = 1
+        B       FFTEnd
+
+orderGreaterthan1:	
+        MOV     tmpOrder,order                          @// tmpOrder = RN 4
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe        
+        CMP     tmpOrder,#2
+        BLGT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_ls_OutOfPlace_unsafe
+        B       FFTEnd
+        
+
+orderGreaterthan3:	       
+        @// check scale = 0 or scale = order
+        SUBS    diff, scale, order                 @// scale > order 
+        MOVGT   scale,order     
+        BGE     specialScaleCase                   @// scale = 0 or scale = order 
+        CMP     scale,#0
+        BEQ     specialScaleCase
+        B       generalScaleCase
+        
+specialScaleCase:	                                    @//  scale = 0 or scale = order  and order >= 2     
+        
+        TST     order, #2                           @// Set input args to fft stages
+        MOVNE   argDst,pDst        
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                           @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        CMP      diff,#0
+        M_STR    diff, diffOnStack
+        BGE      scaleEqualsOrder  
+       
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+        
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC32_Radix8_fs_OutOfPlace_unsafe
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+         
+
+unscaledRadix4Loop:	
+        BEQ        lastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        unscaledRadix4Loop
+         
+lastStageUnscaledRadix4:	
+        BL      armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+         
+
+scaleEqualsOrder:	         
+        @//check for even or odd order
+        @// NOTE: The following combination of BL's would work fine eventhough the first
+        @// BL would corrupt the flags. This is because the end of the "grpZeroSetLoop" loop inside 
+        @// armSP_FFTInv_CToC_SC32_Radix4_fs_OutOfPlace_unsafe sets the Z flag to EQ
+                
+        TST     order,#0x00000001
+        BLEQ    armSP_FFTInv_CToC_SC32_Sfs_Radix4_fs_OutOfPlace_unsafe 
+        BLNE    armSP_FFTInv_CToC_SC32_Sfs_Radix8_fs_OutOfPlace_unsafe 
+        
+        CMP        subFFTNum,#4
+        BLT     FFTEnd
+        
+
+scaledRadix4Loop:	
+        BEQ        lastStageScaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Sfs_Radix4_OutOfPlace_unsafe
+         CMP        subFFTNum,#4
+         B        scaledRadix4Loop         
+         
+lastStageScaledRadix4:	
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix4_ls_OutOfPlace_unsafe 
+        B        FFTEnd        
+        
+generalScaleCase:	                                        @// 0 < scale < order and order >= 2
+        @// Determine the correct destination buffer
+        SUB     diff,order,scale
+        TST     diff,#0x01
+        ADDEQ   count,scale,diff,LSR #1         @// count = scale + (order - scale)/2
+        MOVNE   count,order
+        TST     count,#0x01                     @// Is count even or odd ?
+        
+        MOVNE   argDst,pDst                     @// Set input args to fft stages
+        MOVEQ   argDst,pOut
+        MOVEQ   pOut,pDst                       @// Pass the first stage destination in RN5
+        MOV     argTwiddle,pTwiddle  
+
+        M_STR   diff, diffOnStack    
+        
+        MOV     argScale,scale                  @// Put scale in RN4 so as to save and restore
+        BL      armSP_FFTInv_CToC_SC32_Sfs_Radix2_fs_OutOfPlace_unsafe     @// scaled first stage
+        SUBS    argScale,argScale,#1
+        
+scaledRadix2Loop:	        
+        BLGT    armSP_FFTInv_CToC_SC32_Sfs_Radix2_OutOfPlace_unsafe
+        SUBS    argScale,argScale,#1            @// save and restore scale (RN4) in the scaled stages
+        BGT     scaledRadix2Loop
+        
+        
+        M_LDR   diff, diffOnStack  
+        @//check for even or odd order
+        TST     diff,#0x00000001
+        BEQ     generalUnscaledRadix4Loop
+        B       unscaledRadix2Loop
+
+generalUnscaledRadix4Loop:	
+        CMP        subFFTNum,#4
+         BEQ        generalLastStageUnscaledRadix4
+         BL        armSP_FFTInv_CToC_SC32_Radix4_OutOfPlace_unsafe
+         B        generalUnscaledRadix4Loop 
+         
+generalLastStageUnscaledRadix4:	
+        BL      armSP_FFTInv_CToC_SC32_Radix4_ls_OutOfPlace_unsafe 
+        B        End             
+             
+
+unscaledRadix2Loop:	
+        CMP        subFFTNum,#2
+         BEQ        generalLastStageUnscaledRadix2
+         BL        armSP_FFTInv_CToC_SC32_Radix2_OutOfPlace_unsafe
+         B        unscaledRadix2Loop        
+
+generalLastStageUnscaledRadix2:	
+        BL      armSP_FFTInv_CToC_SC32_Radix2_ls_OutOfPlace_unsafe 
+        B        End             
+
+       
+FFTEnd:	                                              @// Does only the scaling
+        
+        M_LDR   diff, diffOnStack  
+        CMP     diff,#0
+        BLE     End
+        
+        RSB     diff,diff,#0                        @// to use VRSHL for right shift by a variable
+        VDUP    dShift,diff     
+        
+scaleFFTData:	                                        @// N = subFFTSize  ; dataptr = pDst  ; scale = diff
+        VLD1    {dX0},[pSrc]            @// pSrc contains pDst pointer
+        SUBS    subFFTSize,subFFTSize,#1
+        VRSHL   dX0,dShift
+        VST1    {dX0},[pSrc]!
+                
+        BGT     scaleFFTData
+        
+                       
+End:	                        
+        @// Set return value
+        MOV     result, #OMX_Sts_NoErr       
+
+        @// Write function tail
+        M_END
+        
+	.end
--- a/toolkit/content/license.html
+++ b/toolkit/content/license.html
@ -98,6 +98,7 @@
      <li><a href="about:license#jpnic">Japan Network Information Center License</a></li>
      <li><a href="about:license#jemalloc">jemalloc License</a></li>
      <li><a href="about:license#jquery">jQuery License</a></li>
+      <li><a href="about:license#khronos">Khronos group License</a></li>
      <li><a href="about:license#kiss_fft">Kiss FFT License</a></li>
      <li><a href="about:license#libcubeb">libcubeb License</a></li>
      <li><a href="about:license#libevent">libevent License</a></li>
@ -1950,6 +1951,7 @@ WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
      <span class="path">dom/plugins/</span>,
      <span class="path">tools/profiler/sps/</span>,
      <span class="path">gfx/ots/</span>,
+      <span class="path">media/openmax_dl/</span>,
      <span class="path">gfx/ycbcr</span> and
      <span class="path">dom/media/webspeech/recognition/</span>.
    </p>
@ -2882,6 +2884,43 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

    <hr>

+    <h1><a id="khronos"></a>Khronos group License</h1>
+
+    <p>This license applies to the following files:</p>
+
+    <ul>
+      <li class="path">openmax_dl/dl/api/omxtypes.h</li>
+      <li class="path">openmax_dl/dl/sp/api/omxSP.h</li>
+    </ul>
+
+<pre>
+Copyright 2005-2008 The Khronos Group Inc. All Rights Reserved.
+
+These materials are protected by copyright laws and contain material 
+proprietary to the Khronos Group, Inc.  You may use these materials 
+for implementing Khronos specifications, without altering or removing 
+any trademark, copyright or other notice from the specification.
+
+Khronos Group makes no, and expressly disclaims any, representations 
+or warranties, express or implied, regarding these materials, including, 
+without limitation, any implied warranties of merchantability or fitness 
+for a particular purpose or non-infringement of any intellectual property. 
+Khronos Group makes no, and expressly disclaims any, warranties, express 
+or implied, regarding the correctness, accuracy, completeness, timeliness, 
+and reliability of these materials. 
+
+Under no circumstances will the Khronos Group, or any of its Promoters, 
+Contributors or Members or their respective partners, officers, directors, 
+employees, agents or representatives be liable for any damages, whether 
+direct, indirect, special or consequential damages for lost revenues, 
+lost profits, or otherwise, arising from or in connection with these 
+materials.
+
+Khronos and OpenMAX are trademarks of the Khronos Group Inc.
+</pre>
+
+    <hr>
+
    <h1><a id="kiss_fft"></a>Kiss FFT License</h1>

    <p>This license applies to files in the directory