DSP库移植FFT记录-JZTXT

前言

由于项目的需要，需要FFT的算法对采集的中频信号进行处理，但是由于这次项目的使用的单片机的空间十分小，使用的是兆易创新发布的GD32E232系列的芯片，由于之前的项目都是使用ST系列的单片机。ST的单片机本身带有自身配套的汇编库，可以高效的实现FFT的功能，但是经过测试，发现使用GD32使用ST32的DSP库会报错，而进行移植和修改，估计会花费大量的时间（由于我对ARM的汇编不是很懂）。于是在项目的时间评估下，提出了两种方案：

编写FFT的C算法，测试其性能是否满足。
利用ARM自带的DSP库，测试其性能是否满足。

使用C语言实现算法测试效果

Operation Patform：GD32E232 MCU

FFT Point：32bit

Time Consuming：about 50ms

代码如下：

主要文件fft.c,fft.h,main.c三个文件

//fft.c
#include "fft.h"
#define SWAP(a,b)  tempr=(a);(a)=(b);(b)=tempr

/*
	@brief：Computing complex conjugate
	@param[in]   int n					 The number of complex
	@param[in]   complex in[]            complex of exchange        
	@param[out]  complex out[]           complex of conjugate
*/
/*------------------------------------复数的共轭函数---------------------------------------------------*/
void conjugate_complex(int n, complex in[], complex out[])
{
	int i = 0;
	for (i = 0; i < n; i++)
	{
		out[i].imag = -in[i].imag;
		out[i].real = in[i].real;
	}
}
/*------------------------------------------------------------------------------------------------------*/


/*
	@brief：Calculate the absolute value of complex number
	@param[in]   complex1[]			  complex number of input
	@param[out]  float out[]		  Absolute value of complex number
	@param[out]  int n				  number
*/
/*------------------------------------计算复数的模-----------------------------------------------------*/
void c_abs(complex complex1[], double out[], int n)
{
	int i = 0;
	float Temp;
	for (i = 0; i < n; i++)
	{
		Temp = complex1[i].real * complex1[i].real + complex1[i].imag * complex1[i].imag;
		out[i] = (float)sqrt(Temp);
	}
}
/*------------------------------------------------------------------------------------------------------*/



/*
	@brief：The sum of complex
	@param[in]   complex a			  complex number of input
	@param[in]   complex b		      complex number of input
	@param[out]  complex *c		      result
*/
/*------------------------------------计算复数的和------------------------------------------------------*/
void c_plus(complex a, complex b, complex *c)
{
	c->real = a.real + b.real;
	c->imag = a.imag + b.imag;
}
/*------------------------------------------------------------------------------------------------------*/

   
/*------------------------------------计算复数的差------------------------------------------------------*/
void c_sub(complex a, complex b, complex *c)
{
	c->real = a.real - b.real;
	c->imag = a.imag - b.imag;
}
/*------------------------------------------------------------------------------------------------------*/


/*------------------------------------计算复数的积------------------------------------------------------*/
void c_mul(complex a, complex b, complex *c)
{
	c->real = a.real * b.real - a.imag * b.imag;
	c->imag = a.real * b.imag + a.imag * b.real;
}
/*------------------------------------------------------------------------------------------------------*/

/*------------------------------------计算复数的商------------------------------------------------------*/
void c_div(complex a, complex b, complex *c)
{
	c->real = (a.real * b.real + a.imag * b.imag) / (b.real * b.real + b.imag * b.imag);
	c->imag = (a.imag * b.real - a.real * b.imag) / (b.real * b.real + b.imag * b.imag);
}
/*------------------------------------------------------------------------------------------------------*/

/*------------------------------------------------------------------------------------------------------*/
void Wn_i(int n, int i, complex *Wn, char flag)
{
	Wn->real = cos(2 * PI*i / n);
	if (flag == 1)
		Wn->imag = -sin(2 * PI*i / n);
	else if (flag == 0)
		Wn->imag = -sin(2 * PI*i / n);
}
/*----------------------------------计算傅里叶变换--------------------------------------------------------*/

void fft(int N,complex f[])
{
  complex t,wn;//中间变量
  int i,j,k,m,n,l,r,M;
  int la,lb,lc;
  /*----计算分解的级数M=log2(N)----*/
  for(i=N,M=1;(i=i/2)!=1;M++); 
  /*----按照倒位序重新排列原信号----*/
  for(i=1,j=N/2;i<=N-2;i++)
  {
    if(i<j)
    {
     // t.real=f[j].real;
		t = f[j];
		f[j] = f[i];
		f[i] = t;
    }
    k=N/2;
    while(k<=j)
    {
      j=j-k;
      k=k/2;
    }
    j=j+k;
  }
 
  /*----FFT算法----*/
  for(m=1;m<=M;m++)
  {
    la=pow(2,m); //la=2^m代表第m级每个分组所含节点数		
    lb=la/2;    //lb代表第m级每个分组所含碟形单元数
                 //同时它也表示每个碟形单元上下节点之间的距离
    /*----碟形运算----*/
    for(l=1;l<=lb;l++)
    {
      r=(l-1)*pow(2,M-m);	
      for(n=l-1;n<N-1;n=n+la) //遍历每个分组，分组总数为N/la
      {
        lc=n+lb;  //n,lc分别代表一个碟形单元的上、下节点编号     
        Wn_i(N,r,&wn,1);//wn=Wnr
        c_mul(f[lc],wn,&t);//t = f[lc] * wn复数运算
        c_sub(f[n],t,&(f[lc]));//f[lc] = f[n] - f[lc] * Wnr
        c_plus(f[n],t,&(f[n]));//f[n] = f[n] + f[lc] * Wnr
      }
    }
  }
}
//傅里叶逆变换
void ifft(int N,complex f[])
{
  int i=0;
  conjugate_complex(N,f,f);
  fft(N,f);
  conjugate_complex(N,f,f);
  for(i=0;i<N;i++)
  {
    f[i].imag = (f[i].imag)/N;
    f[i].real = (f[i].real)/N;
  }
}
/*------------------------------------------------------------------------------------------------------*/

//fft.h
#ifndef __FFT_H__
#define __FFT_H__
#include "math.h"
#define PI 3.1415926535897932384626433832795028841971
/*定义一个复数的结构*/
typedef struct  
{
	float real;		//实部
	float imag;		//虚部
}complex;


/*------------------------------------函数列表-------------------------------------------------------*/
void conjugate_complex(int n, complex in[], complex out[]);
void c_abs(complex complex1[], double out[], int n);
void c_plus(complex a, complex b, complex *c);
void c_sub(complex a, complex b, complex *c);
void c_mul(complex a, complex b, complex *c);
void c_div(complex a, complex b, complex *c);
void Wn_i(int n, int i, complex *Wn, char flag);
void fft(int N, complex f[]);

#endif

//main.c
#include  "stdio.h"
#include "fft.h"
#include "math.h"

#define sampleFrequency		 32768    //采样频率
#define samplePoint			 64     //采样点数
#define resolution			(sampleFrequency/samplePoint)   //FFT计算点数
#define LPFPoint             25      //低通的截止点数 resolution*LPFPoint为截止频率
#define AmplitudeThreshold   2      //幅度阈值 实际中需要，过滤噪声,量化噪声

complex  BuffInArray[samplePoint] = { 0 };
double   BuffOutArray[samplePoint/2] = { 0 };
double   MagBuffOutArray[samplePoint / 2] = { 0 };
double   LPFMagBuffOutArray[LPFPoint] = { 0 };

/*----------------------------生成测试数据-----------------------------------*/
void InitBuffInArray() 
{
	unsigned int i;
	double test;
	for (long i = 0; i < samplePoint; i++)
	{
		/*产生测试波形*/
		test = 3 * sin(2 * PI * i * 700 / sampleFrequency) +
			   6 * sin(2* PI* i * 118 / sampleFrequency) +
			   4 * sin(2 * PI*i * 350 / sampleFrequency);
		BuffInArray[i].real = test;
		BuffInArray[i].imag = 0;
	}
}
void GetPowerMag() 
{
	long int i;
	double X, Y, P, Mag;
	c_abs(BuffInArray, BuffOutArray, samplePoint/2);
	for (i = 0; i < samplePoint/2; i++)
	{
		//X = BuffInArray[i].real / samplePoint;
		//Y = BuffInArray[i].imag / samplePoint;
		Mag = BuffOutArray[i] * 2 / samplePoint;
		MagBuffOutArray[i] = Mag;
		//P = atan2(Y, X) * 180 / PI;
		printf("%d          ",i);
		printf("%d          ", resolution*i);
		printf("%lf          ", Mag);
		//printf("%f         ", P);
		//printf("%f          ", X);
		//printf("%f          ", Y);
		printf("\r\n");
	}
}
/*----------------------------计算有效值-----------------------------------*/

void LowPassFilter()
{
	for (long i = 0; i < LPFPoint; i++)
	{  
		LPFMagBuffOutArray[i] = MagBuffOutArray[i];
		printf("%d          ", i);
		printf("%d          ", resolution*i);
		printf("%lf          ", MagBuffOutArray[i]);
		printf("\r\n");
	}
}

double GetEffectiveValue()
{
	double SumMag = 0;
	for (long i = 0; i < LPFPoint; i++) 
	{
		if (LPFMagBuffOutArray[i] - AmplitudeThreshold <= 0.0000001 )
		{
			LPFMagBuffOutArray[i] = 0;
		}
		SumMag += LPFMagBuffOutArray[i];
	}
	return (sqrt(2)/2)*SumMag;
}
int main() 
{   
	InitBuffInArray();
	fft(samplePoint, BuffInArray);
	printf("This  is  FFT Test \r\n");
	printf("点数       频率          幅值    \r\n");
	GetPowerMag();
	printf("Low pass filtering results \r\n");
	printf("点数       频率          幅值    \r\n");
	LowPassFilter();	//低通滤波取出118HZ和350HZ
	printf("Effective value   %lf\r\n", GetEffectiveValue());
	return 0;
}

测试后得到的数据是运行一次算法的时间是50ms，这显然太大，对于实时性要求比较高的场合，这个运行速度完全无法满足应用。在之前的测试中使用ST本身的汇编库的运算时间都是级别的。后期对代码进行了优化，但是优化测试的运行速度是30ms左右。

利用ARM自带的DSP库

在研究ARM自带的DSP库后，发现对于各种系列的内核都有支持，但是对于M23核没有支持，网上也没找到对于ARM算法库对于M23核的移植方法。

参考了【STM32H7的DSP教程】第6章 ARM DSP源码和库移植方法（MDK5的AC5和AC6）（https://blog.csdn.net/Simon223/article/details/105311500）这篇博文的方法，但是报出了几百个错误警告。

按照道理而言，M23核的指令集应该是兼容M0核指令集合，但是报错的进行调试，发现依然是汇编部分存在问题。~~我还是太菜~~。既然无法直接使用，于是我自己读懂代码后进行ARM库代码的移植。由于内存和空间的验证，加上之前FFT的测试经验。并且项目的精度要求考虑，使用了Q15型的定点FFT算法。移植代码如下。

代码由fft.h和fft.c两个文件主要组成。

Operation Patform：GD32E232 MCU

FFT Point：32bit

Time Consuming：about 390us

//fft.h
#ifndef __FFT_H__
#define __FFT_H__
#include "stdint.h"
/*--------------------------------------tyde definition---------------------------------------------------------*/

#define FFT32SamplePoint  	32
#define FFT64SamplePoint  	64
#define FFT128SamplePoint  128
#define FFT256SamplePoint  256

#define FFTPoint      FFT32SamplePoint     //set the sampling points of fft
#define ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH 		((uint16_t)24)
#define ARMBITREVINDEXTABLE_FIXED___32_TABLE_LENGTH 	((uint16_t)24)

#define ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH 		((uint16_t)56)
#define ARMBITREVINDEXTABLE_FIXED___64_TABLE_LENGTH 	((uint16_t)56)
/*-------------------------------------------------------------------------------------------------------------*/

/*--------------------------------------tyde definition---------------------------------------------------------*/
typedef int16_t q15_t;
typedef int32_t q31_t;

typedef struct
{
				uint16_t fftLen;                   /**< length of the FFT. */
	const q15_t *pTwiddle;             		   /**< points to the Twiddle factor table. */
	const uint16_t *pBitRevTable;            /**< points to the bit reversal table. */
				uint16_t bitRevLength;             /**< bit reversal table length. */
	
} arm_cfft_instance_q15; 

typedef struct
{
				uint16_t fftLen;                 /**< length of the FFT. */
				uint8_t ifftFlag;                /**< flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform. */
				uint8_t bitReverseFlag;          /**< flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output. */
	const q15_t *pTwiddle;                 /**< points to the twiddle factor table. */
	const uint16_t *pBitRevTable;          /**< points to the bit reversal table. */
				uint16_t twidCoefModifier;       /**< twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. */
				uint16_t bitRevFactor;           /**< bit reversal modifier that supports different size FFTs with the same bit reversal table. */
} arm_cfft_radix4_instance_q15;
/*-------------------------------------------------------------------------------------------------------------*/

/*-----------------------------------------constant table------------------------------------------------------*/
#if FFTPoint == FFT32SamplePoint
static const q15_t twiddleCoef_32_q15[48] = {
(q15_t)0x7FFF, (q15_t)0x0000,
(q15_t)0x7D8A, (q15_t)0x18F8,
(q15_t)0x7641, (q15_t)0x30FB,
(q15_t)0x6A6D, (q15_t)0x471C,
(q15_t)0x5A82, (q15_t)0x5A82,
(q15_t)0x471C, (q15_t)0x6A6D,
(q15_t)0x30FB, (q15_t)0x7641,
(q15_t)0x18F8, (q15_t)0x7D8A,
(q15_t)0x0000, (q15_t)0x7FFF,
(q15_t)0xE707, (q15_t)0x7D8A,
(q15_t)0xCF04, (q15_t)0x7641,
(q15_t)0xB8E3, (q15_t)0x6A6D,
(q15_t)0xA57D, (q15_t)0x5A82,
(q15_t)0x9592, (q15_t)0x471C,
(q15_t)0x89BE, (q15_t)0x30FB,
(q15_t)0x8275, (q15_t)0x18F8,
(q15_t)0x8000, (q15_t)0x0000,
(q15_t)0x8275, (q15_t)0xE707,
(q15_t)0x89BE, (q15_t)0xCF04,
(q15_t)0x9592, (q15_t)0xB8E3,
(q15_t)0xA57D, (q15_t)0xA57D,
(q15_t)0xB8E3, (q15_t)0x9592,
(q15_t)0xCF04, (q15_t)0x89BE,
(q15_t)0xE707, (q15_t)0x8275
};
#endif
#if FFTPoint == FFT64SamplePoint
static const q15_t twiddleCoef_64_q15[96] = {
	(q15_t)0x7FFF, (q15_t)0x0000, (q15_t)0x7F62, (q15_t)0x0C8B,
	(q15_t)0x7D8A, (q15_t)0x18F8, (q15_t)0x7A7D, (q15_t)0x2528,
	(q15_t)0x7641, (q15_t)0x30FB, (q15_t)0x70E2, (q15_t)0x3C56,
	(q15_t)0x6A6D, (q15_t)0x471C, (q15_t)0x62F2, (q15_t)0x5133,
	(q15_t)0x5A82, (q15_t)0x5A82, (q15_t)0x5133, (q15_t)0x62F2,
	(q15_t)0x471C, (q15_t)0x6A6D, (q15_t)0x3C56, (q15_t)0x70E2,
	(q15_t)0x30FB, (q15_t)0x7641, (q15_t)0x2528, (q15_t)0x7A7D,
	(q15_t)0x18F8, (q15_t)0x7D8A, (q15_t)0x0C8B, (q15_t)0x7F62,
	(q15_t)0x0000, (q15_t)0x7FFF, (q15_t)0xF374, (q15_t)0x7F62,
	(q15_t)0xE707, (q15_t)0x7D8A, (q15_t)0xDAD7, (q15_t)0x7A7D,
	(q15_t)0xCF04, (q15_t)0x7641, (q15_t)0xC3A9, (q15_t)0x70E2,
	(q15_t)0xB8E3, (q15_t)0x6A6D, (q15_t)0xAECC, (q15_t)0x62F2,
	(q15_t)0xA57D, (q15_t)0x5A82, (q15_t)0x9D0D, (q15_t)0x5133,
	(q15_t)0x9592, (q15_t)0x471C, (q15_t)0x8F1D, (q15_t)0x3C56,
	(q15_t)0x89BE, (q15_t)0x30FB, (q15_t)0x8582, (q15_t)0x2528,
	(q15_t)0x8275, (q15_t)0x18F8, (q15_t)0x809D, (q15_t)0x0C8B,
	(q15_t)0x8000, (q15_t)0x0000, (q15_t)0x809D, (q15_t)0xF374,
	(q15_t)0x8275, (q15_t)0xE707, (q15_t)0x8582, (q15_t)0xDAD7,
	(q15_t)0x89BE, (q15_t)0xCF04, (q15_t)0x8F1D, (q15_t)0xC3A9,
	(q15_t)0x9592, (q15_t)0xB8E3, (q15_t)0x9D0D, (q15_t)0xAECC,
	(q15_t)0xA57D, (q15_t)0xA57D, (q15_t)0xAECC, (q15_t)0x9D0D,
	(q15_t)0xB8E3, (q15_t)0x9592, (q15_t)0xC3A9, (q15_t)0x8F1D,
	(q15_t)0xCF04, (q15_t)0x89BE, (q15_t)0xDAD7, (q15_t)0x8582,
	(q15_t)0xE707, (q15_t)0x8275, (q15_t)0xF374, (q15_t)0x809D
};
#endif
#if FFTPoint == 32
static const uint16_t armBitRevIndexTable_fixed_32[ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH] =
{
   /* 4x2, size 24 */
   8,128, 16,64, 24,192, 40,160, 48,96, 56,224, 72,144,
   88,208, 104,176, 120,240, 152,200, 184,232
};
#endif
#if FFTPoint == FFT64SamplePoint
static const uint16_t armBitRevIndexTable_fixed_64[ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH] =
{
   /* radix 4, size 56 */
   8,256, 16,128, 24,384, 32,64, 40,320, 48,192, 56,448, 72,288, 80,160, 88,416, 104,352,
   112,224, 120,480, 136,272, 152,400, 168,336, 176,208, 184,464, 200,304, 216,432,
   232,368, 248,496, 280,392, 296,328, 312,456, 344,424, 376,488, 440,472
};
#endif
/*-----------------------------------------------------------------------------------------------------------------*/

/*-----------------------------------function list------------------------------------------------------------------*/


void arm_cfft_q15(
								const arm_cfft_instance_q15 * S,
											q15_t * p1,
											uint8_t ifftFlag,
											uint8_t bitReverseFlag);

void arm_radix4_butterfly_q15(
											q15_t * pSrc16,
											uint32_t fftLen,
									const q15_t * pCoef16,
											uint32_t twidCoefModifier);

void arm_radix4_butterfly_inverse_q15(
											q15_t * pSrc16,
											uint32_t fftLen,
								const q15_t * pCoef16,
											uint32_t twidCoefModifier);

void arm_bitreversal_q15(
											q15_t * pSrc,
											uint32_t fftLen,
											uint16_t bitRevFactor,
								const uint16_t * pBitRevTab);

#endif

//fft.c
#include "fft.h"

/* ----------------------------------------------------------------------
 * Project:      FFT
 * Title:        arm_cfft_radix4_q15.c
 * Description:  This file has function definition of Radix-4 FFT & IFFT function and
 *               In-place bit reversal using bit reversal table
 *
 * $Revision:    V1.6.0
 *
 * Target Processor: Cortex-M cores
 * -------------------------------------------------------------------- */


void arm_bitreversal_16(
        uint16_t *pSrc,
  const uint16_t bitRevLen,
  const uint16_t *pBitRevTab)
{
  uint16_t a, b, i, tmp;

  for (i = 0; i < bitRevLen; )
  {
     a = pBitRevTab[i    ] >> 2;
     b = pBitRevTab[i + 1] >> 2;

     //real
     tmp = pSrc[a];
     pSrc[a] = pSrc[b];
     pSrc[b] = tmp;

     //complex
     tmp = pSrc[a+1];
     pSrc[a+1] = pSrc[b+1];
     pSrc[b+1] = tmp;

    i += 2;
  }
}

void arm_bitreversal_q15(
        q15_t * pSrc16,
        uint32_t fftLen,
        uint16_t bitRevFactor,
  const uint16_t * pBitRevTab)
{
   q31_t *pSrc = (q31_t *) pSrc16;
   q31_t in;
   uint32_t fftLenBy2, fftLenBy2p1;
   uint32_t i, j;

   /*  Initializations */
   j = 0U;
   fftLenBy2 = fftLen / 2U;
   fftLenBy2p1 = (fftLen / 2U) + 1U;

   /* Bit Reversal Implementation */
   for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
   {
      if (i < j)
      {
         /*  pSrc[i] <-> pSrc[j]; */
         /*  pSrc[i+1U] <-> pSrc[j+1U] */
         in = pSrc[i];
         pSrc[i] = pSrc[j];
         pSrc[j] = in;

         /*  pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1];  */
         /*  pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */
         in = pSrc[i + fftLenBy2p1];
         pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
         pSrc[j + fftLenBy2p1] = in;
      }

      /*  pSrc[i+1U] <-> pSrc[j+fftLenBy2];         */
      /*  pSrc[i+2] <-> pSrc[j+fftLenBy2+1U]  */
      in = pSrc[i + 1U];
      pSrc[i + 1U] = pSrc[j + fftLenBy2];
      pSrc[j + fftLenBy2] = in;

      /*  Reading the index for the bit reversal */
      j = *pBitRevTab;

      /*  Updating the bit reversal index depending on the fft length  */
      pBitRevTab += bitRevFactor;
   }
}

int32_t __SSAT(int32_t val, uint32_t sat)
{
  if ((sat >= 1U) && (sat <= 32U))
  {
    const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
    const int32_t min = -1 - max ;
    if (val > max)
    {
      return max;
    }
    else if (val < min)
    {
      return min;
    }
  }
  return val;
}

void arm_cfft_radix4_q15(
  const arm_cfft_radix4_instance_q15 * S,
        q15_t * pSrc)
{
  if (S->ifftFlag == 1U)
  {
    /*  Complex IFFT radix-4  */
    arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  }
  else
  {
    /*  Complex FFT radix-4  */
    arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
  }

  if (S->bitReverseFlag == 1U)
  {
    /*  Bit Reversal */
    arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
  }

}

void arm_radix4_butterfly_q15(
        q15_t * pSrc16,
        uint32_t fftLen,
  const q15_t * pCoef16,
        uint32_t twidCoefModifier)
{

#if defined (ARM_MATH_DSP)

        q31_t R, S, T, U;
        q31_t C1, C2, C3, out1, out2;
        uint32_t n1, n2, ic, i0, j, k;

        q15_t *ptr1;
        q15_t *pSi0;
        q15_t *pSi1;
        q15_t *pSi2;
        q15_t *pSi3;

        q31_t xaya, xbyb, xcyc, xdyd;

  /* Total process is divided into three stages */

  /* process first stage, middle stages, & last stage */

  /*  Initializations for the first stage */
  n2 = fftLen;
  n1 = n2;

  /* n2 = fftLen/4 */
  n2 >>= 2U;

  /* Index for twiddle coefficient */
  ic = 0U;

  /* Index for input read and output write */
  j = n2;

  pSi0 = pSrc16;
  pSi1 = pSi0 + 2 * n2;
  pSi2 = pSi1 + 2 * n2;
  pSi3 = pSi2 + 2 * n2;

  /* Input is in 1.15(q15) format */

  /*  start of first stage process */
  do
  {
    /*  Butterfly implementation */

    /* Reading i0, i0+fftLen/2 inputs */
    /* Read ya (real), xa(imag) input */
    T = read_q15x2 (pSi0);
    T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
    T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
/*
    in = ((int16_t) (T & 0xFFFF)) >> 2;       // alternative code that takes 3 cycles
     T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
*/

    /* Read yc (real), xc(imag) input */
    S = read_q15x2 (pSi2);
    S = __SHADD16(S, 0);
    S = __SHADD16(S, 0);

    /* R = packed((ya + yc), (xa + xc) ) */
    R = __QADD16(T, S);

    /* S = packed((ya - yc), (xa - xc) ) */
    S = __QSUB16(T, S);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* Read yb (real), xb(imag) input */
    T = read_q15x2 (pSi1);
    T = __SHADD16(T, 0);
    T = __SHADD16(T, 0);

    /* Read yd (real), xd(imag) input */
    U = read_q15x2 (pSi3);
    U = __SHADD16(U, 0);
    U = __SHADD16(U, 0);

    /* T = packed((yb + yd), (xb + xd) ) */
    T = __QADD16(T, U);

    /*  writing the butterfly processed i0 sample */
    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    write_q15x2_ia (&pSi0, __SHADD16(R, T));

    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
    R = __QSUB16(R, T);

    /* co2 & si2 are read from SIMD Coefficient pointer */
    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));

#ifndef ARM_MATH_BIG_ENDIAN
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
    out1 = __SMUAD(C2, R) >> 16U;
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
    out2 = __SMUSDX(C2, R);
#else
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
    out1 = __SMUSDX(R, C2) >> 16U;
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
    out2 = __SMUAD(C2, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /*  Reading i0+fftLen/4 */
    /* T = packed(yb, xb) */
    T = read_q15x2 (pSi1);
    T = __SHADD16(T, 0);
    T = __SHADD16(T, 0);

    /* writing the butterfly processed i0 + fftLen/4 sample */
    /* writing output(xc', yc') in little endian format */
    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));

    /*  Butterfly calculations */
    /* U = packed(yd, xd) */
    U = read_q15x2 (pSi3);
    U = __SHADD16(U, 0);
    U = __SHADD16(U, 0);

    /* T = packed(yb-yd, xb-xd) */
    T = __QSUB16(T, U);

#ifndef ARM_MATH_BIG_ENDIAN
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
    R = __QASX(S, T);
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
    S = __QSAX(S, T);
#else
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
    R = __QSAX(S, T);
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
    S = __QASX(S, T);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* co1 & si1 are read from SIMD Coefficient pointer */
    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
    /*  Butterfly process for the i0+fftLen/2 sample */

#ifndef ARM_MATH_BIG_ENDIAN
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
    out1 = __SMUAD(C1, S) >> 16U;
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
    out2 = __SMUSDX(C1, S);
#else
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
    out1 = __SMUSDX(S, C1) >> 16U;
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
    out2 = __SMUAD(C1, S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* writing output(xb', yb') in little endian format */
    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));

    /* co3 & si3 are read from SIMD Coefficient pointer */
    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
    /*  Butterfly process for the i0+3fftLen/4 sample */

#ifndef ARM_MATH_BIG_ENDIAN
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
    out1 = __SMUAD(C3, R) >> 16U;
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
    out2 = __SMUSDX(C3, R);
#else
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
    out1 = __SMUSDX(R, C3) >> 16U;
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
    out2 = __SMUAD(C3, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* writing output(xd', yd') in little endian format */
    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
    /*  Twiddle coefficients index modifier */
    ic = ic + twidCoefModifier;

  } while (--j);
  /* data is in 4.11(q11) format */

  /* end of first stage process */


  /* start of middle stage process */

  /*  Twiddle coefficients index modifier */
  twidCoefModifier <<= 2U;

  /*  Calculation of Middle stage */
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
  {
    /*  Initializations for the middle stage */
    n1 = n2;
    n2 >>= 2U;
    ic = 0U;

    for (j = 0U; j <= (n2 - 1U); j++)
    {
      /*  index calculation for the coefficients */
      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));

      /*  Twiddle coefficients index modifier */
      ic = ic + twidCoefModifier;

      pSi0 = pSrc16 + 2 * j;
      pSi1 = pSi0 + 2 * n2;
      pSi2 = pSi1 + 2 * n2;
      pSi3 = pSi2 + 2 * n2;

      /*  Butterfly implementation */
      for (i0 = j; i0 < fftLen; i0 += n1)
      {
        /*  Reading i0, i0+fftLen/2 inputs */
        /* Read ya (real), xa(imag) input */
        T = read_q15x2 (pSi0);

        /* Read yc (real), xc(imag) input */
        S = read_q15x2 (pSi2);

        /* R = packed( (ya + yc), (xa + xc)) */
        R = __QADD16(T, S);

        /* S = packed((ya - yc), (xa - xc)) */
        S = __QSUB16(T, S);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* Read yb (real), xb(imag) input */
        T = read_q15x2 (pSi1);

        /* Read yd (real), xd(imag) input */
        U = read_q15x2 (pSi3);

        /* T = packed( (yb + yd), (xb + xd)) */
        T = __QADD16(T, U);

        /*  writing the butterfly processed i0 sample */

        /* xa' = xa + xb + xc + xd */
        /* ya' = ya + yb + yc + yd */
        out1 = __SHADD16(R, T);
        out1 = __SHADD16(out1, 0);
        write_q15x2 (pSi0, out1);
        pSi0 += 2 * n1;

        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
        R = __SHSUB16(R, T);

#ifndef ARM_MATH_BIG_ENDIAN
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
        out1 = __SMUAD(C2, R) >> 16U;

        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        out2 = __SMUSDX(C2, R);
#else
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        out1 = __SMUSDX(R, C2) >> 16U;

        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
        out2 = __SMUAD(C2, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /*  Reading i0+3fftLen/4 */
        /* Read yb (real), xb(imag) input */
        T = read_q15x2 (pSi1);

        /*  writing the butterfly processed i0 + fftLen/4 sample */
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi1 += 2 * n1;

        /*  Butterfly calculations */

        /* Read yd (real), xd(imag) input */
        U = read_q15x2 (pSi3);

        /* T = packed(yb-yd, xb-xd) */
        T = __QSUB16(T, U);

#ifndef ARM_MATH_BIG_ENDIAN
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
        R = __SHASX(S, T);

        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
        S = __SHSAX(S, T);


        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = __SMUAD(C1, S) >> 16U;
        out2 = __SMUSDX(C1, S);
#else
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
        R = __SHSAX(S, T);

        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
        S = __SHASX(S, T);


        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = __SMUSDX(S, C1) >> 16U;
        out2 = __SMUAD(C1, S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi2 += 2 * n1;

        /*  Butterfly process for the i0+3fftLen/4 sample */

#ifndef ARM_MATH_BIG_ENDIAN
        out1 = __SMUAD(C3, R) >> 16U;
        out2 = __SMUSDX(C3, R);
#else
        out1 = __SMUSDX(R, C3) >> 16U;
        out2 = __SMUAD(C3, R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi3 += 2 * n1;
      }
    }
    /*  Twiddle coefficients index modifier */
    twidCoefModifier <<= 2U;
  }
  /* end of middle stage process */


  /* data is in 10.6(q6) format for the 1024 point */
  /* data is in 8.8(q8) format for the 256 point */
  /* data is in 6.10(q10) format for the 64 point */
  /* data is in 4.12(q12) format for the 16 point */

  /*  Initializations for the last stage */
  j = fftLen >> 2;

  ptr1 = &pSrc16[0];

  /* start of last stage process */

  /*  Butterfly implementation */
  do
  {
    /* Read xa (real), ya(imag) input */
    xaya = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xb (real), yb(imag) input */
    xbyb = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xc (real), yc(imag) input */
    xcyc = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xd (real), yd(imag) input */
    xdyd = read_q15x2_ia ((q15_t **) &ptr1);

    /* R = packed((ya + yc), (xa + xc)) */
    R = __QADD16(xaya, xcyc);

    /* T = packed((yb + yd), (xb + xd)) */
    T = __QADD16(xbyb, xdyd);

    /* pointer updation for writing */
    ptr1 = ptr1 - 8U;


    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    write_q15x2_ia (&ptr1, __SHADD16(R, T));

    /* T = packed((yb + yd), (xb + xd)) */
    T = __QADD16(xbyb, xdyd);

    /* xc' = (xa-xb+xc-xd) */
    /* yc' = (ya-yb+yc-yd) */
    write_q15x2_ia (&ptr1, __SHSUB16(R, T));

    /* S = packed((ya - yc), (xa - xc)) */
    S = __QSUB16(xaya, xcyc);

    /* Read yd (real), xd(imag) input */
    /* T = packed( (yb - yd), (xb - xd))  */
    U = __QSUB16(xbyb, xdyd);

#ifndef ARM_MATH_BIG_ENDIAN
    /* xb' = (xa+yb-xc-yd) */
    /* yb' = (ya-xb-yc+xd) */
    write_q15x2_ia (&ptr1, __SHSAX(S, U));

    /* xd' = (xa-yb-xc+yd) */
    /* yd' = (ya+xb-yc-xd) */
    write_q15x2_ia (&ptr1, __SHASX(S, U));
#else
    /* xb' = (xa+yb-xc-yd) */
    /* yb' = (ya-xb-yc+xd) */
    write_q15x2_ia (&ptr1, __SHASX(S, U));

    /* xd' = (xa-yb-xc+yd) */
    /* yd' = (ya+xb-yc-xd) */
    write_q15x2_ia (&ptr1, __SHSAX(S, U));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

  } while (--j);

  /* end of last stage process */

  /* output is in 11.5(q5) format for the 1024 point */
  /* output is in 9.7(q7) format for the 256 point   */
  /* output is in 7.9(q9) format for the 64 point  */
  /* output is in 5.11(q11) format for the 16 point  */


#else /* #if defined (ARM_MATH_DSP) */

        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;

  /* Total process is divided into three stages */

  /* process first stage, middle stages, & last stage */

  /*  Initializations for the first stage */
  n2 = fftLen;
  n1 = n2;

  /* n2 = fftLen/4 */
  n2 >>= 2U;

  /* Index for twiddle coefficient */
  ic = 0U;

  /* Index for input read and output write */
  i0 = 0U;
  j = n2;

  /* Input is in 1.15(q15) format */

  /*  start of first stage process */
  do
  {
    /*  Butterfly implementation */

    /*  index calculation for the input as, */
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
    i1 = i0 + n2;
    i2 = i1 + n2;
    i3 = i2 + n2;

    /*  Reading i0, i0+fftLen/2 inputs */

    /* input is down scale by 4 to avoid overflow */
    /* Read ya (real), xa(imag) input */
    T0 = pSrc16[i0 * 2U] >> 2U;
    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;

    /* input is down scale by 4 to avoid overflow */
    /* Read yc (real), xc(imag) input */
    S0 = pSrc16[i2 * 2U] >> 2U;
    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;

    /* R0 = (ya + yc) */
    R0 = __SSAT(T0 + S0, 16U);
    /* R1 = (xa + xc) */
    R1 = __SSAT(T1 + S1, 16U);

    /* S0 = (ya - yc) */
    S0 = __SSAT(T0 - S0, 16);
    /* S1 = (xa - xc) */
    S1 = __SSAT(T1 - S1, 16);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* input is down scale by 4 to avoid overflow */
    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U] >> 2U;
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;

    /* input is down scale by 4 to avoid overflow */
    /* Read yd (real), xd(imag) input */
    U0 = pSrc16[i3 * 2U] >> 2U;
    U1 = pSrc16[(i3 * 2U) + 1] >> 2U;

    /* T0 = (yb + yd) */
    T0 = __SSAT(T0 + U0, 16U);
    /* T1 = (xb + xd) */
    T1 = __SSAT(T1 + U1, 16U);

    /*  writing the butterfly processed i0 sample */
    /* ya' = ya + yb + yc + yd */
    /* xa' = xa + xb + xc + xd */
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);

    /* R0 = (ya + yc) - (yb + yd) */
    /* R1 = (xa + xc) - (xb + xd) */
    R0 = __SSAT(R0 - T0, 16U);
    R1 = __SSAT(R1 - T1, 16U);

    /* co2 & si2 are read from Coefficient pointer */
    Co2 = pCoef16[2U * ic * 2U];
    Si2 = pCoef16[(2U * ic * 2U) + 1];

    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
    out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
    out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);

    /*  Reading i0+fftLen/4 */
    /* input is down scale by 4 to avoid overflow */
    /* T0 = yb, T1 =  xb */
    T0 = pSrc16[i1 * 2U] >> 2;
    T1 = pSrc16[(i1 * 2U) + 1] >> 2;

    /* writing the butterfly processed i0 + fftLen/4 sample */
    /* writing output(xc', yc') in little endian format */
    pSrc16[i1 * 2U] = out1;
    pSrc16[(i1 * 2U) + 1] = out2;

    /*  Butterfly calculations */
    /* input is down scale by 4 to avoid overflow */
    /* U0 = yd, U1 = xd */
    U0 = pSrc16[i3 * 2U] >> 2;
    U1 = pSrc16[(i3 * 2U) + 1] >> 2;
    /* T0 = yb-yd */
    T0 = __SSAT(T0 - U0, 16);
    /* T1 = xb-xd */
    T1 = __SSAT(T1 - U1, 16);

    /* R1 = (ya-yc) + (xb- xd),  R0 = (xa-xc) - (yb-yd)) */
    R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
    R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);

    /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
    S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
    S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);

    /* co1 & si1 are read from Coefficient pointer */
    Co1 = pCoef16[ic * 2U];
    Si1 = pCoef16[(ic * 2U) + 1];
    /*  Butterfly process for the i0+fftLen/2 sample */
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
    out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
    out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);

    /* writing output(xb', yb') in little endian format */
    pSrc16[i2 * 2U] = out1;
    pSrc16[(i2 * 2U) + 1] = out2;

    /* Co3 & si3 are read from Coefficient pointer */
    Co3 = pCoef16[3U * (ic * 2U)];
    Si3 = pCoef16[(3U * (ic * 2U)) + 1];
    /*  Butterfly process for the i0+3fftLen/4 sample */
    /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
    out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
    /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
    out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
    /* writing output(xd', yd') in little endian format */
    pSrc16[i3 * 2U] = out1;
    pSrc16[(i3 * 2U) + 1] = out2;

    /*  Twiddle coefficients index modifier */
    ic = ic + twidCoefModifier;

    /*  Updating input index */
    i0 = i0 + 1U;

  } while (--j);
  /* data is in 4.11(q11) format */

  /* end of first stage process */


  /* start of middle stage process */

  /*  Twiddle coefficients index modifier */
  twidCoefModifier <<= 2U;

  /*  Calculation of Middle stage */
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
  {
    /*  Initializations for the middle stage */
    n1 = n2;
    n2 >>= 2U;
    ic = 0U;

    for (j = 0U; j <= (n2 - 1U); j++)
    {
      /*  index calculation for the coefficients */
      Co1 = pCoef16[ic * 2U];
      Si1 = pCoef16[(ic * 2U) + 1U];
      Co2 = pCoef16[2U * (ic * 2U)];
      Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
      Co3 = pCoef16[3U * (ic * 2U)];
      Si3 = pCoef16[(3U * (ic * 2U)) + 1U];

      /*  Twiddle coefficients index modifier */
      ic = ic + twidCoefModifier;

      /*  Butterfly implementation */
      for (i0 = j; i0 < fftLen; i0 += n1)
      {
        /*  index calculation for the input as, */
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
        i1 = i0 + n2;
        i2 = i1 + n2;
        i3 = i2 + n2;

        /*  Reading i0, i0+fftLen/2 inputs */
        /* Read ya (real), xa(imag) input */
        T0 = pSrc16[i0 * 2U];
        T1 = pSrc16[(i0 * 2U) + 1U];

        /* Read yc (real), xc(imag) input */
        S0 = pSrc16[i2 * 2U];
        S1 = pSrc16[(i2 * 2U) + 1U];

        /* R0 = (ya + yc), R1 = (xa + xc) */
        R0 = __SSAT(T0 + S0, 16);
        R1 = __SSAT(T1 + S1, 16);

        /* S0 = (ya - yc), S1 =(xa - xc) */
        S0 = __SSAT(T0 - S0, 16);
        S1 = __SSAT(T1 - S1, 16);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* Read yb (real), xb(imag) input */
        T0 = pSrc16[i1 * 2U];
        T1 = pSrc16[(i1 * 2U) + 1U];

        /* Read yd (real), xd(imag) input */
        U0 = pSrc16[i3 * 2U];
        U1 = pSrc16[(i3 * 2U) + 1U];


        /* T0 = (yb + yd), T1 = (xb + xd) */
        T0 = __SSAT(T0 + U0, 16);
        T1 = __SSAT(T1 + U1, 16);

        /*  writing the butterfly processed i0 sample */

        /* xa' = xa + xb + xc + xd */
        /* ya' = ya + yb + yc + yd */
        out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
        out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;

        pSrc16[i0 * 2U] = out1;
        pSrc16[(2U * i0) + 1U] = out2;

        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
        R0 = (R0 >> 1U) - (T0 >> 1U);
        R1 = (R1 >> 1U) - (T1 >> 1U);

        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
        out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);

        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);

        /*  Reading i0+3fftLen/4 */
        /* Read yb (real), xb(imag) input */
        T0 = pSrc16[i1 * 2U];
        T1 = pSrc16[(i1 * 2U) + 1U];

        /*  writing the butterfly processed i0 + fftLen/4 sample */
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        pSrc16[i1 * 2U] = out1;
        pSrc16[(i1 * 2U) + 1U] = out2;

        /*  Butterfly calculations */

        /* Read yd (real), xd(imag) input */
        U0 = pSrc16[i3 * 2U];
        U1 = pSrc16[(i3 * 2U) + 1U];

        /* T0 = yb-yd, T1 = xb-xd */
        T0 = __SSAT(T0 - U0, 16);
        T1 = __SSAT(T1 - U1, 16);

        /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
        R0 = (S0 >> 1U) - (T1 >> 1U);
        R1 = (S1 >> 1U) + (T0 >> 1U);

        /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
        S0 = (S0 >> 1U) + (T1 >> 1U);
        S1 = (S1 >> 1U) - (T0 >> 1U);

        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);

        out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);

        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
        pSrc16[i2 * 2U] = out1;
        pSrc16[(i2 * 2U) + 1U] = out2;

        /*  Butterfly process for the i0+3fftLen/4 sample */
        out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);

        out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
        /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
        /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
        pSrc16[i3 * 2U] = out1;
        pSrc16[(i3 * 2U) + 1U] = out2;
      }
    }
    /*  Twiddle coefficients index modifier */
    twidCoefModifier <<= 2U;
  }
  /* end of middle stage process */


  /* data is in 10.6(q6) format for the 1024 point */
  /* data is in 8.8(q8) format for the 256 point */
  /* data is in 6.10(q10) format for the 64 point */
  /* data is in 4.12(q12) format for the 16 point */

  /*  Initializations for the last stage */
  n1 = n2;
  n2 >>= 2U;

  /* start of last stage process */

  /*  Butterfly implementation */
  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  {
    /*  index calculation for the input as, */
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
    i1 = i0 + n2;
    i2 = i1 + n2;
    i3 = i2 + n2;

    /*  Reading i0, i0+fftLen/2 inputs */
    /* Read ya (real), xa(imag) input */
    T0 = pSrc16[i0 * 2U];
    T1 = pSrc16[(i0 * 2U) + 1U];

    /* Read yc (real), xc(imag) input */
    S0 = pSrc16[i2 * 2U];
    S1 = pSrc16[(i2 * 2U) + 1U];

    /* R0 = (ya + yc), R1 = (xa + xc) */
    R0 = __SSAT(T0 + S0, 16U);
    R1 = __SSAT(T1 + S1, 16U);

    /* S0 = (ya - yc), S1 = (xa - xc) */
    S0 = __SSAT(T0 - S0, 16U);
    S1 = __SSAT(T1 - S1, 16U);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U];
    T1 = pSrc16[(i1 * 2U) + 1U];
    /* Read yd (real), xd(imag) input */
    U0 = pSrc16[i3 * 2U];
    U1 = pSrc16[(i3 * 2U) + 1U];

    /* T0 = (yb + yd), T1 = (xb + xd)) */
    T0 = __SSAT(T0 + U0, 16U);
    T1 = __SSAT(T1 + U1, 16U);

    /*  writing the butterfly processed i0 sample */
    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);

    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
    R0 = (R0 >> 1U) - (T0 >> 1U);
    R1 = (R1 >> 1U) - (T1 >> 1U);
    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U];
    T1 = pSrc16[(i1 * 2U) + 1U];

    /*  writing the butterfly processed i0 + fftLen/4 sample */
    /* xc' = (xa-xb+xc-xd) */
    /* yc' = (ya-yb+yc-yd) */
    pSrc16[i1 * 2U] = R0;
    pSrc16[(i1 * 2U) + 1U] = R1;

    /* Read yd (real), xd(imag) input */
    U0 = pSrc16[i3 * 2U];
    U1 = pSrc16[(i3 * 2U) + 1U];
    /* T0 = (yb - yd), T1 = (xb - xd)  */
    T0 = __SSAT(T0 - U0, 16U);
    T1 = __SSAT(T1 - U1, 16U);

    /*  writing the butterfly processed i0 + fftLen/2 sample */
    /* xb' = (xa+yb-xc-yd) */
    /* yb' = (ya-xb-yc+xd) */
    pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);

    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
    /* xd' = (xa-yb-xc+yd) */
    /* yd' = (ya+xb-yc-xd) */
    pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);

  }

  /* end of last stage process */

  /* output is in 11.5(q5) format for the 1024 point */
  /* output is in 9.7(q7) format for the 256 point   */
  /* output is in 7.9(q9) format for the 64 point  */
  /* output is in 5.11(q11) format for the 16 point  */

#endif /* #if defined (ARM_MATH_DSP) */

}


/**
  @brief         Core function for the Q15 CIFFT butterfly process.
  @param[in,out] pSrc16           points to the in-place buffer of Q15 data type
  @param[in]     fftLen           length of the FFT
  @param[in]     pCoef16          points to twiddle coefficient buffer
  @param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
  @return        none
 */

/*
 * Radix-4 IFFT algorithm used is :
 *
 * CIFFT uses same twiddle coefficients as CFFT function
 *  x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
 *
 *
 * IFFT is implemented with following changes in equations from FFT
 *
 * Input real and imaginary data:
 * x(n) = xa + j * ya
 * x(n+N/4 ) = xb + j * yb
 * x(n+N/2 ) = xc + j * yc
 * x(n+3N 4) = xd + j * yd
 *
 *
 * Output real and imaginary data:
 * x(4r) = xa'+ j * ya'
 * x(4r+1) = xb'+ j * yb'
 * x(4r+2) = xc'+ j * yc'
 * x(4r+3) = xd'+ j * yd'
 *
 *
 * Twiddle factors for radix-4 IFFT:
 * Wn = co1 + j * (si1)
 * W2n = co2 + j * (si2)
 * W3n = co3 + j * (si3)
 
 * The real and imaginary output values for the radix-4 butterfly are
 * xa' = xa + xb + xc + xd
 * ya' = ya + yb + yc + yd
 * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
 * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
 * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
 * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
 * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
 * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
 *
 */

void arm_radix4_butterfly_inverse_q15(
        q15_t * pSrc16,
        uint32_t fftLen,
  const q15_t * pCoef16,
        uint32_t twidCoefModifier)
{

#if defined (ARM_MATH_DSP)

        q31_t R, S, T, U;
        q31_t C1, C2, C3, out1, out2;
        uint32_t n1, n2, ic, i0, j, k;
        
        q15_t *ptr1;
        q15_t *pSi0;
        q15_t *pSi1;
        q15_t *pSi2;
        q15_t *pSi3;
        
        q31_t xaya, xbyb, xcyc, xdyd;

  /* Total process is divided into three stages */

  /* process first stage, middle stages, & last stage */

  /*  Initializations for the first stage */
  n2 = fftLen;
  n1 = n2;

  /* n2 = fftLen/4 */
  n2 >>= 2U;

  /* Index for twiddle coefficient */
  ic = 0U;

  /* Index for input read and output write */
  j = n2;

  pSi0 = pSrc16;
  pSi1 = pSi0 + 2 * n2;
  pSi2 = pSi1 + 2 * n2;
  pSi3 = pSi2 + 2 * n2;

  /* Input is in 1.15(q15) format */

  /*  start of first stage process */
  do
  {
    /*  Butterfly implementation */

    /*  Reading i0, i0+fftLen/2 inputs */
    /* Read ya (real), xa(imag) input */
    T = read_q15x2 (pSi0);
    T = __SHADD16(T, 0);
    T = __SHADD16(T, 0);

    /* Read yc (real), xc(imag) input */
    S = read_q15x2 (pSi2);
    S = __SHADD16(S, 0);
    S = __SHADD16(S, 0);

    /* R = packed((ya + yc), (xa + xc) ) */
    R = __QADD16(T, S);

    /* S = packed((ya - yc), (xa - xc) ) */
    S = __QSUB16(T, S);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* Read yb (real), xb(imag) input */
    T = read_q15x2 (pSi1);
    T = __SHADD16(T, 0);
    T = __SHADD16(T, 0);

    /* Read yd (real), xd(imag) input */
    U = read_q15x2 (pSi3);
    U = __SHADD16(U, 0);
    U = __SHADD16(U, 0);

    /* T = packed((yb + yd), (xb + xd) ) */
    T = __QADD16(T, U);

    /*  writing the butterfly processed i0 sample */
    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    write_q15x2_ia (&pSi0, __SHADD16(R, T));

    /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
    R = __QSUB16(R, T);

    /* co2 & si2 are read from SIMD Coefficient pointer */
    C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));

#ifndef ARM_MATH_BIG_ENDIAN
    /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
    out1 = __SMUSD(C2, R) >> 16U;
    /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
    out2 = __SMUADX(C2, R);
#else
    /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
    out1 = __SMUADX(C2, R) >> 16U;
    /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
    out2 = __SMUSD(__QSUB16(0, C2), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /*  Reading i0+fftLen/4 */
    /* T = packed(yb, xb) */
    T = read_q15x2 (pSi1);
    T = __SHADD16(T, 0);
    T = __SHADD16(T, 0);

    /* writing the butterfly processed i0 + fftLen/4 sample */
    /* writing output(xc', yc') in little endian format */
    write_q15x2_ia (&pSi1, (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));

    /*  Butterfly calculations */
    /* U = packed(yd, xd) */
    U = read_q15x2 (pSi3);
    U = __SHADD16(U, 0);
    U = __SHADD16(U, 0);

    /* T = packed(yb-yd, xb-xd) */
    T = __QSUB16(T, U);

#ifndef ARM_MATH_BIG_ENDIAN
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
    R = __QSAX(S, T);
    /* S = packed((ya-yc) + (xb- xd),  (xa-xc) - (yb-yd)) */
    S = __QASX(S, T);
#else
    /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
    R = __QASX(S, T);
    /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
    S = __QSAX(S, T);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* co1 & si1 are read from SIMD Coefficient pointer */
    C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
    /*  Butterfly process for the i0+fftLen/2 sample */

#ifndef ARM_MATH_BIG_ENDIAN
    /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
    out1 = __SMUSD(C1, S) >> 16U;
    /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
    out2 = __SMUADX(C1, S);
#else
    /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
    out1 = __SMUADX(C1, S) >> 16U;
    /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
    out2 = __SMUSD(__QSUB16(0, C1), S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* writing output(xb', yb') in little endian format */
    write_q15x2_ia (&pSi2, ((out2) & 0xFFFF0000) | ((out1) & 0x0000FFFF));

    /* co3 & si3 are read from SIMD Coefficient pointer */
    C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
    /*  Butterfly process for the i0+3fftLen/4 sample */

#ifndef ARM_MATH_BIG_ENDIAN
    /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
    out1 = __SMUSD(C3, R) >> 16U;
    /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
    out2 = __SMUADX(C3, R);
#else
    /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
    out1 = __SMUADX(C3, R) >> 16U;
    /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
    out2 = __SMUSD(__QSUB16(0, C3), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

    /* writing output(xd', yd') in little endian format */
    write_q15x2_ia (&pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));

    /*  Twiddle coefficients index modifier */
    ic = ic + twidCoefModifier;

  } while (--j);
  /* data is in 4.11(q11) format */

  /* end of first stage process */


  /* start of middle stage process */

  /*  Twiddle coefficients index modifier */
  twidCoefModifier <<= 2U;

  /*  Calculation of Middle stage */
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
  {
    /*  Initializations for the middle stage */
    n1 = n2;
    n2 >>= 2U;
    ic = 0U;

    for (j = 0U; j <= (n2 - 1U); j++)
    {
      /*  index calculation for the coefficients */
      C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
      C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
      C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));

      /*  Twiddle coefficients index modifier */
      ic = ic + twidCoefModifier;

      pSi0 = pSrc16 + 2 * j;
      pSi1 = pSi0 + 2 * n2;
      pSi2 = pSi1 + 2 * n2;
      pSi3 = pSi2 + 2 * n2;

      /*  Butterfly implementation */
      for (i0 = j; i0 < fftLen; i0 += n1)
      {
        /*  Reading i0, i0+fftLen/2 inputs */
        /* Read ya (real), xa(imag) input */
        T = read_q15x2 (pSi0);

        /* Read yc (real), xc(imag) input */
        S = read_q15x2 (pSi2);

        /* R = packed( (ya + yc), (xa + xc)) */
        R = __QADD16(T, S);

        /* S = packed((ya - yc), (xa - xc)) */
        S = __QSUB16(T, S);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* Read yb (real), xb(imag) input */
        T = read_q15x2 (pSi1);

        /* Read yd (real), xd(imag) input */
        U = read_q15x2 (pSi3);

        /* T = packed( (yb + yd), (xb + xd)) */
        T = __QADD16(T, U);

        /*  writing the butterfly processed i0 sample */

        /* xa' = xa + xb + xc + xd */
        /* ya' = ya + yb + yc + yd */
        out1 = __SHADD16(R, T);
        out1 = __SHADD16(out1, 0);
        write_q15x2 (pSi0, out1);
        pSi0 += 2 * n1;

        /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
        R = __SHSUB16(R, T);

#ifndef ARM_MATH_BIG_ENDIAN
        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
        out1 = __SMUSD(C2, R) >> 16U;

        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        out2 = __SMUADX(C2, R);
#else
        /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        out1 = __SMUADX(R, C2) >> 16U;

        /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
        out2 = __SMUSD(__QSUB16(0, C2), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /*  Reading i0+3fftLen/4 */
        /* Read yb (real), xb(imag) input */
        T = read_q15x2 (pSi1);

        /*  writing the butterfly processed i0 + fftLen/4 sample */
        /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
        /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
        write_q15x2 (pSi1, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi1 += 2 * n1;

        /*  Butterfly calculations */

        /* Read yd (real), xd(imag) input */
        U = read_q15x2 (pSi3);

        /* T = packed(yb-yd, xb-xd) */
        T = __QSUB16(T, U);

#ifndef ARM_MATH_BIG_ENDIAN
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
        R = __SHSAX(S, T);
        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
        S = __SHASX(S, T);
        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = __SMUSD(C1, S) >> 16U;
        out2 = __SMUADX(C1, S);
#else
        /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
        R = __SHASX(S, T);

        /* S = packed((ya-yc) - (xb- xd),  (xa-xc) + (yb-yd)) */
        S = __SHSAX(S, T);

        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = __SMUADX(S, C1) >> 16U;
        out2 = __SMUSD(__QSUB16(0, C1), S);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
        /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
        write_q15x2 (pSi2, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi2 += 2 * n1;

        /*  Butterfly process for the i0+3fftLen/4 sample */

#ifndef ARM_MATH_BIG_ENDIAN
        out1 = __SMUSD(C3, R) >> 16U;
        out2 = __SMUADX(C3, R);
#else
        out1 = __SMUADX(C3, R) >> 16U;
        out2 = __SMUSD(__QSUB16(0, C3), R);
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

        /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
        /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
        write_q15x2 (pSi3, ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
        pSi3 += 2 * n1;
      }
    }
    /*  Twiddle coefficients index modifier */
    twidCoefModifier <<= 2U;
  }
  /* end of middle stage process */

  /* data is in 10.6(q6) format for the 1024 point */
  /* data is in 8.8(q8) format for the 256 point */
  /* data is in 6.10(q10) format for the 64 point */
  /* data is in 4.12(q12) format for the 16 point */

  /*  Initializations for the last stage */
  j = fftLen >> 2;

  ptr1 = &pSrc16[0];

  /* start of last stage process */

  /*  Butterfly implementation */
  do
  {
    /* Read xa (real), ya(imag) input */
    xaya = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xb (real), yb(imag) input */
    xbyb = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xc (real), yc(imag) input */
    xcyc = read_q15x2_ia ((q15_t **) &ptr1);

    /* Read xd (real), yd(imag) input */
    xdyd = read_q15x2_ia ((q15_t **) &ptr1);

    /* R = packed((ya + yc), (xa + xc)) */
    R = __QADD16(xaya, xcyc);

    /* T = packed((yb + yd), (xb + xd)) */
    T = __QADD16(xbyb, xdyd);

    /* pointer updation for writing */
    ptr1 = ptr1 - 8U;


    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    write_q15x2_ia (&ptr1, __SHADD16(R, T));

    /* T = packed((yb + yd), (xb + xd)) */
    T = __QADD16(xbyb, xdyd);

    /* xc' = (xa-xb+xc-xd) */
    /* yc' = (ya-yb+yc-yd) */
    write_q15x2_ia (&ptr1, __SHSUB16(R, T));

    /* S = packed((ya - yc), (xa - xc)) */
    S = __QSUB16(xaya, xcyc);

    /* Read yd (real), xd(imag) input */
    /* T = packed( (yb - yd), (xb - xd))  */
    U = __QSUB16(xbyb, xdyd);

#ifndef ARM_MATH_BIG_ENDIAN
    /* xb' = (xa+yb-xc-yd) */
    /* yb' = (ya-xb-yc+xd) */
    write_q15x2_ia (&ptr1, __SHASX(S, U));

    /* xd' = (xa-yb-xc+yd) */
    /* yd' = (ya+xb-yc-xd) */
    write_q15x2_ia (&ptr1, __SHSAX(S, U));
#else
    /* xb' = (xa+yb-xc-yd) */
    /* yb' = (ya-xb-yc+xd) */
    write_q15x2_ia (&ptr1, __SHSAX(S, U));

    /* xd' = (xa-yb-xc+yd) */
    /* yd' = (ya+xb-yc-xd) */
    write_q15x2_ia (&ptr1, __SHASX(S, U));
#endif /* #ifndef ARM_MATH_BIG_ENDIAN */

  } while (--j);

  /* end of last stage  process */

  /* output is in 11.5(q5) format for the 1024 point */
  /* output is in 9.7(q7) format for the 256 point   */
  /* output is in 7.9(q9) format for the 64 point  */
  /* output is in 5.11(q11) format for the 16 point  */


#else /* arm_radix4_butterfly_inverse_q15 */

        q15_t R0, R1, S0, S1, T0, T1, U0, U1;
        q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
        uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;

  /* Total process is divided into three stages */

  /* process first stage, middle stages, & last stage */

  /*  Initializations for the first stage */
  n2 = fftLen;
  n1 = n2;

  /* n2 = fftLen/4 */
  n2 >>= 2U;

  /* Index for twiddle coefficient */
  ic = 0U;

  /* Index for input read and output write */
  i0 = 0U;

  j = n2;

  /* Input is in 1.15(q15) format */

  /*  Start of first stage process */
  do
  {
    /*  Butterfly implementation */

    /*  index calculation for the input as, */
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
    i1 = i0 + n2;
    i2 = i1 + n2;
    i3 = i2 + n2;

    /*  Reading i0, i0+fftLen/2 inputs */
    /* input is down scale by 4 to avoid overflow */
    /* Read ya (real), xa(imag) input */
    T0 = pSrc16[i0 * 2U] >> 2U;
    T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
    /* input is down scale by 4 to avoid overflow */
    /* Read yc (real), xc(imag) input */
    S0 = pSrc16[i2 * 2U] >> 2U;
    S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;

    /* R0 = (ya + yc), R1 = (xa + xc) */
    R0 = __SSAT(T0 + S0, 16U);
    R1 = __SSAT(T1 + S1, 16U);
    /* S0 = (ya - yc), S1 = (xa - xc) */
    S0 = __SSAT(T0 - S0, 16U);
    S1 = __SSAT(T1 - S1, 16U);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* input is down scale by 4 to avoid overflow */
    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U] >> 2U;
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
    /* Read yd (real), xd(imag) input */
    /* input is down scale by 4 to avoid overflow */
    U0 = pSrc16[i3 * 2U] >> 2U;
    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;

    /* T0 = (yb + yd), T1 = (xb + xd) */
    T0 = __SSAT(T0 + U0, 16U);
    T1 = __SSAT(T1 + U1, 16U);

    /*  writing the butterfly processed i0 sample */
    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);

    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
    R0 = __SSAT(R0 - T0, 16U);
    R1 = __SSAT(R1 - T1, 16U);
    /* co2 & si2 are read from Coefficient pointer */
    Co2 = pCoef16[2U * ic * 2U];
    Si2 = pCoef16[(2U * ic * 2U) + 1U];
    /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
    out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
    /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
    out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);

    /*  Reading i0+fftLen/4 */
    /* input is down scale by 4 to avoid overflow */
    /* T0 = yb, T1 = xb */
    T0 = pSrc16[i1 * 2U] >> 2U;
    T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;

    /* writing the butterfly processed i0 + fftLen/4 sample */
    /* writing output(xc', yc') in little endian format */
    pSrc16[i1 * 2U] = out1;
    pSrc16[(i1 * 2U) + 1U] = out2;

    /*  Butterfly calculations */
    /* input is down scale by 4 to avoid overflow */
    /* U0 = yd, U1 = xd) */
    U0 = pSrc16[i3 * 2U] >> 2U;
    U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;

    /* T0 = yb-yd, T1 = xb-xd) */
    T0 = __SSAT(T0 - U0, 16U);
    T1 = __SSAT(T1 - U1, 16U);
    /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
    R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
    R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
    /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
    S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
    S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);

    /* co1 & si1 are read from Coefficient pointer */
    Co1 = pCoef16[ic * 2U];
    Si1 = pCoef16[(ic * 2U) + 1U];
    /*  Butterfly process for the i0+fftLen/2 sample */
    /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
    out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
    /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
    out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
    /* writing output(xb', yb') in little endian format */
    pSrc16[i2 * 2U] = out1;
    pSrc16[(i2 * 2U) + 1U] = out2;

    /* Co3 & si3 are read from Coefficient pointer */
    Co3 = pCoef16[3U * ic * 2U];
    Si3 = pCoef16[(3U * ic * 2U) + 1U];
    /*  Butterfly process for the i0+3fftLen/4 sample */
    /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
    out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
    /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
    out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
    /* writing output(xd', yd') in little endian format */
    pSrc16[i3 * 2U] = out1;
    pSrc16[(i3 * 2U) + 1U] = out2;

    /*  Twiddle coefficients index modifier */
    ic = ic + twidCoefModifier;

    /*  Updating input index */
    i0 = i0 + 1U;

  } while (--j);

  /*  End of first stage process */

  /* data is in 4.11(q11) format */


  /*  Start of Middle stage process */

  /*  Twiddle coefficients index modifier */
  twidCoefModifier <<= 2U;

  /*  Calculation of Middle stage */
  for (k = fftLen / 4U; k > 4U; k >>= 2U)
  {
    /*  Initializations for the middle stage */
    n1 = n2;
    n2 >>= 2U;
    ic = 0U;

    for (j = 0U; j <= (n2 - 1U); j++)
    {
      /*  index calculation for the coefficients */
      Co1 = pCoef16[ic * 2U];
      Si1 = pCoef16[(ic * 2U) + 1U];
      Co2 = pCoef16[2U * ic * 2U];
      Si2 = pCoef16[2U * ic * 2U + 1U];
      Co3 = pCoef16[3U * ic * 2U];
      Si3 = pCoef16[(3U * ic * 2U) + 1U];

      /*  Twiddle coefficients index modifier */
      ic = ic + twidCoefModifier;

      /*  Butterfly implementation */
      for (i0 = j; i0 < fftLen; i0 += n1)
      {
        /*  index calculation for the input as, */
        /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
        i1 = i0 + n2;
        i2 = i1 + n2;
        i3 = i2 + n2;

        /*  Reading i0, i0+fftLen/2 inputs */
        /* Read ya (real), xa(imag) input */
        T0 = pSrc16[i0 * 2U];
        T1 = pSrc16[(i0 * 2U) + 1U];

        /* Read yc (real), xc(imag) input */
        S0 = pSrc16[i2 * 2U];
        S1 = pSrc16[(i2 * 2U) + 1U];


        /* R0 = (ya + yc), R1 = (xa + xc) */
        R0 = __SSAT(T0 + S0, 16U);
        R1 = __SSAT(T1 + S1, 16U);
        /* S0 = (ya - yc), S1 = (xa - xc) */
        S0 = __SSAT(T0 - S0, 16U);
        S1 = __SSAT(T1 - S1, 16U);

        /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
        /* Read yb (real), xb(imag) input */
        T0 = pSrc16[i1 * 2U];
        T1 = pSrc16[(i1 * 2U) + 1U];

        /* Read yd (real), xd(imag) input */
        U0 = pSrc16[i3 * 2U];
        U1 = pSrc16[(i3 * 2U) + 1U];

        /* T0 = (yb + yd), T1 = (xb + xd) */
        T0 = __SSAT(T0 + U0, 16U);
        T1 = __SSAT(T1 + U1, 16U);

        /*  writing the butterfly processed i0 sample */
        /* xa' = xa + xb + xc + xd */
        /* ya' = ya + yb + yc + yd */
        pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
        pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;

        /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
        R0 = (R0 >> 1U) - (T0 >> 1U);
        R1 = (R1 >> 1U) - (T1 >> 1U);

        /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
        out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
        /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
        out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);

        /*  Reading i0+3fftLen/4 */
        /* Read yb (real), xb(imag) input */
        T0 = pSrc16[i1 * 2U];
        T1 = pSrc16[(i1 * 2U) + 1U];

        /*  writing the butterfly processed i0 + fftLen/4 sample */
        /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
        /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
        pSrc16[i1 * 2U] = out1;
        pSrc16[(i1 * 2U) + 1U] = out2;

        /*  Butterfly calculations */
        /* Read yd (real), xd(imag) input */
        U0 = pSrc16[i3 * 2U];
        U1 = pSrc16[(i3 * 2U) + 1U];

        /* T0 = yb-yd, T1 = xb-xd) */
        T0 = __SSAT(T0 - U0, 16U);
        T1 = __SSAT(T1 - U1, 16U);

        /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
        R0 = (S0 >> 1U) + (T1 >> 1U);
        R1 = (S1 >> 1U) - (T0 >> 1U);

        /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
        S0 = (S0 >> 1U) - (T1 >> 1U);
        S1 = (S1 >> 1U) + (T0 >> 1U);

        /*  Butterfly process for the i0+fftLen/2 sample */
        out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
        out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
        /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
        /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
        pSrc16[i2 * 2U] = out1;
        pSrc16[(i2 * 2U) + 1U] = out2;

        /*  Butterfly process for the i0+3fftLen/4 sample */
        out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);

        out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
        /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
        /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
        pSrc16[i3 * 2U] = out1;
        pSrc16[(i3 * 2U) + 1U] = out2;


      }
    }
    /*  Twiddle coefficients index modifier */
    twidCoefModifier <<= 2U;
  }
  /*  End of Middle stages process */


  /* data is in 10.6(q6) format for the 1024 point */
  /* data is in 8.8(q8) format for the 256 point   */
  /* data is in 6.10(q10) format for the 64 point  */
  /* data is in 4.12(q12) format for the 16 point  */

  /* start of last stage process */


  /*  Initializations for the last stage */
  n1 = n2;
  n2 >>= 2U;

  /*  Butterfly implementation */
  for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
  {
    /*  index calculation for the input as, */
    /*  pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
    i1 = i0 + n2;
    i2 = i1 + n2;
    i3 = i2 + n2;

    /*  Reading i0, i0+fftLen/2 inputs */
    /* Read ya (real), xa(imag) input */
    T0 = pSrc16[i0 * 2U];
    T1 = pSrc16[(i0 * 2U) + 1U];
    /* Read yc (real), xc(imag) input */
    S0 = pSrc16[i2 * 2U];
    S1 = pSrc16[(i2 * 2U) + 1U];

    /* R0 = (ya + yc), R1 = (xa + xc) */
    R0 = __SSAT(T0 + S0, 16U);
    R1 = __SSAT(T1 + S1, 16U);
    /* S0 = (ya - yc), S1 = (xa - xc) */
    S0 = __SSAT(T0 - S0, 16U);
    S1 = __SSAT(T1 - S1, 16U);

    /*  Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U];
    T1 = pSrc16[(i1 * 2U) + 1U];
    /* Read yd (real), xd(imag) input */
    U0 = pSrc16[i3 * 2U];
    U1 = pSrc16[(i3 * 2U) + 1U];

    /* T0 = (yb + yd), T1 = (xb + xd) */
    T0 = __SSAT(T0 + U0, 16U);
    T1 = __SSAT(T1 + U1, 16U);

    /*  writing the butterfly processed i0 sample */
    /* xa' = xa + xb + xc + xd */
    /* ya' = ya + yb + yc + yd */
    pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
    pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);

    /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
    R0 = (R0 >> 1U) - (T0 >> 1U);
    R1 = (R1 >> 1U) - (T1 >> 1U);

    /* Read yb (real), xb(imag) input */
    T0 = pSrc16[i1 * 2U];
    T1 = pSrc16[(i1 * 2U) + 1U];

    /*  writing the butterfly processed i0 + fftLen/4 sample */
    /* xc' = (xa-xb+xc-xd) */
    /* yc' = (ya-yb+yc-yd) */
    pSrc16[i1 * 2U] = R0;
    pSrc16[(i1 * 2U) + 1U] = R1;

    /* Read yd (real), xd(imag) input */
    U0 = pSrc16[i3 * 2U];
    U1 = pSrc16[(i3 * 2U) + 1U];
    /* T0 = (yb - yd), T1 = (xb - xd) */
    T0 = __SSAT(T0 - U0, 16U);
    T1 = __SSAT(T1 - U1, 16U);

    /*  writing the butterfly processed i0 + fftLen/2 sample */
    /* xb' = (xa-yb-xc+yd) */
    /* yb' = (ya+xb-yc-xd) */
    pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
    pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);


    /*  writing the butterfly processed i0 + 3fftLen/4 sample */
    /* xd' = (xa+yb-xc-yd) */
    /* yd' = (ya-xb-yc+xd) */
    pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
    pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
  }
  /* end of last stage  process */

  /* output is in 11.5(q5) format for the 1024 point */
  /* output is in 9.7(q7) format for the 256 point   */
  /* output is in 7.9(q9) format for the 64 point  */
  /* output is in 5.11(q11) format for the 16 point  */

#endif /* #if defined (ARM_MATH_DSP) */

}

extern void arm_radix4_butterfly_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef,
        uint32_t twidCoefModifier);

extern void arm_radix4_butterfly_inverse_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef,
        uint32_t twidCoefModifier);

extern void arm_bitreversal_16(
        uint16_t * pSrc,
  const uint16_t bitRevLen,
  const uint16_t * pBitRevTable);

void arm_cfft_radix4by2_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef);

void arm_cfft_radix4by2_inverse_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef);


void arm_cfft_q15(
								const arm_cfft_instance_q15 * S,
											q15_t * p1,
											uint8_t ifftFlag,
											uint8_t bitReverseFlag)
	
{
	  uint32_t L = S->fftLen;

  if (ifftFlag == 1U)
  {
     switch (L)
     {
     case 16:
     case 64:
     case 256:
     case 1024:
     case 4096:
       arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
       break;

     case 32:
     case 128:
     case 512:
     case 2048:
       arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
       break;
     }
  }
  else
  {
     switch (L)
     {
     case 16:
     case 64:
     case 256:
     case 1024:
     case 4096:
       arm_radix4_butterfly_q15  ( p1, L, (q15_t*)S->pTwiddle, 1 );
       break;

     case 32:
     case 128:
     case 512:
     case 2048:
       arm_cfft_radix4by2_q15  ( p1, L, S->pTwiddle );
       break;
     }
  }

		if ( bitReverseFlag )
    arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
}

void arm_cfft_radix4by2_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef)
{
	  uint32_t i;
        uint32_t n2;
        q15_t p0, p1, p2, p3;
	
	  uint32_t l;
        q15_t xt, yt, cosVal, sinVal;
	  n2 = fftLen >> 1U;
	
	 for (i = 0; i < n2; i++)	
  {
     cosVal = pCoef[2 * i];
     sinVal = pCoef[2 * i + 1];

     l = i + n2;

     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;

     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;

     pSrc[2 * l]     = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
                        ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );

     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))   );
  }
	arm_radix4_butterfly_q15( pSrc,          n2, (q15_t*)pCoef, 2U);

  /* second col */
  arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);

  n2 = fftLen >> 1U;
  for (i = 0; i < n2; i++)
  {
     p0 = pSrc[4 * i + 0];
     p1 = pSrc[4 * i + 1];
     p2 = pSrc[4 * i + 2];
     p3 = pSrc[4 * i + 3];

     p0 <<= 1U;
     p1 <<= 1U;
     p2 <<= 1U;
     p3 <<= 1U;

     pSrc[4 * i + 0] = p0;
     pSrc[4 * i + 1] = p1;
     pSrc[4 * i + 2] = p2;
     pSrc[4 * i + 3] = p3;
  }
}

void arm_cfft_radix4by2_inverse_q15(
        q15_t * pSrc,
        uint32_t fftLen,
  const q15_t * pCoef)
{
        uint32_t i;
        uint32_t n2;
        q15_t p0, p1, p2, p3;

        uint32_t l;
        q15_t xt, yt, cosVal, sinVal;


  n2 = fftLen >> 1U;



  for (i = 0; i < n2; i++)
  {
     cosVal = pCoef[2 * i];
     sinVal = pCoef[2 * i + 1];

     l = i + n2;

     xt =           (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
     pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;

     yt =               (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
     pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;

     pSrc[2 * l]      = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
                         ((int16_t) (((q31_t) yt * sinVal) >> 16U))  );

     pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
                        ((int16_t) (((q31_t) xt * sinVal) >> 16U))  );
  }


  /* first col */
  arm_radix4_butterfly_inverse_q15( pSrc,          n2, (q15_t*)pCoef, 2U);

  /* second col */
  arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);

  n2 = fftLen >> 1U;
  for (i = 0; i < n2; i++)
  {
     p0 = pSrc[4 * i + 0];
     p1 = pSrc[4 * i + 1];
     p2 = pSrc[4 * i + 2];
     p3 = pSrc[4 * i + 3];

     p0 <<= 1U;
     p1 <<= 1U;
     p2 <<= 1U;
     p3 <<= 1U;

     pSrc[4 * i + 0] = p0;
     pSrc[4 * i + 1] = p1;
     pSrc[4 * i + 2] = p2;
     pSrc[4 * i + 3] = p3;
  }
}

代码使用查表法对FFT的蝶形运算进行优化，对于32点的FFT算法运行时间390us，64点的FFT算法930us。相对使用C语言实现的FFT没有优化前的速度提高了100倍，算法的强大。空间占有为6kb~7kb之间。满足项目需求。

误差分析

由于使用是Q15类型的FFT算法而不是浮点型算法，空间减少许多和速度大大加快，但是算法的误差必然是一个问题。这是一个难以避免的问题。使用matlab代码对算法的误差进行分析。

matlab的代码如下：

clear;
clc;
% Test data initialization
N = 1:64;
sample_frequecy = 3000;
test_data = 800*sin(2*3.14*100*N/sample_frequecy);
test_data = round(test_data);
plot(test_data);
% fft result
fft_data = fft(test_data);
[max_value,max_index] = max(fft_data(1:33));
% Drawing FFT spectrum
plot(N,abs(fft_data)/64);
data_real = real(fft_data);
data_imag = imag(fft_data);

和单片机运行的结果进行对比，当峰值点时，误差在5%以下时可以保证的，并且当峰值越大，误差越小。而值越小，误差越大。

所以如果计算最大值的位置这种应用，Q15类型FFT完全可以胜任。比如这次的项目，单目标的测距和测速。需要就是最大点的位置信息而不是幅度信息。

参考

【STM32H7的DSP教程】第6章 ARM DSP源码和库移植方法（MDK5的AC5和AC6）（https://blog.csdn.net/Simon223/article/details/105311500）

定点fft理论(https://www.docin.com/p-662084418.html)