{___________________________________________________________________________
IRFFT2.ASM      ADSP-21020 Radix-2 Real IFFT

Does a inverse real radix-2 FFT of length 32 or greater on input data X(n).
X(n) is converted into U(n)+jV(n), where U(n) and V(n) are the real and
imaginary parts of the FFT transform of x(2n)+jx(2n+1).  An inverse FFT is then
performed on U(n)+jV(n) to yield x(n) .  The inverse FFT is accomplished by
switching the real and imaginary parts, taking the forward FFT, then switching
the real and imaginary again.  Note that for a N point real IFFT, only N/2+1 FFT
points are needed because of the Hermitian symmetry of the real FFT.  Input data
is destroyed during the course of this routine.

        N/2+1   real of Frequency domain data in DM,
        N/2+1   imag of Frequency domain data in PM,
        N/2     real of even indexed Time domain data stored in DM, 
        N/2     real of odd indexed Time domain data stored in PM, 
        Two N/4 long Cosine tables stored in PM, 
        Two N/4 long Sine tables in DM.


Based on FFTRAD2 by  Kapriel Karagozian, Analog Devices DSP Div.(617) 461-3672
Author:   16-APR-90, Ronnin Yee, Analog Devices DSP Div.(617) 461-3672


Calling Information:
    pm(cosine[N/4])     - cos(2pi*n/HN) table from IRF2TBL program
    pm(n_cosine[N/4])   - cos(pi*n/HN)/(2*HN) table from IRF2TBL program
    dm(sine[N/4])       - sin(2pi*n/HN) table from IRF2TBL program
    dm(n_sine[N/4])     - cos(pi*n/HN)/(2*HN) table from IRF2TBL program
    dm(refft[N/2+1])    - real input array, bitreved to working array
    pm(imfft[N/2+1])    - imaginary input array, bitreved to working array

    The "cosine" and "sine" tables are identical to the cosine and sine tables
    for RFFT2.ASM and therefore can be shared among the two.  "n_cosine" and
    "n_sine" are the same as "h_cosine" and "h_sine" in RFFT2.ASM multiplied by a
    factor of (2/N).  If you wish to share these tables also and dont mind the
    scaling, use the "h_cosine" and "h_sine" tables and change "f2=(1/2*HN);" to
    "f2=0.5;".  This will cause the output of this routine to be (ifft)*(N/2).

    (Note: Because the bit reversed address mode is used with the arrays
    inrefft and evreal, they must start at addresses that are integer
    multiples of the half length (HN) of the transform, (i.e. 0,HN,2HN,3HN...).
    This is accomplished by specifing two segments starting at those addresses
    in the architecture file and placing the variables alone in their
    respective segments. These addresses must also be reflected in the
    preprocessor variables ORE and IRE in bit reversed format.)


Results:
    dm(evreal[N/2])   - even samples of Time domain results
    pm(odreal[N/2])   - odd samples of Time domain results


Benchmarks: Radix-2, inverse real

                                time (msec)
    FFT Length    cycles     20MHz     25MHZ    (21020 Clock)
    ----------    ------   --------  --------
         64         661       .033      .024
        128        1267       .063      .051
        256        2577       .128      .103
        512        5423       .271      .217
       1024       11597       .580      .464
       2048       24939      1.25       .998
       4096       53641      2.68      2.15
       8192      115125      5.76      4.61

   Conversion Stage  - 7 cycles per 2 elements
   First 2 FFT Stages- 8 cycles per 4 butterflies
   Middle Stages     - 4 cycles per butterfly
   2nd to Last Stage - 9 cycles per 2 butterflies
   Last Stage        - 5 cycles per butterfly group

Memory Usage:
    pm code = 200 words, pm data = 1.5*N+1 words, dm data = 1.5*N+1 words

    (note that pm data usage could be cut by 0.5*N words by re-using the
        "imfft" buffer in place of "odreal".)
____________________________________________________________________________}

#include "demo.h"

{________These constants are independent of the number of points____________}
#define BFLY8           4       {Offset between bf branches in a group of 8}

.SEGMENT/DM             dm_sram;

.EXTERN sine;                     {these variables are defined in rfft2}
.EXTERN h_sine;
.EXTERN real;
.EXTERN refft;

.ENDSEG;

.SEGMENT/PM             pm_sram;
.EXTERN cosine;                     {these variables are defined in rfft2}
.EXTERN h_cosine;
.EXTERN imfft;

.VAR    odreal[HN];                     { odd indexed real output}
.GLOBAL odreal;
.ENDSEG;

.GLOBAL irfft2;

.SEGMENT/PM             pm_sram;
{______________________________begin FFT__________________________________}
irfft2:
{----------------- Conversion Stage ---------------------}

{compute the U(n)+jV(n) for the N/2 transform}

b0=refft;           { real src pntr forward}
l0=HN;
b1=h_sine+1;        { sine pntr}
l1=HN/2;
i2=refft+HN;        { real src pntr backward}
l2=0;
b3=i0;              { real dest pntr forward}
l3=HN;
b4=i0;              { real dest pntr backward (will wrap around backward)}
l4=HN;
b5=real;            { imag dest pntr forward}
l6=HN;
b6=i5;              { imag dest pntr backward (will wrap around backward)}
l6=HN;

b8=imfft;           { imag src pntr forward}
l8=HN;
b9=h_cosine+1;      { cosine pntr}
l9=HN/2;
i10=imfft+HN;       { imag src pntr backward}
l10=0;

m0 = 1;
m1 = -1;
m8 = 1;
m9 = -1;

{-------The following line has been changed to allow table sharing--------}
f2 = 0.5;
         
f14= 0.0;

{do first pair of points}
                                                f8 =dm(i0,m0),  f9 =pm(i8,m8);
                                                f12=dm(i2,m1),  f13=pm(i10,m9);
                    f4=f8+f12,  f5=f8-f12,      f8 =dm(i0,m0),  f9 =pm(i8,m8);
        f11=f2*f4,                              f12=dm(i2,m1),  f13=pm(i10,m9);
        f10=f2*f5;

{do all the rest in pairs [U(n), U(HN-n)] except the mid-point}

LCNTR = (HN/2)-1, DO trans UNTIL LCE;
                    f4=f8+f12,  f5=f8-f12,      f0 =dm(i1,m0),  f1 =pm(i9,m8);
        f8=f0*f5,   f6=f9+f13,  f7=f9-f13,      dm(i3,m0)=f3;   {store real}
        f12=f1*f6,  f10=f10+f14,f15=f10-f14,    dm(i4,m1)=f11;  {store real}
        f11=f2*f4,  f14=f8+f12,                 dm(i5,m0)=f10;  {store imag.}
        f10=f1*f5,  f11=f11+f14,f3=f11-f14,     dm(i6,m1)=f15;  {store imag.}
        f13=f0*f6,                              f8 =dm(i0,m0),  f9 =pm(i8,m8);
trans:  f14=f2*f7,  f10=f10-f13,                f12=dm(i2,m1),  f13=pm(i10,m9);
        
{clean up and do the mid-point}
                    f10=f10+f14,f15=f10-f14,    dm(i3,m0)=f3;   {store real}
                    f2=f2+f2,                   dm(i4,m1)=f11;  {store real}
        f0=f2*f8,                               dm(i5,m0)=f10;  {store imag.}
        f1=f2*f9,                               dm(i6,m1)=f15;  {store imag.}
                    f10=-f1,                    dm(i3,m0)=f0;   {store real}
                                                dm(i5,m0)=f10;  {store imag.}
                                                
{--------------First two stages----------------}

{Do bitrev and real and imaginary swap within first two stages}

        bit set MODE1 BR0;  { enable bit reverse of i0 }

        b0=VORE;            { Points to imag data to be transfer to PM in}
                        { bit reversed order }
        l0=0;
        m0=BRMODH;

        b8=odreal;
        l8=HN;
        m8=1;

    {Move imag data to PM in bitreversed order}
        f0=dm(i0,m0);
        LCNTR=HN-1, DO (PC,1) UNTIL LCE;
            f0=dm(i0,m0), pm(i8,m8)=f0;
        pm(i8,m8)=f0;
        
    {Now bitrev real within first two stages}
        b0=VIRE;            { Points to input real array to be read in }
                        { bit reversed order }
        b2=real;
        l2=HN;
        m1=1;           { This loop increments forward +1}

        i8=odreal;
        b10=odreal;
        l10=HN;

    {Do the first two stages (actually a radix-4 FFT stage)}

                                                f1=dm(i0,m0),   f0=pm(i8,m8);
                                                f3=dm(i0,m0),   f2=pm(i8,m8);
                f0=f0+f2,       f2=f0-f2,       f5=dm(i0,m0),   f4=pm(i8,m8);
                f1=f1+f3,       f3=f1-f3,       f7=dm(i0,m0),   f6=pm(i8,m8);
                f4=f6+f4,       f6=f6-f4;
                f5=f5+f7,       f7=f5-f7;
                f8=f0+f4,       f9=f0-f4;
                f10=f1+f5,      f11=f1-f5;

LCNTR=HN/4,     do FSTAGE until LCE;   {do N/4 simple radix-4 butterflies }
                f12=f2+f7,      f13=f2-f7,      f1=dm(i0,m0),   f0=pm(i8,m8);
                f14=f3+f6,      f15=f3-f6,      f3=dm(i0,m0),   f2=pm(i8,m8);
                f0=f0+f2,       f2=f0-f2,       f5=dm(i0,m0),   f4=pm(i8,m8);
                f1=f1+f3,       f3=f1-f3,       f7=dm(i0,m0),   f6=pm(i8,m8);
                f4=f6+f4,       f6=f6-f4,       dm(i2,m1)=f8,   pm(i10,m8)=f10;
                f5=f5+f7,       f7=f5-f7,       dm(i2,m1)=f12,  pm(i10,m8)=f14;
                f8=f0+f4,       f9=f0-f4,       dm(i2,m1)=f9,   pm(i10,m8)=f11;
FSTAGE:         f10=f1+f5,      f11=f1-f5,      dm(i2,m1)=f13,  pm(i10,m8)=f15;

    {middle stages loop }

        bit clr MODE1 BR0;              {finished with bitreversal}

        b0=real;
        l0=HN;
        b1=sine;
        l1=HN/2;

        b9=cosine;      
        l9=HN/2;
        b10=odreal;
        l10=HN;
        b11=odreal;
        l11=HN;

        m0=-BFLY8;
        m1=-HN/8;
        m2=-BFLY8-1;
        m9=-HN/8;
        m11=-1;

        r2=2;
        r3=-BFLY8;              {initializes m0,10 - incr for butterf branches  }
        r5=BFLY8;               {counts # butterflies per a group               }
        r9=(-2*BFLY8)-1;        {initializes m12 - wrap around to next grp + 1  }
        r10=-2*BFLY8;           {initializes m8 - incr between groups   }
        r13=-BFLY8-1;           {initializes m2,13 - wrap to bgn of 1st group   }
        r15=HN/8;               {# OF GROUPS IN THIRD STAGE}

        f1=dm(i1,m1),   f7=pm(i9,m9);   {set pointers to tables to 1st coeff.   }

LCNTR=STAGES-5, do end_stage until LCE; {# OF STAGES TO BE HANDLED = LOG2(HN)-4}
        m8=r10;
        m10=r3;
        m12=r9;
        i0=real+HN-1;
        i2=real+HN-1;
        i8=odreal+HN-1;
        i10=odreal+HN-1;
        i11=odreal+HN-1;
        r15=r15-r2,     m13=r13;        {CALCULATE # OF CORE }
                                        {BFLIES/GROUP IN THIS STAGE}

                                        f0=dm(i1,m1),   f7=pm(i8,m8);
f12=f0*f7,                              f6=dm(i0,m0),   f1=pm(i9,m9);
f8=f1*f6,                                               modify(i11,m10);
f11=f1*f7,                                              f7=pm(i8,m8);
f14=f0*f6,  f12=f8+f12,                 f8=dm(i0,m0);
f12=f0*f7,  f13=f8+f12, f10=f8-f12,     f6=dm(i0,m0);   

{Each iteration does another set of bttrflys in each group}

LCNTR=r5,   do end_group until LCE;     {# OF BUTTERFLIES/GROUP IN THIS STAGE}

{core butterfly loop}

LCNTR=r15,  do end_bfly until LCE;      {Do a butterfly in each group - 2}
    f8=f1*f6,   f14=f11-f14,                    dm(i2,m0)=f10,  f9=pm(i11,m8);
    f11=f1*f7,  f3=f9+f14,      f9=f9-f14,      dm(i2,m0)=f13,  f7=pm(i8,m8);
    f14=f0*f6,  f12=f8+f12,                     f8=dm(i0,m0),   pm(i10,m10)=f9;
end_bfly:
    f12=f0*f7,  f13=f8+f12,     f10=f8-f12,     f6=dm(i0,m0),   pm(i10,m10)=f3;

{finish up last bttrfly and set up for next stage}

f8=f1*f6,               f14=f11-f14,    dm(i2,m0)=f10,  f9=pm(i11,m8);
f11=f1*f7,  f4=f9+f14,  f9=f9-f14,      dm(i2,m0)=f13,  f14=pm(i8,m11);
f14=f0*f6,  f12=f8+f12,                 f8=dm(i0,m2),   pm(i10,m10)=f9;
            f13=f8+f12, f10=f8-f12,     f0=dm(i1,m1),   f7=pm(i8,m8);  {dm:sin} 
                        f14=f11-f14,    dm(i2,m0)=f10,  f9=pm(i11,m12);
  {start on next butterfly in each group}
f12=f0*f7,  f3=f9+f14,  f9=f9-f14,      f6=dm(i0,m0),   f1=pm(i9,m9);  {pm:cos}
f8=f1*f6,                               dm(i2,m2)=f13,  pm(i10,m10)=f4;
f11=f1*f7,                                              pm(i10,m10)=f9;
f14=f0*f6,  f12=f8+f12,                 f8=dm(i0,m0),   f7=pm(i8,m8);
end_group:
f12=f0*f7,  f13=f8+f12, f10=f8-f12,     f6=dm(i0,m0),   pm(i10,m13)=f3;

                                        
            r4=r15+r2,          i1=b1;          {PREPARE R4 FOR #OF BFLIES CALC}
            r15=ashift r4 by -1;                {# OF BFLIES/GRP IN NEXT STAGE}
            r4=-r15,            i9=b9;
            m1=r4;                                  {update inc for sin & cos }
            m9=r4;
            r5=ashift r5 by 1,  f1=dm(i1,m1);       {update # bttrfly in a grp}
            r3=-r5;                                 {   inc for bttrfly branch}
            r13=r3-1,           m0=r3;              {   wrap to 1st grp       }
            r10=ashift r3 by 1, f7=pm(i9,m9);       {   inc between grps      }
end_stage:  r9=r10-1,           m2=r13;             {   wrap to grp +1        }

{_________ next to last stage__________}
        m1=-2;                  {modifier to sine table pntr    }
        m8=r10;                 {incr between groups            }
        m9=-2;                  {modifier to cosine table pntr  }
        m10=r3;                 {incr between bttrfly branches  }
        m12=r9;                 {wrap around to next grp + 1    }
        m13=r13;                {wrap to bgn of 1st group       }

        i0=real+HN-1;
        i1=sine+(HN/2)-2;       {pntr to 1st sine coeff         }
        i2=real+HN-1;
        i8=odreal+HN-1;
        i9=cosine+(HN/2)-2;     {pntr to 1st cosine coeff       }       
        i10=odreal+HN-1;
        i11=odreal+HN-1;
                                        f0=dm(i1,m1),   f7=pm(i8,m8);
f12=f0*f7,                              f6=dm(i0,m0),   f1=pm(i9,m9);
f8=f1*f6,                                               modify(i11,m10);
f11=f1*f7,                                              f7=pm(i8,m12);
f14=f0*f6, f12=f8+f12,                  f8=dm(i0,m0);
f12=f0*f7, f13=f8+f12,  f10=f8-f12,     f6=dm(i0,m0);                           

{Do the HN/4 butterflies in the two groups of this stage}

LCNTR=HN/4,  do end_group2 until LCE;   
    f8=f1*f6,                   f14=f11-f14,    dm(i2,m0)=f10,  f9=pm(i11,m8);
    f11=f1*f7,  f3=f9+f14,      f9=f9-f14,      dm(i2,m0)=f13,  f1=pm(i9,m9);
    f14=f0*f6,  f12=f8+f12,                     f8=dm(i0,m2),   pm(i10,m10)=f9;
                f13=f8+f12,     f10=f8-f12,     f0=dm(i1,m1),   f7=pm(i8,m8);
    f12=f0*f7,                  f14=f11-f14,    f6=dm(i0,m0),   f9=pm(i11,m12);

    f8=f1*f6,   f3=f9+f14,      f9=f9-f14,      dm(i2,m0)=f10,  pm(i10,m10)=f3;
    f11=f1*f7,                                  dm(i2,m2)=f13,  pm(i10,m10)=f9; 
    f14=f0*f6, f12=f8+f12,                      f8=dm(i0,m0),   f7=pm(i8,m12);
end_group2:
    f12=f0*f7, f13=f8+f12,      f10=f8-f12,     f6=dm(i0,m0),   pm(i10,m13)=f3;


{   The last stage      }

        m0=-HN/2;               
        m2=-HN/2-1;
        m10=m0;
        m13=m2;
        i0=real+HN-1;
        i1=sine+(HN/2)-1;       {pntr to 1st sine coeff         }
        i2=real+HN-1;
        i8=odreal+HN-1;
        i9=cosine+(HN/2)-1;     {pntr to 1st cosine coeff       }       
        i10=odreal+HN-1;
        i11=odreal+HN-1;
        m1=-1;                  {modifiers to coeff tables      }
        m9=-1;

{start first bttrfly}
                                        f0=dm(i1,m1),   f7=pm(i8,m11);
f12=f0*f7,                              f6=dm(i0,m0),   f1=pm(i9,m9);
f8=f1*f6,                                               modify(i11,m10);
f11=f1*f7;                                      
f14=f0*f6, f12=f8+f12,                  f8=dm(i0,m2),   f9=pm(i11,m11);

{do HN/2 bttrflys in the last stage and swap the real and imaginary parts}

LCNTR=HN/2,  do last_stage until LCE;   
                f13=f8+f12,     f10=f8-f12,     f0=dm(i1,m1),   f7=pm(i8,m11);
    f12=f0*f7,  f14=f11-f14,                    f6=dm(i0,m0),   f1=pm(i9,m9);
    f9=f1*f6,   f3=f9+f14,      f15=f9-f14,     f8=dm(i0,m2),   pm(i10,m10)=f10;
    f11=f1*f7,                                  dm(i2,m0)=f15,  pm(i10,m13)=f13;
last_stage:                                             
    f14=f0*f6, f12=f9+f12,                      dm(i2,m2)=f3,   f9=pm(i11,m11);


    rts;        {finished}

{_______________________________________________________________________}
.ENDSEG;