{______________________________________________________________________________
IRFFT4.ASM          ADSP-21020 Inverse Real Radix 4 Fast Fourier Transform 

This routine performs a complex, radix 4 Inverse Real Fast Fourier Transform
(IRFFT).  The IRFFT length (N) must be a (power of 4)*2 and a minimum of 128
points.  The real part of the input fft data is placed in DM and the imaginary
part in PM.  This data is destroyed during the course of the computation.  The
real and complex output of the IRFFT is placed in separate locations in DM.

The IRFFT is performed by passing the data through a "conversion stage" and then
performing a N/2 point inverse FFT.  X(n) is converted into U(n)+jV(n), where
U(n) and V(n) are the real and imaginary parts of the FFT transform of
x(2n)+jx(2n+1).  An inverse FFT is then performed on U(n)+jV(n) to yield x(n).
The inverse FFT is accomplished by switching the real and imaginary parts,
taking the forward FFT, then switching the real and imaginary again.  Note that
for a N point IRFFT, only N/2+1 FFT points are needed because of the Hermitian
symmetry of the real FFT.

Since this routine takes care of all necessary address digit-reversals, the
input and output data are in normal order.  The digit reversal is accomplished
by using a modified radix 4 butterfly throughout which swaps the inner two nodes
resulting with bit-reversed data.  Bit-reversal occurs in writing data to DM
in the last stage and when inserting the odd indexes of x(n) into "real" in DM.

For this routine to work correctly, the program "irf4tbl.c" must be used to 
generate the special twiddle factor tables for this program. 

Based on:
     FFTRAD4 by Karl Schwarz & Raimund Meyer, Universitaet Erlangen Nuernberg   
Author:   25-APR-91, Ronnin Yee, Analog Devices DSP Div.(617) 461-3672


Calling Information:
    costwid table at DM : cosine    length 3*N/4                        
    sintwid table at PM : sine      length 3*N/4                        
    coscnv  table at DM : n_cosine  length N/2                  
    sincnv  table at PM : n_sine    length N/2  

    real  input  at DM : inrefft    length N/2+1, normal order          
    imag  input  at PM : inimfft    length N/2+1, normal order

    The "cosine" and "sine" tables are identical to the cosine and sine tables
    for RFFT4.ASM and therefore can be shared among the two.  "n_cosine" and
    "n_sine" are the same as "h_cosine" and "h_sine" in RFFT4.ASM multiplied by
    a factor of (2/N).  If you wish to share these tables also and do not mind
    the scaling, use the "h_cosine" and "h_sine" tables and change
    "f2=(1/2*HN);" to "f2=0.5;" at the beginning of the conversion stage.  This
    will cause the output of this routine to be (ifft)*(N/2).

Results:
    real  output at DM : real       length N, normal order              


(Note:  Because the bit reversed addressing mode is used with the array real, it
must start at addresses that are integer multiples of the length (N) of the
transform, (i.e.  0,N,2N,3N,...).  This is accomplished by specifying two
segments starting at those addresses in the architecture file and placing the
variables alone in their respective segments.  These addresses must also be
reflected in the preprocessor variables ORE and OREP1 in bit reversed format.)

Altered Registers:
    All I, M, L and R registers.
    Three levels of looping.

Benchmarks: radix-4, inverse real with digit reversal

                                 time (msec)
    FFT Length    cycles       20MHz     25MHz
    ----------   -------     --------  -------- 
       128         1129         .056      .045
       512         4858         .243      .194
      2048        22363        1.12       .895  
      8192       103036        5.15      4.12


    Cnvsn. Stage -   6 cycles per a conversion pair
    First Stage  -   8 cycles per radix-4 butterfly
    Other Stages -  14 cycles per radix-4 butterfly (this is optimal!)

Memory Usage:
    pm code = 236 words, pm data = 1.75*N+1 words, dm data = 2.75*N+1 words

Assembler Preprocessor Variables:
    N      = number of points in the FFT 
    STAGES = log4(N/2) 
    OST    = bitrev(32 bit N/2)
    ORE    = bitrev(32 bit addr of output real), addr is 0,N,2N,3N,...
    OREP1  = bitrev((32 bit addr of output real)+1), addr is 0,N,2N,3N,...
_______________________________________________________________________________}
#define N       128
#define HN      N/2
#define STAGES  3
#define OST     0x02000000
#define ORE     0x00000000
#define OREP1   0x80000000

{ include for symbolic definition of system regster bits }
#include "def21020.h"


.SEGMENT/DM             dm_data;
.VAR    cosine[3*HN/4]= "tc4.dat";    {Cosine twiddle factors, from IRF4TBL}
.VAR    n_sine[HN/2]=   "tns.dat";  { Sine coeff., from IRF4TBL}
.VAR    inrefft[HN+1]=  "inrefft.dat"; { Input real data }
.GLOBAL inrefft;
.ENDSEG;

.SEGMENT/DM             dm_rdat;    { this segment is an int multiple of N }
.VAR    real[N];                    { Output real data }
.GLOBAL real;
.ENDSEG;

.SEGMENT/PM             pm_data;
.VAR    sine[3*HN/4]=   "ts4.dat";   {Sine twiddle factors, from IRF4TBL }
.VAR    n_cosine[HN/2]= "tnc.dat";  {Cosine coeff., from IRF4TBL}
.VAR    inimfft[HN+1]=  "inimfft.dat"; { Input imaginary data }
.GLOBAL inimfft;
.ENDSEG;

.SEGMENT/PM             rst_svc;    { program starts at the reset vector }
        pmwait=0x0021;              {pgsz=0,pmwtstates=0,intrn.wtstates only}
        dmwait=0x008421;            {pgsz=0,dmwtstates=0,intrn.wtstates only}
        call    irfft4;
stop:   idle;   
        nop;
        nop;
.ENDSEG;

.SEGMENT/PM             pm_code;
irfft4:

{----------------- Conversion Stage ---------------------}

{compute the U(n)+jV(n) for the N/2 transform}

b0=inrefft;         { real src pntr forward}
l0=HN;
b1=n_sine+1;        { sine pntr}
l1=@n_sine;
i2=inrefft+HN;      { real src pntr backward}
l2=0;
b3=i0;              { real dest pntr forward}
l3=HN;
b4=i0;              { real dest pntr backward (will wrap around backward)}
l4=HN;

b8=inimfft;         { imag src pntr forward}
l8=HN;
b9=n_cosine+1;      { cosine pntr}
l9=@n_cosine;
i10=inimfft+HN;     { imag src pntr backward}
l10=0;
b11=inimfft;        { imag dest pntr forward}
l11=HN;
b12=i11;            { imag dest pntr backward (will wrap around backward)}
l12=HN;

m0 = 1;
m1 = -1;
m8 = 1;
m9 = -1;
f2 = 1.0/(2.0*HN);   { Divide by 2*HN }
f14= 0.0;

{do first pair of points}
                                                f8 =dm(i0,m0),  f9 =pm(i8,m8);
                                                f12=dm(i2,m1),  f13=pm(i10,m9);
                    f4=f8+f12,  f5=f8-f12,      f8 =dm(i0,m0),  f9 =pm(i8,m8);
        f11=f2*f4,                              f12=dm(i2,m1),  f13=pm(i10,m9);
        f10=f2*f5;

{do all the rest in pairs [U(n), U(HN-n)] except the mid-point}
                    f4=f8+f12,  f5=f8-f12,      f0 =dm(i1,m0),  f1 =pm(i9,m8);

LCNTR = (HN/2)-1, DO trans UNTIL LCE;
        f8=f0*f5,   f6=f9+f13,  f7=f9-f13,      dm(i3,m0)=f3,   f9 =pm(i8,m8);
        f12=f1*f6,  f10=f10+f14,f15=f10-f14,    dm(i4,m1)=f11,  f13=pm(i10,m9);
        f11=f2*f4,  f14=f8+f12,                 f8 =dm(i0,m0),  pm(i11,m8)=f10;
        f10=f1*f5,  f11=f11+f14,f3=f11-f14,     f12=dm(i2,m1),  pm(i12,m9)=f15;
        f12=f0*f6,  f4=f8+f12,  f5=f8-f12,      f0 =dm(i1,m0),  f1 =pm(i9,m8);                          
trans:  f14=f2*f7,  f10=f10-f12;                        
                                                
{clean up and do the mid-point}
        f3=f2*f4,   f10=f10+f14,f15=f10-f14,    dm(i3,m0)=f3;
                    f6=f9+f13,                  dm(i4,m1)=f11,  pm(i11,m8)=f10;
        f1=f2*f6;
                    f10=-f1,                                    pm(i12,m9)=f15;
                                                dm(i3,m0)=f3,   pm(i11,m8)=f10;


{_______first stage radix-4 butterfly without twiddles______}
{ Inverse is accomplished by switching the real & imaginary in this stage}
{  and then switching them again in the last stage}

                        i0=inrefft;
                        i1=inrefft+HN/4;
                        i2=inrefft+HN/2;
                        i3=inrefft+3*HN/4;
                        i4=i0;
                        i5=i1;
                        i6=i2;
                        i7=i3;
                        m0=1;
                        m8=1;
                        i8=inimfft;
                        i9=inimfft+HN/4;
                        i10=inimfft+HN/2;
                        i11=inimfft+3*HN/4;
                        i12=i8;
                        i13=i9;
                        i14=i10;
                        i15=i11;
                        l0 = 0;
                        l1 = l0;
                        l2 = l0;
                        l3 = l0;
                        l4 = l0;
                        l5 = l0;
                        l6 = l0;
                        l7 = l0;
                        l8 = l0;
                        l9 = l0;
                        l10 = l0;
                        l11 = l0;
                        l12 = l0;
                        l13 = l0;
                        l14 = l0;
                        l15 = l0;
                                                f1=dm(i0,m0),   f0=pm(i8,m8);
                                                f3=dm(i2,m0),   f2=pm(i10,m8);
                f0=f0+f2,       f2=f0-f2,       f5=dm(i1,m0),   f4=pm(i9,m8);
                f1=f1+f3,       f3=f1-f3,       f7=dm(i3,m0),   f6=pm(i11,m8);
                f4=f6+f4,       f6=f6-f4;
                f5=f5+f7,       f7=f5-f7;
                f8=f0+f4,       f9=f0-f4;
                f10=f1+f5,      f11=f1-f5;


LCNTR=HN/4,     do fstage until LCE;    { do HN/4 simple radix-4 butterflies }
                f12=f2+f7,      f13=f2-f7,      f1=dm(i0,m0),   f0=pm(i8,m8);
                f14=f3+f6,      f15=f3-f6,      f3=dm(i2,m0),   f2=pm(i10,m8);
                f0=f0+f2,       f2=f0-f2,       f5=dm(i1,m0),   f4=pm(i9,m8);
                f1=f1+f3,       f3=f1-f3,       f7=dm(i3,m0),   f6=pm(i11,m8);
                f4=f6+f4,       f6=f6-f4,       dm(i4,m0)=f8,   pm(i12,m8)=f10;
                f5=f5+f7,       f7=f5-f7,       dm(i5,m0)=f9,   pm(i13,m8)=f11;
                f8=f0+f4,       f9=f0-f4,       dm(i6,m0)=f12,  pm(i14,m8)=f14;
fstage:         f10=f1+f5,      f11=f1-f5,      dm(i7,m0)=f13,  pm(i15,m8)=f15;


{_____________Middle stages with radix-4 main butterfly___________________}

{ m0=1 and m8=1 is still preset }
                        m1=-2;          { reverse step for twiddles }
                        m9=m1;  
                        m2=3;           { forward step for twiddles }
                        m10=m2;
                        m5=4;           { first there are 4 groups }
                        r2=HN/16;       { with HN/16 butterflies in each group }
                        r3=HN/16*3;     { step to next group }
                                
LCNTR=STAGES-2, do mstage until LCE;    { do STAGES-2 stages }

                        i7=cosine;      { first real twiddle }
                        i15=sine;       { first imag twiddle }

                        r8=inrefft;
                        r9=inimfft;

                        i0=r8;          { upper real      path }
        r10=r8+r2;      i8=r9;          { upper imaginary path }
                        i1=r10;         { second real input path }
        r10=r10+r2,     i4=r10;         { second real output path }
                        i2=r10;         { third real input path }
        r10=r10+r2,     i5=r10;         { third real output path }
                        i3=r10;         { fourth real input path }
        r10=r9+r2,      i6=r10;         { fourth real output path }
                        i9=r10;         { second imag input path }
        r10=r10+r2,     i12=r10;        { second imag output path }
                        i10=r10;        { third imag input path }
        r10=r10+r2,     i13=r10;        { third imag output path }
                        i11=r10;        { fourth imag input path }
                        i14=r10;        { fourth imag output path } 
                        m4=r3;
                        m12=r3;
        r4=r3+1,        m6=r2;
                        m3=r4;
        r2=r2-1,        m11=r4;
                        m7=r2;
        
LCNTR=m5,       do mgroup until LCE;    { do m5 groups }
                                                f0=dm(i7,m0),   f5=pm(i9,m8);
f8=f0*f5,                                       f4=dm(i1,m0),   f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5,                                      f0=dm(i7,m0),   f5=pm(i11,m8);
f13=f1*f4,      f12=f9+f12,                     f4=dm(i3,m0),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13;
f13=f1*f5;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m1),   f5=pm(i10,m8);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m0),   f1=pm(i15,m9);
f11=f0*f4;
f13=f1*f5,                      f6=f9-f13;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i0,0);
f13=f1*f4,      f8=f11+f13,     f10=f11-f13;
           {___________Do m7 radix-4 butterflies___________}
LCNTR=m7,       do mr4bfly until LCE;
                                f13=f9-f13,     f4=dm(i1,m0),   f5=pm(i9,m8);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i7,m0),   f1=pm(i15,m8);
f8=f0*f4,       f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,0);
f12=f1*f5,      f9=f9+f13,      f11=f9-f13,     f13=f2;
f8=f0*f5,       f12=f8+f12,                     f0=dm(i7,m0),   f5=pm(i11,m8);
f13=f1*f4,      f9=f9+f13,      f6=f9-f13,      f4=dm(i3,m0),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13,      dm(i0,m0)=f3,   pm(i8,m8)=f9;
f13=f1*f5,      f11=f11+f14,    f7=f11-f14,     dm(i4,m0)=f7,   pm(i12,m8)=f6;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m1),   f5=pm(i10,m8);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m0),   f1=pm(i15,m9);
f11=f0*f4,      f3=f10+f15,     f8=f10-f15,                     pm(i13,m8)=f11;
f13=f1*f5,                      f6=f9-f13,      dm(i6,m0)=f8,   pm(i14,m8)=f7;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i0,0);
mr4bfly:
f13=f1*f4,      f8=f11+f13,     f10=f11-f13,    dm(i5,m0)=f3;
           {___________End radix-4 butterfly_____________}
{                       dummy for address update      *               * }
                                f13=f9-f13,     f0=dm(i7,m2),   f1=pm(i15,m10);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i1,m4),   f1=pm(i9,m12);
                f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,0);
                f9=f9+f13,      f11=f9-f13,     f0=dm(i2,m4);
                f9=f9+f2,       f6=f9-f2,       f0=dm(i3,m4),   f1=pm(i10,m12);
                                                dm(i0,m3)=f3,   pm(i8,m11)=f9;
                f11=f11+f14,    f7=f11-f14,     dm(i4,m3)=f7,   pm(i12,m11)=f6;
                f3=f10+f15,     f8=f10-f15,                     pm(i13,m11)=f11;
                                                dm(i6,m3)=f8,   pm(i14,m11)=f7;
mgroup:                                         dm(i5,m3)=f3,   f1=pm(i11,m12);

                                r3=m4;
                                r1=m5;
                                r2=m6;
        r3=ashift r3 by -2;             { groupstep/4 }
        r1=ashift r1 by 2;              { groups*4 }
                                m5=r1;
mstage: r2=ashift r2 by -2;             { butterflies/4 }
        
{____________________Last radix-4 stage__________________________________}
{ Includes bitreversal of the real data in dm }
{ "real" and "imag" are interchanged once more to achieve inverse FFT}

        bit set mode1 BR0;              { bitreversal in i0 }
{ with: m0=m8=1 preset }
        i4=inrefft;                     { input }
        i1=inrefft+1;
        i2=inrefft+2;
        i3=inrefft+3;
        i0=OREP1;  { real output array base must be an integer multiple of N ...}
        m2=OST;                 { ... we are storing odds first}
        i7=cosine;
        i8=inimfft;                     { input }
        i9=inimfft+1;
        i10=inimfft+2;
        i11=inimfft+3;
        i12=inimfft;                    { output }
        i15=sine;
        m1=4;
        m9=m1;
                                                f0=dm(i7,m0),   f5=pm(i9,m9);
f8=f0*f5,                                       f4=dm(i1,m1),   f1=pm(i15,m8);
f9=f0*f4;
f12=f1*f5,                                      f0=dm(i7,m0),   f5=pm(i11,m9);
f13=f1*f4,      f12=f9+f12,                     f4=dm(i3,m1),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13;
f13=f1*f5;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m0),   f5=pm(i10,m9);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m1),   f1=pm(i15,m8);
f11=f0*f4;
f13=f1*f5,                      f6=f9-f13;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i4,m1);
f13=f1*f4,      f8=f11+f13,     f10=f11-f13;
            {________Do HN/4-1 radix-4 butterflies_______}
LCNTR=HN/4-1, do lstage until LCE;
                                f13=f9-f13,     f4=dm(i1,m1),   f5=pm(i9,m9);
                f2=f2+f6,       f15=f2-f6,      f0=dm(i7,m0),   f1=pm(i15,m8);
f8=f0*f4,       f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,m9);
f12=f1*f5,      f9=f9+f13,      f11=f9-f13,     f13=f2;
f8=f0*f5,       f12=f8+f12,                     f0=dm(i7,m0),   f5=pm(i11,m9);
f13=f1*f4,      f9=f9+f13,      f6=f9-f13,      f4=dm(i3,m1),   f1=pm(i15,m8);
f8=f0*f4,                       f2=f8-f13,      dm(i0,m2)=f3,   pm(i12,m8)=f9;
f13=f1*f5,      f11=f11+f14,    f7=f11-f14,     dm(i0,m2)=f7,   pm(i12,m8)=f6;
f9=f0*f5,       f8=f8+f13,                      f0=dm(i7,m0),   f5=pm(i10,m9);
f13=f1*f4,      f12=f8+f12,     f14=f8-f12,     f4=dm(i2,m1),   f1=pm(i15,m8);
f11=f0*f4,      f3=f10+f15,     f8=f10-f15,                     pm(i12,m8)=f11;
f13=f1*f5,                      f6=f9-f13,      dm(i0,m2)=f3,   pm(i12,m8)=f7;
f9=f0*f5,       f13=f11+f13,                    f11=dm(i4,m1);
lstage:
f13=f1*f4,      f8=f11+f13,     f10=f11-f13,    dm(i0,m2)=f8;

                                f13=f9-f13;
                f2=f2+f6,       f15=f2-f6;
                f3=f8+f12,      f7=f8-f12,                      f9=pm(i8,m9);
                f9=f9+f13,      f11=f9-f13,     dm(i0,m2)=f3;
                f9=f9+f2,       f6=f9-f2,       dm(i0,m2)=f7;
                                                                pm(i12,m8)=f9;
                f11=f11+f14,    f7=f11-f14,                     pm(i12,m8)=f6;
                f3=f10+f15,     f8=f10-f15,                     pm(i12,m8)=f11;
                                                dm(i0,m2)=f3,   pm(i12,m8)=f7;
                                                dm(i0,m2)=f8;

{________Do the bitreversal of the imaginary part from pm to dm___________}

    {we are sticking the even indexed samples into real}
        i0=ORE;
        i8=inimfft;

        f0=pm(i8,m8);
        LCNTR=HN-1, do pmbr until LCE;    { do HN-1 bitreversals }
pmbr:     dm(i0,m2)=f0, f0=pm(i8,m8);

        rts (db);
        dm(i0,m2)=f0;   
        bit clr MODE1 BR0;               { no bitreversal in i0 any more }      
.ENDSEG;