{----------------------------------------------------------------------------
conv3x3.asm:    Efficient 3 X 3 Convolution
-----------------------------------------------------------------------------
Description:

A 3x3 convolution in which both the kernel and the data are in memory
involves 18 reads from memory, 9 multiplies, 8 additions, and 1 write to
memory.  The bottleneck is the 9 multiplies - each 3x3 convolution must take
a mimimum of 9 cycles. But the 19 memory I/O operations create another
bottleneck - they would take a minimum of 10 cycles even with data in DM,
kernel in PM. Multifunction restrictions will not permit us to store the 9
kernel values in the register file, either.

The solution used here in achieving an effective 9 cycle rate is to observe
that there is much to be gained by taking advantage of data & kernel overlap
between successive convolutions:

DATA ARRAY:
 +-------------------------------------------------------------+
 |                                                             |
 |  0,0     0,1     0,2     0,3     0,4     . . .     0,M-1    |
 |                                                             |
 |  1,0     1,1     1,2     1,3     1,4     . . .       .      |
 |                                                      .      |
 |  2,0     2,1     2,2     2,3     2,4     . . .       .      |
 |                                                             |
 |   .       .       .       .       .                         |
 ~   .       .       .       .       .                         ~
 ~   .       .       .       .       .                         ~ 
 |                                                    N-2,M-1  |
 |                                                             |
 |  N-1,0               . . .               N-1,M-2   N-1,M-1  |
 |                                                             |
 +-------------------------------------------------------------+

I will refer to the data array elements in the upper left as {d00,d01...}.
The convolution kernel elements I will call { c00, c01, ...  , c32, c33 }.
The first two convolutions are formed from these products:

c00*d00    = a0  ---\
c01*d01    = a1      \
c02*d02    = a2       \
c10*d10    = a3        \
c11*d11    = a4         --->  SUM  ---> convolution sum #1
c12*d12    = a5        /
c20*d20    = a6       / 
c21*d21    = a7      /
c22*d22    = a8  ---/


c00*d01    = b0  ---\
c01*d02    = b1      \
c02*d03    = b2       \
c10*d11    = b3        \
c11*d12    = b4         --->  SUM  ---> convolution sum #2
c12*d13    = b5        /
c20*d21    = b6       / 
c21*d22    = b7      /
c22*d23    = b8  ---/

Note that there are only three new data values introduced between the first
and second convolution - d03, d13, and d23. It is this redundancy, and the
availability of a 16-entry register file on the ADSP-21020, that allows an
implementation of the convolution of order 9N^2.

The intermediate products above (a0-a8 and b0-b8) are referred to in the
coding that follows.

The following section is detailed walkthrough of how the looped code was
generated from pseudo code.

-----------------------------------------------------------------------------
Looping the Convolution:

Here is some pseudo-code which implements these two convolutions in a format
similiar to the 21020's multifunction code:

--------------------------------------------
UNROLLED, UNSHARED PSEUDO-CODE
--------------------------------------------

                        c00=pm  d00=dm
            ra=c00*d00  c01=pm  d01=dm      { a0 }
            rb=c00*d01          d02=dm      { b0 }
            pa=c01*d01  c02=pm              { a1 }
ra=ra+pa    pa=c02*d02                      { a2 }
ra=ra+pa    pb=c01*d02          d03=dm      { b1 }
rb=rb+pb    pb=c02*d03                      { b2 }
rb=rb+pb    

                        c10=pm  d10=dm
            pa=c10*d10  c11=pm  d11=dm      { a3 }
ra=ra+pa    pb=c10*d11          d12=dm      { b3 }
rb=rb+pb    pa=c11*d11  c12=pm              { a4 }
ra=ra+pa    pa=c12*d12                      { a5 }
ra=ra+pa    pb=c11*d12          d13=dm      { b4 }
rb=rb+pb    pb=c12*d13                      { b5 }
rb=rb+pb    

                        c20=pm  d20=dm
            pa=c20*d20  c21=pm  d21=dm      { a6 }
ra=ra+pa    pb=c20*d21          d22=dm      { b6 }
rb=rb+pb    pa=c21*d21  c22=pm              { a7 }
ra=ra+pa    pa=c22*d22                      { a8 }
ra=ra+pa    pb=c21*d22          d23=dm      { b7 }
rb=rb+pb    pb=c22*d23                      { b8 }
rb=rb+pb    

                                dm=ra
                                dm=rb


Here, "ra" and "rb" refer to accumulators "a" and "b" for convolutions 1 and 2
respectively. "pa" and "pb" refer to product registers in the same way. data and
coefficients are retrieved from PM and DM; we hold off worrying about index and
modify registers until we implement the algorithm in real assembly code.


Now, I will roll the code together, taking advantage of multifunction operations:

--------------------------------------------
SHARED, UNROLLED PSEUDO-CODE
--------------------------------------------

                        c00=pm  d00=dm
            ra=c00*d00  c01=pm  d01=dm      { a0 }
            rb=c00*d01          d02=dm      { b0 }
            pa=c01*d01  c02=pm              { a1 }
ra=ra+pa    pa=c02*d02                      { a2 }
ra=ra+pa    pb=c01*d02          d03=dm      { b1 }
rb=rb+pb    pb=c02*d03  c10=pm  d10=dm      { b2 }
rb=rb+pb    pa=c10*d10  c11=pm  d11=dm      { a3 }
ra=ra+pa    pb=c10*d11          d12=dm      { b3 }
rb=rb+pb    pa=c11*d11  c12=pm              { a4 }
ra=ra+pa    pa=c12*d12                      { a5 }
ra=ra+pa    pb=c11*d12          d13=dm      { b4 }
rb=rb+pb    pb=c12*d13  c20=pm  d20=dm      { b5 }
rb=rb+pb    pa=c20*d20  c21=pm  d21=dm      { a6 }
ra=ra+pa    pb=c20*d21          d22=dm      { b6 }
rb=rb+pb    pa=c21*d21  c22=pm              { a7 }
ra=ra+pa    pa=c22*d22                      { a8 }
ra=ra+pa    pb=c21*d22          d23=dm      { b7 }
rb=rb+pb    pb=c22*d23          dm=ra       { b8 }
rb=rb+pb
                                dm=rb

Now, I will roll it into a loop. 
--------------------------------------------
SHARED, ROLLED PSEUDO-CODE
--------------------------------------------

                                        c00=pm  d00=dm
                        ra=c00*d00      c01=pm  d01=dm      { a0 }
                        rb=c00*d01              d02=dm      { b0 }
                        pa=c01*d01      c02=pm              { a1 }

do convo until done;
            ra=ra+pa    pa=c02*d02                          { a2 }
            ra=ra+pa    pb=c01*d02              d03=dm      { b1 }
            rb=rb+pb    pb=c02*d03      c10=pm  d10=dm      { b2 }
            rb=rb+pb    pa=c10*d10      c11=pm  d11=dm      { a3 }
            ra=ra+pa    pb=c10*d11              d12=dm      { b3 }
            rb=rb+pb    pa=c11*d11      c12=pm              { a4 }
            ra=ra+pa    pa=c12*d12                          { a5 }
            ra=ra+pa    pb=c11*d12              d13=dm      { b4 }
            rb=rb+pb    pb=c12*d13      c20=pm  d20=dm      { b5 }
            rb=rb+pb    pa=c20*d20      c21=pm  d21=dm      { a6 }
            ra=ra+pa    pb=c20*d21              d22=dm      { b6 }
            rb=rb+pb    pa=c21*d21      c22=pm              { a7 }
            ra=ra+pa    pa=c22*d22              d23=dm      { a8 }
            ra=ra+pa    pb=c21*d22              d00=dm      { b7 }
            rb=rb+pb    pb=c22*d23      c00=pm  d01=dm      { b8 }
                        ra=c00*d00      c01=pm  dm=ra       { a0 }
                        rb=c00*d01              dm=rb       { b0 }
convo:      rb=rb+pb    pa=c01*d01      c02=pm  d02=dm      { a1 }



Finally, I will make some real register assignments:

        ra=f8
        rb=f9
        pa=f12
        pb=f13
        cx0=f0
        cx1=f1
        cx2=f2
        dx0=f4
        dx1=f5
        dx2=f6
        dx2=f7

--------------------------------------------
SHARED, ROLLED SEMI-REAL CODE
--------------------------------------------



                                        f0=(i8,m8)      f4=dm(i0,m0)
                        f8=f0*f4        f1=(i8,m8)      f5=dm(i0,m0)        { a0 }
                        f9=f0*f5                        f6=dm(i0,m0)        { b0 }
                        f12=f1*f5       f2=(i8,m9)                          { a1 }

do convo until done;
            f8=f8+f12    f12=f2*f6                                          { a2 }
            f8=f8+f12    f13=f1*f6                      f7=dm(i0,m1)        { b1 }
            f9=f9+f13    f13=f2*f7      f0=(i8,m8)      f4=dm(i0,m0)        { b2 }
            f9=f9+f13    f12=f0*f4      f1=(i8,m8)      f5=dm(i0,m0)        { a3 }
            f8=f8+f12    f13=f0*f5                      f6=dm(i0,m0)        { b3 }
            f9=f9+f13    f12=f1*f5      f2=(i8,m9)                          { a4 }
            f8=f8+f12    f12=f2*f6                                          { a5 }
            f8=f8+f12    f13=f1*f6                      f7=dm(i0,m1)        { b4 }
            f9=f9+f13    f13=f2*f7      f0=(i8,m8)      f4=dm(i0,m0)        { b5 }
            f9=f9+f13    f12=f0*f4      f1=(i8,m8)      f5=dm(i0,m0)        { a6 }
            f8=f8+f12    f13=f0*f5                      f6=dm(i0,m0)        { b6 }
            f9=f9+f13    f12=f1*f5      f2=(i8,m9)                          { a7 }
            f8=f8+f12    f12=f2*f6                      f7=dm(i0,m1)        { a8 }
            f8=f8+f12    f13=f1*f6                      f4=dm(i0,m0)        { b7 }
            f9=f9+f13    f13=f2*f7      f0=(i8,m8)      f5=dm(i0,m0)        { b8 }
                         f8=f0*f4       f1=(i8,m8)      dm(i1,m0)=f8        { a0 }
                         f9=f0*f5                       dm(i1,m0)=f9        { b0 }
convo:      f9=f9+f13    f12=f1*f5      f2=(i8,m9)      f6=dm(i0,m0)        { a1 }


The real, working code can be found below. A fortunate by-product of this method is
that the input data and output data both wind up in DM; no ping-ponging between
memories is need to perform a series of convolutions.


-----------------------------------------------------------------------------
Program Characteristics:

Calling Registers (NxM array):

    DAG1 (Data Memory) 
    i0      = start of array to be convolved
    m0      = spacing to next element in a row      = +1
    m1      = spacing to first element of next row  = M-3
    m2      = spacing to next 3x3 matrix            = 2M - 1
    m3      = 2

    DAG2 (Program Memory)
    i1      = start of output array
    i8      = start of kernel coefficients
    l8      = 9
    m8      = 1

Registers Altered:
   r0-r9, r12-r15

Return Values:  output matrix in Data Memory pointed to by i1

Computation Time:
    conv3x3     =   7 + (R-2)*(5+(C/2-2)*18+17) + 3
                =   10 + (R-2)*(22+(9C-36))
                =   10 + (R-2)*(9C-14)
                =   9RC - 14R - 18C + 38        for R,C in terms of input
                =   9RC + 4R + 10               for R,C in terms of output
                =   94.4 ms for 512x512 image @ 25MHz


Note about Efficiency:

If the size of the input matrix is R=C=514, this algorithm takes 2,361,354
cycles.  Since 512x512 convolutions are performed, and 9 cycles/convolution
is the peak theoretical performance, the theoretical minimum number of cycles
for a processor with a single multiplier is 9*512*51 = 2,359,296 cycles.
Thus this algorithm requires only .09% more than the theoretical minimum for
N=514.
-----------------------------------------------------------------------------
Author:     Jim Donahue, Analog Devices DSP Division
Revised:    4-APR-91
----------------------------------------------------------------------------}

.EXTERN conv3x3;
.EXTERN num_rows;
.EXTERN num_cols;

.GLOBAL conv3x3;

{----------------------------------------------------------------------------
    Perform 3x3 by NxM convolution
----------------------------------------------------------------------------}
.SEGMENT /pm pm_code;
conv3x3:    r14=dm(num_rows);
            r0=2;
            r14=r14-r0;                 { r14 = num_rows - 2                }
            r15=dm(num_cols);
            r15=lshift r15 by -1;
            r15=r15-r0;                 { r15 = num_cols/2 - 2              }

            lcntr=r14, do rows until lce;
                           f4=dm(i0,m0), f0=pm(i8,m8);
                f8=f0*f4,  f5=dm(i0,m0), f1=pm(i8,m8);
                f9=f0*f5,  f6=dm(i0,m0)              ;
                f12=f1*f5,               f2=pm(i8,m8); 
                lcntr=r15, do cols until lce;
                    f12=f2*f6, f8=f8+f12                            ;                          
                    f13=f1*f6, f8=f8+f12, f7=dm(i0,m1)              ;
                    f13=f2*f7, f9=f9+f13, f4=dm(i0,m0), f0=pm(i8,m8);
                    f12=f0*f4, f9=f9+f13, f5=dm(i0,m0), f1=pm(i8,m8);
                    f13=f0*f5, f8=f8+f12, f6=dm(i0,m0)              ;
                    f12=f1*f5, f9=f9+f13,               f2=pm(i8,m8);
                    f12=f2*f6, f8=f8+f12                            ;                          
                    f13=f1*f6, f8=f8+f12, f7=dm(i0,m1)              ;
                    f13=f2*f7, f9=f9+f13, f4=dm(i0,m0), f0=pm(i8,m8);
                    f12=f0*f4, f9=f9+f13, f5=dm(i0,m0), f1=pm(i8,m8);
                    f13=f0*f5, f8=f8+f12, f6=dm(i0,m0)              ;
                    f12=f1*f5, f9=f9+f13,               f2=pm(i8,m8);
                    f12=f2*f6, f8=f8+f12, f7=dm(i0,m2)              ;
                    f13=f1*f6, f8=f8+f12, f4=dm(i0,m0)              ;
                    f13=f2*f7, f9=f9+f13, f5=dm(i0,m0), f0=pm(i8,m8);
                    f8=f0*f4,  f9=f9+f13, dm(i1,m0)=f8, f1=pm(i8,m8);
                    f9=f0*f5,             dm(i1,m0)=f9              ;
cols:               f12=f1*f5,            f6=dm(i0,m0), f2=pm(i8,m8);

                f12=f2*f6, f8=f8+f12                            ;                          
                f13=f1*f6, f8=f8+f12, f7=dm(i0,m1)              ;
                f13=f2*f7, f9=f9+f13, f4=dm(i0,m0), f0=pm(i8,m8);
                f12=f0*f4, f9=f9+f13, f5=dm(i0,m0), f1=pm(i8,m8);
                f13=f0*f5, f8=f8+f12, f6=dm(i0,m0)              ;
                f12=f1*f5, f9=f9+f13,               f2=pm(i8,m8);
                f12=f2*f6, f8=f8+f12                            ;                          
                f13=f1*f6, f8=f8+f12, f7=dm(i0,m1)              ;
                f13=f2*f7, f9=f9+f13, f4=dm(i0,m0), f0=pm(i8,m8);
                f12=f0*f4, f9=f9+f13, f5=dm(i0,m0), f1=pm(i8,m8);
                f13=f0*f5, f8=f8+f12, f6=dm(i0,m0)              ;
                f12=f1*f5, f9=f9+f13,               f2=pm(i8,m8);
                f12=f2*f6, f8=f8+f12, f7=dm(i0,m2)              ;
                f13=f1*f6, f8=f8+f12;
                f13=f2*f7, f9=f9+f13, modify(i0,m3);
                f9=f9+f13, dm(i1,m0)=f8;
rows:           dm(i1,m0)=f9;

            rts;

.ENDSEG;