{________________________________________________________________________
MxNxNx1.ASM	ADSP-21020 Matrix times a Vector

		C[Mx1]=A[MxN]*B[Nx1]

Martix dimensions are arbitrary. Matrix A accessed as a circular buffer so 
that the last iteration of the inner loop will do a dummy read from a known
location.

Revision: 25-APR-91 , Steven Cox , Analog Devices

Assembler Preprocessor Switches:
	-Dexample is used to include assembly of and example calling routine

Calling Information:
	Constants: m, n
	pm(mat_b[n]) row major, dm(mat_a[m*n]) row major, 
	M1=1;
	M9=1;
	B0=mat_a;	L0=@mat_a;
	B1=mat_c;	L1=0;
	B8=mat_b;	L8=@mat_b;

Results:
	dm(mat_c[m]) row major

Altered Registers:
	F0,F4,F8,F12, I0,I1,I8

Benchmark:  mxnxnx1
	cycles=6+M(3+N)+5	(entrance + core + 5 cache)

Memory Usage: pm code=8 words, pm data=n words, dm data=m*n+m words
________________________________________________________________________}
{ dimension constants }
#define		M 4
#define		N 4

#ifndef example
.GLOBAL mxnxnx1;
.EXTERN mat_a, mat_b,mat_c;
#endif

#ifdef example
.SEGMENT/DM dm_data;
.VAR		mat_a[M*N]="mat_a.dat";
.VAR		mat_c[M];
.ENDSEG;

.SEGMENT/PM pm_data;
.VAR		mat_b[N]="mat_bb.dat";
.ENDSEG;

.SEGMENT/PM	rst_svc;
		dmwait=0x21;	{ set dm waitstates to zero }
		pmwait=0x21;	{ set pm waitstates to zero }
		jump setup;
.ENDSEG;

		{ example calling code }
.SEGMENT/PM pm_code;
setup:		m1=1;
		m9=1;
		b0=mat_a;	l0=@mat_a;
		b1=mat_c;	l1=0;
		b8=mat_b;	l8=@mat_b;
		call mxnxnx1;
		idle;
.ENDSEG;
#endif

		{ matrix multiply starts here }
.SEGMENT/PM pm_code;
mxnxnx1:	r8=r8 xor r8, f0=dm(i0,m1), f4=pm(i8,m9); { clear f8 }
		f12=f0*f4,   f0=dm(i0,m1), f4=pm(i8,m9);
		lcntr=M, do column until lce;
		  lcntr=N, do row until lce;
row:		    f12=f0*f4, f8=f8+f12, f0=dm(i0,m1), f4=pm(i8,m9);
column:		  r8=r8 xor r8, dm(i1,m1)=f8;
		rts;
.ENDSEG;