#include <stdlib.h>
#include <stdio.h>
#include "matmuls.h"

/*
   This file contains a collection of routines to multiply two matrices
   and store it in a third. As we have to work with two-dimensional
   arrays, and C is really too stupid to handle that flexibly,
   we can use either of two approaches:
   1. Store it in a 1-D array and compute the indices ourselves.
   2. Store it in an array of vectors.

   In this file only the first approach has been implemented.

   The matrix product C = A B is computed.
   The arrays are stored row-wise.
   A is n1 by n2 (row length n2)
   B is n2 by n3 (row length n3)
   C is n1 by n3 (row length n3)

   Author:	G.D. van Albada
		Section Computational Science
		CSP Lab
		Faculty of Science
		Universiteit van Amsterdam
   Date:	September 6, 2003
   Copyright:	(C) Universiteit van Amsterdam

 */
/* simple straightforward matrix multiplication with n1 in the outer loop and
   n3 in the inner */
void
MatMul_1(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (i = 0; i < n1; i++)
    {
	iA = n2 * i;
	for (j = 0; j < n2; j++)
	{
	    iC = n3 * i;
	    iB = n3 * j;
	    for (k = 0; k < n3; k++)
	    {
		C[iC] += A[iA] * B[iB];
		iC++;
		iB++;
	    }
	    iA++;
	}
    }
}

/* simple straightforward matrix multiplication with n2 in the outer loop and
   n3 in the inner */
void
MatMul_2(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (j = 0; j < n2; j++)
    {
	iA = j;
	for (i = 0; i < n1; i++)
	{
	    iC = n3 * i;
	    iB = n3 * j;
	    for (k = 0; k < n3; k++)
	    {
		C[iC] += A[iA] * B[iB];
		iC++;
		iB++;
	    }
	    iA += n2;
	}
    }
}

/* simple straightforward matrix multiplication with n2 in the outer loop and
   n1 in the inner */
void
MatMul_3(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (j = 0; j < n2; j++)
    {
	iB = n3 * j;
	for (k = 0; k < n3; k++)
	{
	    iA = j;
	    iC = k;
	    for (i = 0; i < n1; i++)
	    {
		C[iC] += A[iA] * B[iB];
		iC += n3;
		iA += n2;
	    }
	    iB++;
	}
    }
}


/* simple straightforward matrix multiplication with n1 in the outer loop and
   n2 in the inner */
void
MatMul_4(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    iA = n2 * i;
	    iB = k;
	    for (j = 0; j < n2; j++)
	    {
		C[iC] += A[iA] * B[iB];
		iB += n3;
		iA++;
	    }
	    iC++;
	}
    }
}

/* simple straightforward matrix multiplication with n3 in the outer loop and
   n2 in the inner */
void
MatMul_5(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (k = 0; k < n3; k++)
    {
	for (i = 0; i < n1; i++)
	{
	    iA = n2 * i;
	    iC = n3 * i + k;
	    iB = k;
	    for (j = 0; j < n2; j++)
	    {
		C[iC] += A[iA] * B[iB];
		iB += n3;
		iA++;
	    }
	}
    }
}

/* simple straightforward matrix multiplication with n3 in the outer loop and
   n1 in the inner */
void
MatMul_6(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (k = 0; k < n3; k++)
    {
	iB = k;
	for (j = 0; j < n2; j++)
	{
	    iA = j;
	    iC = k;
	    for (i = 0; i < n1; i++)
	    {
		C[iC] += A[iA] * B[iB];
		iC += n3;
		iA += n2;
	    }
	    iB += n3;
	}
    }
}

/* From here follow a sequence of matrix multipliers that use a sub-block
   approach to optimise the cache access. The sub-block dimensions
   may not be optimal */

#define SUBM_SIZE	(73)

void
MatMul_7(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;
    int     li, ui, lj, uj, lk, uk;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			iC = n3 * i + lk;
			iB = n3 * j + lk;
			for (k = lk; k < uk; k++)
			{
			    C[iC] += A[iA] * B[iB];
			    iC++;
			    iB++;
			}
			iA++;
		    }
		}
	    }
	}
    }
}

void
MatMul_8(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;
    int     li, ui, lj, uj, lk, uk;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (lj = 0; lj < n2; lj = uj)
    {
	uj = lj + SUBM_SIZE;
	uj = (uj > n2) ? n2 : uj;
	for (li = 0; li < n1; li = ui)
	{
	    ui = li + SUBM_SIZE;
	    ui = (ui > n1) ? n1 : ui;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (j = lj; j < uj; j++)
		{
		    iA = j + li * n2;
		    for (i = li; i < ui; i++)
		    {
			iC = n3 * i + lk;
			iB = n3 * j + lk;
			for (k = lk; k < uk; k++)
			{
			    C[iC] += A[iA] * B[iB];
			    iC++;
			    iB++;
			}
			iA += n2;
		    }
		}
	    }
	}
    }
}


void
MatMul_9(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			Cc = C + n3 * i + lk;
			Bb = B + n3 * j + lk;
			for (k = lk; k < uk; k++)
			{
			    *Cc++ += A[iA] * *Bb++;
			}
			iA++;
		    }
		}
	    }
	}
    }
}

/* Using a much smaller sub-block size */

#undef  SUBM_SIZE
#define SUBM_SIZE	(12)


void
MatMul_10(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;
    int     li, ui, lj, uj, lk, uk;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			iC = n3 * i + lk;
			iB = n3 * j + lk;
			for (k = lk; k < uk; k++)
			{
			    C[iC] += A[iA] * B[iB];
			    iC++;
			    iB++;
			}
			iA++;
		    }
		}
	    }
	}
    }
}


void
MatMul_11(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iB, iC;
    int     li, ui, lj, uj, lk, uk;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			iC = n3 * i + lk;
			iB = n3 * j + lk;
			if (uk < n3)
			{
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			    C[iC++] += A[iA] * B[iB++];
			} else
			    for (k = lk; k < uk; k++)
			    {
				C[iC] += A[iA] * B[iB];
				iC++;
				iB++;
			    }
			iA++;
		    }
		}
	    }
	}
    }
}

/* dynamically modifiable sub-block size; this makes it harder for the
   compiler to optimise its code */
static int subm_size = 73;

int
set_subm_size(int val)
{
    int     rv = subm_size;

    subm_size = val;
    return rv;
}

void
MatMul_12(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			Cc = C + n3 * i + lk;
			Bb = B + n3 * j + lk;
			for (k = lk; k < uk; k++)
			{
			    *Cc++ += A[iA] * *Bb++;
			}
			iA++;
		    }
		}
	    }
	}
    }
}


void
MatMul_13(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb;
    register double Aa;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			Aa = A[iA];
			Cc = C + n3 * i + lk;
			Bb = B + n3 * j + lk;
			if ((uk - lk) & 3)
			{
			    for (k = lk; k < uk; k++)
			    {
				*Cc++ += Aa * *Bb++;
			    }
			} else
			{
			    for (k = lk; k < uk; k += 4)
			    {
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
			    }
			}
			iA++;
		    }
		}
	    }
	}
    }
}


void
MatMul_14(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iC;
    int     lj, uj, lk, uk;
    double *Cc, *Bb;
    register double Aa;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (lj = 0; lj < n2; lj = uj)
    {
	uj = lj + subm_size;
	uj = (uj > n2) ? n2 : uj;
	for (lk = 0; lk < n3; lk = uk)
	{
	    uk = lk + subm_size;
	    uk = (uk > n3) ? n3 : uk;
	    for (i = 0; i < n1; i++)
	    {
		iA = n2 * i + lj;
		for (j = lj; j < uj; j++)
		{
		    Aa = A[iA];
		    Cc = C + n3 * i + lk;
		    Bb = B + n3 * j + lk;
		    if ((uk - lk) & 3)
		    {
			for (k = lk; k < uk; k++)
			{
			    *Cc++ += Aa * *Bb++;
			}
		    } else
		    {
			for (k = lk; k < uk; k += 4)
			{
			    *Cc++ += Aa * *Bb++;
			    *Cc++ += Aa * *Bb++;
			    *Cc++ += Aa * *Bb++;
			    *Cc++ += Aa * *Bb++;
			}
		    }
		    iA++;
		}
	    }
	}
    }
}


void
MatMul_15(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    for (k = lk; k < uk; k++)
		    {
			Cc = C + n3 * i + k;
			Bb = B + n3 * lj + k;
			Aa = A + n2 * i + lj;
			for (j = lj; j < uj; j++)
			{
			    *Cc += *Aa++ * *Bb;
			    Bb += n3;
			}
		    }
		}
	    }
	}
    }
}


void
MatMul_16(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;
    register double Accum;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    for (k = lk; k < uk; k++)
		    {
			Cc = C + n3 * i + k;
			Bb = B + n3 * lj + k;
			Aa = A + n2 * i + lj;
			Accum = 0.0;
			for (j = lj; j < uj; j++)
			{
			    Accum += *Aa++ * *Bb;
			    Bb += n3;
			}
			*Cc += Accum;
		    }
		}
	    }
	}
    }
}


void
MatMul_17(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (j = lj; j < uj; j++)
		{
		    Bb = B + n3 * j + lk;
		    for (k = lk; k < uk; k++)
		    {
			Cc = C + n3 * li + k;
			Aa = A + n2 * li + j;
			for (i = li; i < ui; i++)
			{
			    *Cc += *Aa * *Bb;
			    Aa += n2;
			    Cc += n3;
			}
			Bb++;
		    }
		}
	    }
	}
    }
}

void
MatMul_18(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Aa, *Bb;
    register double Cr;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + subm_size;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + subm_size;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + subm_size;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iC = n3 * i + lk;
		    for (k = lk; k < uk; k++)
		    {
			Aa = A + n2 * i + lj;
			Bb = B + k + n3 * lj;
			Cr = C[iC];
			for (j = lj; j < uj; j++)
			{
			    Cr += *Aa * *Bb;
			    Bb += n3;
			    Aa++;
			}
			C[iC] = Cr;
			iC++;
		    }
		}
	    }
	}
    }
}


/* Now use a smaller and even sub-block size */

#undef SUBM_SIZE
#define SUBM_SIZE (40)

void
MatMul_19(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iA, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb;
    register double Aa;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    iA = n2 * i + lj;
		    for (j = lj; j < uj; j++)
		    {
			Aa = A[iA];
			Cc = C + n3 * i + lk;
			Bb = B + n3 * j + lk;
			if ((uk - lk) & 3)
			{
			    for (k = lk; k < uk; k++)
			    {
				*Cc++ += Aa * *Bb++;
			    }
			} else
			{
			    for (k = lk; k < uk; k += 4)
			    {
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
				*Cc++ += Aa * *Bb++;
			    }
			}
			iA++;
		    }
		}
	    }
	}
    }
}

void
MatMul_20(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;
    register double Accum;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    for (k = lk; k < uk; k++)
		    {
			Cc = C + n3 * i + k;
			Bb = B + n3 * lj + k;
			Aa = A + n2 * i + lj;
			Accum = 0.0;
			if ((uj - lj) & 7)
			{
			    for (j = lj; j < uj; j++)
			    {
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			    }
			} else
			{
			    for (j = lj; j < uj; j += 8)
			    {
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			        Accum += *Aa++ * *Bb;
			        Bb += n3;
			    }
			}
			*Cc += Accum;
		    }
		}
	    }
	}
    }
}

/* Here we will multiply by B transpose !! */
void
MatTMul_20(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;
    register double Accum;

    for (i = 0; i < n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k < n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li < n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj < n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk < n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i < ui; i++)
		{
		    for (k = lk; k < uk; k++)
		    {
			Cc = C + n3 * i + k;
			Bb = B + n2 * k + lj;
			Aa = A + n2 * i + lj;
			Accum = 0.0;
			if ((uj - lj) & 7)
			{
			    for (j = lj; j < uj; j++)
			    {
			        Accum += *Aa++ * *Bb++;
			    }
			} else
			{
			    for (j = lj; j < uj; j += 8)
			    {
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			    }
			}
			*Cc += Accum;
		    }
		}
	    }
	}
    }
}

void
MatMul_21(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, k, iB;
    // int     li, ui, lj, uj, lk, uk;
    double  *BT, *bt;

    bt = BT = (double *) malloc(n2 * n3 * sizeof(double));

    for (i = 0; i < n3; i++)
    {
	iB = i;
	for (k = 0; k < n2; k++)
	{
	    *bt++ = B[iB];
	    iB += n3;
	}
    }
    MatTMul_20(A, BT, C, n1, n2, n3);
    free(BT);
}

/* slightly modified version of MatTMul_20, uses inequality checking
 * instead of magnitude comparison. */
void
MatTMul_22(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, j, k, iC;
    int     li, ui, lj, uj, lk, uk;
    double *Cc, *Bb, *Aa;
    register double Accum;

    for (i = 0; i != n1; i++)
    {
	iC = n3 * i;
	for (k = 0; k != n3; k++)
	{
	    C[iC] = 0;
	    iC++;
	}
    }
    for (li = 0; li != n1; li = ui)
    {
	ui = li + SUBM_SIZE;
	ui = (ui > n1) ? n1 : ui;
	for (lj = 0; lj != n2; lj = uj)
	{
	    uj = lj + SUBM_SIZE;
	    uj = (uj > n2) ? n2 : uj;
	    for (lk = 0; lk != n3; lk = uk)
	    {
		uk = lk + SUBM_SIZE;
		uk = (uk > n3) ? n3 : uk;
		for (i = li; i != ui; i++)
		{
		    for (k = lk; k != uk; k++)
		    {
			Cc = C + n3 * i + k;
			Bb = B + n2 * k + lj;
			Aa = A + n2 * i + lj;
			Accum = 0.0;
			if ((uj - lj) & 7)
			{
			    for (j = lj; j < uj; j++)
			    {
			        Accum += *Aa++ * *Bb++;
			    }
			} else
			{
			    for (j = lj; j != uj; j += 8)
			    {
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			        Accum += *Aa++ * *Bb++;
			    }
			}
			*Cc += Accum;
		    }
		}
	    }
	}
    }
}

void
MatMul_22(double *A, double *B, double *C, int n1, int n2, int n3)
{
    int     i, k, iB;
    double  *BT, *bt;

    bt = BT = (double *) malloc(n2 * n3 * sizeof(double));

    for (i = 0; i != n3; i++)
    {
	iB = i;
	for (k = 0; k != n2; k++)
	{
	    *bt++ = B[iB];
	    iB += n3;
	}
    }
    MatTMul_22(A, BT, C, n1, n2, n3);
    free(BT);
}

#ifdef STAND_ALONE_TEST


MatMulFunStruct FunArray[] = {{&MatMul_1, "MatMul_1"},
			      {&MatMul_2, "MatMul_2"},
			      {&MatMul_3, "MatMul_3"},
			      {&MatMul_4, "MatMul_4"},
			      {&MatMul_5, "MatMul_5"},
			      {&MatMul_6, "MatMul_6"},
			      {&MatMul_7, "MatMul_7"},
			      {&MatMul_8, "MatMul_8"},
			      {&MatMul_9, "MatMul_9"},
			      {&MatMul_10, "MatMul_10"},
			      {&MatMul_11, "MatMul_11"},
			      {&MatMul_12, "MatMul_12"},
			      {&MatMul_13, "MatMul_13"},
			      {&MatMul_14, "MatMul_14"},
			      {&MatMul_15, "MatMul_15"},
			      {&MatMul_16, "MatMul_16"},
			      {&MatMul_17, "MatMul_17"},
			      {&MatMul_18, "MatMul_18"},
			      {&MatMul_19, "MatMul_19"},
			      {&MatMul_20, "MatMul_20"},
			      {&MatMul_21, "MatMul_21"},
			      {NULL, ""}};

int
main()
{
    double  A[50], B[50], C[50];

    int     i, j;
    unsigned long Q = 37;

    for (i = 0; i < 50; i++)
    {
	A[i] = Q;
	Q = 73 * i + 5;
	Q %= 41;
	B[i] = Q;
	Q = 73 * i + 5;
	Q %= 41;
    }
    for (j = 0; FunArray[j].MatMul; j++)
    {
	printf("-------------%s\n", FunArray[j].funName);
	(*FunArray[j].MatMul)(A, B, C, 7, 5, 6);
	for (i = 0; i < 42; i++)
	{
	    printf("%g\t", C[i]);
	}
	printf("\n");
    }
    set_subm_size(4);
    for (j = 0; FunArray[j].MatMul; j++)
    {
	printf("-------------%s\n", FunArray[j].funName);
	(*FunArray[j].MatMul)(A, B, C, 7, 5, 6);
	for (i = 0; i < 42; i++)
	{
	    printf("%g\t", C[i]);
	}
	printf("\n");
    }
    printf("\n");
    for (i = 0; i < 50; i++)
    {
	A[i] = 1;
	Q = 73 * i + 5;
	Q %= 41;
	B[i] = 1;
	Q = 73 * i + 5;
	Q %= 41;
    }
    MatMul_1(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_2(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_3(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_4(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_5(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_6(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    MatMul_7(A, B, C, 7, 5, 6);
    for (i = 0; i < 42; i++)
    {
	printf("%g\t", C[i]);
    }
    printf("\n");
    return 0;
}

#endif
