6 files changed, 1084 insertions, 0 deletions
diff --git a/sci-libs/superlu_mt/ChangeLog b/sci-libs/superlu_mt/ChangeLog
new file mode 100644
index 000000000..f6d1810dd
--- /dev/null
+++ b/sci-libs/superlu_mt/ChangeLog
@@ -0,0 +1,11 @@
+# ChangeLog for sci-libs/superlu_mt
+# Copyright 1999-2013 Gentoo Foundation; Distributed under the GPL v2
+# $Header: $
+
+*superlu_mt-2.1 (15 Jul 2013)
+
+  15 Jul 2013; Sébastien Fabbro <bicatali@gentoo.org>
+  +files/superlu_mt-2.1-duplicate-symbols.patch,
+  +files/superlu_mt-2.1-missing-includes.patch, +metadata.xml,
+  +superlu_mt-2.1.ebuild:
+  sci-libs/superlu_mt: Initial import
diff --git a/sci-libs/superlu_mt/Manifest b/sci-libs/superlu_mt/Manifest
new file mode 100644
index 000000000..929cbbd3a
--- /dev/null
+++ b/sci-libs/superlu_mt/Manifest
@@ -0,0 +1 @@
+DIST superlu_mt_2.1.tar.gz 2718660 SHA256 77fd2a67a789704b566681dc614fa8f759b2925d3ff49cda9e11376b6dc38ed9 SHA512 1abd94c086404a12b82dcf39238a2aef584ba9d11ca24942faad1dbd8a283f257acbc594325ba3a64ec7323b2d738b2dcb8e2551953d01d017ca91f3a2d05890 WHIRLPOOL e7482c9c29e50af0a23bd943a9b80f2cb8bdcb7169f435994e7680b28e9fd9ec0876713ff1ac5b58a325cd000f7ca63fc1caad4eb52599e63922288dcf9e5505
diff --git a/sci-libs/superlu_mt/files/superlu_mt-2.1-duplicate-symbols.patch b/sci-libs/superlu_mt/files/superlu_mt-2.1-duplicate-symbols.patch
new file mode 100644
index 000000000..6b2c5d57b
--- /dev/null
+++ b/sci-libs/superlu_mt/files/superlu_mt-2.1-duplicate-symbols.patch
@@ -0,0 +1,900 @@
+diff -Nur SRC.orig/cmatgen.c SRC/cmatgen.c
+--- SRC.orig/cmatgen.c	2013-07-15 11:47:52.512735420 -0700
++++ SRC/cmatgen.c	2013-07-15 11:49:05.149137948 -0700
+@@ -93,76 +93,4 @@
+     xa[n] = lasta;
+ }
+ 
+-double dlaran_(int *iseed)
+-{
+-/*  -- LAPACK auxiliary routine (version 2.0) --   
+-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+-       Courant Institute, Argonne National Lab, and Rice University   
+-       February 29, 1992   
+-
+-    Purpose   
+-    =======   
+-
+-    DLARAN returns a random real number from a uniform (0,1)   
+-    distribution.   
+-
+-    Arguments   
+-    =========   
+-
+-    ISEED   (input/output) INT array, dimension (4)   
+-            On entry, the seed of the random number generator; the array 
+-  
+-            elements must be between 0 and 4095, and ISEED(4) must be   
+-            odd.   
+-            On exit, the seed is updated.   
+-
+-    Further Details   
+-    ===============   
+-
+-    This routine uses a multiplicative congruential method with modulus   
+-    2**48 and multiplier 33952834046453 (see G.S.Fishman,   
+-    'Multiplicative congruential random number generators with modulus   
+-    2**b: an exhaustive analysis for b = 32 and a partial analysis for   
+-    b = 48', Math. Comp. 189, pp 331-344, 1990).   
+-
+-    48-bit integers are stored in 4 integer array elements with 12 bits   
+-    per element. Hence the routine is portable across machines with   
+-    integers of 32 bits or more.   
+-
+-    ===================================================================== 
+-*/
+-    
+-    /* Local variables */
+-    int it1, it2, it3, it4;
+-
+-    --iseed;
+-
+-    /* multiply the seed by the multiplier modulo 2**48 */
+-    it4 = iseed[4] * 2549;
+-    it3 = it4 / 4096;
+-    it4 -= it3 << 12;
+-    it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
+-    it2 = it3 / 4096;
+-    it3 -= it2 << 12;
+-    it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
+-    it1 = it2 / 4096;
+-    it2 -= it1 << 12;
+-    it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4] 
+-	    * 494;
+-    it1 %= 4096;
+-
+-   /* return updated seed */
+-
+-    iseed[1] = it1;
+-    iseed[2] = it2;
+-    iseed[3] = it3;
+-    iseed[4] = it4;
+-
+-   /* convert 48-bit integer to a real number in the interval (0,1) */
+-
+-    return ((double) it1 +
+-	    ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
+-	     2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
+-
+-} /* dlaran_ */
+ 
+diff -Nur SRC.orig/cmyblas2.c SRC/cmyblas2.c
+--- SRC.orig/cmyblas2.c	2013-07-15 11:47:52.509735400 -0700
++++ SRC/cmyblas2.c	2013-07-15 11:49:05.149137948 -0700
+@@ -183,3 +183,127 @@
+ 	
+ }
+ 
++/*
++ * Performs dense matrix-vector multiply with 2 vectors:
++ *        y0 = y0 + A * x0
++ *        y1 = y1 + A * x1
++ */
++void cmatvec2 (
++               int lda,     /* leading dimension of A */
++               int m,
++               int n,
++               complex *A,   /* in - size m-by-n */
++               complex *x0,  /* in - size n-by-1 */
++               complex *x1,  /* in - size n-by-1 */
++               complex *y0,  /* out - size n-by-1 */
++               complex *y1   /* out - size n-by-1 */
++               )
++
++{
++    complex v00, v10, v20, v30, v40, v50, v60, v70,
++                    v01, v11, v21, v31, v41, v51, v61, v71;
++    complex t0, t1, t2, t3, t4, t5, t6, t7;
++    complex f0, f1;
++    complex *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
++    register int firstcol = 0;
++    complex *M0, temp;
++    int k;
++
++    M0 = &A[0];
++
++    while ( firstcol < n - 7 ) {        /* Do 8 columns */
++
++        Mki0 = M0;
++        Mki1 = Mki0 + lda;
++        Mki2 = Mki1 + lda;
++        Mki3 = Mki2 + lda;
++        Mki4 = Mki3 + lda;
++        Mki5 = Mki4 + lda;
++        Mki6 = Mki5 + lda;
++        Mki7 = Mki6 + lda;
++
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++        v10 = x0[firstcol];   v11 = x1[firstcol++];
++        v20 = x0[firstcol];   v21 = x1[firstcol++];
++        v30 = x0[firstcol];   v31 = x1[firstcol++];
++        v40 = x0[firstcol];   v41 = x1[firstcol++];
++        v50 = x0[firstcol];   v51 = x1[firstcol++];
++        v60 = x0[firstcol];   v61 = x1[firstcol++];
++        v70 = x0[firstcol];   v71 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];  cc_mult(&temp, &v00, &t0);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
++            t1 = Mki1[k];  cc_mult(&temp,&v10,&t1);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v11,&t1);c_add(&f1,&f1,&temp);
++            t2 = Mki2[k];  cc_mult(&temp,&v20,&t2);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v21,&t2);c_add(&f1,&f1,&temp);
++            t3 = Mki3[k];  cc_mult(&temp,&v30,&t3);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v31,&t3);c_add(&f1,&f1,&temp);
++            t4 = Mki4[k];  cc_mult(&temp,&v40,&t4);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v41,&t4);c_add(&f1,&f1,&temp);
++            t5 = Mki5[k];  cc_mult(&temp,&v50,&t5);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v51,&t5);c_add(&f1,&f1,&temp);
++            t6 = Mki6[k];  cc_mult(&temp,&v60,&t6);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v61,&t6);c_add(&f1,&f1,&temp);
++            t7 = Mki7[k];  cc_mult(&temp,&v70,&t7);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v71,&t7);c_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += 8 * lda;
++    }
++
++    while ( firstcol < n - 3 ) {        /* Do 4 columns */
++        Mki0 = M0;
++        Mki1 = Mki0 + lda;
++        Mki2 = Mki1 + lda;
++        Mki3 = Mki2 + lda;
++
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++        v10 = x0[firstcol];   v11 = x1[firstcol++];
++        v20 = x0[firstcol];   v21 = x1[firstcol++];
++        v30 = x0[firstcol];   v31 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];  cc_mult(&temp,&v00,&t0);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
++            t1 = Mki1[k];  cc_mult(&temp,&v10,&t1);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v11,&t1);c_add(&f1,&f1,&temp);
++            t2 = Mki2[k];  cc_mult(&temp,&v20,&t2);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v21,&t2);c_add(&f1,&f1,&temp);
++            t3 = Mki3[k];  cc_mult(&temp,&v30,&t3);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v31,&t3);c_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += 4 * lda;
++
++    }
++
++    while ( firstcol < n ) {            /* Do 1 column */
++        Mki0 = M0;
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];
++            cc_mult(&temp,&v00,&t0);c_add(&f0,&f0,&temp);
++            cc_mult(&temp,&v01,&t0);c_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += lda;
++    }
++
++}
++
++
+diff -Nur SRC.orig/cmyblas2.c.orig SRC/cmyblas2.c.orig
+--- SRC.orig/cmyblas2.c.orig	1969-12-31 16:00:00.000000000 -0800
++++ SRC/cmyblas2.c.orig	2013-07-15 11:49:05.149137948 -0700
+@@ -0,0 +1,185 @@
++
++/*
++ * -- SuperLU routine (version 2.0) --
++ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
++ * and Xerox Palo Alto Research Center.
++ * September 10, 2007
++ *
++ */
++/*
++ * File name:		cmyblas2.c
++ * Purpose:
++ *     Level 2 BLAS operations: solves and matvec, written in C.
++ * Note:
++ *     This is only used when the system lacks an efficient BLAS library.
++ */
++#include "slu_scomplex.h"
++
++
++/*
++ * Solves a dense UNIT lower triangular system. The unit lower 
++ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
++ * The solution will be returned in the rhs vector.
++ */
++void clsolve ( int ldm, int ncol, complex *M, complex *rhs )
++{
++    int k;
++    complex x0, x1, x2, x3, temp;
++    complex *M0;
++    complex *Mki0, *Mki1, *Mki2, *Mki3;
++    register int firstcol = 0;
++
++    M0 = &M[0];
++
++
++    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
++      	Mki0 = M0 + 1;
++      	Mki1 = Mki0 + ldm + 1;
++      	Mki2 = Mki1 + ldm + 1;
++      	Mki3 = Mki2 + ldm + 1;
++
++      	x0 = rhs[firstcol];
++      	cc_mult(&temp, &x0, Mki0); Mki0++;
++      	c_sub(&x1, &rhs[firstcol+1], &temp);
++      	cc_mult(&temp, &x0, Mki0); Mki0++;
++	c_sub(&x2, &rhs[firstcol+2], &temp);
++	cc_mult(&temp, &x1, Mki1); Mki1++;
++	c_sub(&x2, &x2, &temp);
++      	cc_mult(&temp, &x0, Mki0); Mki0++;
++	c_sub(&x3, &rhs[firstcol+3], &temp);
++	cc_mult(&temp, &x1, Mki1); Mki1++;
++	c_sub(&x3, &x3, &temp);
++	cc_mult(&temp, &x2, Mki2); Mki2++;
++	c_sub(&x3, &x3, &temp);
++
++ 	rhs[++firstcol] = x1;
++      	rhs[++firstcol] = x2;
++      	rhs[++firstcol] = x3;
++      	++firstcol;
++    
++      	for (k = firstcol; k < ncol; k++) {
++	    cc_mult(&temp, &x0, Mki0); Mki0++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	    cc_mult(&temp, &x1, Mki1); Mki1++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	    cc_mult(&temp, &x2, Mki2); Mki2++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	    cc_mult(&temp, &x3, Mki3); Mki3++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	}
++
++        M0 += 4 * ldm + 4;
++    }
++
++    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
++        Mki0 = M0 + 1;
++        Mki1 = Mki0 + ldm + 1;
++
++        x0 = rhs[firstcol];
++	cc_mult(&temp, &x0, Mki0); Mki0++;
++	c_sub(&x1, &rhs[firstcol+1], &temp);
++
++      	rhs[++firstcol] = x1;
++      	++firstcol;
++    
++      	for (k = firstcol; k < ncol; k++) {
++	    cc_mult(&temp, &x0, Mki0); Mki0++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	    cc_mult(&temp, &x1, Mki1); Mki1++;
++	    c_sub(&rhs[k], &rhs[k], &temp);
++	} 
++    }
++    
++}
++
++/*
++ * Solves a dense upper triangular system. The upper triangular matrix is
++ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
++ * in the rhs vector.
++ */
++void
++cusolve (
++int ldm,	/* in */
++int ncol,	/* in */
++complex *M,	/* in */
++complex *rhs	/* modified */
++)
++{
++    complex xj, temp;
++    int jcol, j, irow;
++
++    jcol = ncol - 1;
++
++    for (j = 0; j < ncol; j++) {
++
++	c_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
++	rhs[jcol] = xj;
++	
++	for (irow = 0; irow < jcol; irow++) {
++	    cc_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
++	    c_sub(&rhs[irow], &rhs[irow], &temp);
++	}
++
++	jcol--;
++
++    }
++}
++
++
++/*
++ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
++ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
++ */
++void cmatvec (
++int ldm,	/* in -- leading dimension of M */
++int nrow,	/* in */ 
++int ncol,	/* in */
++complex *M,	/* in */
++complex *vec,	/* in */
++complex *Mxvec	/* in/out */
++)
++{
++    complex vi0, vi1, vi2, vi3;
++    complex *M0, temp;
++    complex *Mki0, *Mki1, *Mki2, *Mki3;
++    register int firstcol = 0;
++    int k;
++
++    M0 = &M[0];
++
++    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
++	Mki0 = M0;
++	Mki1 = Mki0 + ldm;
++	Mki2 = Mki1 + ldm;
++	Mki3 = Mki2 + ldm;
++
++	vi0 = vec[firstcol++];
++	vi1 = vec[firstcol++];
++	vi2 = vec[firstcol++];
++	vi3 = vec[firstcol++];	
++	for (k = 0; k < nrow; k++) {
++	    cc_mult(&temp, &vi0, Mki0); Mki0++;
++	    c_add(&Mxvec[k], &Mxvec[k], &temp);
++	    cc_mult(&temp, &vi1, Mki1); Mki1++;
++	    c_add(&Mxvec[k], &Mxvec[k], &temp);
++	    cc_mult(&temp, &vi2, Mki2); Mki2++;
++	    c_add(&Mxvec[k], &Mxvec[k], &temp);
++	    cc_mult(&temp, &vi3, Mki3); Mki3++;
++	    c_add(&Mxvec[k], &Mxvec[k], &temp);
++	}
++
++	M0 += 4 * ldm;
++    }
++
++    while ( firstcol < ncol ) {		/* Do 1 column */
++ 	Mki0 = M0;
++	vi0 = vec[firstcol++];
++	for (k = 0; k < nrow; k++) {
++	    cc_mult(&temp, &vi0, Mki0); Mki0++;
++	    c_add(&Mxvec[k], &Mxvec[k], &temp);
++	}
++	M0 += ldm;
++    }
++	
++}
++
+diff -Nur SRC.orig/Makefile SRC/Makefile
+--- SRC.orig/Makefile	2013-07-15 11:47:52.511735412 -0700
++++ SRC/Makefile	2013-07-15 11:53:15.393528085 -0700
+@@ -31,7 +31,7 @@
+ #
+ #######################################################################
+ 
+-ALLAUX = superlu_timer.o dclock.o sp_ienv.o lsame.o xerbla.o \
++ALLAUX = superlu_timer.o sp_ienv.o lsame.o xerbla.o \
+ 	util.o pmemory.o qrnzcnt.o await.o \
+ 	get_perm_c.o mmd.o colamd.o sp_coletree.o \
+ 	pxgstrf_scheduler.o sp_colorder.o \
+diff -Nur SRC.orig/smatgen.c SRC/smatgen.c
+--- SRC.orig/smatgen.c	2013-07-15 11:47:52.512735420 -0700
++++ SRC/smatgen.c	2013-07-15 11:49:05.149137948 -0700
+@@ -93,76 +93,3 @@
+     xa[n] = lasta;
+ }
+ 
+-double dlaran_(int *iseed)
+-{
+-/*  -- LAPACK auxiliary routine (version 2.0) --   
+-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+-       Courant Institute, Argonne National Lab, and Rice University   
+-       February 29, 1992   
+-
+-    Purpose   
+-    =======   
+-
+-    DLARAN returns a random real number from a uniform (0,1)   
+-    distribution.   
+-
+-    Arguments   
+-    =========   
+-
+-    ISEED   (input/output) INT array, dimension (4)   
+-            On entry, the seed of the random number generator; the array 
+-  
+-            elements must be between 0 and 4095, and ISEED(4) must be   
+-            odd.   
+-            On exit, the seed is updated.   
+-
+-    Further Details   
+-    ===============   
+-
+-    This routine uses a multiplicative congruential method with modulus   
+-    2**48 and multiplier 33952834046453 (see G.S.Fishman,   
+-    'Multiplicative congruential random number generators with modulus   
+-    2**b: an exhaustive analysis for b = 32 and a partial analysis for   
+-    b = 48', Math. Comp. 189, pp 331-344, 1990).   
+-
+-    48-bit integers are stored in 4 integer array elements with 12 bits   
+-    per element. Hence the routine is portable across machines with   
+-    integers of 32 bits or more.   
+-
+-    ===================================================================== 
+-*/
+-    
+-    /* Local variables */
+-    int it1, it2, it3, it4;
+-
+-    --iseed;
+-
+-    /* multiply the seed by the multiplier modulo 2**48 */
+-    it4 = iseed[4] * 2549;
+-    it3 = it4 / 4096;
+-    it4 -= it3 << 12;
+-    it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
+-    it2 = it3 / 4096;
+-    it3 -= it2 << 12;
+-    it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
+-    it1 = it2 / 4096;
+-    it2 -= it1 << 12;
+-    it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4] 
+-	    * 494;
+-    it1 %= 4096;
+-
+-   /* return updated seed */
+-
+-    iseed[1] = it1;
+-    iseed[2] = it2;
+-    iseed[3] = it3;
+-    iseed[4] = it4;
+-
+-   /* convert 48-bit integer to a real number in the interval (0,1) */
+-
+-    return ((double) it1 +
+-	    ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
+-	     2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
+-
+-} /* dlaran_ */
+-
+diff -Nur SRC.orig/xerbla.c SRC/xerbla.c
+--- SRC.orig/xerbla.c	2013-07-15 11:47:52.513735427 -0700
++++ SRC/xerbla.c	2013-07-15 11:49:05.150137959 -0700
+@@ -1,3 +1,4 @@
++#include <stdio.h>
+ /* Subroutine */ int xerbla_(char *srname, int *info)
+ {
+ /*  -- LAPACK auxiliary routine (version 2.0) --   
+diff -Nur SRC.orig/zmatgen.c SRC/zmatgen.c
+--- SRC.orig/zmatgen.c	2013-07-15 11:47:52.513735427 -0700
++++ SRC/zmatgen.c	2013-07-15 11:49:05.150137959 -0700
+@@ -93,76 +93,3 @@
+     xa[n] = lasta;
+ }
+ 
+-double dlaran_(int *iseed)
+-{
+-/*  -- LAPACK auxiliary routine (version 2.0) --   
+-       Univ. of Tennessee, Univ. of California Berkeley, NAG Ltd.,   
+-       Courant Institute, Argonne National Lab, and Rice University   
+-       February 29, 1992   
+-
+-    Purpose   
+-    =======   
+-
+-    DLARAN returns a random real number from a uniform (0,1)   
+-    distribution.   
+-
+-    Arguments   
+-    =========   
+-
+-    ISEED   (input/output) INT array, dimension (4)   
+-            On entry, the seed of the random number generator; the array 
+-  
+-            elements must be between 0 and 4095, and ISEED(4) must be   
+-            odd.   
+-            On exit, the seed is updated.   
+-
+-    Further Details   
+-    ===============   
+-
+-    This routine uses a multiplicative congruential method with modulus   
+-    2**48 and multiplier 33952834046453 (see G.S.Fishman,   
+-    'Multiplicative congruential random number generators with modulus   
+-    2**b: an exhaustive analysis for b = 32 and a partial analysis for   
+-    b = 48', Math. Comp. 189, pp 331-344, 1990).   
+-
+-    48-bit integers are stored in 4 integer array elements with 12 bits   
+-    per element. Hence the routine is portable across machines with   
+-    integers of 32 bits or more.   
+-
+-    ===================================================================== 
+-*/
+-    
+-    /* Local variables */
+-    int it1, it2, it3, it4;
+-
+-    --iseed;
+-
+-    /* multiply the seed by the multiplier modulo 2**48 */
+-    it4 = iseed[4] * 2549;
+-    it3 = it4 / 4096;
+-    it4 -= it3 << 12;
+-    it3 = it3 + iseed[3] * 2549 + iseed[4] * 2508;
+-    it2 = it3 / 4096;
+-    it3 -= it2 << 12;
+-    it2 = it2 + iseed[2] * 2549 + iseed[3] * 2508 + iseed[4] * 322;
+-    it1 = it2 / 4096;
+-    it2 -= it1 << 12;
+-    it1 = it1 + iseed[1] * 2549 + iseed[2] * 2508 + iseed[3] * 322 + iseed[4] 
+-	    * 494;
+-    it1 %= 4096;
+-
+-   /* return updated seed */
+-
+-    iseed[1] = it1;
+-    iseed[2] = it2;
+-    iseed[3] = it3;
+-    iseed[4] = it4;
+-
+-   /* convert 48-bit integer to a real number in the interval (0,1) */
+-
+-    return ((double) it1 +
+-	    ((double) it2 + ((double) it3 + (double) it4 * 2.44140625e-4) *
+-	     2.44140625e-4) * 2.44140625e-4) * 2.44140625e-4;
+-
+-} /* dlaran_ */
+-
+diff -Nur SRC.orig/zmyblas2.c SRC/zmyblas2.c
+--- SRC.orig/zmyblas2.c	2013-07-15 11:47:52.511735412 -0700
++++ SRC/zmyblas2.c	2013-07-15 11:49:05.150137959 -0700
+@@ -183,3 +183,127 @@
+ 	
+ }
+ 
++/*
++ * Performs dense matrix-vector multiply with 2 vectors:
++ *        y0 = y0 + A * x0
++ *        y1 = y1 + A * x1
++ */
++void zmatvec2 (
++               int lda,     /* leading dimension of A */
++               int m,
++               int n,
++               doublecomplex *A,   /* in - size m-by-n */
++               doublecomplex *x0,  /* in - size n-by-1 */
++               doublecomplex *x1,  /* in - size n-by-1 */
++               doublecomplex *y0,  /* out - size n-by-1 */
++               doublecomplex *y1   /* out - size n-by-1 */
++               )
++
++{
++    doublecomplex v00, v10, v20, v30, v40, v50, v60, v70,
++                    v01, v11, v21, v31, v41, v51, v61, v71;
++    doublecomplex t0, t1, t2, t3, t4, t5, t6, t7;
++    doublecomplex f0, f1;
++    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3, *Mki4, *Mki5, *Mki6, *Mki7;
++    register int firstcol = 0;
++    doublecomplex *M0, temp;
++    int k;
++
++    M0 = &A[0];
++
++    while ( firstcol < n - 7 ) {        /* Do 8 columns */
++
++        Mki0 = M0;
++        Mki1 = Mki0 + lda;
++        Mki2 = Mki1 + lda;
++        Mki3 = Mki2 + lda;
++        Mki4 = Mki3 + lda;
++        Mki5 = Mki4 + lda;
++        Mki6 = Mki5 + lda;
++        Mki7 = Mki6 + lda;
++
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++        v10 = x0[firstcol];   v11 = x1[firstcol++];
++        v20 = x0[firstcol];   v21 = x1[firstcol++];
++        v30 = x0[firstcol];   v31 = x1[firstcol++];
++        v40 = x0[firstcol];   v41 = x1[firstcol++];
++        v50 = x0[firstcol];   v51 = x1[firstcol++];
++        v60 = x0[firstcol];   v61 = x1[firstcol++];
++        v70 = x0[firstcol];   v71 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];  zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
++            t1 = Mki1[k];  zz_mult(&temp,&v10,&t1);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v11,&t1);z_add(&f1,&f1,&temp);
++            t2 = Mki2[k];  zz_mult(&temp,&v20,&t2);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v21,&t2);z_add(&f1,&f1,&temp);
++            t3 = Mki3[k];  zz_mult(&temp,&v30,&t3);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v31,&t3);z_add(&f1,&f1,&temp);
++            t4 = Mki4[k];  zz_mult(&temp,&v40,&t4);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v41,&t4);z_add(&f1,&f1,&temp);
++            t5 = Mki5[k];  zz_mult(&temp,&v50,&t5);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v51,&t5);z_add(&f1,&f1,&temp);
++            t6 = Mki6[k];  zz_mult(&temp,&v60,&t6);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v61,&t6);z_add(&f1,&f1,&temp);
++            t7 = Mki7[k];  zz_mult(&temp,&v70,&t7);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v71,&t7);z_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += 8 * lda;
++    }
++
++    while ( firstcol < n - 3 ) {        /* Do 4 columns */
++        Mki0 = M0;
++        Mki1 = Mki0 + lda;
++        Mki2 = Mki1 + lda;
++        Mki3 = Mki2 + lda;
++
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++        v10 = x0[firstcol];   v11 = x1[firstcol++];
++        v20 = x0[firstcol];   v21 = x1[firstcol++];
++        v30 = x0[firstcol];   v31 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];  zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
++            t1 = Mki1[k];  zz_mult(&temp,&v10,&t1);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v11,&t1);z_add(&f1,&f1,&temp);
++            t2 = Mki2[k];  zz_mult(&temp,&v20,&t2);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v21,&t2);z_add(&f1,&f1,&temp);
++            t3 = Mki3[k];  zz_mult(&temp,&v30,&t3);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v31,&t3);z_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += 4 * lda;
++
++    }
++
++    while ( firstcol < n ) {            /* Do 1 column */
++        Mki0 = M0;
++        v00 = x0[firstcol];   v01 = x1[firstcol++];
++
++        for (k = 0; k < m; k++) {
++            f0 = y0[k];
++            f1 = y1[k];
++            t0 = Mki0[k];
++            zz_mult(&temp,&v00,&t0);z_add(&f0,&f0,&temp);
++            zz_mult(&temp,&v01,&t0);z_add(&f1,&f1,&temp);
++            y0[k] = f0;
++            y1[k] = f1;
++        }
++
++        M0 += lda;
++    }
++
++}
++
++
+diff -Nur SRC.orig/zmyblas2.c.orig SRC/zmyblas2.c.orig
+--- SRC.orig/zmyblas2.c.orig	1969-12-31 16:00:00.000000000 -0800
++++ SRC/zmyblas2.c.orig	2013-07-15 11:49:05.150137959 -0700
+@@ -0,0 +1,185 @@
++
++/*
++ * -- SuperLU routine (version 2.0) --
++ * Lawrence Berkeley National Lab, Univ. of California Berkeley,
++ * and Xerox Palo Alto Research Center.
++ * September 10, 2007
++ *
++ */
++/*
++ * File name:		zmyblas2.c
++ * Purpose:
++ *     Level 2 BLAS operations: solves and matvec, written in C.
++ * Note:
++ *     This is only used when the system lacks an efficient BLAS library.
++ */
++#include "slu_dcomplex.h"
++
++
++/*
++ * Solves a dense UNIT lower triangular system. The unit lower 
++ * triangular matrix is stored in a 2D array M(1:nrow,1:ncol). 
++ * The solution will be returned in the rhs vector.
++ */
++void zlsolve ( int ldm, int ncol, doublecomplex *M, doublecomplex *rhs )
++{
++    int k;
++    doublecomplex x0, x1, x2, x3, temp;
++    doublecomplex *M0;
++    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
++    register int firstcol = 0;
++
++    M0 = &M[0];
++
++
++    while ( firstcol < ncol - 3 ) { /* Do 4 columns */
++      	Mki0 = M0 + 1;
++      	Mki1 = Mki0 + ldm + 1;
++      	Mki2 = Mki1 + ldm + 1;
++      	Mki3 = Mki2 + ldm + 1;
++
++      	x0 = rhs[firstcol];
++      	zz_mult(&temp, &x0, Mki0); Mki0++;
++      	z_sub(&x1, &rhs[firstcol+1], &temp);
++      	zz_mult(&temp, &x0, Mki0); Mki0++;
++	z_sub(&x2, &rhs[firstcol+2], &temp);
++	zz_mult(&temp, &x1, Mki1); Mki1++;
++	z_sub(&x2, &x2, &temp);
++      	zz_mult(&temp, &x0, Mki0); Mki0++;
++	z_sub(&x3, &rhs[firstcol+3], &temp);
++	zz_mult(&temp, &x1, Mki1); Mki1++;
++	z_sub(&x3, &x3, &temp);
++	zz_mult(&temp, &x2, Mki2); Mki2++;
++	z_sub(&x3, &x3, &temp);
++
++ 	rhs[++firstcol] = x1;
++      	rhs[++firstcol] = x2;
++      	rhs[++firstcol] = x3;
++      	++firstcol;
++    
++      	for (k = firstcol; k < ncol; k++) {
++	    zz_mult(&temp, &x0, Mki0); Mki0++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	    zz_mult(&temp, &x1, Mki1); Mki1++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	    zz_mult(&temp, &x2, Mki2); Mki2++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	    zz_mult(&temp, &x3, Mki3); Mki3++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	}
++
++        M0 += 4 * ldm + 4;
++    }
++
++    if ( firstcol < ncol - 1 ) { /* Do 2 columns */
++        Mki0 = M0 + 1;
++        Mki1 = Mki0 + ldm + 1;
++
++        x0 = rhs[firstcol];
++	zz_mult(&temp, &x0, Mki0); Mki0++;
++	z_sub(&x1, &rhs[firstcol+1], &temp);
++
++      	rhs[++firstcol] = x1;
++      	++firstcol;
++    
++      	for (k = firstcol; k < ncol; k++) {
++	    zz_mult(&temp, &x0, Mki0); Mki0++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	    zz_mult(&temp, &x1, Mki1); Mki1++;
++	    z_sub(&rhs[k], &rhs[k], &temp);
++	} 
++    }
++    
++}
++
++/*
++ * Solves a dense upper triangular system. The upper triangular matrix is
++ * stored in a 2-dim array M(1:ldm,1:ncol). The solution will be returned
++ * in the rhs vector.
++ */
++void
++zusolve (
++int ldm,	/* in */
++int ncol,	/* in */
++doublecomplex *M,	/* in */
++doublecomplex *rhs	/* modified */
++)
++{
++    doublecomplex xj, temp;
++    int jcol, j, irow;
++
++    jcol = ncol - 1;
++
++    for (j = 0; j < ncol; j++) {
++
++	z_div(&xj, &rhs[jcol], &M[jcol + jcol*ldm]); /* M(jcol, jcol) */
++	rhs[jcol] = xj;
++	
++	for (irow = 0; irow < jcol; irow++) {
++	    zz_mult(&temp, &xj, &M[irow+jcol*ldm]); /* M(irow, jcol) */
++	    z_sub(&rhs[irow], &rhs[irow], &temp);
++	}
++
++	jcol--;
++
++    }
++}
++
++
++/*
++ * Performs a dense matrix-vector multiply: Mxvec = Mxvec + M * vec.
++ * The input matrix is M(1:nrow,1:ncol); The product is returned in Mxvec[].
++ */
++void zmatvec (
++int ldm,	/* in -- leading dimension of M */
++int nrow,	/* in */ 
++int ncol,	/* in */
++doublecomplex *M,	/* in */
++doublecomplex *vec,	/* in */
++doublecomplex *Mxvec	/* in/out */
++)
++{
++    doublecomplex vi0, vi1, vi2, vi3;
++    doublecomplex *M0, temp;
++    doublecomplex *Mki0, *Mki1, *Mki2, *Mki3;
++    register int firstcol = 0;
++    int k;
++
++    M0 = &M[0];
++
++    while ( firstcol < ncol - 3 ) {	/* Do 4 columns */
++	Mki0 = M0;
++	Mki1 = Mki0 + ldm;
++	Mki2 = Mki1 + ldm;
++	Mki3 = Mki2 + ldm;
++
++	vi0 = vec[firstcol++];
++	vi1 = vec[firstcol++];
++	vi2 = vec[firstcol++];
++	vi3 = vec[firstcol++];	
++	for (k = 0; k < nrow; k++) {
++	    zz_mult(&temp, &vi0, Mki0); Mki0++;
++	    z_add(&Mxvec[k], &Mxvec[k], &temp);
++	    zz_mult(&temp, &vi1, Mki1); Mki1++;
++	    z_add(&Mxvec[k], &Mxvec[k], &temp);
++	    zz_mult(&temp, &vi2, Mki2); Mki2++;
++	    z_add(&Mxvec[k], &Mxvec[k], &temp);
++	    zz_mult(&temp, &vi3, Mki3); Mki3++;
++	    z_add(&Mxvec[k], &Mxvec[k], &temp);
++	}
++
++	M0 += 4 * ldm;
++    }
++
++    while ( firstcol < ncol ) {		/* Do 1 column */
++ 	Mki0 = M0;
++	vi0 = vec[firstcol++];
++	for (k = 0; k < nrow; k++) {
++	    zz_mult(&temp, &vi0, Mki0); Mki0++;
++	    z_add(&Mxvec[k], &Mxvec[k], &temp);
++	}
++	M0 += ldm;
++    }
++	
++}
++
diff --git a/sci-libs/superlu_mt/files/superlu_mt-2.1-missing-includes.patch b/sci-libs/superlu_mt/files/superlu_mt-2.1-missing-includes.patch
new file mode 100644
index 000000000..afeca8fa7
--- /dev/null
+++ b/sci-libs/superlu_mt/files/superlu_mt-2.1-missing-includes.patch
@@ -0,0 +1,44 @@
+diff -Nur TESTING.orig/MATGEN/clatb4.c TESTING/MATGEN/clatb4.c
+--- TESTING.orig/MATGEN/clatb4.c	2013-07-15 11:48:34.285967038 -0700
++++ TESTING/MATGEN/clatb4.c	2013-07-15 11:49:05.150137959 -0700
+@@ -3,6 +3,7 @@
+ 	-lf2c -lm   (in that order)
+ */
+ 
++#include <string.h>
+ #include "f2c.h"
+ 
+ /* Table of constant values */
+diff -Nur TESTING.orig/MATGEN/dlatb4.c TESTING/MATGEN/dlatb4.c
+--- TESTING.orig/MATGEN/dlatb4.c	2013-07-15 11:48:34.288966998 -0700
++++ TESTING/MATGEN/dlatb4.c	2013-07-15 11:49:05.150137959 -0700
+@@ -3,6 +3,7 @@
+ 	-lf2c -lm   (in that order)
+ */
+ 
++#include <string.h>
+ #include "f2c.h"
+ 
+ /* Table of constant values */
+diff -Nur TESTING.orig/MATGEN/slatb4.c TESTING/MATGEN/slatb4.c
+--- TESTING.orig/MATGEN/slatb4.c	2013-07-15 11:48:34.288966998 -0700
++++ TESTING/MATGEN/slatb4.c	2013-07-15 11:49:05.150137959 -0700
+@@ -3,6 +3,7 @@
+ 	-lf2c -lm   (in that order)
+ */
+ 
++#include <string.h>
+ #include "f2c.h"
+ 
+ /* Table of constant values */
+diff -Nur TESTING.orig/MATGEN/zlatb4.c TESTING/MATGEN/zlatb4.c
+--- TESTING.orig/MATGEN/zlatb4.c	2013-07-15 11:48:34.288966998 -0700
++++ TESTING/MATGEN/zlatb4.c	2013-07-15 11:49:05.150137959 -0700
+@@ -3,6 +3,7 @@
+ 	-lf2c -lm   (in that order)
+ */
+ 
++#include <string.h>
+ #include "f2c.h"
+ 
+ /* Table of constant values */
diff --git a/sci-libs/superlu_mt/metadata.xml b/sci-libs/superlu_mt/metadata.xml
new file mode 100644
index 000000000..91de91fe6
--- /dev/null
+++ b/sci-libs/superlu_mt/metadata.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE pkgmetadata SYSTEM "http://www.gentoo.org/dtd/metadata.dtd">
+<pkgmetadata>
+<herd>sci</herd>
+<longdescription lang="en">
+  SuperLU is a general purpose library for the direct solution of
+  large, sparse, nonsymmetric systems of linear equations on high
+  performance machines. The library is written in C and is callable
+  from either C or Fortran. The library routines will perform an LU
+  decomposition with partial pivoting and triangular system solves
+  through forward and back substitution. The LU factorization routines
+  can handle non-square matrices but the triangular solves are
+  performed only for square matrices. The matrix columns may be
+  preordered (before factorization) either through library or user
+  supplied routines. This preordering for sparsity is completely
+  separate from the factorization. Working precision iterative
+  refinement subroutines are provided for improved backward
+  stability. Routines are also provided to equilibrate the system,
+  estimate the condition number, calculate the relative backward
+  error, and estimate error bounds for the refined solutions.
+  This is the multi-threaded version (POSIX threads or OpenMP).
+</longdescription>
+</pkgmetadata>
diff --git a/sci-libs/superlu_mt/superlu_mt-2.1.ebuild b/sci-libs/superlu_mt/superlu_mt-2.1.ebuild
new file mode 100644
index 000000000..bfd2866d4
--- /dev/null
+++ b/sci-libs/superlu_mt/superlu_mt-2.1.ebuild
@@ -0,0 +1,105 @@
+# Copyright 1999-2013 Gentoo Foundation
+# Distributed under the terms of the GNU General Public License v2
+# $Header: /var/cvsroot/gentoo-x86/sci-libs/superlu/superlu-4.3.ebuild,v 1.7 2012/12/12 20:31:31 jlec Exp $
+
+EAPI=5
+
+inherit eutils fortran-2 toolchain-funcs
+
+MYPN=SuperLU_MT
+
+DESCRIPTION="Sparse LU factorization library multithreading library"
+HOMEPAGE="http://crd.lbl.gov/~xiaoye/SuperLU/"
+SRC_URI="${HOMEPAGE}/${PN}_${PV}.tar.gz"
+
+LICENSE="BSD"
+SLOT="0"
+KEYWORDS="~amd64 ~ppc ~ppc64 ~x86 ~amd64-linux ~x86-linux"
+IUSE="doc openmp threads examples static-libs test"
+
+RDEPEND="
+	virtual/cblas"
+DEPEND="${RDEPEND}
+	virtual/pkgconfig
+	test? ( app-shells/tcsh )"
+
+S="${WORKDIR}/${MYPN}_${PV}"
+
+pkg_setup() {
+	if use threads; then
+		export CTHREADS="-D__PTHREAD" LDTHREADS="-pthread"
+	elif use openmp; then
+		if [[ $(tc-getCC) == *gcc ]] && ! tc-has-openmp; then
+			ewarn "OpenMP is not available in your current selected gcc"
+			die "need openmp capable gcc"
+		fi
+		FORTRAN_NEED_OPENMP=1
+		export CTHREADS="-D__OPENMP"
+		[[ $(tc-getCC) == *gcc ]] && LDTHREADS="-fopenmp"
+	else
+		ewarn "Neither threads or openmp selected. Forcing threads"
+		export CTHREADS="-D__PTHREAD" LDTHREADS="-pthread"
+	fi
+	fortran-2_pkg_setup
+}
+
+src_prepare() {
+	epatch \
+		"${FILESDIR}"/${P}-duplicate-symbols.patch \
+		"${FILESDIR}"/${P}-missing-includes.patch
+}
+
+src_configure() {
+	sed -i \
+		-e 's/^\(PLAT\s*=\).*/\1/' \
+		-e "s:^\(CC\s*=\).*:\1 $(tc-getCC):" \
+		-e "/CFLAGS/s:-O3:${CFLAGS} \$(PIC):" \
+		-e "s:^\(PREDEFS\s*=\).*:\1 ${CPPFLAGS} -DUSE_VENDOR_BLAS \$(CTHREADS)$:" \
+		-e "s:^\(NOOPTS\s*=.*\):\1 \$(PIC):" \
+		-e "s:^\(FORTRAN\s*=\).*:\1 $(tc-getFC):" \
+		-e "s:^\(FFLAGS\s*=\).*:\1 ${FFLAGS} \$(PIC):" \
+		-e "s:^\(ARCH\s*=\).*:\1 $(tc-getAR):" \
+		-e "s:^\(RANLIB\s*=\).*:\1 $(tc-getRANLIB):" \
+		-e "s:^\(LOADER\s*=\).*:\1 $(tc-getCC):" \
+		-e "s:^\(LOADOPTS\s*=\).*:\1 ${LDFLAGS} \$(LDTHREADS):" \
+		-e "/MPLIB/d" \
+		-e "s:^\(BLASLIB\s*=\).*:\1 $($(tc-getPKG_CONFIG) --libs blas):" \
+		make.inc || die
+	SONAME=libsuperlu_mt.so.0
+	sed -i \
+		-e 's|../make.inc|make.inc|' \
+		-e "s|../SRC|${EPREFIX}/usr/include/${PN}|" \
+		-e '/:.*$(SUPERLULIB)/s|../lib/$(SUPERLULIB)||g' \
+		-e 's|../lib/$(SUPERLULIB)|-lsuperlu_mt|g' \
+		EXAMPLE/Makefile || die
+}
+
+src_compile() {
+	emake superlulib \
+		PIC="-fPIC" ARCH="echo" ARCHFLAGS="" RANLIB="echo"
+	$(tc-getCC) ${LDFLAGS} ${LDTHREADS} -shared -Wl,-soname=${SONAME} SRC/*.o \
+		$($(tc-getPKG_CONFIG) --libs blas) -lm -o lib/${SONAME} || die
+	ln -s ${SONAME} lib/libsuperlu_mt.so || die
+
+	use static-libs && rm -f SRC/*.o &&	emake superlulib \
+		PIC="" ARCH="$(tc-getAR)" ARCHFLAGS="cr" RANLIB="$(tc-getRANLIB)"
+}
+
+src_test() {
+	emake -j1 tmglib
+	LD_LIBRARY_PATH="${S}/lib:${LD_LIBRARY_PATH}" \
+		emake SUPERLULIB="${SONAME}" testing
+}
+
+src_install() {
+	dolib.so lib/*so*
+	use static-libs && dolib.a lib/*.a
+	insinto /usr/include/${PN}
+	doins SRC/*h
+	dodoc README
+	use doc && dodoc DOC/ug.pdf
+	if use examples; then
+		insinto /usr/share/doc/${PF}/examples
+		doins -r EXAMPLE/* make.inc
+	fi
+}